Skip to content

Commit ba9644b

Browse files
AndreasArvidssonpre-commit-ci[bot]pokey
authored
Add word separator setting (#1078)
* Added word separator setting. Removed language specific regex components. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated tests * Word separator is now a list * Update documentation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Mock word separators for tests * Minor tweaks * Update src/core/tokenizerConfiguration.ts Co-authored-by: Pokey Rule <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert "Minor tweaks" This reverts commit b7f3695. * cleanup * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Use join to create key * Update src/core/tokenizer.ts Co-authored-by: Pokey Rule <[email protected]> * Update docs Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pokey Rule <[email protected]>
1 parent 04c14ba commit ba9644b

File tree

9 files changed

+104
-53
lines changed

9 files changed

+104
-53
lines changed

docs/user/customization.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,20 @@ While the hats are hidden, you will not be able to address any marks, eg `"take
7575

7676
If you'd like to map a voice command to toggle the hats, have a look at https://youtu.be/oWUJyDgz63k
7777

78+
## Updating word separators
79+
80+
The word separators are characters that defines the boundary between words in a identifier. eg `hello_world` is an identifier with two words separated by `_`. If you like to support other separators like `-` in `hello-world` that can be accomplished by changing the `cursorless.wordSeparators` setting. This setting is also language overridable.
81+
82+
```json
83+
// Sets the word separator for all languages
84+
"cursorless.wordSeparators": ["_"]
85+
86+
// Sets the word separator for css only
87+
"[css]": {
88+
"cursorless.wordSeparators": ["_", "-"]
89+
}
90+
```
91+
7892
## Cursorless public API
7993

8094
Cursorless exposes a couple talon actions and captures that you can use to define your own custom command grammar leveraging cursorless targets.

package.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,18 @@
207207
"description": "How much to vertically shift the hats as a percentage of font size; positive is up",
208208
"order": 1
209209
},
210+
"cursorless.wordSeparators": {
211+
"type": "array",
212+
"items": {
213+
"type": "string"
214+
},
215+
"default": [
216+
"_"
217+
],
218+
"scope": "language-overridable",
219+
"markdownDescription": "A list of characters that separate words in identifiers. For example `_` splits `hello_world` into two words.",
220+
"order": 6
221+
},
210222
"cursorless.colors.dark": {
211223
"description": "Colors to use for dark theme",
212224
"type": "object",

src/core/Decorations.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ export default class Decorations {
5555
},
5656
),
5757

58+
// Don't use fine grained settings here until tokenizer has migrated to graph
5859
vscode.workspace.onDidChangeConfiguration(this.recomputeDecorationStyles),
5960
);
6061
}

src/core/languageTokenizers.ts

Lines changed: 0 additions & 10 deletions
This file was deleted.

src/core/tokenizer.ts

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
1-
import { escapeRegExp, mapValues } from "lodash";
2-
import { LanguageId, SupportedLanguageId } from "../languages/constants";
3-
1+
import { escapeRegExp } from "lodash";
42
import { matchAll } from "../util/regex";
5-
import { languageWithDashedIdentifiers } from "./languageTokenizers";
6-
import {
7-
LanguageTokenizerComponents,
8-
LanguageTokenizerOverrides,
9-
} from "./tokenizer.types";
3+
import { LanguageTokenizerComponents } from "./tokenizer.types";
4+
import { tokenizerConfiguration } from "./tokenizerConfiguration";
105

116
const REPEATABLE_SYMBOLS = [
127
"-",
@@ -46,27 +41,17 @@ const FIXED_TOKENS = [
4641
];
4742

4843
export const IDENTIFIER_WORD_REGEXES = ["\\p{L}", "\\p{M}", "\\p{N}"];
49-
const IDENTIFIER_WORD_DELIMITERS = ["_"];
5044
const SINGLE_SYMBOLS_REGEX = "[^\\s\\w]";
5145
const NUMBERS_REGEX = "(?<=[^.\\d]|^)\\d+\\.\\d+(?=[^.\\d]|$)"; // (not-dot/digit digits dot digits not-dot/digit)
5246

53-
const defaultLanguageTokenizerComponents: LanguageTokenizerComponents = {
54-
fixedTokens: FIXED_TOKENS,
55-
repeatableSymbols: REPEATABLE_SYMBOLS,
56-
identifierWordRegexes: IDENTIFIER_WORD_REGEXES,
57-
identifierWordDelimiters: IDENTIFIER_WORD_DELIMITERS,
58-
numbersRegex: NUMBERS_REGEX,
59-
singleSymbolsRegex: SINGLE_SYMBOLS_REGEX,
60-
};
6147
interface Matcher {
6248
tokenMatcher: RegExp;
6349
identifierMatcher: RegExp;
6450
wordMatcher: RegExp;
6551
}
66-
const defaultMatcher = generateMatcher();
6752

6853
function generateMatcher(
69-
languageOverrides: LanguageTokenizerOverrides = {},
54+
languageComponents: LanguageTokenizerComponents,
7055
): Matcher {
7156
const {
7257
fixedTokens,
@@ -75,10 +60,7 @@ function generateMatcher(
7560
identifierWordDelimiters,
7661
numbersRegex,
7762
singleSymbolsRegex,
78-
}: LanguageTokenizerComponents = {
79-
...defaultLanguageTokenizerComponents,
80-
...languageOverrides,
81-
};
63+
} = languageComponents;
8264

8365
const repeatableSymbolsRegex = repeatableSymbols
8466
.map(escapeRegExp)
@@ -109,24 +91,25 @@ function generateMatcher(
10991
};
11092
}
11193

112-
const languageTokenizerOverrides: Partial<
113-
Record<LanguageId, LanguageTokenizerOverrides>
114-
> = {
115-
css: languageWithDashedIdentifiers,
116-
scss: languageWithDashedIdentifiers,
117-
shellscript: languageWithDashedIdentifiers,
118-
};
119-
120-
const tokenMatchersForLanguage: Partial<Record<LanguageId, Matcher>> =
121-
mapValues(languageTokenizerOverrides, (val: LanguageTokenizerComponents) =>
122-
generateMatcher(val),
123-
);
94+
const matchers = new Map<string, Matcher>();
12495

12596
export function getMatcher(languageId: string): Matcher {
126-
return (
127-
tokenMatchersForLanguage[languageId as SupportedLanguageId] ??
128-
defaultMatcher
129-
);
97+
const wordSeparators = tokenizerConfiguration.getWordSeparators(languageId);
98+
const key = wordSeparators.join("\u0000");
99+
100+
if (!matchers.has(key)) {
101+
const components: LanguageTokenizerComponents = {
102+
fixedTokens: FIXED_TOKENS,
103+
repeatableSymbols: REPEATABLE_SYMBOLS,
104+
identifierWordRegexes: IDENTIFIER_WORD_REGEXES,
105+
identifierWordDelimiters: wordSeparators,
106+
numbersRegex: NUMBERS_REGEX,
107+
singleSymbolsRegex: SINGLE_SYMBOLS_REGEX,
108+
};
109+
matchers.set(key, generateMatcher(components));
110+
}
111+
112+
return matchers.get(key)!;
130113
}
131114

132115
export function tokenize<T>(

src/core/tokenizer.types.ts

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,3 @@ export interface LanguageTokenizerComponents {
2020
repeatableSymbols: string[];
2121
singleSymbolsRegex: string;
2222
}
23-
24-
export type LanguageTokenizerOverrides = Partial<LanguageTokenizerComponents>;

src/core/tokenizerConfiguration.ts

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/**
2+
* TODO: This is just an ugly mock since that tokenizer doesn't have access to the graph/ide
3+
* Remove this once https://github.com/cursorless-dev/cursorless/issues/785 is implemented
4+
*/
5+
6+
import * as vscode from "vscode";
7+
8+
const defaultGetWordSeparators = (languageId: string) => {
9+
// FIXME: The reason this code will auto-reload on settings change is that we don't use fine-grained settings listener in `Decorations`:
10+
// https://github.com/cursorless-dev/cursorless/blob/c914d477c9624c498a47c964088b34e484eac494/src/core/Decorations.ts#L58
11+
return vscode.workspace
12+
.getConfiguration("cursorless", { languageId })
13+
.get<string[]>("wordSeparators", ["_"]);
14+
};
15+
16+
let getWordSeparators = defaultGetWordSeparators;
17+
18+
export const tokenizerConfiguration = {
19+
getWordSeparators: (languageId: string) => {
20+
return getWordSeparators(languageId);
21+
},
22+
// For testing purposes, we override the word separator in a few languages to
23+
// make sure that overriding the word separator works. Note that in
24+
// production, we tokenize all languages the same way by default.
25+
mockWordSeparators: () => {
26+
getWordSeparators = (languageId: string) => {
27+
switch (languageId) {
28+
case "css":
29+
case "scss":
30+
case "shellscript":
31+
return ["-", "_"];
32+
}
33+
return ["_"];
34+
};
35+
},
36+
};

src/extension.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import * as vscode from "vscode";
22
import CommandRunner from "./core/commandRunner/CommandRunner";
33
import { ThatMark } from "./core/ThatMark";
4+
import { tokenizerConfiguration } from "./core/tokenizerConfiguration";
45
import isTesting from "./testUtil/isTesting";
56
import { Graph } from "./typings/Types";
67
import { getCommandServerApi, getParseTreeApi } from "./util/getExtensionApi";
@@ -42,6 +43,11 @@ export async function activate(context: vscode.ExtensionContext) {
4243
// TODO: Do this using the graph once we migrate its dependencies onto the graph
4344
new CommandRunner(graph, thatMark, sourceMark);
4445

46+
// TODO: Remove this once tokenizer has access to graph
47+
if (isTesting()) {
48+
tokenizerConfiguration.mockWordSeparators();
49+
}
50+
4551
return {
4652
thatMark,
4753
sourceMark,

src/test/suite/tokenizer.test.ts

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
import * as assert from "assert";
2-
import { tokenize } from "../../core/tokenizer";
32
import { flatten, range } from "lodash";
3+
import { tokenize } from "../../core/tokenizer";
4+
import { tokenizerConfiguration } from "../../core/tokenizerConfiguration";
45
import { LanguageId } from "../../languages/constants";
56

67
type TestCase = [string, string[]];
7-
/** Language-specific tokenizer test configuration object */
8+
/**
9+
* Language-specific tokenizer test configuration object. Note that these
10+
* languages don't actually behave differently in production, we just mock
11+
* overriding the word separator for a few languages to make sure that
12+
* overriding works.
13+
*/
814
interface LanguageTokenizerTests {
915
/** Language-specific test cases to run in addition to the global tests for this language */
1016
additionalTests: TestCase[];
@@ -109,6 +115,11 @@ const languageTokenizerTests: Partial<
109115
};
110116

111117
suite("tokenizer", () => {
118+
// TODO: Remove this once tokenizer has access to graph
119+
suiteSetup(() => {
120+
tokenizerConfiguration.mockWordSeparators();
121+
});
122+
112123
globalTests.forEach(([input, expectedOutput]) => {
113124
test(`tokenizer test, input: "${input}"`, () => {
114125
const output = tokenize(input, "anyLang", (match) => match[0]);

0 commit comments

Comments
 (0)