Add word separator setting (#1078)

AndreasArvidsson · pre-commit-ci[bot] · pokey · web-flow · commit ba9644becb06 · 2022-10-24T20:17:42.000+01:00
* Added word separator setting. Removed language specific regex components. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated tests * Word separator is now a list * Update documentation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Mock word separators for tests * Minor tweaks * Update src/core/tokenizerConfiguration.ts Co-authored-by: Pokey Rule <755842+pokey@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert "Minor tweaks" This reverts commit b7f3695. * cleanup * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Use join to create key * Update src/core/tokenizer.ts Co-authored-by: Pokey Rule <755842+pokey@users.noreply.github.com> * Update docs Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Pokey Rule <755842+pokey@users.noreply.github.com>
diff --git a/docs/user/customization.md b/docs/user/customization.md
@@ -75,6 +75,20 @@ While the hats are hidden, you will not be able to address any marks, eg `"take
 
 If you'd like to map a voice command to toggle the hats, have a look at https://youtu.be/oWUJyDgz63k
 
+## Updating word separators
+
+The word separators are characters that defines the boundary between words in a identifier. eg `hello_world` is an identifier with two words separated by `_`. If you like to support other separators like `-` in `hello-world` that can be accomplished by changing the `cursorless.wordSeparators` setting. This setting is also language overridable.
+
+```json
+// Sets the word separator for all languages
+"cursorless.wordSeparators": ["_"]
+
+// Sets the word separator for css only
+"[css]": {
+  "cursorless.wordSeparators": ["_", "-"]
+}
+```
+
 ## Cursorless public API
 
 Cursorless exposes a couple talon actions and captures that you can use to define your own custom command grammar leveraging cursorless targets.
diff --git a/package.json b/package.json
@@ -207,6 +207,18 @@
 					"description": "How much to vertically shift the hats as a percentage of font size; positive is up",
 					"order": 1
 				},
+				"cursorless.wordSeparators": {
+					"type": "array",
+					"items": {
+						"type": "string"
+					},
+					"default": [
+						"_"
+					],
+					"scope": "language-overridable",
+					"markdownDescription": "A list of characters that separate words in identifiers. For example `_` splits `hello_world` into two words.",
+					"order": 6
+				},
 				"cursorless.colors.dark": {
 					"description": "Colors to use for dark theme",
 					"type": "object",
diff --git a/src/core/Decorations.ts b/src/core/Decorations.ts
@@ -55,6 +55,7 @@ export default class Decorations {
         },
       ),
 
+      // Don't use fine grained settings here until tokenizer has migrated to graph
       vscode.workspace.onDidChangeConfiguration(this.recomputeDecorationStyles),
     );
   }
diff --git a/src/core/languageTokenizers.ts b/src/core/languageTokenizers.ts
diff --git a/src/core/tokenizer.ts b/src/core/tokenizer.ts
@@ -1,12 +1,7 @@
-import { escapeRegExp, mapValues } from "lodash";
-import { LanguageId, SupportedLanguageId } from "../languages/constants";
-
+import { escapeRegExp } from "lodash";
 import { matchAll } from "../util/regex";
-import { languageWithDashedIdentifiers } from "./languageTokenizers";
-import {
-  LanguageTokenizerComponents,
-  LanguageTokenizerOverrides,
-} from "./tokenizer.types";
+import { LanguageTokenizerComponents } from "./tokenizer.types";
+import { tokenizerConfiguration } from "./tokenizerConfiguration";
 
 const REPEATABLE_SYMBOLS = [
   "-",
@@ -46,27 +41,17 @@ const FIXED_TOKENS = [
 ];
 
 export const IDENTIFIER_WORD_REGEXES = ["\\p{L}", "\\p{M}", "\\p{N}"];
-const IDENTIFIER_WORD_DELIMITERS = ["_"];
 const SINGLE_SYMBOLS_REGEX = "[^\\s\\w]";
 const NUMBERS_REGEX = "(?<=[^.\\d]|^)\\d+\\.\\d+(?=[^.\\d]|$)"; // (not-dot/digit digits dot digits not-dot/digit)
 
-const defaultLanguageTokenizerComponents: LanguageTokenizerComponents = {
-  fixedTokens: FIXED_TOKENS,
-  repeatableSymbols: REPEATABLE_SYMBOLS,
-  identifierWordRegexes: IDENTIFIER_WORD_REGEXES,
-  identifierWordDelimiters: IDENTIFIER_WORD_DELIMITERS,
-  numbersRegex: NUMBERS_REGEX,
-  singleSymbolsRegex: SINGLE_SYMBOLS_REGEX,
-};
 interface Matcher {
   tokenMatcher: RegExp;
   identifierMatcher: RegExp;
   wordMatcher: RegExp;
 }
-const defaultMatcher = generateMatcher();
 
 function generateMatcher(
-  languageOverrides: LanguageTokenizerOverrides = {},
+  languageComponents: LanguageTokenizerComponents,
 ): Matcher {
   const {
     fixedTokens,
@@ -75,10 +60,7 @@ function generateMatcher(
     identifierWordDelimiters,
     numbersRegex,
     singleSymbolsRegex,
-  }: LanguageTokenizerComponents = {
-    ...defaultLanguageTokenizerComponents,
-    ...languageOverrides,
-  };
+  } = languageComponents;
 
   const repeatableSymbolsRegex = repeatableSymbols
     .map(escapeRegExp)
@@ -109,24 +91,25 @@ function generateMatcher(
   };
 }
 
-const languageTokenizerOverrides: Partial<
-  Record<LanguageId, LanguageTokenizerOverrides>
-> = {
-  css: languageWithDashedIdentifiers,
-  scss: languageWithDashedIdentifiers,
-  shellscript: languageWithDashedIdentifiers,
-};
-
-const tokenMatchersForLanguage: Partial<Record<LanguageId, Matcher>> =
-  mapValues(languageTokenizerOverrides, (val: LanguageTokenizerComponents) =>
-    generateMatcher(val),
-  );
+const matchers = new Map<string, Matcher>();
 
 export function getMatcher(languageId: string): Matcher {
-  return (
-    tokenMatchersForLanguage[languageId as SupportedLanguageId] ??
-    defaultMatcher
-  );
+  const wordSeparators = tokenizerConfiguration.getWordSeparators(languageId);
+  const key = wordSeparators.join("\u0000");
+
+  if (!matchers.has(key)) {
+    const components: LanguageTokenizerComponents = {
+      fixedTokens: FIXED_TOKENS,
+      repeatableSymbols: REPEATABLE_SYMBOLS,
+      identifierWordRegexes: IDENTIFIER_WORD_REGEXES,
+      identifierWordDelimiters: wordSeparators,
+      numbersRegex: NUMBERS_REGEX,
+      singleSymbolsRegex: SINGLE_SYMBOLS_REGEX,
+    };
+    matchers.set(key, generateMatcher(components));
+  }
+
+  return matchers.get(key)!;
 }
 
 export function tokenize<T>(
diff --git a/src/core/tokenizer.types.ts b/src/core/tokenizer.types.ts
@@ -20,5 +20,3 @@ export interface LanguageTokenizerComponents {
   repeatableSymbols: string[];
   singleSymbolsRegex: string;
 }
-
-export type LanguageTokenizerOverrides = Partial<LanguageTokenizerComponents>;
diff --git a/src/core/tokenizerConfiguration.ts b/src/core/tokenizerConfiguration.ts
@@ -0,0 +1,36 @@
+/**
+ * TODO: This is just an ugly mock since that tokenizer doesn't have access to the graph/ide
+ * Remove this once https://github.com/cursorless-dev/cursorless/issues/785 is implemented
+ */
+
+import * as vscode from "vscode";
+
+const defaultGetWordSeparators = (languageId: string) => {
+  // FIXME: The reason this code will auto-reload on settings change is that we don't use fine-grained settings listener in `Decorations`:
+  // https://github.com/cursorless-dev/cursorless/blob/c914d477c9624c498a47c964088b34e484eac494/src/core/Decorations.ts#L58
+  return vscode.workspace
+    .getConfiguration("cursorless", { languageId })
+    .get<string[]>("wordSeparators", ["_"]);
+};
+
+let getWordSeparators = defaultGetWordSeparators;
+
+export const tokenizerConfiguration = {
+  getWordSeparators: (languageId: string) => {
+    return getWordSeparators(languageId);
+  },
+  // For testing purposes, we override the word separator in a few languages to
+  // make sure that overriding the word separator works.  Note that in
+  // production, we tokenize all languages the same way by default.
+  mockWordSeparators: () => {
+    getWordSeparators = (languageId: string) => {
+      switch (languageId) {
+        case "css":
+        case "scss":
+        case "shellscript":
+          return ["-", "_"];
+      }
+      return ["_"];
+    };
+  },
+};
diff --git a/src/extension.ts b/src/extension.ts
@@ -1,6 +1,7 @@
 import * as vscode from "vscode";
 import CommandRunner from "./core/commandRunner/CommandRunner";
 import { ThatMark } from "./core/ThatMark";
+import { tokenizerConfiguration } from "./core/tokenizerConfiguration";
 import isTesting from "./testUtil/isTesting";
 import { Graph } from "./typings/Types";
 import { getCommandServerApi, getParseTreeApi } from "./util/getExtensionApi";
@@ -42,6 +43,11 @@ export async function activate(context: vscode.ExtensionContext) {
   // TODO: Do this using the graph once we migrate its dependencies onto the graph
   new CommandRunner(graph, thatMark, sourceMark);
 
+  // TODO: Remove this once tokenizer has access to graph
+  if (isTesting()) {
+    tokenizerConfiguration.mockWordSeparators();
+  }
+
   return {
     thatMark,
     sourceMark,
diff --git a/src/test/suite/tokenizer.test.ts b/src/test/suite/tokenizer.test.ts
@@ -1,10 +1,16 @@
 import * as assert from "assert";
-import { tokenize } from "../../core/tokenizer";
 import { flatten, range } from "lodash";
+import { tokenize } from "../../core/tokenizer";
+import { tokenizerConfiguration } from "../../core/tokenizerConfiguration";
 import { LanguageId } from "../../languages/constants";
 
 type TestCase = [string, string[]];
-/** Language-specific tokenizer test configuration object */
+/**
+ * Language-specific tokenizer test configuration object.  Note that these
+ * languages don't actually behave differently in production, we just mock
+ * overriding the word separator for a few languages to make sure that
+ * overriding works.
+ */
 interface LanguageTokenizerTests {
   /** Language-specific test cases to run in addition to the global tests for this language */
   additionalTests: TestCase[];
@@ -109,6 +115,11 @@ const languageTokenizerTests: Partial<
 };
 
 suite("tokenizer", () => {
+  // TODO: Remove this once tokenizer has access to graph
+  suiteSetup(() => {
+    tokenizerConfiguration.mockWordSeparators();
+  });
+
   globalTests.forEach(([input, expectedOutput]) => {
     test(`tokenizer test, input: "${input}"`, () => {
       const output = tokenize(input, "anyLang", (match) => match[0]);

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,7 @@ export default class Decorations {`
`55`	`55`	`},`
`56`	`56`	`),`
`57`	`57`
	`58`	`+ // Don't use fine grained settings here until tokenizer has migrated to graph`
`58`	`59`	`vscode.workspace.onDidChangeConfiguration(this.recomputeDecorationStyles),`
`59`	`60`	`);`
`60`	`61`	`}`
Original file line number	Diff line number	Diff line change
`@@ -20,5 +20,3 @@ export interface LanguageTokenizerComponents {`
`20`	`20`	`repeatableSymbols: string[];`
`21`	`21`	`singleSymbolsRegex: string;`
`22`	`22`	`}`
`23`		`-`
`24`		`-export type LanguageTokenizerOverrides = Partial<LanguageTokenizerComponents>;`