From 0624c93395e122b53640e7f5606ff9e711d1b696 Mon Sep 17 00:00:00 2001
From: Jason Dent <jason@streetsidesoftware.nl>
Date: Fri, 29 Jul 2022 20:04:21 +0200
Subject: [PATCH 1/2] feat: Support Ignoring characters before checking

Added the ability to specify characters to be ignored (removed) from a word before checking the word in the dictionary.

Related to Aribic Harakat vowel accents.
https://github.com/streetsidesoftware/cspell-dicts/issues/1314
---
 cspell-dict.txt                               |  1 +
 cspell.schema.json                            |  4 +++
 .../SpellingDictionaryCollection.test.ts      | 28 +++++++++++++------
 .../SpellingDictionaryFromTrie.ts             | 21 ++++++++++++++
 .../src/SpellingDictionary/charset.ts         | 12 ++++++++
 packages/cspell-types/cspell.schema.json      |  4 +++
 .../cspell-types/src/DictionaryInformation.ts | 10 +++++++
 7 files changed, 72 insertions(+), 8 deletions(-)
 create mode 100644 packages/cspell-lib/src/SpellingDictionary/charset.ts

diff --git a/cspell-dict.txt b/cspell-dict.txt
index 78bf8ff6238..ce1aa72022a 100644
--- a/cspell-dict.txt
+++ b/cspell-dict.txt
@@ -17,6 +17,7 @@ estree
 exonum
 gimu
 globstar
+Harakat
 jamstack
 lcov
 licia
diff --git a/cspell.schema.json b/cspell.schema.json
index b15eb5e5562..dd179fa5caf 100644
--- a/cspell.schema.json
+++ b/cspell.schema.json
@@ -434,6 +434,10 @@
           "$ref": "#/definitions/HunspellInformation",
           "description": "Used by dictionary authors"
         },
+        "ignore": {
+          "$ref": "#/definitions/CharacterSet",
+          "description": "An optional set of characters that can possibly be removed from a word before checking it.\n\nThis is useful in languages like Arabic where Harakat accents are optional.\n\nNote: All matching characters are removed or none. Partial removal is not supported."
+        },
         "locale": {
           "description": "The locale of the dictionary. Example: `nl,nl-be`",
           "type": "string"
diff --git a/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryCollection.test.ts b/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryCollection.test.ts
index b29b730f2d4..e572096962e 100644
--- a/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryCollection.test.ts
+++ b/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryCollection.test.ts
@@ -32,6 +32,7 @@ describe('Verify using multiple dictionaries', () => {
     const wordsC = ['ant', 'snail', 'beetle', 'worm', 'stink bug', 'centipede', 'millipede', 'flea', 'fly'];
     const wordsD = ['red*', 'green*', 'blue*', 'pink*', 'black*', '*berry', '+-fruit', '*bug', 'pinkie'];
     const wordsF = ['!pink*', '+berry', '+bug', '!stinkbug'];
+    const wordsG = ['café', 'accent'];
 
     const wordsLegacy = ['error', 'code', 'system', 'ctrl'];
 
@@ -181,24 +182,35 @@ describe('Verify using multiple dictionaries', () => {
     });
 
     test.each`
-        word            | expected
-        ${'redberry'}   | ${true}
-        ${'pink'}       | ${false}
-        ${'bug'}        | ${true}
-        ${'blackberry'} | ${true}
-        ${'pinkbug'}    | ${true}
+        word              | expected
+        ${'redberry'}     | ${true}
+        ${'pink'}         | ${false}
+        ${'bug'}          | ${true}
+        ${'blackberry'}   | ${true}
+        ${'pinkbug'}      | ${true}
+        ${'cafe'}         | ${false}
+        ${'café'}         | ${true}
+        ${'cafe\u0301'}   | ${true}
+        ${'accent'}       | ${true}
+        ${'áccent'}       | ${true /* ignore the accent. cspell:disable-line */}
+        ${'a\u0301ccent'} | ${true /* ignore the accent. cspell:disable-line */}
+        ${'applé'}        | ${true /* ignore the accent. cspell:disable-line */}
     `('checks has word: "$word"', ({ word, expected }) => {
         const dicts = [
-            createSpellingDictionary(wordsA, 'wordsA', 'test', undefined),
+            createSpellingDictionary(wordsA, 'wordsA', 'test', { dictionaryInformation: { ignore: '\u0300-\u0362' } }),
             createSpellingDictionary(wordsB, 'wordsB', 'test', undefined),
             createSpellingDictionary(wordsC, 'wordsC', 'test', undefined),
             createSpellingDictionary(wordsD, 'wordsD', 'test', undefined),
             createSpellingDictionary(wordsF, 'wordsF', 'test', undefined),
+            createSpellingDictionary(wordsG, 'wordsA', 'test', {
+                dictionaryInformation: { ignore: '\u0300-\u0362' },
+                caseSensitive: true,
+            }),
             createForbiddenWordsDictionary(['Avocado'], 'flag_words', 'test', undefined),
         ];
 
         const dictCollection = createCollection(dicts, 'test');
-        expect(dictCollection.has(word)).toEqual(expected);
+        expect(dictCollection.has(word, { ignoreCase: false })).toEqual(expected);
     });
 
     test.each`
diff --git a/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryFromTrie.ts b/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryFromTrie.ts
index 9c7b68417c2..49ca52da889 100644
--- a/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryFromTrie.ts
+++ b/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryFromTrie.ts
@@ -10,6 +10,7 @@ import { getDefaultSettings } from '../Settings';
 import { memorizer } from '../util/Memorizer';
 import { createMapper } from '../util/repMap';
 import { clean } from '../util/util';
+import { charsetToRegExp } from './charset';
 import {
     FindResult,
     HasOptions,
@@ -36,6 +37,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
     readonly type = 'SpellingDictionaryFromTrie';
     readonly isDictionaryCaseSensitive: boolean;
     readonly containsNoSuggestWords: boolean;
+    readonly ignoreCharactersRegExp: RegExp | undefined;
 
     private weightMap: WeightMap | undefined;
 
@@ -51,6 +53,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
         this.containsNoSuggestWords = options.noSuggest || false;
         this._size = size || 0;
         this.weightMap = options.weightMap || createWeightMapFromDictionaryInformation(options.dictionaryInformation);
+        this.ignoreCharactersRegExp = charsetToRegExp(this.options.dictionaryInformation?.ignore);
     }
 
     public get size(): number {
@@ -103,6 +106,24 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
         word: string,
         useCompounds: number | boolean | undefined,
         ignoreCase: boolean
+    ): FindAnyFormResult | undefined {
+        const outerForms = new Set([word]);
+        if (this.ignoreCharactersRegExp) {
+            outerForms.add(word.replace(this.ignoreCharactersRegExp, ''));
+            outerForms.add(word.normalize('NFD').replace(this.ignoreCharactersRegExp, ''));
+            outerForms.add(word.normalize('NFC').replace(this.ignoreCharactersRegExp, ''));
+        }
+        for (const form of outerForms) {
+            const r = this._findAnyForm(form, useCompounds, ignoreCase);
+            if (r) return r;
+        }
+        return undefined;
+    }
+
+    private _findAnyForm(
+        word: string,
+        useCompounds: number | boolean | undefined,
+        ignoreCase: boolean
     ): FindAnyFormResult | undefined {
         const mWord = this.mapWord(word.normalize('NFC'));
         const opts: FindWordOptions = { caseSensitive: !ignoreCase };
diff --git a/packages/cspell-lib/src/SpellingDictionary/charset.ts b/packages/cspell-lib/src/SpellingDictionary/charset.ts
new file mode 100644
index 00000000000..34bddd361da
--- /dev/null
+++ b/packages/cspell-lib/src/SpellingDictionary/charset.ts
@@ -0,0 +1,12 @@
+import { CharacterSet } from '@cspell/cspell-types';
+
+export function charsetToRegExp(charset: CharacterSet | undefined): RegExp | undefined {
+    if (!charset) return undefined;
+
+    try {
+        const reg = `[${charset.replace(/[\][]/g, '\\$&')}]`;
+        return new RegExp(reg, 'g');
+    } catch (e) {
+        return undefined;
+    }
+}
diff --git a/packages/cspell-types/cspell.schema.json b/packages/cspell-types/cspell.schema.json
index b15eb5e5562..dd179fa5caf 100644
--- a/packages/cspell-types/cspell.schema.json
+++ b/packages/cspell-types/cspell.schema.json
@@ -434,6 +434,10 @@
           "$ref": "#/definitions/HunspellInformation",
           "description": "Used by dictionary authors"
         },
+        "ignore": {
+          "$ref": "#/definitions/CharacterSet",
+          "description": "An optional set of characters that can possibly be removed from a word before checking it.\n\nThis is useful in languages like Arabic where Harakat accents are optional.\n\nNote: All matching characters are removed or none. Partial removal is not supported."
+        },
         "locale": {
           "description": "The locale of the dictionary. Example: `nl,nl-be`",
           "type": "string"
diff --git a/packages/cspell-types/src/DictionaryInformation.ts b/packages/cspell-types/src/DictionaryInformation.ts
index b426f53b358..c1de1603744 100644
--- a/packages/cspell-types/src/DictionaryInformation.ts
+++ b/packages/cspell-types/src/DictionaryInformation.ts
@@ -47,6 +47,16 @@ export interface DictionaryInformation {
      * If the word matches the pattern, then the penalty is applied.
      */
     adjustments?: PatternAdjustment[];
+
+    /**
+     * An optional set of characters that can possibly be removed from a word before
+     * checking it.
+     *
+     * This is useful in languages like Arabic where Harakat accents are optional.
+     *
+     * Note: All matching characters are removed or none. Partial removal is not supported.
+     */
+    ignore?: CharacterSet;
 }
 
 // cspell:ignore aeistlunkodmrvpgjhäõbüoöfcwzxðqþ aàâä eéèêë iîïy

From 454ed13fc276e02a8a90395e70a0a713824dba22 Mon Sep 17 00:00:00 2001
From: Jason Dent <jason@streetsidesoftware.nl>
Date: Fri, 29 Jul 2022 20:13:44 +0200
Subject: [PATCH 2/2] Update charset.ts

---
 packages/cspell-lib/src/SpellingDictionary/charset.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/cspell-lib/src/SpellingDictionary/charset.ts b/packages/cspell-lib/src/SpellingDictionary/charset.ts
index 34bddd361da..8b4211d829e 100644
--- a/packages/cspell-lib/src/SpellingDictionary/charset.ts
+++ b/packages/cspell-lib/src/SpellingDictionary/charset.ts
@@ -4,7 +4,7 @@ export function charsetToRegExp(charset: CharacterSet | undefined): RegExp | und
     if (!charset) return undefined;
 
     try {
-        const reg = `[${charset.replace(/[\][]/g, '\\$&')}]`;
+        const reg = `[${charset.replace(/[\][\\]/g, '\\$&')}]`;
         return new RegExp(reg, 'g');
     } catch (e) {
         return undefined;