From 0624c93395e122b53640e7f5606ff9e711d1b696 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Fri, 29 Jul 2022 20:04:21 +0200 Subject: [PATCH 1/2] feat: Support Ignoring characters before checking Added the ability to specify characters to be ignored (removed) from a word before checking the word in the dictionary. Related to Aribic Harakat vowel accents. https://github.com/streetsidesoftware/cspell-dicts/issues/1314 --- cspell-dict.txt | 1 + cspell.schema.json | 4 +++ .../SpellingDictionaryCollection.test.ts | 28 +++++++++++++------ .../SpellingDictionaryFromTrie.ts | 21 ++++++++++++++ .../src/SpellingDictionary/charset.ts | 12 ++++++++ packages/cspell-types/cspell.schema.json | 4 +++ .../cspell-types/src/DictionaryInformation.ts | 10 +++++++ 7 files changed, 72 insertions(+), 8 deletions(-) create mode 100644 packages/cspell-lib/src/SpellingDictionary/charset.ts diff --git a/cspell-dict.txt b/cspell-dict.txt index 78bf8ff6238..ce1aa72022a 100644 --- a/cspell-dict.txt +++ b/cspell-dict.txt @@ -17,6 +17,7 @@ estree exonum gimu globstar +Harakat jamstack lcov licia diff --git a/cspell.schema.json b/cspell.schema.json index b15eb5e5562..dd179fa5caf 100644 --- a/cspell.schema.json +++ b/cspell.schema.json @@ -434,6 +434,10 @@ "$ref": "#/definitions/HunspellInformation", "description": "Used by dictionary authors" }, + "ignore": { + "$ref": "#/definitions/CharacterSet", + "description": "An optional set of characters that can possibly be removed from a word before checking it.\n\nThis is useful in languages like Arabic where Harakat accents are optional.\n\nNote: All matching characters are removed or none. Partial removal is not supported." + }, "locale": { "description": "The locale of the dictionary. Example: `nl,nl-be`", "type": "string" diff --git a/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryCollection.test.ts b/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryCollection.test.ts index b29b730f2d4..e572096962e 100644 --- a/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryCollection.test.ts +++ b/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryCollection.test.ts @@ -32,6 +32,7 @@ describe('Verify using multiple dictionaries', () => { const wordsC = ['ant', 'snail', 'beetle', 'worm', 'stink bug', 'centipede', 'millipede', 'flea', 'fly']; const wordsD = ['red*', 'green*', 'blue*', 'pink*', 'black*', '*berry', '+-fruit', '*bug', 'pinkie']; const wordsF = ['!pink*', '+berry', '+bug', '!stinkbug']; + const wordsG = ['café', 'accent']; const wordsLegacy = ['error', 'code', 'system', 'ctrl']; @@ -181,24 +182,35 @@ describe('Verify using multiple dictionaries', () => { }); test.each` - word | expected - ${'redberry'} | ${true} - ${'pink'} | ${false} - ${'bug'} | ${true} - ${'blackberry'} | ${true} - ${'pinkbug'} | ${true} + word | expected + ${'redberry'} | ${true} + ${'pink'} | ${false} + ${'bug'} | ${true} + ${'blackberry'} | ${true} + ${'pinkbug'} | ${true} + ${'cafe'} | ${false} + ${'café'} | ${true} + ${'cafe\u0301'} | ${true} + ${'accent'} | ${true} + ${'áccent'} | ${true /* ignore the accent. cspell:disable-line */} + ${'a\u0301ccent'} | ${true /* ignore the accent. cspell:disable-line */} + ${'applé'} | ${true /* ignore the accent. cspell:disable-line */} `('checks has word: "$word"', ({ word, expected }) => { const dicts = [ - createSpellingDictionary(wordsA, 'wordsA', 'test', undefined), + createSpellingDictionary(wordsA, 'wordsA', 'test', { dictionaryInformation: { ignore: '\u0300-\u0362' } }), createSpellingDictionary(wordsB, 'wordsB', 'test', undefined), createSpellingDictionary(wordsC, 'wordsC', 'test', undefined), createSpellingDictionary(wordsD, 'wordsD', 'test', undefined), createSpellingDictionary(wordsF, 'wordsF', 'test', undefined), + createSpellingDictionary(wordsG, 'wordsA', 'test', { + dictionaryInformation: { ignore: '\u0300-\u0362' }, + caseSensitive: true, + }), createForbiddenWordsDictionary(['Avocado'], 'flag_words', 'test', undefined), ]; const dictCollection = createCollection(dicts, 'test'); - expect(dictCollection.has(word)).toEqual(expected); + expect(dictCollection.has(word, { ignoreCase: false })).toEqual(expected); }); test.each` diff --git a/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryFromTrie.ts b/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryFromTrie.ts index 9c7b68417c2..49ca52da889 100644 --- a/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryFromTrie.ts +++ b/packages/cspell-lib/src/SpellingDictionary/SpellingDictionaryFromTrie.ts @@ -10,6 +10,7 @@ import { getDefaultSettings } from '../Settings'; import { memorizer } from '../util/Memorizer'; import { createMapper } from '../util/repMap'; import { clean } from '../util/util'; +import { charsetToRegExp } from './charset'; import { FindResult, HasOptions, @@ -36,6 +37,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary { readonly type = 'SpellingDictionaryFromTrie'; readonly isDictionaryCaseSensitive: boolean; readonly containsNoSuggestWords: boolean; + readonly ignoreCharactersRegExp: RegExp | undefined; private weightMap: WeightMap | undefined; @@ -51,6 +53,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary { this.containsNoSuggestWords = options.noSuggest || false; this._size = size || 0; this.weightMap = options.weightMap || createWeightMapFromDictionaryInformation(options.dictionaryInformation); + this.ignoreCharactersRegExp = charsetToRegExp(this.options.dictionaryInformation?.ignore); } public get size(): number { @@ -103,6 +106,24 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary { word: string, useCompounds: number | boolean | undefined, ignoreCase: boolean + ): FindAnyFormResult | undefined { + const outerForms = new Set([word]); + if (this.ignoreCharactersRegExp) { + outerForms.add(word.replace(this.ignoreCharactersRegExp, '')); + outerForms.add(word.normalize('NFD').replace(this.ignoreCharactersRegExp, '')); + outerForms.add(word.normalize('NFC').replace(this.ignoreCharactersRegExp, '')); + } + for (const form of outerForms) { + const r = this._findAnyForm(form, useCompounds, ignoreCase); + if (r) return r; + } + return undefined; + } + + private _findAnyForm( + word: string, + useCompounds: number | boolean | undefined, + ignoreCase: boolean ): FindAnyFormResult | undefined { const mWord = this.mapWord(word.normalize('NFC')); const opts: FindWordOptions = { caseSensitive: !ignoreCase }; diff --git a/packages/cspell-lib/src/SpellingDictionary/charset.ts b/packages/cspell-lib/src/SpellingDictionary/charset.ts new file mode 100644 index 00000000000..34bddd361da --- /dev/null +++ b/packages/cspell-lib/src/SpellingDictionary/charset.ts @@ -0,0 +1,12 @@ +import { CharacterSet } from '@cspell/cspell-types'; + +export function charsetToRegExp(charset: CharacterSet | undefined): RegExp | undefined { + if (!charset) return undefined; + + try { + const reg = `[${charset.replace(/[\][]/g, '\\$&')}]`; + return new RegExp(reg, 'g'); + } catch (e) { + return undefined; + } +} diff --git a/packages/cspell-types/cspell.schema.json b/packages/cspell-types/cspell.schema.json index b15eb5e5562..dd179fa5caf 100644 --- a/packages/cspell-types/cspell.schema.json +++ b/packages/cspell-types/cspell.schema.json @@ -434,6 +434,10 @@ "$ref": "#/definitions/HunspellInformation", "description": "Used by dictionary authors" }, + "ignore": { + "$ref": "#/definitions/CharacterSet", + "description": "An optional set of characters that can possibly be removed from a word before checking it.\n\nThis is useful in languages like Arabic where Harakat accents are optional.\n\nNote: All matching characters are removed or none. Partial removal is not supported." + }, "locale": { "description": "The locale of the dictionary. Example: `nl,nl-be`", "type": "string" diff --git a/packages/cspell-types/src/DictionaryInformation.ts b/packages/cspell-types/src/DictionaryInformation.ts index b426f53b358..c1de1603744 100644 --- a/packages/cspell-types/src/DictionaryInformation.ts +++ b/packages/cspell-types/src/DictionaryInformation.ts @@ -47,6 +47,16 @@ export interface DictionaryInformation { * If the word matches the pattern, then the penalty is applied. */ adjustments?: PatternAdjustment[]; + + /** + * An optional set of characters that can possibly be removed from a word before + * checking it. + * + * This is useful in languages like Arabic where Harakat accents are optional. + * + * Note: All matching characters are removed or none. Partial removal is not supported. + */ + ignore?: CharacterSet; } // cspell:ignore aeistlunkodmrvpgjhäõbüoöfcwzxðqþ aàâä eéèêë iîïy From 454ed13fc276e02a8a90395e70a0a713824dba22 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Fri, 29 Jul 2022 20:13:44 +0200 Subject: [PATCH 2/2] Update charset.ts --- packages/cspell-lib/src/SpellingDictionary/charset.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/cspell-lib/src/SpellingDictionary/charset.ts b/packages/cspell-lib/src/SpellingDictionary/charset.ts index 34bddd361da..8b4211d829e 100644 --- a/packages/cspell-lib/src/SpellingDictionary/charset.ts +++ b/packages/cspell-lib/src/SpellingDictionary/charset.ts @@ -4,7 +4,7 @@ export function charsetToRegExp(charset: CharacterSet | undefined): RegExp | und if (!charset) return undefined; try { - const reg = `[${charset.replace(/[\][]/g, '\\$&')}]`; + const reg = `[${charset.replace(/[\][\\]/g, '\\$&')}]`; return new RegExp(reg, 'g'); } catch (e) { return undefined;