Skip to content

Commit

Permalink
feat: Support Ignoring characters before checking (#3311)
Browse files Browse the repository at this point in the history
* feat: Support Ignoring characters before checking

   Added the ability to specify characters to be ignored (removed) from a word before checking the word in the dictionary.

   Related to Aribic Harakat vowel accents.
   streetsidesoftware/cspell-dicts#1314
  • Loading branch information
Jason3S committed Jul 29, 2022
1 parent 6057fa3 commit d3fbe6c
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 8 deletions.
1 change: 1 addition & 0 deletions cspell-dict.txt
Expand Up @@ -17,6 +17,7 @@ estree
exonum
gimu
globstar
Harakat
jamstack
lcov
licia
Expand Down
4 changes: 4 additions & 0 deletions cspell.schema.json
Expand Up @@ -434,6 +434,10 @@
"$ref": "#/definitions/HunspellInformation",
"description": "Used by dictionary authors"
},
"ignore": {
"$ref": "#/definitions/CharacterSet",
"description": "An optional set of characters that can possibly be removed from a word before checking it.\n\nThis is useful in languages like Arabic where Harakat accents are optional.\n\nNote: All matching characters are removed or none. Partial removal is not supported."
},
"locale": {
"description": "The locale of the dictionary. Example: `nl,nl-be`",
"type": "string"
Expand Down
Expand Up @@ -32,6 +32,7 @@ describe('Verify using multiple dictionaries', () => {
const wordsC = ['ant', 'snail', 'beetle', 'worm', 'stink bug', 'centipede', 'millipede', 'flea', 'fly'];
const wordsD = ['red*', 'green*', 'blue*', 'pink*', 'black*', '*berry', '+-fruit', '*bug', 'pinkie'];
const wordsF = ['!pink*', '+berry', '+bug', '!stinkbug'];
const wordsG = ['café', 'accent'];

const wordsLegacy = ['error', 'code', 'system', 'ctrl'];

Expand Down Expand Up @@ -181,24 +182,35 @@ describe('Verify using multiple dictionaries', () => {
});

test.each`
word | expected
${'redberry'} | ${true}
${'pink'} | ${false}
${'bug'} | ${true}
${'blackberry'} | ${true}
${'pinkbug'} | ${true}
word | expected
${'redberry'} | ${true}
${'pink'} | ${false}
${'bug'} | ${true}
${'blackberry'} | ${true}
${'pinkbug'} | ${true}
${'cafe'} | ${false}
${'café'} | ${true}
${'cafe\u0301'} | ${true}
${'accent'} | ${true}
${'áccent'} | ${true /* ignore the accent. cspell:disable-line */}
${'a\u0301ccent'} | ${true /* ignore the accent. cspell:disable-line */}
${'applé'} | ${true /* ignore the accent. cspell:disable-line */}
`('checks has word: "$word"', ({ word, expected }) => {
const dicts = [
createSpellingDictionary(wordsA, 'wordsA', 'test', undefined),
createSpellingDictionary(wordsA, 'wordsA', 'test', { dictionaryInformation: { ignore: '\u0300-\u0362' } }),
createSpellingDictionary(wordsB, 'wordsB', 'test', undefined),
createSpellingDictionary(wordsC, 'wordsC', 'test', undefined),
createSpellingDictionary(wordsD, 'wordsD', 'test', undefined),
createSpellingDictionary(wordsF, 'wordsF', 'test', undefined),
createSpellingDictionary(wordsG, 'wordsA', 'test', {
dictionaryInformation: { ignore: '\u0300-\u0362' },
caseSensitive: true,
}),
createForbiddenWordsDictionary(['Avocado'], 'flag_words', 'test', undefined),
];

const dictCollection = createCollection(dicts, 'test');
expect(dictCollection.has(word)).toEqual(expected);
expect(dictCollection.has(word, { ignoreCase: false })).toEqual(expected);
});

test.each`
Expand Down
Expand Up @@ -10,6 +10,7 @@ import { getDefaultSettings } from '../Settings';
import { memorizer } from '../util/Memorizer';
import { createMapper } from '../util/repMap';
import { clean } from '../util/util';
import { charsetToRegExp } from './charset';
import {
FindResult,
HasOptions,
Expand All @@ -36,6 +37,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
readonly type = 'SpellingDictionaryFromTrie';
readonly isDictionaryCaseSensitive: boolean;
readonly containsNoSuggestWords: boolean;
readonly ignoreCharactersRegExp: RegExp | undefined;

private weightMap: WeightMap | undefined;

Expand All @@ -51,6 +53,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
this.containsNoSuggestWords = options.noSuggest || false;
this._size = size || 0;
this.weightMap = options.weightMap || createWeightMapFromDictionaryInformation(options.dictionaryInformation);
this.ignoreCharactersRegExp = charsetToRegExp(this.options.dictionaryInformation?.ignore);
}

public get size(): number {
Expand Down Expand Up @@ -103,6 +106,24 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
word: string,
useCompounds: number | boolean | undefined,
ignoreCase: boolean
): FindAnyFormResult | undefined {
const outerForms = new Set([word]);
if (this.ignoreCharactersRegExp) {
outerForms.add(word.replace(this.ignoreCharactersRegExp, ''));
outerForms.add(word.normalize('NFD').replace(this.ignoreCharactersRegExp, ''));
outerForms.add(word.normalize('NFC').replace(this.ignoreCharactersRegExp, ''));
}
for (const form of outerForms) {
const r = this._findAnyForm(form, useCompounds, ignoreCase);
if (r) return r;
}
return undefined;
}

private _findAnyForm(
word: string,
useCompounds: number | boolean | undefined,
ignoreCase: boolean
): FindAnyFormResult | undefined {
const mWord = this.mapWord(word.normalize('NFC'));
const opts: FindWordOptions = { caseSensitive: !ignoreCase };
Expand Down
12 changes: 12 additions & 0 deletions packages/cspell-lib/src/SpellingDictionary/charset.ts
@@ -0,0 +1,12 @@
import { CharacterSet } from '@cspell/cspell-types';

export function charsetToRegExp(charset: CharacterSet | undefined): RegExp | undefined {
if (!charset) return undefined;

try {
const reg = `[${charset.replace(/[\][\\]/g, '\\$&')}]`;
return new RegExp(reg, 'g');
} catch (e) {
return undefined;
}
}
4 changes: 4 additions & 0 deletions packages/cspell-types/cspell.schema.json
Expand Up @@ -434,6 +434,10 @@
"$ref": "#/definitions/HunspellInformation",
"description": "Used by dictionary authors"
},
"ignore": {
"$ref": "#/definitions/CharacterSet",
"description": "An optional set of characters that can possibly be removed from a word before checking it.\n\nThis is useful in languages like Arabic where Harakat accents are optional.\n\nNote: All matching characters are removed or none. Partial removal is not supported."
},
"locale": {
"description": "The locale of the dictionary. Example: `nl,nl-be`",
"type": "string"
Expand Down
10 changes: 10 additions & 0 deletions packages/cspell-types/src/DictionaryInformation.ts
Expand Up @@ -47,6 +47,16 @@ export interface DictionaryInformation {
* If the word matches the pattern, then the penalty is applied.
*/
adjustments?: PatternAdjustment[];

/**
* An optional set of characters that can possibly be removed from a word before
* checking it.
*
* This is useful in languages like Arabic where Harakat accents are optional.
*
* Note: All matching characters are removed or none. Partial removal is not supported.
*/
ignore?: CharacterSet;
}

// cspell:ignore aeistlunkodmrvpgjhäõbüoöfcwzxðqþ aàâä eéèêë iîïy
Expand Down

0 comments on commit d3fbe6c

Please sign in to comment.