Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support Ignoring characters before checking #3311

Merged
merged 2 commits into from Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions cspell-dict.txt
Expand Up @@ -17,6 +17,7 @@ estree
exonum
gimu
globstar
Harakat
jamstack
lcov
licia
Expand Down
4 changes: 4 additions & 0 deletions cspell.schema.json
Expand Up @@ -434,6 +434,10 @@
"$ref": "#/definitions/HunspellInformation",
"description": "Used by dictionary authors"
},
"ignore": {
"$ref": "#/definitions/CharacterSet",
"description": "An optional set of characters that can possibly be removed from a word before checking it.\n\nThis is useful in languages like Arabic where Harakat accents are optional.\n\nNote: All matching characters are removed or none. Partial removal is not supported."
},
"locale": {
"description": "The locale of the dictionary. Example: `nl,nl-be`",
"type": "string"
Expand Down
Expand Up @@ -32,6 +32,7 @@ describe('Verify using multiple dictionaries', () => {
const wordsC = ['ant', 'snail', 'beetle', 'worm', 'stink bug', 'centipede', 'millipede', 'flea', 'fly'];
const wordsD = ['red*', 'green*', 'blue*', 'pink*', 'black*', '*berry', '+-fruit', '*bug', 'pinkie'];
const wordsF = ['!pink*', '+berry', '+bug', '!stinkbug'];
const wordsG = ['café', 'accent'];

const wordsLegacy = ['error', 'code', 'system', 'ctrl'];

Expand Down Expand Up @@ -181,24 +182,35 @@ describe('Verify using multiple dictionaries', () => {
});

test.each`
word | expected
${'redberry'} | ${true}
${'pink'} | ${false}
${'bug'} | ${true}
${'blackberry'} | ${true}
${'pinkbug'} | ${true}
word | expected
${'redberry'} | ${true}
${'pink'} | ${false}
${'bug'} | ${true}
${'blackberry'} | ${true}
${'pinkbug'} | ${true}
${'cafe'} | ${false}
${'café'} | ${true}
${'cafe\u0301'} | ${true}
${'accent'} | ${true}
${'áccent'} | ${true /* ignore the accent. cspell:disable-line */}
${'a\u0301ccent'} | ${true /* ignore the accent. cspell:disable-line */}
${'applé'} | ${true /* ignore the accent. cspell:disable-line */}
`('checks has word: "$word"', ({ word, expected }) => {
const dicts = [
createSpellingDictionary(wordsA, 'wordsA', 'test', undefined),
createSpellingDictionary(wordsA, 'wordsA', 'test', { dictionaryInformation: { ignore: '\u0300-\u0362' } }),
createSpellingDictionary(wordsB, 'wordsB', 'test', undefined),
createSpellingDictionary(wordsC, 'wordsC', 'test', undefined),
createSpellingDictionary(wordsD, 'wordsD', 'test', undefined),
createSpellingDictionary(wordsF, 'wordsF', 'test', undefined),
createSpellingDictionary(wordsG, 'wordsA', 'test', {
dictionaryInformation: { ignore: '\u0300-\u0362' },
caseSensitive: true,
}),
createForbiddenWordsDictionary(['Avocado'], 'flag_words', 'test', undefined),
];

const dictCollection = createCollection(dicts, 'test');
expect(dictCollection.has(word)).toEqual(expected);
expect(dictCollection.has(word, { ignoreCase: false })).toEqual(expected);
});

test.each`
Expand Down
Expand Up @@ -10,6 +10,7 @@ import { getDefaultSettings } from '../Settings';
import { memorizer } from '../util/Memorizer';
import { createMapper } from '../util/repMap';
import { clean } from '../util/util';
import { charsetToRegExp } from './charset';
import {
FindResult,
HasOptions,
Expand All @@ -36,6 +37,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
readonly type = 'SpellingDictionaryFromTrie';
readonly isDictionaryCaseSensitive: boolean;
readonly containsNoSuggestWords: boolean;
readonly ignoreCharactersRegExp: RegExp | undefined;

private weightMap: WeightMap | undefined;

Expand All @@ -51,6 +53,7 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
this.containsNoSuggestWords = options.noSuggest || false;
this._size = size || 0;
this.weightMap = options.weightMap || createWeightMapFromDictionaryInformation(options.dictionaryInformation);
this.ignoreCharactersRegExp = charsetToRegExp(this.options.dictionaryInformation?.ignore);
}

public get size(): number {
Expand Down Expand Up @@ -103,6 +106,24 @@ export class SpellingDictionaryFromTrie implements SpellingDictionary {
word: string,
useCompounds: number | boolean | undefined,
ignoreCase: boolean
): FindAnyFormResult | undefined {
const outerForms = new Set([word]);
if (this.ignoreCharactersRegExp) {
outerForms.add(word.replace(this.ignoreCharactersRegExp, ''));
outerForms.add(word.normalize('NFD').replace(this.ignoreCharactersRegExp, ''));
outerForms.add(word.normalize('NFC').replace(this.ignoreCharactersRegExp, ''));
}
for (const form of outerForms) {
const r = this._findAnyForm(form, useCompounds, ignoreCase);
if (r) return r;
}
return undefined;
}

private _findAnyForm(
word: string,
useCompounds: number | boolean | undefined,
ignoreCase: boolean
): FindAnyFormResult | undefined {
const mWord = this.mapWord(word.normalize('NFC'));
const opts: FindWordOptions = { caseSensitive: !ignoreCase };
Expand Down
12 changes: 12 additions & 0 deletions packages/cspell-lib/src/SpellingDictionary/charset.ts
@@ -0,0 +1,12 @@
import { CharacterSet } from '@cspell/cspell-types';

export function charsetToRegExp(charset: CharacterSet | undefined): RegExp | undefined {
if (!charset) return undefined;

try {
const reg = `[${charset.replace(/[\][]/g, '\\$&')}]`;
Fixed Show fixed Hide fixed
return new RegExp(reg, 'g');
} catch (e) {
return undefined;
}
}
4 changes: 4 additions & 0 deletions packages/cspell-types/cspell.schema.json
Expand Up @@ -434,6 +434,10 @@
"$ref": "#/definitions/HunspellInformation",
"description": "Used by dictionary authors"
},
"ignore": {
"$ref": "#/definitions/CharacterSet",
"description": "An optional set of characters that can possibly be removed from a word before checking it.\n\nThis is useful in languages like Arabic where Harakat accents are optional.\n\nNote: All matching characters are removed or none. Partial removal is not supported."
},
"locale": {
"description": "The locale of the dictionary. Example: `nl,nl-be`",
"type": "string"
Expand Down
10 changes: 10 additions & 0 deletions packages/cspell-types/src/DictionaryInformation.ts
Expand Up @@ -47,6 +47,16 @@ export interface DictionaryInformation {
* If the word matches the pattern, then the penalty is applied.
*/
adjustments?: PatternAdjustment[];

/**
* An optional set of characters that can possibly be removed from a word before
* checking it.
*
* This is useful in languages like Arabic where Harakat accents are optional.
*
* Note: All matching characters are removed or none. Partial removal is not supported.
*/
ignore?: CharacterSet;
}

// cspell:ignore aeistlunkodmrvpgjhäõbüoöfcwzxðqþ aàâä eéèêë iîïy
Expand Down