From ac1ab980ff42ac147da200b978c3895ecaeee207 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Mon, 6 Sep 2021 10:27:25 +0200 Subject: [PATCH 1/2] fix: Perf - Use PairingHeap for SortedQueue --- ...rtedQueue.test.ts => MinHeapQueue.test.ts} | 12 +- .../util/{SortedQueue.ts => MinHeapQueue.ts} | 13 ++- .../cspell-lib/src/util/PairingHeap.test.ts | 51 +++++++++ packages/cspell-lib/src/util/PairingHeap.ts | 107 ++++++++++++++++++ packages/cspell-lib/src/util/wordSplitter.ts | 14 +-- 5 files changed, 179 insertions(+), 18 deletions(-) rename packages/cspell-lib/src/util/{SortedQueue.test.ts => MinHeapQueue.test.ts} (90%) rename packages/cspell-lib/src/util/{SortedQueue.ts => MinHeapQueue.ts} (87%) create mode 100644 packages/cspell-lib/src/util/PairingHeap.test.ts create mode 100644 packages/cspell-lib/src/util/PairingHeap.ts diff --git a/packages/cspell-lib/src/util/SortedQueue.test.ts b/packages/cspell-lib/src/util/MinHeapQueue.test.ts similarity index 90% rename from packages/cspell-lib/src/util/SortedQueue.test.ts rename to packages/cspell-lib/src/util/MinHeapQueue.test.ts index d0e86ac15bd..61468d0a4af 100644 --- a/packages/cspell-lib/src/util/SortedQueue.test.ts +++ b/packages/cspell-lib/src/util/MinHeapQueue.test.ts @@ -1,5 +1,5 @@ import { compare } from './Comparable'; -import { __testing__, SortedQueue } from './SortedQueue'; +import { __testing__, MinHeapQueue } from './MinHeapQueue'; const { addToHeap, takeFromHeap } = __testing__; @@ -25,7 +25,7 @@ describe('Validate Mere Sort methods', () => { ${'abc def ghi jkl mno pqr stu vwx yz'} ${'aaaaaaaaaaaaaaaaaa'} `('Merge Queue $letters', ({ letters }: { letters: string }) => { - const q = new SortedQueue(compare); + const q = new MinHeapQueue(compare); const values = letters.split(''); q.concat(values); expect(q.length).toBe(values.length); @@ -35,7 +35,7 @@ describe('Validate Mere Sort methods', () => { }); test('Queue', () => { - const q = new SortedQueue(compare); + const q = new MinHeapQueue(compare); expect(q.length).toBe(0); q.add('one'); q.add('two'); @@ -60,13 +60,13 @@ describe('Validate Mere Sort methods', () => { ]; const sorted = values.concat().sort(compare); - const q = new SortedQueue(compare); + const q = new MinHeapQueue(compare); q.concat(values); expect([...q]).toEqual(sorted); }); test('Queue Random', () => { - const q = new SortedQueue(compare); + const q = new MinHeapQueue(compare); for (let i = 0; i < 100; ++i) { const s = Math.random(); const n = Math.floor(100 * s); @@ -83,7 +83,7 @@ describe('Validate Mere Sort methods', () => { }); test('Clone', () => { - const q = new SortedQueue(compare); + const q = new MinHeapQueue(compare); for (let i = 0; i < 10; ++i) { const s = Math.random(); const n = Math.floor(100 * s); diff --git a/packages/cspell-lib/src/util/SortedQueue.ts b/packages/cspell-lib/src/util/MinHeapQueue.ts similarity index 87% rename from packages/cspell-lib/src/util/SortedQueue.ts rename to packages/cspell-lib/src/util/MinHeapQueue.ts index 83c5fd7f2da..672ef390c97 100644 --- a/packages/cspell-lib/src/util/SortedQueue.ts +++ b/packages/cspell-lib/src/util/MinHeapQueue.ts @@ -46,11 +46,14 @@ function takeFromHeap(t: T[], compare: (a: T, b: T) => number): T | undefined return result; } -export class SortedQueue implements IterableIterator { +/** + * MinHeapQueue - based upon a minHeap array. + */ +export class MinHeapQueue implements IterableIterator { private values: T[] = []; constructor(readonly compare: (a: T, b: T) => number) {} - add(t: T): SortedQueue { + add(t: T): MinHeapQueue { addToHeap(this.values, t, this.compare); return this; } @@ -63,7 +66,7 @@ export class SortedQueue implements IterableIterator { return takeFromHeap(this.values, this.compare); } - concat(i: Iterable): SortedQueue { + concat(i: Iterable): MinHeapQueue { for (const v of i) { this.add(v); } @@ -86,8 +89,8 @@ export class SortedQueue implements IterableIterator { return this; } - clone(): SortedQueue { - const clone = new SortedQueue(this.compare); + clone(): MinHeapQueue { + const clone = new MinHeapQueue(this.compare); clone.values = this.values.concat(); return clone; } diff --git a/packages/cspell-lib/src/util/PairingHeap.test.ts b/packages/cspell-lib/src/util/PairingHeap.test.ts new file mode 100644 index 00000000000..4a4cea2f41b --- /dev/null +++ b/packages/cspell-lib/src/util/PairingHeap.test.ts @@ -0,0 +1,51 @@ +import { PairingHeap } from './PairingHeap'; + +describe('PairingHeap', () => { + test('Basic add and remove', () => { + const compare = new Intl.Collator().compare; + const values = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']; + const sorted = values.concat().sort(compare); + const heap = new PairingHeap(compare); + values.forEach((v) => heap.add(v)); + expect(heap.length).toBe(values.length); + const result = [...heap]; + expect(result).toEqual(sorted); + expect(heap.length).toBe(0); + }); + + interface Person { + name: string; + } + + test('FIFO for latest', () => { + const compareStr = new Intl.Collator().compare; + const compare = (a: Person, b: Person) => compareStr(a.name, b.name); + const names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July']; + const people: Person[] = names.map((name) => ({ name })); + const sorted = people.concat().sort(compare); + const heap = new PairingHeap(compare); + + heap.add(people[0]); + expect(heap.dequeue()).toBe(people[0]); + expect(heap.length).toBe(0); + expect(heap.peek()).toBeUndefined(); + expect(heap.dequeue()).toBeUndefined(); + + heap.concat(people); + expect(heap.dequeue()).toBe(sorted[0]); + expect(heap.dequeue()).toBe(sorted[1]); + heap.concat(people); + expect(heap.dequeue()).toBe(sorted[0]); + expect(heap.dequeue()).toBe(sorted[1]); + expect(heap.peek()).toBe(sorted[2]); + expect(heap.dequeue()).toBe(sorted[2]); + expect(heap.dequeue()).toBe(sorted[2]); + expect(heap.peek()).toBe(sorted[3]); + + heap.add(sorted[0]); + expect(heap.peek()).toBe(sorted[0]); + const copy = { ...sorted[0] }; + // Make sure we get back the open we added. + expect(heap.add(copy).peek()).toBe(copy); + }); +}); diff --git a/packages/cspell-lib/src/util/PairingHeap.ts b/packages/cspell-lib/src/util/PairingHeap.ts new file mode 100644 index 00000000000..2669bb558e4 --- /dev/null +++ b/packages/cspell-lib/src/util/PairingHeap.ts @@ -0,0 +1,107 @@ +export interface PairHeapNode { + /** Value */ + v: T; + /** Siblings */ + s: PairHeapNode | undefined; + /** Children */ + c: PairHeapNode | undefined; +} + +export type CompareFn = (a: T, b: T) => number; + +export class PairingHeap implements IterableIterator { + private _heap: PairHeapNode | undefined; + private _size = 0; + + constructor(readonly compare: CompareFn) {} + + add(v: T): this { + this._heap = insert(this.compare, this._heap, v); + ++this._size; + return this; + } + + dequeue(): T | undefined { + const n = this.next(); + if (n.done) return undefined; + return n.value; + } + + concat(i: Iterable): this { + for (const v of i) { + this.add(v); + } + return this; + } + + next(): IteratorResult { + if (!this._heap) { + return { value: undefined, done: true }; + } + const value = this._heap.v; + --this._size; + this._heap = removeHead(this.compare, this._heap); + return { value }; + } + + peek(): T | undefined { + return this._heap?.v; + } + + [Symbol.iterator](): IterableIterator { + return this; + } + + get length(): number { + return this._size; + } +} + +function removeHead(compare: CompareFn, heap: PairHeapNode | undefined): PairHeapNode | undefined { + if (!heap || !heap.c) return undefined; + return mergeSiblings(compare, heap.c); +} + +function insert(compare: CompareFn, heap: PairHeapNode | undefined, v: T): PairHeapNode { + const n: PairHeapNode = { + v, + s: undefined, + c: undefined, + }; + + if (!heap || compare(v, heap.v) <= 0) { + n.c = heap; + return n; + } + + n.s = heap.c; + heap.c = n; + return heap; +} + +function merge(compare: CompareFn, a: PairHeapNode, b: PairHeapNode): PairHeapNode { + if (compare(a.v, b.v) <= 0) { + a.s = undefined; + b.s = a.c; + a.c = b; + return a; + } + b.s = undefined; + a.s = b.c; + b.c = a; + return b; +} + +function mergeSiblings(compare: CompareFn, n: PairHeapNode): PairHeapNode { + if (!n.s) return n; + const s = n.s; + const ss = s.s; + const m = merge(compare, n, s); + return ss ? merge(compare, m, mergeSiblings(compare, ss)) : m; +} + +export const heapMethods = { + insert, + merge, + mergeSiblings, +}; diff --git a/packages/cspell-lib/src/util/wordSplitter.ts b/packages/cspell-lib/src/util/wordSplitter.ts index f4836f5b9b8..a6e09ac0cac 100644 --- a/packages/cspell-lib/src/util/wordSplitter.ts +++ b/packages/cspell-lib/src/util/wordSplitter.ts @@ -1,15 +1,15 @@ +import { PairingHeap } from './PairingHeap'; +import { escapeRegEx } from './regexHelper'; import { TextOffset } from './text'; import { - regExWordsAndDigits, + regExDanglingQuote, + regExEscapeCharacters, + regExPossibleWordBreaks, regExSplitWords, regExSplitWords2, - regExPossibleWordBreaks, - regExEscapeCharacters, - regExDanglingQuote, regExTrailingEndings, + regExWordsAndDigits, } from './textRegex'; -import { SortedQueue } from './SortedQueue'; -import { escapeRegEx } from './regexHelper'; const ignoreBreak: readonly number[] = Object.freeze([] as number[]); @@ -388,7 +388,7 @@ function splitIntoWords( } let maxCost = lineSeg.relEnd - lineSeg.relStart; - const candidates = new SortedQueue(compare); + const candidates = new PairingHeap(compare); const text = lineSeg.line.text; candidates.concat(makeCandidates(undefined, lineSeg.relStart, 0, 0)); let attempts = 0; From 2f3f08ce0b7106bf536a73f70e4e8ee256c1d881 Mon Sep 17 00:00:00 2001 From: Jason Dent Date: Mon, 6 Sep 2021 10:28:10 +0200 Subject: [PATCH 2/2] dev: Try using an A* algorithm to find suggestions. --- .../src/lib/PairingHeap.test.ts | 51 ++ .../cspell-trie-lib/src/lib/PairingHeap.ts | 107 +++++ .../cspell-trie-lib/src/lib/orthography.ts | 2 +- .../src/lib/suggest-en-a-star.test.ts | 179 +++++++ packages/cspell-trie-lib/src/lib/suggest.ts | 4 +- .../src/lib/suggestAStar.test.ts | 262 +++++++++++ .../cspell-trie-lib/src/lib/suggestAStar.ts | 445 ++++++++++++++++++ 7 files changed, 1047 insertions(+), 3 deletions(-) create mode 100644 packages/cspell-trie-lib/src/lib/PairingHeap.test.ts create mode 100644 packages/cspell-trie-lib/src/lib/PairingHeap.ts create mode 100644 packages/cspell-trie-lib/src/lib/suggest-en-a-star.test.ts create mode 100644 packages/cspell-trie-lib/src/lib/suggestAStar.test.ts create mode 100644 packages/cspell-trie-lib/src/lib/suggestAStar.ts diff --git a/packages/cspell-trie-lib/src/lib/PairingHeap.test.ts b/packages/cspell-trie-lib/src/lib/PairingHeap.test.ts new file mode 100644 index 00000000000..4a4cea2f41b --- /dev/null +++ b/packages/cspell-trie-lib/src/lib/PairingHeap.test.ts @@ -0,0 +1,51 @@ +import { PairingHeap } from './PairingHeap'; + +describe('PairingHeap', () => { + test('Basic add and remove', () => { + const compare = new Intl.Collator().compare; + const values = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']; + const sorted = values.concat().sort(compare); + const heap = new PairingHeap(compare); + values.forEach((v) => heap.add(v)); + expect(heap.length).toBe(values.length); + const result = [...heap]; + expect(result).toEqual(sorted); + expect(heap.length).toBe(0); + }); + + interface Person { + name: string; + } + + test('FIFO for latest', () => { + const compareStr = new Intl.Collator().compare; + const compare = (a: Person, b: Person) => compareStr(a.name, b.name); + const names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'July']; + const people: Person[] = names.map((name) => ({ name })); + const sorted = people.concat().sort(compare); + const heap = new PairingHeap(compare); + + heap.add(people[0]); + expect(heap.dequeue()).toBe(people[0]); + expect(heap.length).toBe(0); + expect(heap.peek()).toBeUndefined(); + expect(heap.dequeue()).toBeUndefined(); + + heap.concat(people); + expect(heap.dequeue()).toBe(sorted[0]); + expect(heap.dequeue()).toBe(sorted[1]); + heap.concat(people); + expect(heap.dequeue()).toBe(sorted[0]); + expect(heap.dequeue()).toBe(sorted[1]); + expect(heap.peek()).toBe(sorted[2]); + expect(heap.dequeue()).toBe(sorted[2]); + expect(heap.dequeue()).toBe(sorted[2]); + expect(heap.peek()).toBe(sorted[3]); + + heap.add(sorted[0]); + expect(heap.peek()).toBe(sorted[0]); + const copy = { ...sorted[0] }; + // Make sure we get back the open we added. + expect(heap.add(copy).peek()).toBe(copy); + }); +}); diff --git a/packages/cspell-trie-lib/src/lib/PairingHeap.ts b/packages/cspell-trie-lib/src/lib/PairingHeap.ts new file mode 100644 index 00000000000..2669bb558e4 --- /dev/null +++ b/packages/cspell-trie-lib/src/lib/PairingHeap.ts @@ -0,0 +1,107 @@ +export interface PairHeapNode { + /** Value */ + v: T; + /** Siblings */ + s: PairHeapNode | undefined; + /** Children */ + c: PairHeapNode | undefined; +} + +export type CompareFn = (a: T, b: T) => number; + +export class PairingHeap implements IterableIterator { + private _heap: PairHeapNode | undefined; + private _size = 0; + + constructor(readonly compare: CompareFn) {} + + add(v: T): this { + this._heap = insert(this.compare, this._heap, v); + ++this._size; + return this; + } + + dequeue(): T | undefined { + const n = this.next(); + if (n.done) return undefined; + return n.value; + } + + concat(i: Iterable): this { + for (const v of i) { + this.add(v); + } + return this; + } + + next(): IteratorResult { + if (!this._heap) { + return { value: undefined, done: true }; + } + const value = this._heap.v; + --this._size; + this._heap = removeHead(this.compare, this._heap); + return { value }; + } + + peek(): T | undefined { + return this._heap?.v; + } + + [Symbol.iterator](): IterableIterator { + return this; + } + + get length(): number { + return this._size; + } +} + +function removeHead(compare: CompareFn, heap: PairHeapNode | undefined): PairHeapNode | undefined { + if (!heap || !heap.c) return undefined; + return mergeSiblings(compare, heap.c); +} + +function insert(compare: CompareFn, heap: PairHeapNode | undefined, v: T): PairHeapNode { + const n: PairHeapNode = { + v, + s: undefined, + c: undefined, + }; + + if (!heap || compare(v, heap.v) <= 0) { + n.c = heap; + return n; + } + + n.s = heap.c; + heap.c = n; + return heap; +} + +function merge(compare: CompareFn, a: PairHeapNode, b: PairHeapNode): PairHeapNode { + if (compare(a.v, b.v) <= 0) { + a.s = undefined; + b.s = a.c; + a.c = b; + return a; + } + b.s = undefined; + a.s = b.c; + b.c = a; + return b; +} + +function mergeSiblings(compare: CompareFn, n: PairHeapNode): PairHeapNode { + if (!n.s) return n; + const s = n.s; + const ss = s.s; + const m = merge(compare, n, s); + return ss ? merge(compare, m, mergeSiblings(compare, ss)) : m; +} + +export const heapMethods = { + insert, + merge, + mergeSiblings, +}; diff --git a/packages/cspell-trie-lib/src/lib/orthography.ts b/packages/cspell-trie-lib/src/lib/orthography.ts index 70f82399f1b..1e7d00fa560 100644 --- a/packages/cspell-trie-lib/src/lib/orthography.ts +++ b/packages/cspell-trie-lib/src/lib/orthography.ts @@ -58,7 +58,7 @@ export const visualLetterMaskMap: Record = calcVisua */ function calcVisualLetterMasks(groups: string[]): Record { // map each letter in a group to the index of the group. - const map: Record = {}; + const map: Record = Object.create(null); for (let i = 0; i < groups.length; ++i) { const m = 1 << i; const g = groups[i]; diff --git a/packages/cspell-trie-lib/src/lib/suggest-en-a-star.test.ts b/packages/cspell-trie-lib/src/lib/suggest-en-a-star.test.ts new file mode 100644 index 00000000000..5d697347d31 --- /dev/null +++ b/packages/cspell-trie-lib/src/lib/suggest-en-a-star.test.ts @@ -0,0 +1,179 @@ +import { suggestionCollector, SuggestionResult, SuggestionCollectorOptions } from './suggest'; +import { suggest, genCompoundableSuggestions } from './suggestAStar'; +import { CompoundWordsMethod } from './walker'; +import { readTrie } from './dictionaries.test.helper'; + +function getTrie() { + return readTrie('@cspell/dict-en_us/cspell-ext.json'); +} + +const timeout = 10000; + +interface ExpectedSuggestion extends Partial { + word: string; +} + +describe('Validate English Suggestions', () => { + interface WordSuggestionsTest { + word: string; + expected: ExpectedSuggestion[]; + } + + // cspell:ignore emplode ballence catagory cateogry + test.each` + word | expected + ${'hello'} | ${sr({ word: 'hello', cost: 0 })} + ${'apple'} | ${sr({ word: 'apple', cost: 0 }, { word: 'apples', cost: 100 })} + ${'emplode'} | ${sr('implode')} + ${'dont'} | ${sr("don't")} + ${'ballence'} | ${sr('balance')} + ${'catagory'} | ${sr('category')} + ${'cateogry'} | ${sr({ word: 'category', cost: 75 })} + `('suggestions for $word', async ({ word, expected }: WordSuggestionsTest) => { + const trie = await getTrie(); + const x = suggest(trie.root, word); + expect(x).toEqual(expect.arrayContaining(expected.map((e) => expect.objectContaining(e)))); + }); + + test( + 'Tests suggestions "joyful"', + async () => { + const trie = await getTrie(); + const collector = suggestionCollector('joyful', opts(8, undefined, 1)); + collector.collect(genCompoundableSuggestions(trie.root, collector.word, CompoundWordsMethod.NONE)); + const results = collector.suggestions; + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual(expect.arrayContaining(['joyful'])); + expect(suggestions[0]).toBe('joyful'); + }, + timeout + ); + + // test( + // 'Tests suggestions "joyfull"', + // async () => { + // const trie = await getTrie(); + // // cspell:ignore joyfull + // const collector = suggestionCollector('joyfull', opts(8)); + // collector.collect( + // genCompoundableSuggestions(trie.root, collector.word, CompoundWordsMethod.SEPARATE_WORDS) + // ); + // const results = collector.suggestions; + // const suggestions = results.map((s) => s.word); + // expect(suggestions).toEqual(expect.arrayContaining(['joyful'])); + // expect(suggestions[0]).toBe('joyfully'); + // expect(suggestions[1]).toBe('joyful'); + // expect(suggestions).toHaveLength(collector.maxNumSuggestions); + // }, + // timeout + // ); + + // test( + // 'Tests compound SEPARATE_WORDS suggestions', + // async () => { + // const trie = await getTrie(); + // // cspell:ignore onetwothreefour + // const collector = suggestionCollector('onetwothreefour', opts(8, undefined, 3.3)); + // collector.collect( + // genCompoundableSuggestions(trie.root, collector.word, CompoundWordsMethod.SEPARATE_WORDS) + // ); + // const results = collector.suggestions; + // const suggestions = results.map((s) => s.word); + // expect(suggestions).toEqual(expect.arrayContaining(['one two three four'])); + // expect(suggestions[0]).toBe('one two three four'); + // expect(suggestions).toHaveLength(collector.maxNumSuggestions); + // }, + // timeout + // ); + + // test( + // 'Tests compound JOIN_WORDS suggestions', + // async () => { + // const trie = await getTrie(); + // // cspell:ignore onetwothrefour + // const collector = suggestionCollector('onetwothreefour', opts(8, undefined, 3)); + // collector.collect(genCompoundableSuggestions(trie.root, collector.word, CompoundWordsMethod.JOIN_WORDS)); + // const results = collector.suggestions; + // const suggestions = results.map((s) => s.word); + // expect(suggestions).toEqual(expect.arrayContaining(['one+two+three+four'])); + // expect(suggestions).toHaveLength(collector.maxNumSuggestions); + // }, + // timeout + // ); + + // test( + // 'Tests compound suggestions', + // async () => { + // const trie = await getTrie(); + // // cspell:ignore onetwothrefour + // const collector = suggestionCollector('onetwothreefour', opts(8, undefined, 3)); + // collector.collect(genCompoundableSuggestions(trie.root, collector.word, CompoundWordsMethod.JOIN_WORDS)); + // const results = collector.suggestions; + // const suggestions = results.map((s) => s.word); + // expect(suggestions).toEqual(expect.arrayContaining(['one+two+three+four'])); + // expect(suggestions).toHaveLength(collector.maxNumSuggestions); + // }, + // timeout + // ); + + // // Takes a long time. + // test( + // 'Tests long compound suggestions `testscomputesuggestions`', + // async () => { + // const trie = await getTrie(); + // // cspell:ignore testscomputesuggestions + // const collector = suggestionCollector('testscomputesuggestions', opts(2, undefined, 3, true)); + // collector.collect( + // genCompoundableSuggestions(trie.root, collector.word, CompoundWordsMethod.SEPARATE_WORDS) + // ); + // const results = collector.suggestions; + // const suggestions = results.map((s) => s.word); + // expect(suggestions).toHaveLength(collector.maxNumSuggestions); + // expect(suggestions).toEqual(['tests compute suggestions', 'test compute suggestions']); + // expect(suggestions[0]).toBe('tests compute suggestions'); + // }, + // timeout + // ); + + // // Takes a long time. + // test( + // 'Tests long compound suggestions `testscompundsuggestions`', + // async () => { + // const trie = await getTrie(); + // // cspell:ignore testscompundsuggestions + // const collector = suggestionCollector('testscompundsuggestions', opts(1, undefined, 3)); + // collector.collect( + // genCompoundableSuggestions(trie.root, collector.word, CompoundWordsMethod.SEPARATE_WORDS) + // ); + // const results = collector.suggestions; + // const suggestions = results.map((s) => s.word); + // expect(suggestions).toHaveLength(collector.maxNumSuggestions); + // expect(suggestions).toEqual(expect.arrayContaining(['tests compound suggestions'])); + // expect(suggestions[0]).toBe('tests compound suggestions'); + // }, + // timeout + // ); +}); + +function opts( + numSuggestions: number, + filter?: SuggestionCollectorOptions['filter'], + changeLimit?: number, + includeTies?: boolean, + ignoreCase?: boolean +): SuggestionCollectorOptions { + return { + numSuggestions, + filter, + changeLimit, + includeTies, + ignoreCase, + }; +} + +function sr(...sugs: (string | ExpectedSuggestion)[]): ExpectedSuggestion[] { + return sugs.map((s) => { + if (typeof s === 'string') return { word: s }; + return s; + }); +} diff --git a/packages/cspell-trie-lib/src/lib/suggest.ts b/packages/cspell-trie-lib/src/lib/suggest.ts index b916c71b77e..2812b1211bf 100644 --- a/packages/cspell-trie-lib/src/lib/suggest.ts +++ b/packages/cspell-trie-lib/src/lib/suggest.ts @@ -102,7 +102,7 @@ export function* genCompoundableSuggestions( [JOIN_SEPARATOR]: insertSpaceCost, }; - let costLimit: MaxCost = Math.min(bc * word.length * maxCostScale, bc * maxNumChanges); + let costLimit: MaxCost = bc * Math.min(word.length * maxCostScale, maxNumChanges); const a = 0; let b = 0; for (let i = 0, c = 0; i <= mx && c <= costLimit; ++i) { @@ -289,7 +289,7 @@ export function suggestionCollector(wordToMatch: string, options: SuggestionColl const { filter = () => true, changeLimit = maxNumChanges, includeTies = false, ignoreCase = true } = options; const numSuggestions = Math.max(options.numSuggestions, 0) || 0; const sugs = new Map(); - let maxCost: number = Math.min(baseCost * wordToMatch.length * maxAllowedCostScale, baseCost * changeLimit); + let maxCost: number = baseCost * Math.min(wordToMatch.length * maxAllowedCostScale, changeLimit); function dropMax() { if (sugs.size < 2) { diff --git a/packages/cspell-trie-lib/src/lib/suggestAStar.test.ts b/packages/cspell-trie-lib/src/lib/suggestAStar.test.ts new file mode 100644 index 00000000000..f803c8567ff --- /dev/null +++ b/packages/cspell-trie-lib/src/lib/suggestAStar.test.ts @@ -0,0 +1,262 @@ +import { suggestionCollector } from '.'; +import { parseDictionary } from './SimpleDictionaryParser'; +import { SuggestionCollectorOptions } from './suggest'; +import * as Sug from './suggestAStar'; +import { Trie } from './trie'; +import * as Walker from './walker'; + +const defaultOptions: SuggestionCollectorOptions = { + numSuggestions: 10, + ignoreCase: undefined, + changeLimit: undefined, + includeTies: true, +}; + +const stopHere = true; + +describe('Validate Suggest', () => { + test('Tests suggestions for valid word talks', () => { + const trie = Trie.create(sampleWords); + const results = Sug.suggest(trie.root, 'talks'); + expect(results).toEqual([ + { cost: 0, word: 'talks' }, + { cost: 100, word: 'talk' }, + { cost: 125, word: 'walks' }, + { cost: 200, word: 'talked' }, + { cost: 200, word: 'talker' }, + { cost: 225, word: 'walk' }, + ]); + }); + + test('Tests suggestions for valid word', () => { + const trie = Trie.create(sampleWords); + const results = Sug.suggest(trie.root, 'talks'); + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual(expect.arrayContaining(['talks'])); + expect(suggestions).toEqual(expect.arrayContaining(['talk'])); + expect(suggestions[0]).toBe('talks'); + expect(suggestions[1]).toBe('talk'); + expect(suggestions).toEqual(['talks', 'talk', 'walks', 'talked', 'talker', 'walk']); + }); + + test('Tests suggestions for invalid word', () => { + const trie = Trie.create(sampleWords); + // cspell:ignore tallk + const results = Sug.suggest(trie.root, 'tallk'); + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual(expect.arrayContaining(['talks'])); + expect(suggestions).toEqual(expect.arrayContaining(['talk'])); + expect(suggestions[1]).toBe('talks'); + expect(suggestions[0]).toBe('talk'); + expect(suggestions).toEqual(['talk', 'talks', 'walk', 'talked', 'talker', 'walks']); + }); + + // cspell:ignore jernals + test('Tests suggestions jernals', () => { + const trie = Trie.create(sampleWords); + const results = Sug.suggest(trie.root, 'jernals'); + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual(['journals', 'journal']); + }); + + // cspell:ignore juornals + test('Tests suggestions for `juornals` (reduced cost for swap)', () => { + const trie = Trie.create(sampleWords); + const results = Sug.suggest(trie.root, 'juornals'); + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual(['journals', 'journal', 'journalism', 'journalist']); + }); + + test('Tests suggestions for joyfull', () => { + const trie = Trie.create(sampleWords); + const results = Sug.suggest(trie.root, 'joyfull'); + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual(['joyful', 'joyfully', 'joyfuller', 'joyous', 'joyfullest']); + }); + + test('Tests suggestions', () => { + const trie = Trie.create(sampleWords); + const results = Sug.suggest(trie.root, ''); + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual([]); + }); + + // cspell:ignore joyfull + test('Tests suggestions with low max num', () => { + const trie = Trie.create(sampleWords); + const results = Sug.suggest(trie.root, 'joyfull', 3); + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual(['joyful', 'joyfully', 'joyfuller']); + }); + + test('Tests genSuggestions', () => { + const trie = Trie.create(sampleWords); + const collector = suggestionCollector( + 'joyfull', + sugOpts({ + numSuggestions: 3, + filter: (word) => word !== 'joyfully', + }) + ); + collector.collect(Sug.genSuggestions(trie.root, collector.word)); + const suggestions = collector.suggestions.map((s) => s.word); + expect(suggestions).toEqual(expect.not.arrayContaining(['joyfully'])); + // We get 4 because they are tied + expect(suggestions).toEqual(['joyful', 'joyfuller', 'joyous', 'joyfullest']); + expect(collector.maxCost).toBeLessThanOrEqual(300); + }); + + test('Tests genSuggestions wanting 0', () => { + const trie = Trie.create(sampleWords); + const collector = suggestionCollector('joyfull', sugOptsMaxNum(0)); + collector.collect(Sug.genSuggestions(trie.root, collector.word)); + const suggestions = collector.suggestions.map((s) => s.word); + expect(suggestions).toHaveLength(0); + }); + + test('Tests genSuggestions wanting -10', () => { + const trie = Trie.create(sampleWords); + const collector = suggestionCollector('joyfull', sugOptsMaxNum(-10)); + collector.collect(Sug.genSuggestions(trie.root, collector.word)); + const suggestions = collector.suggestions.map((s) => s.word); + expect(suggestions).toHaveLength(0); + }); + + // cspell:ignore wålk + test('that accents are closer', () => { + const trie = Trie.create(sampleWords); + const collector = suggestionCollector('wålk', sugOptsMaxNum(3)); + collector.collect( + Sug.genCompoundableSuggestions(trie.root, collector.word, Walker.CompoundWordsMethod.JOIN_WORDS) + ); + const suggestions = collector.suggestions.map((s) => s.word); + expect(suggestions).toEqual(['walk', 'walks', 'talk']); + }); + + // cspell:ignore wâlkéd + test('that multiple accents are closer', () => { + const trie = Trie.create(sampleWords); + const collector = suggestionCollector('wâlkéd', sugOptsMaxNum(3)); + collector.collect( + Sug.genCompoundableSuggestions(trie.root, collector.word, Walker.CompoundWordsMethod.JOIN_WORDS) + ); + const suggestions = collector.suggestions.map((s) => s.word); + expect(suggestions).toEqual(['walked', 'walker', 'talked']); + }); + + if (stopHere) return; + + // cspell:ignore walkingtalkingjoy + test('Tests compound suggestions', () => { + const trie = Trie.create(sampleWords); + const results = Sug.suggest(trie.root, 'walkingtalkingjoy', 1, Walker.CompoundWordsMethod.SEPARATE_WORDS); + const suggestions = results.map((s) => s.word); + expect(suggestions).toEqual(['walking talking joy']); + }); + + // cspell:ignore joyfullwalk + test('Tests genSuggestions with compounds SEPARATE_WORDS', () => { + const trie = Trie.create(sampleWords); + const collector = suggestionCollector('joyfullwalk', sugOptsMaxNum(3)); + collector.collect( + Sug.genCompoundableSuggestions(trie.root, collector.word, Walker.CompoundWordsMethod.SEPARATE_WORDS) + ); + const suggestions = collector.suggestions.map((s) => s.word); + expect(suggestions).toEqual(['joyful walk', 'joyful walks', 'joyfully walk']); + expect(collector.maxCost).toBeLessThan(300); + }); + + // cspell:ignore joyfullwalk joyfulwalk joyfulwalks joyfullywalk, joyfullywalks + test('Tests genSuggestions with compounds JOIN_WORDS', () => { + const trie = Trie.create(sampleWords); + const collector = suggestionCollector('joyfullwalk', sugOptsMaxNum(3)); + collector.collect( + Sug.genCompoundableSuggestions(trie.root, collector.word, Walker.CompoundWordsMethod.JOIN_WORDS) + ); + const suggestions = collector.suggestions.map((s) => s.word); + expect(suggestions).toEqual(['joyful+walk', 'joyful+walks', 'joyfully+walk']); + expect(collector.maxCost).toBeLessThan(300); + }); + + // cspell:ignore walkingtree talkingtree + test('that forbidden words are not included (collector)', () => { + const trie = parseDictionary(` + walk + walking* + *stick + talking* + *tree + !walkingtree + `); + expect(trie.suggest('walkingstick', 1)).toEqual(['walkingstick']); + expect(trie.suggest('walkingtree', 1)).toEqual([]); + expect(trie.suggest('walking*', 1)).toEqual(['walking']); + const collector = suggestionCollector('walkingtree', sugOptsMaxNum(2)); + trie.genSuggestions(collector); + expect(collector.suggestions).toEqual([ + { word: 'talkingtree', cost: 99 }, + { word: 'walkingstick', cost: 359 }, + ]); + }); +}); + +function sugOpts(opts: Partial): SuggestionCollectorOptions { + return { + ...defaultOptions, + ...opts, + }; +} + +function sugOptsMaxNum(maxNumSuggestions: number): SuggestionCollectorOptions { + return sugOpts({ numSuggestions: maxNumSuggestions }); +} + +const sampleWords = [ + 'walk', + 'walked', + 'walker', + 'walking', + 'walks', + 'talk', + 'talks', + 'talked', + 'talker', + 'talking', + 'lift', + 'lifts', + 'lifted', + 'lifter', + 'lifting', + 'journal', + 'journals', + 'journalism', + 'journalist', + 'journalistic', + 'journey', + 'journeyer', + 'journeyman', + 'journeymen', + 'joust', + 'jouster', + 'jousting', + 'jovial', + 'joviality', + 'jowl', + 'jowly', + 'joy', + 'joyful', + 'joyfuller', + 'joyfullest', + 'joyfully', + 'joyfulness', + 'joyless', + 'joylessness', + 'joyous', + 'joyousness', + 'joyridden', + 'joyride', + 'joyrider', + 'joyriding', + 'joyrode', + 'joystick', +]; diff --git a/packages/cspell-trie-lib/src/lib/suggestAStar.ts b/packages/cspell-trie-lib/src/lib/suggestAStar.ts new file mode 100644 index 00000000000..1c7d24a63a2 --- /dev/null +++ b/packages/cspell-trie-lib/src/lib/suggestAStar.ts @@ -0,0 +1,445 @@ +import { TrieRoot, TrieNode } from './TrieNode'; +import { CompoundWordsMethod } from './walker'; +import { SuggestionIterator } from './suggest'; +import { PairingHeap } from './PairingHeap'; +import { suggestionCollector, SuggestionResult } from '.'; +import { visualLetterMaskMap } from './orthography'; + +export function* genCompoundableSuggestions( + root: TrieRoot, + word: string, + _compoundMethod: CompoundWordsMethod, + ignoreCase = true +): SuggestionIterator { + const len = word.length; + + const nodes = determineInitialNodes(root, ignoreCase); + const noFollow = determineNoFollow(root); + + function compare(a: Candidate, b: Candidate): number { + const deltaCost = a.g - b.g; + if (deltaCost) return deltaCost; + // The costs are the some return the one with the most progress. + + return b.i - a.i; + } + + const opCosts = { + baseCost: 100, + swapCost: 75, + duplicateLetterCost: 25, + visuallySimilar: 1, + firstLetterBias: 25, + } as const; + + const bc = opCosts.baseCost; + const maxNumChanges = 3; + const maxCostScale = 1.03 / 2; + const optimalCost = 0; + const mapSugCost = opCosts.visuallySimilar; + + /** costLimit is the delta between ideal cost and actual cost. */ + let costLimit = bc * Math.min(len * maxCostScale, maxNumChanges); + + const candidates = new PairingHeap(compare); + const locationCache: LocationCache = new Map(); + const wordsToEmit: EmitWord[] = []; + const pathToLocation: Map = new Map(); + + const edgesToResolve: EdgeToResolve[] = []; + + const emittedWords = new Map(); + + function getLocationNode(path: Path): LocationNode { + const index = path.i; + const node = path.n; + const foundByIndex = locationCache.get(index); + const byTrie: LocationByTriNode = foundByIndex || new Map(); + if (!foundByIndex) locationCache.set(index, byTrie); + const f = byTrie.get(node); + const n: LocationNode = f || { in: new Map(), bc: 0, p: path, sbc: -1, sfx: [] }; + if (!f) byTrie.set(node, n); + return n; + } + + function* emitWord(word: string, cost: number): SuggestionIterator { + if (cost <= costLimit) { + // console.log(`e: ${word} ${cost}`); + const f = emittedWords.get(word); + if (f !== undefined && f <= cost) return undefined; + emittedWords.set(word, cost); + const lastChar = word[word.length - 1]; + if (!noFollow[lastChar]) { + const changeLimit = (yield { word: word, cost: cost - optimalCost }) ?? costLimit - optimalCost; + costLimit = Math.min(changeLimit + optimalCost, costLimit); + } + } + return undefined; + } + + function* emitWords(): SuggestionIterator { + for (const w of wordsToEmit) { + yield* emitWord(w.word, w.cost); + } + wordsToEmit.length = 0; + return undefined; + } + + function addEdge(path: Path, edge: Edge): Edge | undefined { + const g = path.g + edge.c; + const i = edge.i; + const h = 0; // (len - i) * bc; + const f = g + h; + if (f > costLimit) return undefined; + + const { n } = edge; + const w = path.w + edge.s; + const can: Path = { e: edge, n, i, w, g, f, r: new Set(), a: true }; + const location = getLocationNode(can); + + // Is Location Resolved + if (location.sbc >= 0 && location.sbc <= can.g) { + // No need to go further, this node has been resolved. + // Return the edge to be resolved + path.r.add(edge); + edgesToResolve.push({ edge, suffixes: location.sfx }); + return undefined; + } + const found = location.in.get(can.w); + if (found) { + // If the existing path is cheaper or the same keep it. + // Do not add the edge. + if (found.g <= can.g) return undefined; + // Otherwise mark it as inactive and + const e = found.e; + if (e) { + edgesToResolve.push({ edge: e, suffixes: [] }); + } + found.a = false; + } + location.in.set(can.w, can); + if (location.p.g > can.g) { + pathToLocation.delete(location.p); + location.p = can; + } + if (location.p === can) { + // Make this path the representation of this location. + pathToLocation.set(can, location); + candidates.add(can); + } + path.r.add(edge); + return edge; + } + + function opInsert(best: Candidate): void { + const children = best.n.c; + if (!children) return; + const i = best.i; + const c = bc; + for (const [s, n] of children) { + const e: Edge = { p: best, n, i, s, c, a: Action.Insert }; + addEdge(best, e); + } + } + + function opDelete(best: Candidate, num = 1): Edge | undefined { + const e: Edge = { p: best, n: best.n, i: best.i + num, s: '', c: bc * num, a: Action.Delete }; + return addEdge(best, e); + } + + function opIdentity(best: Candidate): void { + const s = word[best.i]; + const n = best.n.c?.get(s); + if (!n) return; + const i = best.i + 1; + const e: Edge = { p: best, n, i, s, c: 0, a: Action.Identity }; + addEdge(best, e); + } + + function opReplace(best: Candidate): void { + const children = best.n.c; + if (!children) return; + const wc = word[best.i]; + const wg = visualLetterMaskMap[wc] || 0; + const i = best.i + 1; + const cost = bc + (best.i ? 0 : opCosts.firstLetterBias); + for (const [s, n] of children) { + if (s == wc) continue; + const sg = visualLetterMaskMap[s] || 0; + const c = wg & sg ? mapSugCost : cost; + const e: Edge = { p: best, n, i, s, c, a: Action.Replace }; + addEdge(best, e); + } + } + + function opSwap(best: Candidate): void { + const children = best.n.c; + const i = best.i; + const i2 = i + 1; + if (!children || len <= i2) return; + const wc1 = word[i]; + const wc2 = word[i2]; + if (wc1 === wc2) return; + const n = best.n.c?.get(wc2); + const n2 = n?.c?.get(wc1); + if (!n || !n2) return; + const e: Edge = { p: best, n: n2, i: i2 + 1, s: wc2 + wc1, c: opCosts.swapCost, a: Action.Swap }; + addEdge(best, e); + } + + function opDuplicate(best: Candidate): void { + const children = best.n.c; + const i = best.i; + const i2 = i + 1; + if (!children || len <= i2) return; + const wc1 = word[i]; + const wc2 = word[i2]; + const n = best.n.c?.get(wc1); + if (!n) return; + if (wc1 === wc2) { + // convert double letter to single + const e: Edge = { p: best, n, i: i + 2, s: wc1, c: opCosts.duplicateLetterCost, a: Action.Delete }; + addEdge(best, e); + return; + } + const n2 = n?.c?.get(wc1); + if (!n2) return; + // convert single to double letter + const e: Edge = { p: best, n: n2, i: i2, s: wc1 + wc1, c: opCosts.duplicateLetterCost, a: Action.Insert }; + addEdge(best, e); + } + + function resolveEdges() { + let e: EdgeToResolve | undefined; + while ((e = edgesToResolve.shift())) { + resolveEdge(e); + } + } + + function resolveEdge({ edge, suffixes }: EdgeToResolve) { + const { p, s: es, c: ec } = edge; + if (!p.r.has(edge)) return; + const edgeSuffixes = suffixes.map((sfx) => ({ s: es + sfx.s, c: ec + sfx.c })); + for (const { s, c } of edgeSuffixes) { + const cost = p.g + c; + if (cost <= costLimit) { + const word = p.w + s; + wordsToEmit.push({ word, cost }); + } + } + p.r.delete(edge); + const location = pathToLocation.get(p); + if (location?.p === p) { + location.sfx = location.sfx.concat(edgeSuffixes); + if (!p.r.size) { + location.sbc = p.g; + for (const inPath of location.in.values()) { + const { e: edge } = inPath; + if (!edge) continue; + edgesToResolve.push({ edge, suffixes: edgeSuffixes }); + } + } + } else if (!p.r.size) { + if (p.e) { + // Keep rolling up. + edgesToResolve.push({ edge: p.e, suffixes: edgeSuffixes }); + } + } + } + + /************ + * Below is the core of the A* algorithm + */ + + nodes.forEach((node, idx) => { + const g = idx ? 1 : 0; + candidates.add({ e: undefined, n: node, i: 0, w: '', g, f: optimalCost + g, r: new Set(), a: true }); + }); + + let maxSize = 0; + let best: Candidate | undefined; + // const bc2 = 2 * bc; + while ((best = candidates.dequeue())) { + maxSize = Math.max(maxSize, candidates.length); + if (best.f > costLimit) break; + if (!best.a) continue; + + const bi = best.i; + if (best.n.f) { + const toDelete = len - bi; + const e: Edge = { p: best, n: best.n, i: bi, s: '', c: bc * toDelete, a: Action.Delete }; + best.r.add(e); + edgesToResolve.push({ edge: e, suffixes: [{ s: '', c: 0 }] }); + } + const children = best.n.c; + if (!children) continue; + + if (bi === len) { + opInsert(best); + } else { + opIdentity(best); + opReplace(best); + opDelete(best); + opInsert(best); + opSwap(best); + opDuplicate(best); + } + resolveEdges(); + yield* emitWords(); + } + + // console.log(` + // word: ${word} + // maxSize: ${maxSize} + // length: ${candidates.length} + // `); + + return undefined; +} + +enum Action { + Identity, + Replace, + Delete, + Insert, + Swap, +} + +interface Edge { + /** from */ + p: Path; + /** to Node */ + n: TrieNode; + /** index into the original word */ + i: number; + /** suffix character to add to Path p. */ + s: string; + /** edge cost */ + c: number; + /** Action */ + a: Action; +} + +interface Path { + /** Edge taken to get here */ + e: Edge | undefined; + /** to Node */ + n: TrieNode; + /** index into the original word */ + i: number; + /** Suggested word so far */ + w: string; + /** cost so far */ + g: number; + /** expected total cost */ + f: number; + /** active */ + a: boolean; + /** Edges to be resolved. */ + r: Set; +} + +interface Suffix { + /** suffix */ + s: string; + /** Cost of using suffix */ + c: number; +} + +interface LocationNode { + /** Incoming Paths */ + in: Map; + /** Best Possible cost - only non-zero when location has been resolved. */ + bc: number; + /** Pending Path to be resolved */ + p: Path; + /** + * Suffix Base Cost + * The base cost used when calculating the suffixes. + * If a new path comes in with a lower base cost, + * then the suffixes need to be recalculated. + */ + sbc: number; + /** Set of suffixes, calculated when location has been resolved. */ + sfx: Suffix[]; +} + +type LocationByTriNode = Map; + +type LocationCache = Map; + +type Candidate = Path; + +type NoFollow = Record; + +interface EmitWord { + word: string; + cost: number; +} + +interface EdgeToResolve { + edge: Edge; + suffixes: Suffix[]; +} + +const defaultMaxNumberSuggestions = 10; +const maxNumChanges = 5; + +export function suggest( + root: TrieRoot | TrieRoot[], + word: string, + numSuggestions: number = defaultMaxNumberSuggestions, + compoundMethod: CompoundWordsMethod = CompoundWordsMethod.NONE, + numChanges: number = maxNumChanges, + ignoreCase?: boolean +): SuggestionResult[] { + const collector = suggestionCollector(word, { + numSuggestions: numSuggestions, + changeLimit: numChanges, + includeTies: true, + ignoreCase, + }); + collector.collect(genSuggestions(root, word, compoundMethod)); + return collector.suggestions; +} + +export function* genSuggestions( + root: TrieRoot | TrieRoot[], + word: string, + compoundMethod: CompoundWordsMethod = CompoundWordsMethod.NONE +): SuggestionIterator { + const roots = Array.isArray(root) ? root : [root]; + for (const r of roots) { + yield* genCompoundableSuggestions(r, word, compoundMethod); + } + return undefined; +} + +function determineNoFollow(root: TrieRoot): NoFollow { + const noFollow: NoFollow = Object.assign(Object.create(null), { + [root.compoundCharacter]: true, + [root.forbiddenWordPrefix]: true, + [root.stripCaseAndAccentsPrefix]: true, + }); + return noFollow; +} + +function determineInitialNodes(root: TrieRoot | TrieRoot[], ignoreCase: boolean): TrieNode[] { + const roots = Array.isArray(root) ? root : [root]; + const rootNodes: TrieNode[] = roots.map((r) => { + const noFollow = determineNoFollow(r); + return { c: new Map([...r.c].filter(([c]) => !noFollow[c])) }; + }); + const noCaseNodes = ignoreCase + ? roots + .filter((r) => r.stripCaseAndAccentsPrefix) + .map((n) => n.c?.get(n.stripCaseAndAccentsPrefix)) + .filter(isDefined) + : []; + const nodes: TrieNode[] = rootNodes.concat(noCaseNodes); + return nodes; +} + +function isDefined(v: T | undefined): v is T { + return v !== undefined; +}