Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: char index #5926

Merged
merged 3 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@ describe('CharIndexBuilder', () => {
const letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'];
const indexes = letters.map((c) => charIndexBuilder.getUtf8Value(c));
expect(indexes).toEqual(letters.map((c) => c.codePointAt(0)));
const r = charIndexBuilder.wordToUtf8Seq('abcdefghij');
expect(r).toEqual([...textEncoder.encode('abcdefghij')]);
expect(charIndexBuilder.size).toBe(11); // One extra for the empty string.
const r = charIndexBuilder.wordToUtf8Seq('abcdefghij⚁⚂⚃⚄⚀');
expect(r).toEqual([...textEncoder.encode('abcdefghij⚁⚂⚃⚄⚀')]);
expect(charIndexBuilder.size).toBe(16); // One extra for the empty string.

// Add the same letters again.
expect(letters.map((c) => charIndexBuilder.getUtf8Value(c))).toEqual(letters.map((c) => c.codePointAt(0)));

const charIndex = charIndexBuilder.build();
expect(charIndex.size).toBe(11);
expect(charIndex.size).toBe(16);
expect(charIndex.wordToUtf8Seq('abcdefghij')).toEqual([...textEncoder.encode('abcdefghij')]);
});
});
86 changes: 25 additions & 61 deletions packages/cspell-trie-lib/src/lib/TrieBlob/CharIndex.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import { encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';
import { encodeTextToUtf8, encodeUtf8N_BE, type Utf8BE32 } from './Utf8.js';

export type Utf8Seq = Readonly<number[]>;

export type CharIndexMap = Record<string, Utf8BE32>;

export type RO_CharIndexMap = Readonly<CharIndexMap>;

export type CharIndexSeqMap = Record<string, Utf8Seq | number>;
export type CharIndexSeqMap = Record<string, Utf8Seq>;

export type RO_CharIndexSeqMap = Readonly<CharIndexSeqMap>;

Expand All @@ -15,52 +15,29 @@ const emptySeq: Utf8Seq = [0];
Object.freeze(emptySeq);

export class CharIndex {
readonly charToUtf8Map: RO_CharIndexMap;
readonly charToUtf8SeqMap: RO_CharIndexSeqMap;
#charToUtf8SeqMap: CharIndexSeqMap;

#lastWord = '';
#lastWordSeq: Utf8Seq = [];
#multiByteChars: boolean;

constructor(readonly charIndex: readonly string[]) {
this.charToUtf8Map = buildCharIndexMap(charIndex);
this.charToUtf8SeqMap = buildCharIndexSequenceMap(this.charToUtf8Map);
}

getUtf8Value(c: string): number {
return this.charToUtf8Map[c] || 0;
this.#charToUtf8SeqMap = buildCharIndexSequenceMap(charIndex);
this.#multiByteChars = Object.values(this.#charToUtf8SeqMap).some((c) => c.length > 1);
}

getCharUtf8Seq(c: string): Utf8Seq {
const r = this.charToUtf8SeqMap[c] ?? emptySeq;
return typeof r === 'number' ? [r] : r;
}

__wordToUtf8Seq(word: string): Utf8Seq {
// Note: Array.flatMap is very slow
const seq: number[] = new Array(word.length);
let i = 0;
for (const c of word) {
const cSep = this.charToUtf8SeqMap[c];
if (typeof cSep === 'number') {
seq[i++] = cSep;
continue;
}
if (!cSep) {
seq[i++] = 0;
continue;
}
for (const cIdx of cSep) {
seq[i++] = cIdx;
}
}
if (seq.length !== i) seq.length = i;
return seq;
const found = this.#charToUtf8SeqMap[c];
if (found) return found;
const s = encodeTextToUtf8(c);
this.#charToUtf8SeqMap[c] = s;
return s;
}

wordToUtf8Seq(word: string): Utf8Seq {
if (this.#lastWord === word) return this.#lastWordSeq;

const seq = this.__wordToUtf8Seq(word);
const seq = encodeTextToUtf8(word);

this.#lastWord = word;
this.#lastWordSeq = seq;
Expand All @@ -69,7 +46,7 @@ export class CharIndex {
}

indexContainsMultiByteChars(): boolean {
return Object.values(this.charToUtf8Map).some((v) => v >= 0x80);
return this.#multiByteChars;
}

get size(): number {
Expand All @@ -81,22 +58,10 @@ export class CharIndex {
}
}

function buildCharIndexMap(charIndex: readonly string[]): CharIndexMap {
const map: CharIndexMap = Object.create(null);
for (const c of charIndex) {
const cn = c.normalize('NFC');
const utf8 = encodeUtf8N_BE(cn.codePointAt(0) || 0);
map[c] = utf8;
map[c.normalize('NFC')] = utf8;
map[c.normalize('NFD')] = utf8;
}
return map;
}

function buildCharIndexSequenceMap(charIndexMap: RO_CharIndexMap): CharIndexSeqMap {
function buildCharIndexSequenceMap(charIndex: readonly string[]): CharIndexSeqMap {
const map: CharIndexSeqMap = Object.create(null);
for (const [key, value] of Object.entries(charIndexMap)) {
map[key] = splitUtf8IfNeeded(value);
for (const key of charIndex) {
map[key] = encodeTextToUtf8(key);
}
return map;
}
Expand All @@ -106,7 +71,7 @@ export class CharIndexBuilder {
readonly charIndexMap: CharIndexMap = Object.create(null);
readonly charIndexSeqMap: CharIndexSeqMap = Object.create(null);

readonly #mapIdxToSeq = new Map<number, number[] | number>();
readonly #mapIdxToSeq = new Map<number, number[]>();

constructor() {
this.getUtf8Value('');
Expand All @@ -126,24 +91,22 @@ export class CharIndexBuilder {
return utf8;
}

utf8ValueToUtf8Seq(idx: number): number[] | number {
utf8ValueToUtf8Seq(idx: number): number[] {
const found = this.#mapIdxToSeq.get(idx);
if (found !== undefined) {
return found;
}
const seq = splitUtf8IfNeeded(idx);
const seq = splitUtf8(idx);
this.#mapIdxToSeq.set(idx, seq);
return seq;
}

charToUtf8Seq(c: string): number[] {
const idx = this.getUtf8Value(c);
const s = this.utf8ValueToUtf8Seq(idx);
return typeof s === 'number' ? [s] : s;
return this.utf8ValueToUtf8Seq(idx);
}

wordToUtf8Seq(word: string): number[] {
// word = word.normalize('NFC');
const seq: number[] = new Array(word.length);
let i = 0;
for (const c of word) {
Expand All @@ -170,8 +133,9 @@ export class CharIndexBuilder {
}
}

function splitUtf8IfNeeded(utf8: number): number | number[] {
if (utf8 < 0x80) return utf8;
const s = [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
return s.length ? s : s[0];
function splitUtf8(utf8: number): number[] {
if (utf8 <= 0xff) return [utf8];
if (utf8 <= 0xffff) return [(utf8 >> 8) & 0xff, utf8 & 0xff];
if (utf8 <= 0xff_ffff) return [(utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff];
return [(utf8 >> 24) & 0xff, (utf8 >> 16) & 0xff, (utf8 >> 8) & 0xff, utf8 & 0xff].filter((v) => v);
}
2 changes: 1 addition & 1 deletion packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ export class FastTrieBlob implements TrieData {

static toITrieNodeRoot(trie: FastTrieBlob): ITrieNodeRoot {
return new FastTrieBlobIRoot(
new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo, trie.sorted),
new FastTrieBlobInternals(trie.nodes, trie._charIndex, trie.bitMasksInfo),
0,
trie.info,
);
Expand Down
33 changes: 0 additions & 33 deletions packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
for (let i = 0; i < utf8Seq.length; ++i) {
insertCharIndexes(utf8Seq[i], pDepth);
}
// dumpState({ step: 'insertChar', char });
};

/**
Expand Down Expand Up @@ -174,8 +173,6 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
const pos = s.pos;
const node = nodes[nodeIdx];
node[pos] = (refNodeIdx << NodeChildRefShift) | (node[pos] & LetterMask);

// dumpState({ step: 'reference', refId, refNodeIdx });
};

const backStep = (num: number) => {
Expand All @@ -186,38 +183,8 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
depth = stack[depth].pDepth;
}
nodeIdx = stack[depth + 1].nodeIdx;

// dumpState({ step: 'backStep', num });
};

// function dumpNode(node: number[]): string {
// const n = node
// .map((n, i) => {
// if (!i) return `w: ${(n & NodeMaskEOW && 1) || 0}`;
// return `{ c: ${(n & LetterMask).toString(16).padStart(2, '0')}, r: ${n >>> NodeChildRefShift} }`;
// })
// .join(', ');
// return `[${n}]`;
// }

// function dumpNodes(nodes: FastTrieBlobNode[]) {
// return nodes.map((n, i) => `${i}: ${dumpNode(n)}`);
// }

// const debug = false;

// function dumpState(extra?: Record<string, unknown>) {
// debug &&
// console.warn('%o', {
// stack: stack.slice(0, depth + 1),
// nodes: dumpNodes(nodes),
// nodeIdx,
// depth,
// refNodes,
// ...extra,
// });
// }

const c: BuilderCursor = {
insertChar,
markEOW,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
readonly nodes: number[][],
readonly charIndex: CharIndex,
maskInfo: FastTrieBlobBitMaskInfo,
sorted = false,
) {
const { NodeMaskEOW, NodeMaskChildCharIndex, NodeChildRefShift } = maskInfo;
this.NodeMaskEOW = NodeMaskEOW;
this.NodeMaskChildCharIndex = NodeMaskChildCharIndex;
this.NodeChildRefShift = NodeChildRefShift;
this.isIndexDecoderNeeded = charIndex.indexContainsMultiByteChars();
!sorted && sortNodes(nodes, this.NodeMaskChildCharIndex);
sortNodes(nodes, this.NodeMaskChildCharIndex);
}
}

Expand All @@ -30,6 +29,10 @@ export class FastTrieBlobInternals implements FastTrieBlobBitMaskInfo {
* @returns
*/
export function sortNodes(nodes: number[][], mask: number): number[][] {
if (Object.isFrozen(nodes)) {
assertSorted(nodes, mask);
return nodes;
}
for (let i = 0; i < nodes.length; ++i) {
let node = nodes[i];
if (node.length > 2) {
Expand Down
4 changes: 2 additions & 2 deletions packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ export class TrieBlob implements TrieData {
this.#nonStrictIdx = this._lookupNode(0, this.info.stripCaseAndAccentsPrefix);
}

public wordToNodeCharIndexSequence(word: string): Utf8Seq {
public wordToUtf8Seq(word: string): Utf8Seq {
return this.charIndex.wordToUtf8Seq(word);
}

Expand Down Expand Up @@ -159,7 +159,7 @@ export class TrieBlob implements TrieData {
const NodeChildRefShift = TrieBlob.NodeChildRefShift;
const nodes = this.nodes;
const nodes8 = this.#nodes8;
const wordIndexes = this.wordToNodeCharIndexSequence(word);
const wordIndexes = this.wordToUtf8Seq(word);
const lookup = this.#nodeIdxLookup;
const len = wordIndexes.length;
let p = 0;
Expand Down
Loading