Skip to content

Commit fef2d95

Browse files
✨ feat(search): Improve patient searching.
By dropping language-dependent particle splitting and using trigrams of concatenated names and unconcatenated (with word boundaries) names for approximate matching instead of subtrings of unconcatenated names.
1 parent 736e639 commit fef2d95

File tree

5 files changed

+311
-238
lines changed

5 files changed

+311
-238
lines changed

imports/api/patients.ts

+24-16
Original file line numberDiff line numberDiff line change
@@ -13,36 +13,44 @@ import {insurances} from './insurances';
1313
import {doctors} from './doctors';
1414
import {allergies} from './allergies';
1515

16-
import {makeIndex, shatter, normalized, normalizeSearch} from './string';
16+
import {
17+
makeIndex,
18+
words,
19+
normalized,
20+
keepUnique,
21+
stringTrigrams,
22+
boundaryTrigrams,
23+
} from './string';
1724

1825
export const BIRTHDATE_FORMAT = 'yyyy-MM-dd';
1926
export const SEX_ALLOWED = [undefined, '', 'male', 'female', 'other'];
2027

2128
function normalizedName(firstname, lastname) {
22-
const lastnameHash = normalizeSearch(lastname || '').replace(' ', '-');
29+
const lastnameHash = normalized(lastname || '').replace(' ', '');
2330
const firstnameHash = normalized(firstname || '').split(' ')[0];
2431
return `${lastnameHash} ${firstnameHash}`;
2532
}
2633

2734
function updateIndex(userId: string, _id: string, fields) {
2835
const {niss, firstname, lastname, birthdate, sex} = fields;
29-
const patientIndex = {};
30-
if (firstname) {
31-
const nameIndex = shatter(firstname);
32-
for (const [key, value] of Object.entries(nameIndex)) {
33-
patientIndex['firstname_' + key] = value;
34-
}
35-
}
36+
const firstnameWords = keepUnique(words(firstname ?? ''));
37+
const lastnameWords = keepUnique(words(lastname ?? ''));
3638

37-
if (lastname) {
38-
const nameIndex = shatter(lastname);
39-
for (const [key, value] of Object.entries(nameIndex)) {
40-
patientIndex['lastname_' + key] = value;
41-
}
42-
}
39+
const innerTrigrams = keepUnique(
40+
stringTrigrams(firstname ?? ''),
41+
stringTrigrams(lastname ?? ''),
42+
);
43+
44+
const outerTrigrams = keepUnique(
45+
boundaryTrigrams([...lastnameWords, ...firstnameWords]),
46+
boundaryTrigrams([...firstnameWords, ...lastnameWords]),
47+
);
4348

4449
const upsertFields = {
45-
...patientIndex,
50+
firstnameWords,
51+
lastnameWords,
52+
innerTrigrams,
53+
outerTrigrams,
4654
niss,
4755
firstname,
4856
lastname,

imports/api/string.ts

+42-95
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@ import {all, min} from '@iterable-iterator/reduce';
55
import {list} from '@iterable-iterator/list';
66
import {len} from '@functional-abstraction/operator';
77
import {map} from '@iterable-iterator/map';
8+
import {_chain} from '@iterable-iterator/chain';
89
import {sorted} from '@iterable-iterator/sorted';
9-
import {increasing, decreasing} from '@total-order/primitive';
10+
import {window} from '@iterable-iterator/window';
11+
import {increasing} from '@total-order/primitive';
1012
import {len as byLength} from '@total-order/key';
13+
import {combinations} from '@combinatorics/n-combinations';
1114
import escapeStringRegexp from 'escape-string-regexp';
1215

1316
export {default as escapeStringRegexp} from 'escape-string-regexp';
@@ -28,6 +31,9 @@ export const onlyASCII = (string: string) => deburr(string);
2831
export const onlyLowerCaseASCII = (string: string) =>
2932
onlyASCII(string.toLowerCase());
3033

34+
export const onlyLowerCaseAlphabetical = (string: string, replacement = '') =>
35+
onlyLowerCaseASCII(string).replace(/[^a-z]+/g, replacement);
36+
3137
export const makeIndex = (data: string) => {
3238
const needles = onlyLowerCaseASCII(data).split(' ');
3339
return (query: string) => {
@@ -92,109 +98,50 @@ export const makeRegExpIndex = (patterns: Iterable<string>) => {
9298
};
9399
};
94100

95-
const PARTICLES_FR: string[] = ['du', 'de', 'des', "d'", 'le', 'la'];
96-
const PARTICLES_NL: string[] = [
97-
'de',
98-
'den',
99-
'op',
100-
"t'",
101-
"'t",
102-
'ten',
103-
'ter',
104-
'te',
105-
'van',
106-
'der',
107-
];
108-
const PARTICLES_DE: string[] = [
109-
'am',
110-
'an',
111-
'af',
112-
'auf',
113-
'aus',
114-
'der',
115-
'im',
116-
'von',
117-
'und',
118-
'zu',
119-
'zum',
120-
'zur',
121-
];
101+
const split = (string: string): string[] => {
102+
const trimmed = string.replace(/^\s+/, '').replace(/\s+$/, '');
103+
return trimmed === '' ? [] : trimmed.split(/\s+/);
104+
};
122105

123-
const PARTICLES: Set<string> = new Set([
124-
...PARTICLES_FR,
125-
...PARTICLES_NL,
126-
...PARTICLES_DE,
127-
]);
128-
const PARTICLES_ORDERED: string[] = sorted(byLength(decreasing), PARTICLES);
129-
130-
const words = (string: string) => string.trim().split(/\s+/);
131-
132-
function* splitParticles(data: string) {
133-
const queue = words(data).reverse();
134-
outer: while (queue.length > 0) {
135-
const word = queue.pop();
136-
// greedy match
137-
// TODO use prefix tree
138-
for (const particle of PARTICLES_ORDERED) {
139-
if (word.startsWith(particle)) {
140-
yield particle;
141-
const rest = word.slice(particle.length);
142-
if (rest) queue.push(rest);
143-
continue outer;
144-
}
145-
}
106+
export const words = (string: string): string[] =>
107+
split(onlyLowerCaseAlphabetical(string, ' '));
146108

147-
yield word;
148-
}
149-
}
109+
const trigrams = (string: string): IterableIterator<string> =>
110+
map(([a, b, c]: string[]) => a + b + c, window(3, string));
150111

151-
export const normalizeSearch = (data: string) =>
152-
[...splitParticles(onlyLowerCaseASCII(data))].join(' ');
112+
const wrapTrigram = (x: string) => `0${x}0`;
153113

154-
function* nonEmptySubstrings(string: string) {
155-
const n = string.length;
156-
for (let i = 0; i < n; ++i) {
157-
for (let j = i + 1; j <= n; ++j) {
158-
yield string.slice(i, j);
159-
}
160-
}
161-
}
114+
export const stringTrigrams = (string: string) =>
115+
map(wrapTrigram, trigrams(onlyLowerCaseAlphabetical(string)));
116+
const textTrigrams = (text: string) =>
117+
map(wrapTrigram, trigrams(`11${words(text).join('1')}1`));
162118

163-
const SHATTER_SHORT = 2;
164-
const SHATTER_MEDIUM = 4;
165-
const SHATTER_LONG = 6;
166-
167-
export const shatter = (data: string) => {
168-
const parts = splitParticles(onlyLowerCaseASCII(data));
169-
const index = {
170-
whole: [],
171-
particles: [],
172-
substring_long: [],
173-
substring_medium: [],
174-
substring_short: [],
175-
};
176-
for (const part of parts) {
177-
if (PARTICLES.has(part)) {
178-
index.particles.push(part);
179-
} else {
180-
for (const _part of part.split(/[^a-z]+/)) {
181-
index.whole.push(_part);
182-
for (const substring of nonEmptySubstrings(_part)) {
183-
if (substring.length < SHATTER_SHORT) continue;
184-
else if (substring.length === _part.length) continue;
185-
else if (substring.length < SHATTER_MEDIUM)
186-
index.substring_short.push(substring);
187-
else if (substring.length < SHATTER_LONG)
188-
index.substring_medium.push(substring);
189-
else index.substring_long.push(substring);
190-
}
191-
}
192-
}
119+
const _boundaryTrigrams = function* (
120+
strings: string[],
121+
): IterableIterator<string> {
122+
for (const a of strings) {
123+
assert(a.length > 0);
124+
const wrapped = `11${a}1`;
125+
yield wrapped.slice(0, 3);
126+
yield wrapped.slice(1, 4);
127+
yield wrapped.slice(-3);
193128
}
194129

195-
return index;
130+
for (const [a, b] of combinations(strings, 2)) {
131+
yield `${a[a.length - 1]}1${b[0]}`;
132+
}
196133
};
197134

135+
export const boundaryTrigrams = (strings: string[]) =>
136+
map(wrapTrigram, _boundaryTrigrams(strings));
137+
138+
export const normalizeSearch = (data: string) =>
139+
[...words(data), ...textTrigrams(data), ...stringTrigrams(data)].join(' ');
140+
141+
export const keepUnique = <T>(...iterables: Array<Iterable<T>>) => [
142+
...new Set<T>(_chain(iterables)),
143+
];
144+
198145
const alphabet = 'abcdefghijklmnopqrstuvwxyz';
199146

200147
const _isPositiveIntegerStrict_regex = (base: number) => {

0 commit comments

Comments
 (0)