@@ -5,9 +5,12 @@ import {all, min} from '@iterable-iterator/reduce';
5
5
import { list } from '@iterable-iterator/list' ;
6
6
import { len } from '@functional-abstraction/operator' ;
7
7
import { map } from '@iterable-iterator/map' ;
8
+ import { _chain } from '@iterable-iterator/chain' ;
8
9
import { sorted } from '@iterable-iterator/sorted' ;
9
- import { increasing , decreasing } from '@total-order/primitive' ;
10
+ import { window } from '@iterable-iterator/window' ;
11
+ import { increasing } from '@total-order/primitive' ;
10
12
import { len as byLength } from '@total-order/key' ;
13
+ import { combinations } from '@combinatorics/n-combinations' ;
11
14
import escapeStringRegexp from 'escape-string-regexp' ;
12
15
13
16
export { default as escapeStringRegexp } from 'escape-string-regexp' ;
@@ -28,6 +31,9 @@ export const onlyASCII = (string: string) => deburr(string);
28
31
export const onlyLowerCaseASCII = ( string : string ) =>
29
32
onlyASCII ( string . toLowerCase ( ) ) ;
30
33
34
+ export const onlyLowerCaseAlphabetical = ( string : string , replacement = '' ) =>
35
+ onlyLowerCaseASCII ( string ) . replace ( / [ ^ a - z ] + / g, replacement ) ;
36
+
31
37
export const makeIndex = ( data : string ) => {
32
38
const needles = onlyLowerCaseASCII ( data ) . split ( ' ' ) ;
33
39
return ( query : string ) => {
@@ -92,109 +98,50 @@ export const makeRegExpIndex = (patterns: Iterable<string>) => {
92
98
} ;
93
99
} ;
94
100
95
- const PARTICLES_FR : string [ ] = [ 'du' , 'de' , 'des' , "d'" , 'le' , 'la' ] ;
96
- const PARTICLES_NL : string [ ] = [
97
- 'de' ,
98
- 'den' ,
99
- 'op' ,
100
- "t'" ,
101
- "'t" ,
102
- 'ten' ,
103
- 'ter' ,
104
- 'te' ,
105
- 'van' ,
106
- 'der' ,
107
- ] ;
108
- const PARTICLES_DE : string [ ] = [
109
- 'am' ,
110
- 'an' ,
111
- 'af' ,
112
- 'auf' ,
113
- 'aus' ,
114
- 'der' ,
115
- 'im' ,
116
- 'von' ,
117
- 'und' ,
118
- 'zu' ,
119
- 'zum' ,
120
- 'zur' ,
121
- ] ;
101
+ const split = ( string : string ) : string [ ] => {
102
+ const trimmed = string . replace ( / ^ \s + / , '' ) . replace ( / \s + $ / , '' ) ;
103
+ return trimmed === '' ? [ ] : trimmed . split ( / \s + / ) ;
104
+ } ;
122
105
123
- const PARTICLES : Set < string > = new Set ( [
124
- ...PARTICLES_FR ,
125
- ...PARTICLES_NL ,
126
- ...PARTICLES_DE ,
127
- ] ) ;
128
- const PARTICLES_ORDERED : string [ ] = sorted ( byLength ( decreasing ) , PARTICLES ) ;
129
-
130
- const words = ( string : string ) => string . trim ( ) . split ( / \s + / ) ;
131
-
132
- function * splitParticles ( data : string ) {
133
- const queue = words ( data ) . reverse ( ) ;
134
- outer: while ( queue . length > 0 ) {
135
- const word = queue . pop ( ) ;
136
- // greedy match
137
- // TODO use prefix tree
138
- for ( const particle of PARTICLES_ORDERED ) {
139
- if ( word . startsWith ( particle ) ) {
140
- yield particle ;
141
- const rest = word . slice ( particle . length ) ;
142
- if ( rest ) queue . push ( rest ) ;
143
- continue outer;
144
- }
145
- }
106
+ export const words = ( string : string ) : string [ ] =>
107
+ split ( onlyLowerCaseAlphabetical ( string , ' ' ) ) ;
146
108
147
- yield word ;
148
- }
149
- }
109
+ const trigrams = ( string : string ) : IterableIterator < string > =>
110
+ map ( ( [ a , b , c ] : string [ ] ) => a + b + c , window ( 3 , string ) ) ;
150
111
151
- export const normalizeSearch = ( data : string ) =>
152
- [ ...splitParticles ( onlyLowerCaseASCII ( data ) ) ] . join ( ' ' ) ;
112
+ const wrapTrigram = ( x : string ) => `0${ x } 0` ;
153
113
154
- function * nonEmptySubstrings ( string : string ) {
155
- const n = string . length ;
156
- for ( let i = 0 ; i < n ; ++ i ) {
157
- for ( let j = i + 1 ; j <= n ; ++ j ) {
158
- yield string . slice ( i , j ) ;
159
- }
160
- }
161
- }
114
+ export const stringTrigrams = ( string : string ) =>
115
+ map ( wrapTrigram , trigrams ( onlyLowerCaseAlphabetical ( string ) ) ) ;
116
+ const textTrigrams = ( text : string ) =>
117
+ map ( wrapTrigram , trigrams ( `11${ words ( text ) . join ( '1' ) } 1` ) ) ;
162
118
163
- const SHATTER_SHORT = 2 ;
164
- const SHATTER_MEDIUM = 4 ;
165
- const SHATTER_LONG = 6 ;
166
-
167
- export const shatter = ( data : string ) => {
168
- const parts = splitParticles ( onlyLowerCaseASCII ( data ) ) ;
169
- const index = {
170
- whole : [ ] ,
171
- particles : [ ] ,
172
- substring_long : [ ] ,
173
- substring_medium : [ ] ,
174
- substring_short : [ ] ,
175
- } ;
176
- for ( const part of parts ) {
177
- if ( PARTICLES . has ( part ) ) {
178
- index . particles . push ( part ) ;
179
- } else {
180
- for ( const _part of part . split ( / [ ^ a - z ] + / ) ) {
181
- index . whole . push ( _part ) ;
182
- for ( const substring of nonEmptySubstrings ( _part ) ) {
183
- if ( substring . length < SHATTER_SHORT ) continue ;
184
- else if ( substring . length === _part . length ) continue ;
185
- else if ( substring . length < SHATTER_MEDIUM )
186
- index . substring_short . push ( substring ) ;
187
- else if ( substring . length < SHATTER_LONG )
188
- index . substring_medium . push ( substring ) ;
189
- else index . substring_long . push ( substring ) ;
190
- }
191
- }
192
- }
119
+ const _boundaryTrigrams = function * (
120
+ strings : string [ ] ,
121
+ ) : IterableIterator < string > {
122
+ for ( const a of strings ) {
123
+ assert ( a . length > 0 ) ;
124
+ const wrapped = `11${ a } 1` ;
125
+ yield wrapped . slice ( 0 , 3 ) ;
126
+ yield wrapped . slice ( 1 , 4 ) ;
127
+ yield wrapped . slice ( - 3 ) ;
193
128
}
194
129
195
- return index ;
130
+ for ( const [ a , b ] of combinations ( strings , 2 ) ) {
131
+ yield `${ a [ a . length - 1 ] } 1${ b [ 0 ] } ` ;
132
+ }
196
133
} ;
197
134
135
+ export const boundaryTrigrams = ( strings : string [ ] ) =>
136
+ map ( wrapTrigram , _boundaryTrigrams ( strings ) ) ;
137
+
138
+ export const normalizeSearch = ( data : string ) =>
139
+ [ ...words ( data ) , ...textTrigrams ( data ) , ...stringTrigrams ( data ) ] . join ( ' ' ) ;
140
+
141
+ export const keepUnique = < T > ( ...iterables : Array < Iterable < T > > ) => [
142
+ ...new Set < T > ( _chain ( iterables ) ) ,
143
+ ] ;
144
+
198
145
const alphabet = 'abcdefghijklmnopqrstuvwxyz' ;
199
146
200
147
const _isPositiveIntegerStrict_regex = ( base : number ) => {
0 commit comments