-
Notifications
You must be signed in to change notification settings - Fork 551
/
builder.js
360 lines (315 loc) · 12.9 KB
/
builder.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
/*!
* lunr.Builder
* Copyright (C) @YEAR Oliver Nightingale
*/
/**
* lunr.Builder performs indexing on a set of documents and
* returns instances of lunr.Index ready for querying.
*
* All configuration of the index is done via the builder, the
* fields to index, the document reference, the text processing
* pipeline and document scoring parameters are all set on the
* builder before indexing.
*
* @constructor
* @property {string} _ref - Internal reference to the document reference field.
* @property {string[]} _fields - Internal reference to the document fields to index.
* @property {object} invertedIndex - The inverted index maps terms to document fields.
* @property {object} documentTermFrequencies - Keeps track of document term frequencies.
* @property {object} documentLengths - Keeps track of the length of documents added to the index.
* @property {lunr.tokenizer} tokenizer - Function for splitting strings into tokens for indexing.
* @property {lunr.Pipeline} pipeline - The pipeline performs text processing on tokens before indexing.
* @property {lunr.Pipeline} searchPipeline - A pipeline for processing search terms before querying the index.
* @property {number} documentCount - Keeps track of the total number of documents indexed.
* @property {number} _b - A parameter to control field length normalization, setting this to 0 disabled normalization, 1 fully normalizes field lengths, the default value is 0.75.
* @property {number} _k1 - A parameter to control how quickly an increase in term frequency results in term frequency saturation, the default value is 1.2.
* @property {number} termIndex - A counter incremented for each unique term, used to identify a terms position in the vector space.
* @property {array} metadataWhitelist - A list of metadata keys that have been whitelisted for entry in the index.
*/
lunr.Builder = function () {
this._ref = "id"
this._fields = Object.create(null)
this._documents = Object.create(null)
this.invertedIndex = Object.create(null)
this.fieldTermFrequencies = {}
this.fieldLengths = {}
this.tokenizer = lunr.tokenizer
this.pipeline = new lunr.Pipeline
this.searchPipeline = new lunr.Pipeline
this.documentCount = 0
this._b = 0.75
this._k1 = 1.2
this.termIndex = 0
this.metadataWhitelist = []
}
/**
* Sets the document field used as the document reference. Every document must have this field.
* The type of this field in the document should be a string, if it is not a string it will be
* coerced into a string by calling toString.
*
* The default ref is 'id'.
*
* The ref should _not_ be changed during indexing, it should be set before any documents are
* added to the index. Changing it during indexing can lead to inconsistent results.
*
* @param {string} ref - The name of the reference field in the document.
*/
lunr.Builder.prototype.ref = function (ref) {
this._ref = ref
}
/**
* A function that is used to extract a field from a document.
*
* Lunr expects a field to be at the top level of a document, if however the field
* is deeply nested within a document an extractor function can be used to extract
* the right field for indexing.
*
* @callback fieldExtractor
* @param {object} doc - The document being added to the index.
* @returns {?(string|object|object[])} obj - The object that will be indexed for this field.
* @example <caption>Extracting a nested field</caption>
* function (doc) { return doc.nested.field }
*/
/**
* Adds a field to the list of document fields that will be indexed. Every document being
* indexed should have this field. Null values for this field in indexed documents will
* not cause errors but will limit the chance of that document being retrieved by searches.
*
* All fields should be added before adding documents to the index. Adding fields after
* a document has been indexed will have no effect on already indexed documents.
*
* Fields can be boosted at build time. This allows terms within that field to have more
* importance when ranking search results. Use a field boost to specify that matches within
* one field are more important than other fields.
*
* @param {string} fieldName - The name of a field to index in all documents.
* @param {object} attributes - Optional attributes associated with this field.
* @param {number} [attributes.boost=1] - Boost applied to all terms within this field.
* @param {fieldExtractor} [attributes.extractor] - Function to extract a field from a document.
* @throws {RangeError} fieldName cannot contain unsupported characters '/'
*/
lunr.Builder.prototype.field = function (fieldName, attributes) {
if (/\//.test(fieldName)) {
throw new RangeError ("Field '" + fieldName + "' contains illegal character '/'")
}
this._fields[fieldName] = attributes || {}
}
/**
* A parameter to tune the amount of field length normalisation that is applied when
* calculating relevance scores. A value of 0 will completely disable any normalisation
* and a value of 1 will fully normalise field lengths. The default is 0.75. Values of b
* will be clamped to the range 0 - 1.
*
* @param {number} number - The value to set for this tuning parameter.
*/
lunr.Builder.prototype.b = function (number) {
if (number < 0) {
this._b = 0
} else if (number > 1) {
this._b = 1
} else {
this._b = number
}
}
/**
* A parameter that controls the speed at which a rise in term frequency results in term
* frequency saturation. The default value is 1.2. Setting this to a higher value will give
* slower saturation levels, a lower value will result in quicker saturation.
*
* @param {number} number - The value to set for this tuning parameter.
*/
lunr.Builder.prototype.k1 = function (number) {
this._k1 = number
}
/**
* Adds a document to the index.
*
* Before adding fields to the index the index should have been fully setup, with the document
* ref and all fields to index already having been specified.
*
* The document must have a field name as specified by the ref (by default this is 'id') and
* it should have all fields defined for indexing, though null or undefined values will not
* cause errors.
*
* Entire documents can be boosted at build time. Applying a boost to a document indicates that
* this document should rank higher in search results than other documents.
*
* @param {object} doc - The document to add to the index.
* @param {object} attributes - Optional attributes associated with this document.
* @param {number} [attributes.boost=1] - Boost applied to all terms within this document.
*/
lunr.Builder.prototype.add = function (doc, attributes) {
var docRef = doc[this._ref],
fields = Object.keys(this._fields)
this._documents[docRef] = attributes || {}
this.documentCount += 1
for (var i = 0; i < fields.length; i++) {
var fieldName = fields[i],
extractor = this._fields[fieldName].extractor,
field = extractor ? extractor(doc) : doc[fieldName],
tokens = this.tokenizer(field, {
fields: [fieldName]
}),
terms = this.pipeline.run(tokens),
fieldRef = new lunr.FieldRef (docRef, fieldName),
fieldTerms = Object.create(null)
this.fieldTermFrequencies[fieldRef] = fieldTerms
this.fieldLengths[fieldRef] = 0
// store the length of this field for this document
this.fieldLengths[fieldRef] += terms.length
// calculate term frequencies for this field
for (var j = 0; j < terms.length; j++) {
var term = terms[j]
if (fieldTerms[term] == undefined) {
fieldTerms[term] = 0
}
fieldTerms[term] += 1
// add to inverted index
// create an initial posting if one doesn't exist
if (this.invertedIndex[term] == undefined) {
var posting = Object.create(null)
posting["_index"] = this.termIndex
this.termIndex += 1
for (var k = 0; k < fields.length; k++) {
posting[fields[k]] = Object.create(null)
}
this.invertedIndex[term] = posting
}
// add an entry for this term/fieldName/docRef to the invertedIndex
if (this.invertedIndex[term][fieldName][docRef] == undefined) {
this.invertedIndex[term][fieldName][docRef] = Object.create(null)
}
// store all whitelisted metadata about this token in the
// inverted index
for (var l = 0; l < this.metadataWhitelist.length; l++) {
var metadataKey = this.metadataWhitelist[l],
metadata = term.metadata[metadataKey]
if (this.invertedIndex[term][fieldName][docRef][metadataKey] == undefined) {
this.invertedIndex[term][fieldName][docRef][metadataKey] = []
}
this.invertedIndex[term][fieldName][docRef][metadataKey].push(metadata)
}
}
}
}
/**
* Calculates the average document length for this index
*
* @private
*/
lunr.Builder.prototype.calculateAverageFieldLengths = function () {
var fieldRefs = Object.keys(this.fieldLengths),
numberOfFields = fieldRefs.length,
accumulator = {},
documentsWithField = {}
for (var i = 0; i < numberOfFields; i++) {
var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
field = fieldRef.fieldName
documentsWithField[field] || (documentsWithField[field] = 0)
documentsWithField[field] += 1
accumulator[field] || (accumulator[field] = 0)
accumulator[field] += this.fieldLengths[fieldRef]
}
var fields = Object.keys(this._fields)
for (var i = 0; i < fields.length; i++) {
var fieldName = fields[i]
accumulator[fieldName] = accumulator[fieldName] / documentsWithField[fieldName]
}
this.averageFieldLength = accumulator
}
/**
* Builds a vector space model of every document using lunr.Vector
*
* @private
*/
lunr.Builder.prototype.createFieldVectors = function () {
var fieldVectors = {},
fieldRefs = Object.keys(this.fieldTermFrequencies),
fieldRefsLength = fieldRefs.length,
termIdfCache = Object.create(null)
for (var i = 0; i < fieldRefsLength; i++) {
var fieldRef = lunr.FieldRef.fromString(fieldRefs[i]),
fieldName = fieldRef.fieldName,
fieldLength = this.fieldLengths[fieldRef],
fieldVector = new lunr.Vector,
termFrequencies = this.fieldTermFrequencies[fieldRef],
terms = Object.keys(termFrequencies),
termsLength = terms.length
var fieldBoost = this._fields[fieldName].boost || 1,
docBoost = this._documents[fieldRef.docRef].boost || 1
for (var j = 0; j < termsLength; j++) {
var term = terms[j],
tf = termFrequencies[term],
termIndex = this.invertedIndex[term]._index,
idf, score, scoreWithPrecision
if (termIdfCache[term] === undefined) {
idf = lunr.idf(this.invertedIndex[term], this.documentCount)
termIdfCache[term] = idf
} else {
idf = termIdfCache[term]
}
score = idf * ((this._k1 + 1) * tf) / (this._k1 * (1 - this._b + this._b * (fieldLength / this.averageFieldLength[fieldName])) + tf)
score *= fieldBoost
score *= docBoost
scoreWithPrecision = Math.round(score * 1000) / 1000
// Converts 1.23456789 to 1.234.
// Reducing the precision so that the vectors take up less
// space when serialised. Doing it now so that they behave
// the same before and after serialisation. Also, this is
// the fastest approach to reducing a number's precision in
// JavaScript.
fieldVector.insert(termIndex, scoreWithPrecision)
}
fieldVectors[fieldRef] = fieldVector
}
this.fieldVectors = fieldVectors
}
/**
* Creates a token set of all tokens in the index using lunr.TokenSet
*
* @private
*/
lunr.Builder.prototype.createTokenSet = function () {
this.tokenSet = lunr.TokenSet.fromArray(
Object.keys(this.invertedIndex).sort()
)
}
/**
* Builds the index, creating an instance of lunr.Index.
*
* This completes the indexing process and should only be called
* once all documents have been added to the index.
*
* @returns {lunr.Index}
*/
lunr.Builder.prototype.build = function () {
this.calculateAverageFieldLengths()
this.createFieldVectors()
this.createTokenSet()
return new lunr.Index({
invertedIndex: this.invertedIndex,
fieldVectors: this.fieldVectors,
tokenSet: this.tokenSet,
fields: Object.keys(this._fields),
pipeline: this.searchPipeline
})
}
/**
* Applies a plugin to the index builder.
*
* A plugin is a function that is called with the index builder as its context.
* Plugins can be used to customise or extend the behaviour of the index
* in some way. A plugin is just a function, that encapsulated the custom
* behaviour that should be applied when building the index.
*
* The plugin function will be called with the index builder as its argument, additional
* arguments can also be passed when calling use. The function will be called
* with the index builder as its context.
*
* @param {Function} plugin The plugin to apply.
*/
lunr.Builder.prototype.use = function (fn) {
var args = Array.prototype.slice.call(arguments, 1)
args.unshift(this)
fn.apply(this, args)
}