Skip to content

Commit

Permalink
feat: speeds up insertion for number properties (#756)
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva authored Jul 28, 2024
1 parent 77ba63c commit 71702f1
Show file tree
Hide file tree
Showing 10 changed files with 212 additions and 115 deletions.
12 changes: 8 additions & 4 deletions packages/orama/src/components/facets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ export async function getFacets<T extends AnyOrama>(
case 'string[]': {
const alreadyInsertedValues = new Set<string>()
const innerType = propertyType === 'boolean[]' ? 'boolean' : 'string'
const calculateBooleanStringOrEnumFacet = calculateBooleanStringOrEnumFacetBuilder(facetValues, innerType, alreadyInsertedValues)
const calculateBooleanStringOrEnumFacet = calculateBooleanStringOrEnumFacetBuilder(
facetValues,
innerType,
alreadyInsertedValues
)
for (const v of facetValue as Array<FacetValue>) {
calculateBooleanStringOrEnumFacet(v)
}
Expand Down Expand Up @@ -140,13 +144,13 @@ function calculateNumberFacetBuilder(
if (alreadyInsertedValues?.has(value)) {
continue
}

if (facetValue >= range.from && facetValue <= range.to) {
if (values[value] === undefined) {
values[value] = 1
} else {
values[value]++

alreadyInsertedValues?.add(value)
}
}
Expand All @@ -159,7 +163,7 @@ function calculateBooleanStringOrEnumFacetBuilder(
propertyType: 'string' | 'boolean' | 'enum',
alreadyInsertedValues?: Set<string>
) {
const defaultValue = (propertyType === 'boolean' ? 'false' : '')
const defaultValue = propertyType === 'boolean' ? 'false' : ''
return (facetValue: FacetValue) => {
// String or boolean based facets
const value = facetValue?.toString() ?? defaultValue
Expand Down
27 changes: 20 additions & 7 deletions packages/orama/src/components/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import type {
VectorType,
WhereCondition
} from '../types.js'
import type { InsertOptions } from '../methods/insert.js'
import { createError } from '../errors.js'
import {
create as avlCreate,
Expand Down Expand Up @@ -293,7 +294,8 @@ function insertScalarBuilder(
id: DocumentID,
language: string | undefined,
tokenizer: Tokenizer,
docsCount: number
docsCount: number,
options?: InsertOptions
) {
return async (value: SearchableValue): Promise<void> => {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id)
Expand All @@ -305,7 +307,8 @@ function insertScalarBuilder(
break
}
case 'AVL': {
avlInsert(node, value as number, [internalId])
const avlRebalanceThreshold = options?.avlRebalanceThreshold ?? 1
avlInsert(node, value as number, [internalId], avlRebalanceThreshold)
break
}
case 'Radix': {
Expand Down Expand Up @@ -341,13 +344,14 @@ export async function insert(
schemaType: SearchableType,
language: string | undefined,
tokenizer: Tokenizer,
docsCount: number
docsCount: number,
options?: InsertOptions
): Promise<void> {
if (isVectorType(schemaType)) {
return insertVector(index, prop, value as number[] | Float32Array, id)
}

const insertScalar = insertScalarBuilder(implementation, index, prop, id, language, tokenizer, docsCount)
const insertScalar = insertScalarBuilder(implementation, index, prop, id, language, tokenizer, docsCount, options)

if (!isArrayType(schemaType)) {
return insertScalar(value)
Expand Down Expand Up @@ -553,7 +557,10 @@ export async function searchByWhereClause<T extends AnyOrama, ResultDocument = T
highPrecision
)
// @todo: convert this into a for loop
safeArrayPush(filtersMap[param], ids.flatMap(({ docIDs }) => docIDs))
safeArrayPush(
filtersMap[param],
ids.flatMap(({ docIDs }) => docIDs)
)
} else {
const {
coordinates,
Expand All @@ -562,7 +569,10 @@ export async function searchByWhereClause<T extends AnyOrama, ResultDocument = T
} = operation[reqOperation] as GeosearchPolygonOperator['polygon']
const ids = searchByPolygon(node.root, coordinates as BKDGeoPoint[], inside, undefined, highPrecision)
// @todo: convert this into a for loop
safeArrayPush(filtersMap[param], ids.flatMap(({ docIDs }) => docIDs))
safeArrayPush(
filtersMap[param],
ids.flatMap(({ docIDs }) => docIDs)
)
}

continue
Expand All @@ -588,7 +598,10 @@ export async function searchByWhereClause<T extends AnyOrama, ResultDocument = T

if (type === 'Flat') {
const flatOperation = isArray ? flatFilterArr : flatFilter
safeArrayPush(filtersMap[param], flatOperation(node, operation as EnumComparisonOperator & EnumArrComparisonOperator))
safeArrayPush(
filtersMap[param],
flatOperation(node, operation as EnumComparisonOperator & EnumArrComparisonOperator)
)

continue
}
Expand Down
18 changes: 13 additions & 5 deletions packages/orama/src/methods/insert.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,23 @@ import { createError } from '../errors.js'
import { Point } from '../trees/bkd.js'
import { AnyOrama, PartialSchemaDeep, SortValue, TypedDocument } from '../types.js'

export type InsertOptions = {
avlRebalanceThreshold?: number
}

export async function insert<T extends AnyOrama>(
orama: T,
doc: PartialSchemaDeep<TypedDocument<T>>,
language?: string,
skipHooks?: boolean
skipHooks?: boolean,
options?: InsertOptions
): Promise<string> {
const errorProperty = await orama.validateSchema(doc, orama.schema)
if (errorProperty) {
throw createError('SCHEMA_VALIDATION_FAILURE', errorProperty)
}

return innerInsert(orama, doc, language, skipHooks)
return innerInsert(orama, doc, language, skipHooks, options)
}

const ENUM_TYPE = new Set(['enum', 'enum[]'])
Expand All @@ -26,7 +31,8 @@ async function innerInsert<T extends AnyOrama>(
orama: T,
doc: PartialSchemaDeep<TypedDocument<T>>,
language?: string,
skipHooks?: boolean
skipHooks?: boolean,
options?: InsertOptions
): Promise<string> {
const { index, docs } = orama.data

Expand Down Expand Up @@ -111,7 +117,8 @@ async function innerInsert<T extends AnyOrama>(
expectedType,
language,
orama.tokenizer,
docsCount
docsCount,
options
)
await orama.index.afterInsert?.(
orama.data.index,
Expand Down Expand Up @@ -199,7 +206,8 @@ export async function innerInsertMultiple<T extends AnyOrama>(

for (const doc of batch) {
try {
const id = await insert(orama, doc, language, skipHooks)
const options = { avlRebalanceThreshold: batch.length }
const id = await insert(orama, doc, language, skipHooks, options)
ids.push(id)
} catch (err) {
reject(err)
Expand Down
66 changes: 40 additions & 26 deletions packages/orama/src/trees/avl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,14 @@ export function rangeSearch<K, V>(node: RootNode<K, V>, min: K, max: K): V {
export function greaterThan<K, V>(node: RootNode<K, V>, key: K, inclusive = false): V {
const result: V[] = []

if (node === null) return result as V;
if (node === null) return result as V

const stack: Array<Nullable<Node<K, V>>> = [node.root]

while (stack.length > 0) {
const node = stack.pop()
if (!node) {
continue;
continue
}

if (inclusive && node.k >= key) {
Expand All @@ -149,14 +149,14 @@ export function greaterThan<K, V>(node: RootNode<K, V>, key: K, inclusive = fals
export function lessThan<K, V>(node: RootNode<K, V>, key: K, inclusive = false): V {
const result: V[] = []

if (node === null) return result as V;
if (node === null) return result as V

const stack: Array<Nullable<Node<K, V>>> = [node.root]

while (stack.length > 0) {
const node = stack.pop()
if (!node) {
continue;
continue
}

if (inclusive && node.k <= key) {
Expand Down Expand Up @@ -198,9 +198,12 @@ export function create<K, V>(key: K, value: V): RootNode<K, V> {
}
}

export function insert<K, V>(rootNode: RootNode<K, V[]>, key: K, newValue: V[]): void {
let insertCount = 0

export function insert<K, V>(rootNode: RootNode<K, V[]>, key: K, newValue: V[], rebalanceThreshold = 500): void {
function insertNode(node: Nullable<Node<K, V[]>>, key: K, newValue: V[]): Node<K, V[]> {
if (node === null) {
insertCount++
return {
k: key,
v: newValue,
Expand All @@ -215,38 +218,49 @@ export function insert<K, V>(rootNode: RootNode<K, V[]>, key: K, newValue: V[]):
} else if (key > node.k) {
node.r = insertNode(node.r, key, newValue)
} else {
for (const value of newValue) {
node.v.push(value)
}
node.v.push(...newValue)
return node
}

node.h = 1 + Math.max(getHeight(node.l), getHeight(node.r))
// Rebalance the tree if the insert count reaches the threshold.
// This will improve insertion performance since we won't be rebalancing the tree on every insert.
// When inserting docs using `insertMultiple`, the threshold will be set to the number of docs being inserted.
// We can force rebalancing the tree by setting the threshold to 1 (default).
if (insertCount % rebalanceThreshold === 0) {
console.log(`Rebalancing tree after ${insertCount} inserts...`)
return rebalanceNode(node, key)
}

const balanceFactor = getHeight(node.l) - getHeight(node.r)
return node
}

if (balanceFactor > 1 && key < node.l!.k) {
return rotateRight(node)
}
rootNode.root = insertNode(rootNode.root, key, newValue)
}

if (balanceFactor < -1 && key > node.r!.k) {
return rotateLeft(node)
}
function rebalanceNode<K, V>(node: Node<K, V[]>, key: K): Node<K, V[]> {
node.h = 1 + Math.max(getHeight(node.l), getHeight(node.r))

if (balanceFactor > 1 && key > node.l!.k) {
node.l = rotateLeft(node.l!)
return rotateRight(node)
}
const balanceFactor = getHeight(node.l) - getHeight(node.r)

if (balanceFactor < -1 && key < node.r!.k) {
node.r = rotateRight(node.r!)
return rotateLeft(node)
}
if (balanceFactor > 1 && key < node.l!.k) {
return rotateRight(node)
}

return node
if (balanceFactor < -1 && key > node.r!.k) {
return rotateLeft(node)
}

rootNode.root = insertNode(rootNode.root, key, newValue)
if (balanceFactor > 1 && key > node.l!.k) {
node.l = rotateLeft(node.l!)
return rotateRight(node)
}

if (balanceFactor < -1 && key < node.r!.k) {
node.r = rotateRight(node.r!)
return rotateLeft(node)
}

return node
}

function getHeight<K, V>(node: Nullable<Node<K, V>>): number {
Expand Down
4 changes: 3 additions & 1 deletion packages/orama/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import type { InsertOptions } from './methods/insert.js'
import { MODE_FULLTEXT_SEARCH, MODE_HYBRID_SEARCH, MODE_VECTOR_SEARCH } from './constants.js'
import { DocumentsStore } from './components/documents-store.js'
import { Index } from './components/index.js'
Expand Down Expand Up @@ -924,7 +925,8 @@ export interface IIndex<I extends AnyIndexStore> {
schemaType: SearchableType,
language: string | undefined,
tokenizer: Tokenizer,
docsCount: number
docsCount: number,
options?: InsertOptions
) => SyncOrAsyncValue
afterInsert?: IIndexInsertOrRemoveHookFunction

Expand Down
2 changes: 1 addition & 1 deletion packages/orama/tests/distinct.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ async function createDb() {
color: 'string',
rank: 'number',
isPromoted: 'boolean'
}
} as const
})

const ids = await insertMultiple(db, [
Expand Down
Loading

0 comments on commit 71702f1

Please sign in to comment.