Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: speeds up insertion for number properties #756

Merged
merged 3 commits into from
Jul 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions packages/orama/src/components/facets.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,11 @@ export async function getFacets<T extends AnyOrama>(
case 'string[]': {
const alreadyInsertedValues = new Set<string>()
const innerType = propertyType === 'boolean[]' ? 'boolean' : 'string'
const calculateBooleanStringOrEnumFacet = calculateBooleanStringOrEnumFacetBuilder(facetValues, innerType, alreadyInsertedValues)
const calculateBooleanStringOrEnumFacet = calculateBooleanStringOrEnumFacetBuilder(
facetValues,
innerType,
alreadyInsertedValues
)
for (const v of facetValue as Array<FacetValue>) {
calculateBooleanStringOrEnumFacet(v)
}
Expand Down Expand Up @@ -140,13 +144,13 @@ function calculateNumberFacetBuilder(
if (alreadyInsertedValues?.has(value)) {
continue
}

if (facetValue >= range.from && facetValue <= range.to) {
if (values[value] === undefined) {
values[value] = 1
} else {
values[value]++

alreadyInsertedValues?.add(value)
}
}
Expand All @@ -159,7 +163,7 @@ function calculateBooleanStringOrEnumFacetBuilder(
propertyType: 'string' | 'boolean' | 'enum',
alreadyInsertedValues?: Set<string>
) {
const defaultValue = (propertyType === 'boolean' ? 'false' : '')
const defaultValue = propertyType === 'boolean' ? 'false' : ''
return (facetValue: FacetValue) => {
// String or boolean based facets
const value = facetValue?.toString() ?? defaultValue
Expand Down
27 changes: 20 additions & 7 deletions packages/orama/src/components/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import type {
VectorType,
WhereCondition
} from '../types.js'
import type { InsertOptions } from '../methods/insert.js'
import { createError } from '../errors.js'
import {
create as avlCreate,
Expand Down Expand Up @@ -293,7 +294,8 @@ function insertScalarBuilder(
id: DocumentID,
language: string | undefined,
tokenizer: Tokenizer,
docsCount: number
docsCount: number,
options?: InsertOptions
) {
return async (value: SearchableValue): Promise<void> => {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id)
Expand All @@ -305,7 +307,8 @@ function insertScalarBuilder(
break
}
case 'AVL': {
avlInsert(node, value as number, [internalId])
const avlRebalanceThreshold = options?.avlRebalanceThreshold ?? 1
avlInsert(node, value as number, [internalId], avlRebalanceThreshold)
break
}
case 'Radix': {
Expand Down Expand Up @@ -341,13 +344,14 @@ export async function insert(
schemaType: SearchableType,
language: string | undefined,
tokenizer: Tokenizer,
docsCount: number
docsCount: number,
options?: InsertOptions
): Promise<void> {
if (isVectorType(schemaType)) {
return insertVector(index, prop, value as number[] | Float32Array, id)
}

const insertScalar = insertScalarBuilder(implementation, index, prop, id, language, tokenizer, docsCount)
const insertScalar = insertScalarBuilder(implementation, index, prop, id, language, tokenizer, docsCount, options)

if (!isArrayType(schemaType)) {
return insertScalar(value)
Expand Down Expand Up @@ -553,7 +557,10 @@ export async function searchByWhereClause<T extends AnyOrama, ResultDocument = T
highPrecision
)
// @todo: convert this into a for loop
safeArrayPush(filtersMap[param], ids.flatMap(({ docIDs }) => docIDs))
safeArrayPush(
filtersMap[param],
ids.flatMap(({ docIDs }) => docIDs)
)
} else {
const {
coordinates,
Expand All @@ -562,7 +569,10 @@ export async function searchByWhereClause<T extends AnyOrama, ResultDocument = T
} = operation[reqOperation] as GeosearchPolygonOperator['polygon']
const ids = searchByPolygon(node.root, coordinates as BKDGeoPoint[], inside, undefined, highPrecision)
// @todo: convert this into a for loop
safeArrayPush(filtersMap[param], ids.flatMap(({ docIDs }) => docIDs))
safeArrayPush(
filtersMap[param],
ids.flatMap(({ docIDs }) => docIDs)
)
}

continue
Expand All @@ -588,7 +598,10 @@ export async function searchByWhereClause<T extends AnyOrama, ResultDocument = T

if (type === 'Flat') {
const flatOperation = isArray ? flatFilterArr : flatFilter
safeArrayPush(filtersMap[param], flatOperation(node, operation as EnumComparisonOperator & EnumArrComparisonOperator))
safeArrayPush(
filtersMap[param],
flatOperation(node, operation as EnumComparisonOperator & EnumArrComparisonOperator)
)

continue
}
Expand Down
18 changes: 13 additions & 5 deletions packages/orama/src/methods/insert.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,23 @@ import { createError } from '../errors.js'
import { Point } from '../trees/bkd.js'
import { AnyOrama, PartialSchemaDeep, SortValue, TypedDocument } from '../types.js'

export type InsertOptions = {
avlRebalanceThreshold?: number
}

export async function insert<T extends AnyOrama>(
orama: T,
doc: PartialSchemaDeep<TypedDocument<T>>,
language?: string,
skipHooks?: boolean
skipHooks?: boolean,
options?: InsertOptions
): Promise<string> {
const errorProperty = await orama.validateSchema(doc, orama.schema)
if (errorProperty) {
throw createError('SCHEMA_VALIDATION_FAILURE', errorProperty)
}

return innerInsert(orama, doc, language, skipHooks)
return innerInsert(orama, doc, language, skipHooks, options)
}

const ENUM_TYPE = new Set(['enum', 'enum[]'])
Expand All @@ -26,7 +31,8 @@ async function innerInsert<T extends AnyOrama>(
orama: T,
doc: PartialSchemaDeep<TypedDocument<T>>,
language?: string,
skipHooks?: boolean
skipHooks?: boolean,
options?: InsertOptions
): Promise<string> {
const { index, docs } = orama.data

Expand Down Expand Up @@ -111,7 +117,8 @@ async function innerInsert<T extends AnyOrama>(
expectedType,
language,
orama.tokenizer,
docsCount
docsCount,
options
)
await orama.index.afterInsert?.(
orama.data.index,
Expand Down Expand Up @@ -199,7 +206,8 @@ export async function innerInsertMultiple<T extends AnyOrama>(

for (const doc of batch) {
try {
const id = await insert(orama, doc, language, skipHooks)
const options = { avlRebalanceThreshold: batch.length }
const id = await insert(orama, doc, language, skipHooks, options)
ids.push(id)
} catch (err) {
reject(err)
Expand Down
66 changes: 40 additions & 26 deletions packages/orama/src/trees/avl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,14 @@ export function rangeSearch<K, V>(node: RootNode<K, V>, min: K, max: K): V {
export function greaterThan<K, V>(node: RootNode<K, V>, key: K, inclusive = false): V {
const result: V[] = []

if (node === null) return result as V;
if (node === null) return result as V

const stack: Array<Nullable<Node<K, V>>> = [node.root]

while (stack.length > 0) {
const node = stack.pop()
if (!node) {
continue;
continue
}

if (inclusive && node.k >= key) {
Expand All @@ -149,14 +149,14 @@ export function greaterThan<K, V>(node: RootNode<K, V>, key: K, inclusive = fals
export function lessThan<K, V>(node: RootNode<K, V>, key: K, inclusive = false): V {
const result: V[] = []

if (node === null) return result as V;
if (node === null) return result as V

const stack: Array<Nullable<Node<K, V>>> = [node.root]

while (stack.length > 0) {
const node = stack.pop()
if (!node) {
continue;
continue
}

if (inclusive && node.k <= key) {
Expand Down Expand Up @@ -198,9 +198,12 @@ export function create<K, V>(key: K, value: V): RootNode<K, V> {
}
}

export function insert<K, V>(rootNode: RootNode<K, V[]>, key: K, newValue: V[]): void {
let insertCount = 0

export function insert<K, V>(rootNode: RootNode<K, V[]>, key: K, newValue: V[], rebalanceThreshold = 500): void {
function insertNode(node: Nullable<Node<K, V[]>>, key: K, newValue: V[]): Node<K, V[]> {
if (node === null) {
insertCount++
return {
k: key,
v: newValue,
Expand All @@ -215,38 +218,49 @@ export function insert<K, V>(rootNode: RootNode<K, V[]>, key: K, newValue: V[]):
} else if (key > node.k) {
node.r = insertNode(node.r, key, newValue)
} else {
for (const value of newValue) {
node.v.push(value)
}
node.v.push(...newValue)
return node
}

node.h = 1 + Math.max(getHeight(node.l), getHeight(node.r))
// Rebalance the tree if the insert count reaches the threshold.
// This will improve insertion performance since we won't be rebalancing the tree on every insert.
// When inserting docs using `insertMultiple`, the threshold will be set to the number of docs being inserted.
// We can force rebalancing the tree by setting the threshold to 1 (default).
if (insertCount % rebalanceThreshold === 0) {
console.log(`Rebalancing tree after ${insertCount} inserts...`)
return rebalanceNode(node, key)
}

const balanceFactor = getHeight(node.l) - getHeight(node.r)
return node
}

if (balanceFactor > 1 && key < node.l!.k) {
return rotateRight(node)
}
rootNode.root = insertNode(rootNode.root, key, newValue)
}

if (balanceFactor < -1 && key > node.r!.k) {
return rotateLeft(node)
}
function rebalanceNode<K, V>(node: Node<K, V[]>, key: K): Node<K, V[]> {
node.h = 1 + Math.max(getHeight(node.l), getHeight(node.r))

if (balanceFactor > 1 && key > node.l!.k) {
node.l = rotateLeft(node.l!)
return rotateRight(node)
}
const balanceFactor = getHeight(node.l) - getHeight(node.r)

if (balanceFactor < -1 && key < node.r!.k) {
node.r = rotateRight(node.r!)
return rotateLeft(node)
}
if (balanceFactor > 1 && key < node.l!.k) {
return rotateRight(node)
}

return node
if (balanceFactor < -1 && key > node.r!.k) {
return rotateLeft(node)
}

rootNode.root = insertNode(rootNode.root, key, newValue)
if (balanceFactor > 1 && key > node.l!.k) {
node.l = rotateLeft(node.l!)
return rotateRight(node)
}

if (balanceFactor < -1 && key < node.r!.k) {
node.r = rotateRight(node.r!)
return rotateLeft(node)
}

return node
}

function getHeight<K, V>(node: Nullable<Node<K, V>>): number {
Expand Down
4 changes: 3 additions & 1 deletion packages/orama/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import type { InsertOptions } from './methods/insert.js'
import { MODE_FULLTEXT_SEARCH, MODE_HYBRID_SEARCH, MODE_VECTOR_SEARCH } from './constants.js'
import { DocumentsStore } from './components/documents-store.js'
import { Index } from './components/index.js'
Expand Down Expand Up @@ -924,7 +925,8 @@ export interface IIndex<I extends AnyIndexStore> {
schemaType: SearchableType,
language: string | undefined,
tokenizer: Tokenizer,
docsCount: number
docsCount: number,
options?: InsertOptions
) => SyncOrAsyncValue
afterInsert?: IIndexInsertOrRemoveHookFunction

Expand Down
2 changes: 1 addition & 1 deletion packages/orama/tests/distinct.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ async function createDb() {
color: 'string',
rank: 'number',
isPromoted: 'boolean'
}
} as const
})

const ids = await insertMultiple(db, [
Expand Down
Loading
Loading