Skip to content

Commit

Permalink
Specify Weaviate schema (#49)
Browse files Browse the repository at this point in the history
* Specify Weaviate schema in code
  • Loading branch information
arihanv authored Nov 29, 2023
1 parent 9af71d4 commit 48df5f7
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 47 deletions.
46 changes: 0 additions & 46 deletions lib/utils/embeddings/setup.ts

This file was deleted.

21 changes: 21 additions & 0 deletions lib/utils/embeddings/setup/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import weaviate from 'weaviate-ts-client'
import {schema} from './schema'

export async function main() {
console.log('Grabbing Weaviate schema template...')

console.log('Creating Weaviate schema...')
const client = weaviate.client({
scheme: 'https',
host: process.env.WEAVIATE_HOST
})

try {
await client.schema.classCreator().withClass(schema).do()
console.log('Schema created!')
} catch (e: unknown) {
console.log(e instanceof Error && e.message)
}
}

main()
113 changes: 113 additions & 0 deletions lib/utils/embeddings/setup/schema.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
export const schema = {
class: 'CodeSearch_TestTed',
invertedIndexConfig: {
bm25: {
b: 0.75,
k1: 1.2
},
cleanupIntervalSeconds: 60,
stopwords: {
additions: null,
preset: 'en',
removals: null
}
},
multiTenancyConfig: {
enabled: false
},
properties: [
{
dataType: ['text'],
indexFilterable: true,
indexSearchable: true,
name: 'text',
tokenization: 'word'
},
{
dataType: ['text'],
indexFilterable: true,
indexSearchable: true,
name: 'source',
tokenization: 'word'
},
{
dataType: ['text'],
indexFilterable: true,
indexSearchable: true,
name: 'repository',
tokenization: 'word'
},
{
dataType: ['text'],
indexFilterable: true,
indexSearchable: true,
name: 'branch',
tokenization: 'word'
},
{
dataType: ['number'],
indexFilterable: true,
indexSearchable: false,
name: 'loc_lines_from'
},
{
dataType: ['number'],
indexFilterable: true,
indexSearchable: false,
name: 'loc_lines_to'
},
{
dataType: ['text'],
indexFilterable: true,
indexSearchable: true,
name: 'ext',
tokenization: 'word'
},
{
dataType: ['text'],
indexFilterable: true,
indexSearchable: true,
name: 'userId',
tokenization: 'word'
}
],
replicationConfig: {
factor: 1
},
shardingConfig: {
actualCount: 1,
actualVirtualCount: 128,
desiredCount: 1,
desiredVirtualCount: 128,
function: 'murmur3',
key: '_id',
strategy: 'hash',
virtualPerPhysical: 128
},
vectorIndexConfig: {
cleanupIntervalSeconds: 300,
distance: 'cosine',
dynamicEfFactor: 8,
dynamicEfMax: 500,
dynamicEfMin: 100,
ef: -1,
efConstruction: 128,
flatSearchCutoff: 40000, // 40k
maxConnections: 64,
pq: {
bitCompression: false,
centroids: 256,
enabled: false,
encoder: {
distribution: 'log-normal',
type: 'kmeans'
},
segments: 0,
trainingLimit: 100000 // 100k
},
skip: false,
vectorCacheMaxObjects: 1000000000000 // 1 trillion (1e12)
},
vectorIndexType: 'hnsw',
vectorizer: 'none'
}
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
"lint:fix": "eslint . --ext .js,.jsx,.cjs,.mjs,.json,.ts,.tsx --fix",
"postinstall": "prisma generate",
"start": "next start",
"vector:schema:push": "bun run ./lib/utils/embeddings/setup.ts"
"vector:schema:push": "bun run ./lib/utils/embeddings/setup"
},
"version": "0.1.0"
}

1 comment on commit 48df5f7

@vercel
Copy link

@vercel vercel bot commented on 48df5f7 Nov 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.