From 3febb4258ef0a2b0b08d30061f812f9a8c2b014a Mon Sep 17 00:00:00 2001 From: arihanv Date: Tue, 5 Dec 2023 23:02:19 -0600 Subject: [PATCH 1/6] Add summary function for code snippets --- lib/utils/embeddings/summary.ts | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 lib/utils/embeddings/summary.ts diff --git a/lib/utils/embeddings/summary.ts b/lib/utils/embeddings/summary.ts new file mode 100644 index 00000000..933ad088 --- /dev/null +++ b/lib/utils/embeddings/summary.ts @@ -0,0 +1,21 @@ +import {LLMChain} from 'langchain/chains' +import {OpenAI} from 'langchain/llms/openai' +import {PromptTemplate} from 'langchain/prompts' + +const prompt = PromptTemplate.fromTemplate( + `You are a helpful assistant that writes a description of the given code snippet. My ultimate goal is to use your produced summary as a key in a key value store vectordb. Your summary is the key and the value is the code. + 1) Don't mention "code snippet" or "summary" in your response. Just produce the summary of just the code without fluff. + 2) Try to summarize the code in no more than 6 sentences and use as many keywords as possible. + 3) Your response should be a single line of text. + + Example Code: #!/bin/sh\ngroovyc src/*.groovy\ngroovy src/Main.groovy --cp src/ + Good Summary: Compiles and runs a Groovy program using the source files in the \"src\" directory. + + {code}` +) + +export async function AISummary(code: string, modelName: string = 'gpt-3.5-turbo', temperature: number = 0) { + const model = new OpenAI({temperature, modelName}) + const codeChain = new LLMChain({llm: model, prompt}) + return await codeChain.call({code}) +} From effab0305970bb678458b0fc1b2d830dd8246d33 Mon Sep 17 00:00:00 2001 From: arihanv Date: Tue, 5 Dec 2023 23:02:27 -0600 Subject: [PATCH 2/6] Add AI summary to document metadata --- lib/utils/embeddings/embed.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/utils/embeddings/embed.ts b/lib/utils/embeddings/embed.ts index b9894219..3372f98d 100644 --- a/lib/utils/embeddings/embed.ts +++ b/lib/utils/embeddings/embed.ts @@ -2,9 +2,10 @@ import {OpenAIEmbeddings} from 'langchain/embeddings/openai' import {RecursiveCharacterTextSplitter} from 'langchain/text_splitter' import {WeaviateStore} from 'langchain/vectorstores/weaviate' import {cloneRepo} from './cloneRepo' -import {type WeaviateConfig} from './db' import deleteRepo from './delete' import {checkRepoExists} from './exists' +import {type WeaviateConfig} from './db' +import { AISummary } from './summary' export default async function addRepo( weaviateConfig: WeaviateConfig, @@ -42,16 +43,17 @@ export default async function addRepo( process.env.GITHUB_ACCESS_TOKEN || '' ) - const docs = repo.map(doc => { + const docs = Promise.all(repo.map(async doc => { return { ...doc, metadata: { ...doc.metadata, userId: weaviateConfig.userId, + summary: await AISummary(doc.pageContent), ext: doc.metadata.source.split('.')[1] || '' } } - }) + })) const embeddings = new OpenAIEmbeddings({ openAIApiKey: process.env.OPENAI_API_KEY @@ -64,9 +66,9 @@ export default async function addRepo( }) // returns the Weaviate ids of the added documents - return await store.addDocuments(docs) + return await store.addDocuments(await docs) } catch (e) { console.error(e) return } -} +} \ No newline at end of file From d8490e9661ce7d43b97ff52f52d0d81933627c47 Mon Sep 17 00:00:00 2001 From: arihanv Date: Wed, 6 Dec 2023 13:31:59 -0600 Subject: [PATCH 3/6] feat: specify summary attribute for weaviate --- lib/utils/embeddings/query.ts | 1 + lib/utils/embeddings/setup/schema.ts | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/lib/utils/embeddings/query.ts b/lib/utils/embeddings/query.ts index 0343ceda..f5a9cbec 100644 --- a/lib/utils/embeddings/query.ts +++ b/lib/utils/embeddings/query.ts @@ -5,6 +5,7 @@ const keys = [ 'source', 'text', 'ext', + 'summary', 'repository', 'branch', 'userId', diff --git a/lib/utils/embeddings/setup/schema.ts b/lib/utils/embeddings/setup/schema.ts index 92d9128c..d9b03e80 100644 --- a/lib/utils/embeddings/setup/schema.ts +++ b/lib/utils/embeddings/setup/schema.ts @@ -37,6 +37,13 @@ export const schema = { name: 'repository', tokenization: 'word' }, + { + dataType: ['text'], + indexFilterable: true, + indexSearchable: true, + name: 'summary', + tokenization: 'word' + }, { dataType: ['text'], indexFilterable: true, From 099b91d0f1e4b580b66c5f46d8c9330f5fefecc4 Mon Sep 17 00:00:00 2001 From: arihanv Date: Wed, 13 Dec 2023 11:10:16 -0600 Subject: [PATCH 4/6] Change position of await --- lib/utils/embeddings/embed.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/utils/embeddings/embed.ts b/lib/utils/embeddings/embed.ts index 3372f98d..698347d9 100644 --- a/lib/utils/embeddings/embed.ts +++ b/lib/utils/embeddings/embed.ts @@ -43,7 +43,7 @@ export default async function addRepo( process.env.GITHUB_ACCESS_TOKEN || '' ) - const docs = Promise.all(repo.map(async doc => { + const docs = await Promise.all(repo.map(async doc => { return { ...doc, metadata: { @@ -66,7 +66,7 @@ export default async function addRepo( }) // returns the Weaviate ids of the added documents - return await store.addDocuments(await docs) + return await store.addDocuments(docs) } catch (e) { console.error(e) return From 198bc6e34d07f8ed039bb1637ee3ca4231e888e8 Mon Sep 17 00:00:00 2001 From: arihanv Date: Wed, 13 Dec 2023 11:28:18 -0600 Subject: [PATCH 5/6] Shorten prompt --- lib/utils/embeddings/summary.ts | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/utils/embeddings/summary.ts b/lib/utils/embeddings/summary.ts index 933ad088..03ba295e 100644 --- a/lib/utils/embeddings/summary.ts +++ b/lib/utils/embeddings/summary.ts @@ -3,15 +3,13 @@ import {OpenAI} from 'langchain/llms/openai' import {PromptTemplate} from 'langchain/prompts' const prompt = PromptTemplate.fromTemplate( - `You are a helpful assistant that writes a description of the given code snippet. My ultimate goal is to use your produced summary as a key in a key value store vectordb. Your summary is the key and the value is the code. - 1) Don't mention "code snippet" or "summary" in your response. Just produce the summary of just the code without fluff. - 2) Try to summarize the code in no more than 6 sentences and use as many keywords as possible. - 3) Your response should be a single line of text. + `You are AI that writes description of code snippet. Follow format of example below. Lots of keywords, straight to the point, one line. - Example Code: #!/bin/sh\ngroovyc src/*.groovy\ngroovy src/Main.groovy --cp src/ - Good Summary: Compiles and runs a Groovy program using the source files in the \"src\" directory. + Code:#!/bin/sh\ngroovyc src/*.groovy\ngroovy src/Main.groovy --cp src/ + AI:Compiles and runs a Groovy program using the source files in the \"src\" directory. - {code}` + Code:{code} + AI:` ) export async function AISummary(code: string, modelName: string = 'gpt-3.5-turbo', temperature: number = 0) { From da2ffb2df8104c25c8dbb31c6d36b0b66154d5d8 Mon Sep 17 00:00:00 2001 From: arihanv Date: Wed, 13 Dec 2023 11:36:03 -0600 Subject: [PATCH 6/6] Even shorter --- lib/utils/embeddings/summary.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/utils/embeddings/summary.ts b/lib/utils/embeddings/summary.ts index 03ba295e..7b915ec8 100644 --- a/lib/utils/embeddings/summary.ts +++ b/lib/utils/embeddings/summary.ts @@ -3,7 +3,7 @@ import {OpenAI} from 'langchain/llms/openai' import {PromptTemplate} from 'langchain/prompts' const prompt = PromptTemplate.fromTemplate( - `You are AI that writes description of code snippet. Follow format of example below. Lots of keywords, straight to the point, one line. + `You are AI that make desc of code snippet. Many keywords, straight to point, 1 line. Code:#!/bin/sh\ngroovyc src/*.groovy\ngroovy src/Main.groovy --cp src/ AI:Compiles and runs a Groovy program using the source files in the \"src\" directory.