Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code Cange to improve data ingest #8

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
*.lnk
error.txt
*.bat
3 changes: 3 additions & 0 deletions CUDA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import torch
print("is CUDA available?",torch.cuda.is_available())
print(torch.version.cuda)
18 changes: 14 additions & 4 deletions api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import environment_var
import os
from openai import OpenAI
#from langchain_community.llms import Ollama
#from langgraph.graph import END, MessageGraph

class Item(BaseModel):
Expand All @@ -19,7 +20,8 @@ def __init__(self, query: str) -> None:

#model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
model_name = "sentence-transformers/msmarco-bert-base-dot-v5"
model_kwargs = {'device': 'cpu'}
model_kwargs = {'device': 'cuda'} # changed by pdchristian to 'cuda'
#model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
model_name=model_name,
Expand All @@ -34,10 +36,15 @@ def __init__(self, query: str) -> None:
client_ai = OpenAI(
base_url="https://integrate.api.nvidia.com/v1",
api_key=environment_var.nvidia_key
# base_url="http://localhost:11434", #pdchristian tried to connect to local ollama server
# api_key="ollama" #pdchristian tried to connect to local ollama server
# base_url="http://localhost:1234", #pdchristian tried to connect to local LM-Studio server
# api_key=environment_var.nvidia_key #pdchristian tried to connect to local LM-Studio server
)
use_nvidia_api = True
elif use_quantized:
model_id = "Kameshr/LLAMA-3-Quantized"
# model_id = "llama3.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
Expand All @@ -46,6 +53,8 @@ def __init__(self, query: str) -> None:
)
else:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "llama3.1"
# model_id = "lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
Expand All @@ -70,7 +79,7 @@ async def root():
def search(Item:Item):
query = Item.query
search_result = qdrant.similarity_search(
query=query, k=10
query=query, k=20
)
i = 0
list_res = []
Expand All @@ -82,7 +91,7 @@ def search(Item:Item):
async def ask_localai(Item:Item):
query = Item.query
search_result = qdrant.similarity_search(
query=query, k=10
query=query, k=20
)
i = 0
list_res = []
Expand All @@ -96,7 +105,8 @@ async def ask_localai(Item:Item):
i = i +1

rolemsg = {"role": "system",
"content": "Answer user's question using documents given in the context. In the context are documents that should contain an answer. Please always reference document id (in squere brackets, for example [0],[1]) of the document that was used to make a claim. Use as many citations and documents as it is necessary to answer question."}
"content": "Answer user's question using documents given in the context. Formulate all answers in German. In the context are documents that should contain an answer. Please always reference document id (in squere brackets, for example [0],[1]) of the document that was used to make a claim. Use as many citations and documents as it is necessary to answer question."}
# "content": "Answer user's question using documents given in the context. In the context are documents that should contain an answer. Please always reference document id (in squere brackets, for example [0],[1]) of the document that was used to make a claim. Use as many citations and documents as it is necessary to answer question."}
messages = [
rolemsg,
{"role": "user", "content": "Documents:\n"+context+"\n\nQuestion: "+query},
Expand Down
44 changes: 32 additions & 12 deletions index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import PyPDF2
from os import listdir
from os.path import isfile, join,isdir
from os.path import isfile, join, isdir
import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_qdrant import Qdrant
import sys
Expand All @@ -10,13 +11,20 @@
from qdrant_client.models import Distance, VectorParams
import docx

#def get_files(dir):
# file_list = []
# for f in listdir(dir):
# if isfile(join(dir,f)):
# file_list.append(join(dir,f))
# elif isdir(join(dir,f)):
# file_list= file_list + get_files(join(dir,f))
# return file_list

def get_files(dir):
file_list = []
for f in listdir(dir):
if isfile(join(dir,f)):
file_list.append(join(dir,f))
elif isdir(join(dir,f)):
file_list= file_list + get_files(join(dir,f))
for dir, _, filenames in os.walk(dir):
for f in filenames:
file_list.append(os.path.join(dir, f))
return file_list

def getTextFromWord(filename):
Expand All @@ -37,6 +45,7 @@ def getTextFromPPTX(filename):
def main_indexing(mypath):
#model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
model_name = "sentence-transformers/msmarco-bert-base-dot-v5"
#model_kwargs = {'device': 'cuda'}
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
Expand All @@ -56,14 +65,21 @@ def main_indexing(mypath):
file_content = ""
for file in onlyfiles:
file_content = ""
if file.endswith(".pdf"):
if file.find("~") > 0: #added by pdchristian to catch files with "~" in file name
file_content = "Empty due to ~ in file name." #added by pdchristian to catch files with "~" in file name
print("Document title with ~: "+file) #added by pdchristian to catch files with "~" in file name
elif file.endswith(".pdf"): #added by pdchristian to catch files with "~" in file name
#if file.endswith(".pdf"):
print("indexing "+file)
reader = PyPDF2.PdfReader(file)
for i in range(0,len(reader.pages)):
file_content = file_content + " "+reader.pages[i].extract_text()
try: #added by pdchristian to catch decryption error
reader = PyPDF2.PdfReader(file)
for i in range(0,len(reader.pages)):
file_content = file_content + " "+reader.pages[i].extract_text()
except Exception as exc: #added by pdchristian to catch decryption error
file_content = "Empty due to extraction error." #added by pdchristian to catch decryption error
elif file.endswith(".txt") or file.endswith(".md") or file.endswith(".markdown"):
print("indexing " + file)
f = open(file,'r')
f = open(file,'r',encoding='utf-8',errors='ignore') #added by pdchristian encoding='utf-8',errors='ignore'
file_content = f.read()
f.close()
elif file.endswith(".docx"):
Expand All @@ -81,9 +97,13 @@ def main_indexing(mypath):
metadata.append({"path":file})
qdrant.add_texts(texts,metadatas=metadata)
len(texts)
print(onlyfiles)
#print(onlyfiles)
print("Finished indexing!")

#Fixed Folder
#main_indexing("TestFolder")

#Folder from command line: python index.py path/to/folder or python index.py "X:/Christian/SV"
if __name__ == "__main__":
arguments = sys.argv
if len(arguments)>1:
Expand Down
10 changes: 9 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,12 @@ accelerate
streamlit
python-docx
python-pptx
openai
openai

#Added by pdchristian
PyCryptodome
torchvision
torchaudio #--index-url https://download.pytorch.org/whl/cu118

#langchain-huggingface #will replace HuggingFaceEmbeddings
#QdrantVectorStore #will replace qdrant
5 changes: 4 additions & 1 deletion user_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import requests
import json
st.title('_:blue[Local GenAI Search]_ :sunglasses:')

question = st.text_input("Ask a question based on your local files", "")
if st.button("Ask a question"):

if not question=="":
#if question("Frage abschicken"):
st.write("The current question is \"", question+"\"")
url = "http://127.0.0.1:8000/ask_localai"

Expand Down