nikolamilosevic86 · pdchristian · Aug 18, 2024 · Aug 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -160,3 +160,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+*.lnk
+error.txt
+*.bat
diff --git a/CUDA.py b/CUDA.py
@@ -0,0 +1,3 @@
+import torch
+print("is CUDA available?",torch.cuda.is_available())
+print(torch.version.cuda)
diff --git a/api.py b/api.py
@@ -10,6 +10,7 @@
 import environment_var
 import os
 from openai import OpenAI
+#from langchain_community.llms import Ollama
 #from langgraph.graph import END, MessageGraph
 
 class Item(BaseModel):
@@ -19,7 +20,8 @@ def __init__(self, query: str) -> None:
 
 #model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
 model_name = "sentence-transformers/msmarco-bert-base-dot-v5"
-model_kwargs = {'device': 'cpu'}
+model_kwargs = {'device': 'cuda'} # changed by pdchristian to 'cuda'
+#model_kwargs = {'device': 'cpu'}
 encode_kwargs = {'normalize_embeddings': True}
 hf = HuggingFaceEmbeddings(
     model_name=model_name,
@@ -34,10 +36,15 @@ def __init__(self, query: str) -> None:
     client_ai = OpenAI(
         base_url="https://integrate.api.nvidia.com/v1",
         api_key=environment_var.nvidia_key
+#        base_url="http://localhost:11434", #pdchristian tried to connect to local ollama server
+#        api_key="ollama" #pdchristian tried to connect to local ollama server
+#        base_url="http://localhost:1234", #pdchristian tried to connect to local LM-Studio server
+#        api_key=environment_var.nvidia_key #pdchristian tried to connect to local LM-Studio server
     )
     use_nvidia_api = True
 elif use_quantized:
     model_id = "Kameshr/LLAMA-3-Quantized"
+#    model_id = "llama3.1"
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
@@ -46,6 +53,8 @@ def __init__(self, query: str) -> None:
     )
 else:
     model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+#    model_id = "llama3.1"
+#    model_id = "lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF"
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     model = AutoModelForCausalLM.from_pretrained(
         model_id,
@@ -70,7 +79,7 @@ async def root():
 def search(Item:Item):
     query = Item.query
     search_result = qdrant.similarity_search(
-        query=query, k=10
+        query=query, k=20
     )
     i = 0
     list_res = []
@@ -82,7 +91,7 @@ def search(Item:Item):
 async def ask_localai(Item:Item):
     query = Item.query
     search_result = qdrant.similarity_search(
-        query=query, k=10
+        query=query, k=20
     )
     i = 0
     list_res = []
@@ -96,7 +105,8 @@ async def ask_localai(Item:Item):
         i = i +1
 
     rolemsg = {"role": "system",
-               "content": "Answer user's question using documents given in the context. In the context are documents that should contain an answer. Please always reference document id (in squere brackets, for example [0],[1]) of the document that was used to make a claim. Use as many citations and documents as it is necessary to answer question."}
+               "content": "Answer user's question using documents given in the context. Formulate all answers in German. In the context are documents that should contain an answer. Please always reference document id (in squere brackets, for example [0],[1]) of the document that was used to make a claim. Use as many citations and documents as it is necessary to answer question."}
+    #           "content": "Answer user's question using documents given in the context. In the context are documents that should contain an answer. Please always reference document id (in squere brackets, for example [0],[1]) of the document that was used to make a claim. Use as many citations and documents as it is necessary to answer question."}
     messages = [
         rolemsg,
         {"role": "user", "content": "Documents:\n"+context+"\n\nQuestion: "+query},

diff --git a/index.py b/index.py
@@ -1,6 +1,7 @@
 import PyPDF2
 from os import listdir
-from os.path import isfile, join,isdir
+from os.path import isfile, join, isdir
+import os
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_qdrant import Qdrant
 import sys
@@ -10,13 +11,20 @@
 from qdrant_client.models import Distance, VectorParams
 import docx
 
+#def get_files(dir):
+#    file_list = []
+#    for f in listdir(dir):
+#        if isfile(join(dir,f)):
+#            file_list.append(join(dir,f))
+#        elif isdir(join(dir,f)):
+#            file_list= file_list + get_files(join(dir,f))
+#    return file_list
+
 def get_files(dir):
     file_list = []
-    for f in listdir(dir):
-        if isfile(join(dir,f)):
-            file_list.append(join(dir,f))
-        elif isdir(join(dir,f)):
-            file_list= file_list + get_files(join(dir,f))
+    for dir, _, filenames in os.walk(dir):
+        for f in filenames:
+           file_list.append(os.path.join(dir, f))
     return file_list
 
 def getTextFromWord(filename):
@@ -37,6 +45,7 @@ def getTextFromPPTX(filename):
 def main_indexing(mypath):
     #model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
     model_name = "sentence-transformers/msmarco-bert-base-dot-v5"
+    #model_kwargs = {'device': 'cuda'}
     model_kwargs = {'device': 'cpu'}
     encode_kwargs = {'normalize_embeddings': True}
     hf = HuggingFaceEmbeddings(
@@ -56,14 +65,21 @@ def main_indexing(mypath):
     file_content = ""
     for file in onlyfiles:
         file_content = ""
-        if file.endswith(".pdf"):
+        if file.find("~") > 0: #added by pdchristian to catch files with "~" in file name
+            file_content = "Empty due to ~ in file name." #added by pdchristian to catch files with "~" in file name
+            print("Document title with ~: "+file) #added by pdchristian to catch files with "~" in file name        
+        elif file.endswith(".pdf"): #added by pdchristian to catch files with "~" in file name
+        #if file.endswith(".pdf"):
             print("indexing "+file)
-            reader = PyPDF2.PdfReader(file)
-            for i in range(0,len(reader.pages)):
-                file_content = file_content + " "+reader.pages[i].extract_text()
+            try:    #added by pdchristian to catch decryption error
+                reader = PyPDF2.PdfReader(file)
+                for i in range(0,len(reader.pages)):
+                    file_content = file_content + " "+reader.pages[i].extract_text()
+            except Exception as exc: #added by pdchristian to catch decryption error
+                file_content = "Empty due to extraction error." #added by pdchristian to catch decryption error
         elif file.endswith(".txt") or file.endswith(".md") or file.endswith(".markdown"):
             print("indexing " + file)
-            f = open(file,'r')
+            f = open(file,'r',encoding='utf-8',errors='ignore') #added by pdchristian encoding='utf-8',errors='ignore'
             file_content = f.read()
             f.close()
         elif file.endswith(".docx"):
@@ -81,9 +97,13 @@ def main_indexing(mypath):
             metadata.append({"path":file})
         qdrant.add_texts(texts,metadatas=metadata)
         len(texts)
-    print(onlyfiles)
+    #print(onlyfiles)
     print("Finished indexing!")
 
+#Fixed Folder
+#main_indexing("TestFolder")
+
+#Folder from command line: python index.py path/to/folder or python index.py "X:/Christian/SV"
 if __name__ == "__main__":
     arguments = sys.argv
     if len(arguments)>1:

diff --git a/requirements.txt b/requirements.txt
@@ -17,4 +17,12 @@ accelerate
 streamlit
 python-docx
 python-pptx
-openai
+openai
+
+#Added by pdchristian
+PyCryptodome
+torchvision 
+torchaudio #--index-url https://download.pytorch.org/whl/cu118
+
+#langchain-huggingface #will replace HuggingFaceEmbeddings
+#QdrantVectorStore #will replace qdrant
diff --git a/user_interface.py b/user_interface.py
@@ -3,8 +3,11 @@
 import requests
 import json
 st.title('_:blue[Local GenAI Search]_ :sunglasses:')
+
 question = st.text_input("Ask a question based on your local files", "")
-if st.button("Ask a question"):
+
+if not question=="":
+#if question("Frage abschicken"):
     st.write("The current question is \"", question+"\"")
     url = "http://127.0.0.1:8000/ask_localai"