-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontext_docs.py
97 lines (73 loc) · 3.12 KB
/
context_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import argparse
import pdfplumber
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
MODEL_EMBEDDING = os.getenv('MODEL_EMBEDDING')
def extract_elements_from_pdf(filepaths, collection_name):
"""
Génère une base de données de vecteurs à partir de fichiers PDF
Paramètres
-------
filepaths : str | emplacement des fichiers à 'embedder'
collection_name : str | nom identifiant la base de données
Retourne
-------
Rien
"""
documents = []
pdf_files = [os.path.join(filepaths, file) for file in os.listdir(filepaths) if file.endswith('.pdf')]
print(pdf_files)
for pdf_path in pdf_files:
# Extract text using pdfplumber
with pdfplumber.open(pdf_path) as pdf:
for i, page in enumerate(pdf.pages):
extracted_text = ""
page_text = page.extract_text()
extracted_text += page_text + "\n"
# Split text using LangChain's RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(extracted_text)
# Wrap each chunk in a Document object with metadata
for chunk in chunks:
# Extract names from the text
doc = Document(
page_content=chunk,
metadata={"page": i + 1, "source": pdf_path}
)
documents.append(doc)
print(doc)
print('\n\n')
embeddings = OpenAIEmbeddings(model=MODEL_EMBEDDING)
vectorstore = Chroma(
collection_name=collection_name,
embedding_function=embeddings,
persist_directory=f"./chroma_db/{collection_name}"
)
# Add documents to the vector store
vectorstore.add_documents(documents)
# Optionally, check the number of documents added
print(f"Number of documents in the vector store: {len(documents)}")
# Create an argument parser
def main():
"""
Appelle la fonction "extract_elements_from_pdf" en ligne de commande avec arguments
Arguments
-------
filepath : str | emplacement des fichiers à 'embedder'
collection_name : str | nom identifiant la base de données
"""
parser = argparse.ArgumentParser(description='Function to fill a Chroma DB with PDF documents content.')
parser.add_argument('--filepath', type=str, required=True, help='Filepath of the PDF documents')
parser.add_argument('--collection_name', type=str, required=True, help='Collection Name for the Chroma DB')
# Parse the command-line arguments
args = parser.parse_args()
# Call your function with the arguments
extract_elements_from_pdf(args.filepath, args.collection_name)
# Ensure the script runs when executed with python -m
if __name__ == "__main__":
main()