-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocument_processing2.py
100 lines (64 loc) · 2.72 KB
/
document_processing2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from langchain_doc_intelligence.AzureAIDocumentIntelligenceLoader import AzureAIDocumentIntelligenceLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
from dotenv import load_dotenv
import os
load_dotenv()
endpoint = os.getenv('AZURE_DOC_INT_ENDPOINT')
key = os.getenv('AZURE_DOC_INT_ENDPOINT_KEY')
url_path = "https://arxiv.org/pdf/2312.06648.pdf"
loader = AzureAIDocumentIntelligenceLoader(
api_endpoint=endpoint,
api_key=key,
url_path =url_path,
api_model="prebuilt-layout",
mode="markdown-page"
)
documents = loader.load()
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2")]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
chunks = []
for doc in documents:
chunks.extend(markdown_splitter.split_text(doc.page_content))
embeddings = OpenAIEmbeddings()
# chunk_embeddings = [embeddings.embed_documents([chunk.page_content]) for chunk in chunks]
qdrant = Qdrant.from_documents(
chunks, embeddings,
# location="/data",
# location="R:/Work/Asoft/doc-intelligence/langchain",
url="http://localhost:6333",
collection_name="markdown_chunks"
)
print("Qdrant vector store created successfully")
#better chunking logic?
# chunks = []
# for doc in documents:
# # Split the document by "Section" headers first
# section_chunks = markdown_splitter.split_text(doc.page_content, headers_to_split_on=[("#", "Section")])
# for section_chunk in section_chunks:
# # Split each section by "Subsection" headers
# subsection_chunks = markdown_splitter.split_text(section_chunk.page_content, headers_to_split_on=[("##", "Subsection")])
# # Update the metadata for each subsection chunk
# for subsection_chunk in subsection_chunks:
# subsection_chunk.metadata.update(section_chunk.metadata)
# chunks.extend(subsection_chunks)
# Retrieve all the chunks from the Qdrant vector store
# all_chunks = qdrant.get_all_documents()
# # Print the content and metadata of each chunk
# for chunk in all_chunks:
# print(f"Content: {chunk.page_content}")
# print(f"Metadata: {chunk.metadata}")
# print("-" * 20)
# similarity search
# from langchain_openai import OpenAIEmbeddings
# # Create embeddings for the query
# embeddings = OpenAIEmbeddings()
# query_embedding = embeddings.embed_query("your query text here")
# # Perform similarity search
# similar_chunks = qdrant.similarity_search(query_embedding, k=5)
# # Print the most similar chunks
# for chunk in similar_chunks:
# print(f"Content: {chunk.page_content}")
# print(f"Metadata: {chunk.metadata}")
# print("-" * 20)