Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add URL embed to new feat submit-url at librechat #44

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ uploads/
myenv/
venv/
*.pyc
env/

74 changes: 47 additions & 27 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import aiofiles.os
from typing import Iterable, List
from shutil import copyfileobj

import re
import uvicorn
from langchain.schema import Document
from contextlib import asynccontextmanager
Expand All @@ -25,6 +25,7 @@
)
from langchain_community.document_loaders import (
WebBaseLoader,
YoutubeLoader,
TextLoader,
PyPDFLoader,
CSVLoader,
Expand All @@ -34,6 +35,7 @@
UnstructuredXMLLoader,
UnstructuredRSTLoader,
UnstructuredExcelLoader,
ArxivLoader,
)

from models import (
Expand Down Expand Up @@ -363,38 +365,60 @@ async def embed_local_file(document: StoreDocument, request: Request):

@app.post("/embed")
async def embed_file(
request: Request, file_id: str = Form(...), file: UploadFile = File(...)
request: Request, file_id: str = Form(None), file: UploadFile = File(None), url: str = Form(None)
):
response_status = True
response_message = "File processed successfully."
known_type = None
known_type = None
if not hasattr(request.state, "user"):
user_id = "public"
else:
user_id = request.state.user.get("id")

if url is not None:
if "youtube.com" in url:
loader = YoutubeLoader.from_youtube_url(url)
data = loader.load()
elif "arxiv.org" in url:
pattern = r"(\d+\.\d+)"
match = re.search(pattern, url)
if match:
doc = match.group(1)
loader = ArxivLoader(doc)
data = loader.load()
else:
loader = WebBaseLoader(url)
data = loader.load()
else:
temp_base_path = os.path.join(RAG_UPLOAD_DIR, user_id)
os.makedirs(temp_base_path, exist_ok=True)
temp_file_path = os.path.join(RAG_UPLOAD_DIR, user_id, file.filename)

temp_base_path = os.path.join(RAG_UPLOAD_DIR, user_id)
os.makedirs(temp_base_path, exist_ok=True)
temp_file_path = os.path.join(RAG_UPLOAD_DIR, user_id, file.filename)
try:
async with aiofiles.open(temp_file_path, "wb") as temp_file:
chunk_size = 64 * 1024 # 64 KB
while content := await file.read(chunk_size):
await temp_file.write(content)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to save the uploaded file. Error: {str(e)}",
)

try:
async with aiofiles.open(temp_file_path, "wb") as temp_file:
chunk_size = 64 * 1024 # 64 KB
while content := await file.read(chunk_size):
await temp_file.write(content)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to save the uploaded file. Error: {str(e)}",
)
try:
loader, known_type, file_ext = get_loader(
file.filename, file.content_type, temp_file_path
)
data = loader.load()
finally:
try:
await aiofiles.os.remove(temp_file_path)
except Exception as e:
logger.info(f"Failed to remove temporary file: {str(e)}")

try:
loader, known_type, file_ext = get_loader(
file.filename, file.content_type, temp_file_path
)
data = loader.load()
result = await store_data_in_vector_db(
data=data, file_id=file_id, user_id=user_id, clean_content=file_ext == "pdf"
data=data, file_id=file_id, user_id=user_id, clean_content=False
)

if not result:
Expand All @@ -421,21 +445,17 @@ async def embed_file(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Error during file processing: {str(e)}",
)
finally:
try:
await aiofiles.os.remove(temp_file_path)
except Exception as e:
logger.info(f"Failed to remove temporary file: {str(e)}")

return {
"status": response_status,
"message": response_message,
"file_id": file_id,
"filename": file.filename,
"filename": file.filename if file else url,
"known_type": known_type,
}



@app.get("/documents/{id}/context")
async def load_document_context(id: str):
ids = [id]
Expand Down
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,8 @@ opencv-python-headless==4.9.0.80
pymongo==4.6.3
langchain-mongodb==0.1.3
cryptography==42.0.7
pytube==15.0.0
youtube-transcript-api==0.6.2
PyMuPDFb==1.24.3
PyMuPDF==1.24.5
arxiv==2.1.0