Skip to content

Commit

Permalink
Add unidecode and string manipulation for filename
Browse files Browse the repository at this point in the history
  • Loading branch information
StanGirard committed Apr 17, 2024
1 parent cb0051f commit 8921eb9
Show file tree
Hide file tree
Showing 10 changed files with 98 additions and 13 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ ragas = "*"
datasets = "*"
pytest-dotenv = "*"
fpdf2 = "*"
unidecode = "*"

[dev-packages]
black = "*"
Expand Down
21 changes: 15 additions & 6 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions backend/modules/assistant/ito/ito.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import random
import re
import string
from abc import abstractmethod
from io import BytesIO
from tempfile import NamedTemporaryFile
Expand All @@ -16,6 +18,7 @@
from modules.user.entity.user_identity import UserIdentity
from packages.emails.send_email import send_email
from pydantic import BaseModel
from unidecode import unidecode

logger = get_logger(__name__)

Expand Down Expand Up @@ -129,6 +132,7 @@ async def create_and_upload_processed_file(
self, processed_content: str, original_filename: str, file_description: str
) -> dict:
"""Handles creation and uploading of the processed file."""
# remove any special characters from the filename that aren't http safe

new_filename = (
original_filename.split(".")[0]
Expand All @@ -138,6 +142,11 @@ async def create_and_upload_processed_file(
+ str(random.randint(1000, 9999))
+ ".pdf"
)
new_filename = unidecode(new_filename)
new_filename = re.sub(
"[^{}0-9a-zA-Z]".format(re.escape(string.punctuation)), "", new_filename
)

self.generate_pdf(
new_filename,
f"{file_description} of {original_filename}",
Expand Down
12 changes: 12 additions & 0 deletions backend/modules/assistant/ito/utils/Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
unidecode = "*"

[dev-packages]

[requires]
python_version = "3.12"
30 changes: 30 additions & 0 deletions backend/modules/assistant/ito/utils/Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file not shown.
Binary file not shown.
Binary file not shown.
31 changes: 27 additions & 4 deletions backend/modules/assistant/ito/utils/pdf_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,27 @@ class PDFGenerator(FPDF):
def __init__(self, pdf_model: PDFModel, *args, **kwargs):
super().__init__(*args, **kwargs)
self.pdf_model = pdf_model
self.add_font(
"DejaVu",
"",
os.path.join(os.path.dirname(__file__), "font/DejaVuSansCondensed.ttf"),
uni=True,
)
self.add_font(
"DejaVu",
"B",
os.path.join(
os.path.dirname(__file__), "font/DejaVuSansCondensed-Bold.ttf"
),
uni=True,
)
self.add_font(
"DejaVu",
"I",
os.path.join(
os.path.dirname(__file__), "font/DejaVuSansCondensed-Oblique.ttf"
),
)

def header(self):
# Logo
Expand All @@ -23,22 +44,24 @@ def header(self):
self.set_xy(20, 15)

# Title
self.set_font("Arial", "B", 12)
self.set_font("DejaVu", "B", 12)
self.multi_cell(0, 10, self.pdf_model.title, align="C")
self.ln(5) # Padding after title

def footer(self):
self.set_y(-15)
self.set_font("helvetica", "I", 8)

self.set_font("DejaVu", "I", 8)
self.set_text_color(169, 169, 169)
self.cell(80, 10, "Generated by Quivr", 0, 0, "C")
self.set_font("helvetica", "U", 8)
self.set_font("DejaVu", "U", 8)
self.set_text_color(0, 0, 255)
self.cell(30, 10, "quivr.app", 0, 0, "C", link="https://quivr.app")
self.cell(0, 10, "Github", 0, 1, "C", link="https://github.com/quivrhq/quivr")

def chapter_body(self):
self.set_font("Arial", "", 12)

self.set_font("DejaVu", "", 12)
self.multi_cell(0, 10, self.pdf_model.content, markdown=True)
self.ln()

Expand Down
7 changes: 4 additions & 3 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ langchain-core==0.1.43; python_version < '4.0' and python_full_version >= '3.8.1
langchain-openai==0.1.3; python_version < '4.0' and python_full_version >= '3.8.1'
langchain-text-splitters==0.0.1; python_version < '4.0' and python_full_version >= '3.8.1'
langdetect==1.0.9
langfuse==2.26.2; python_version < '4.0' and python_full_version >= '3.8.1'
langfuse==2.26.3; python_version < '4.0' and python_full_version >= '3.8.1'
langsmith==0.1.48; python_version < '4.0' and python_full_version >= '3.8.1'
layoutparser[layoutmodels,tesseract]==0.3.4; python_version >= '3.6'
litellm==1.35.8; python_version not in '2.7, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7' and python_version >= '3.8'
Expand Down Expand Up @@ -134,7 +134,7 @@ networkx==3.3
newspaper3k==0.2.8
nltk==3.8.1; python_version >= '3.7'
nodeenv==1.8.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6'
numpy==1.26.4; python_version >= '3.10'
numpy==1.26.4; python_version >= '3.9'
olefile==0.47; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
omegaconf==2.3.0; python_version >= '3.6'
onnx==1.16.0
Expand Down Expand Up @@ -241,11 +241,12 @@ transformers==4.39.3; python_full_version >= '3.8.0'
typing-extensions==4.11.0; python_version >= '3.8'
typing-inspect==0.9.0
tzdata==2024.1; python_version >= '2'
unidecode==1.3.8; python_version >= '3.5'
unstructured[all-docs]==0.13.2; python_version < '3.12' and python_full_version >= '3.9.0'
unstructured-client==0.18.0; python_version >= '3.8'
unstructured-inference==0.7.25
unstructured.pytesseract==0.3.12
urllib3==2.2.1; python_version >= '3.10'
urllib3==2.2.1; python_version >= '3.8'
uvicorn==0.29.0; python_version >= '3.8'
vine==5.1.0; python_version >= '3.6'
watchdog==4.0.0; python_version >= '3.8'
Expand Down

0 comments on commit 8921eb9

Please sign in to comment.