Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refine pdf parser, add time zone to userinfo #112

Merged
merged 1 commit into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/db/db_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ class User(DataBaseModel, UserMixin):
avatar = TextField(null=True, help_text="avatar base64 string")
language = CharField(max_length=32, null=True, help_text="English|Chinese", default="Chinese")
color_schema = CharField(max_length=32, null=True, help_text="Bright|Dark", default="Dark")
timezone = CharField(max_length=64, null=True, help_text="Timezone", default="UTC+8\tAsia/Shanghai")
last_login_time = DateTimeField(null=True)
is_authenticated = CharField(max_length=1, null=False, default="1")
is_active = CharField(max_length=1, null=False, default="1")
Expand Down
25 changes: 15 additions & 10 deletions deepdoc/parser/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,9 +313,19 @@ def start_with(b, txts):
while i < len(bxs) - 1:
b = bxs[i]
b_ = bxs[i + 1]
if b.get("layoutno", "0") != b_.get("layoutno", "1"):
if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure", "equation"]:
i += 1
continue
if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
# merge
bxs[i]["x1"] = b_["x1"]
bxs[i]["top"] = (b["top"] + b_["top"]) / 2
bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
bxs[i]["text"] += b_["text"]
bxs.pop(i + 1)
continue
i += 1
continue

dis_thr = 1
dis = b["x1"] - b_["x0"]
Expand Down Expand Up @@ -642,9 +652,9 @@ def nearest(tbls):

tk, tv = nearest(tables)
fk, fv = nearest(figures)
if min(tv, fv) > 2000:
i += 1
continue
#if min(tv, fv) > 2000:
# i += 1
# continue
if tv < fv:
tables[tk].insert(0, c)
logging.debug(
Expand Down Expand Up @@ -711,12 +721,7 @@ def cropout(bxs, ltype, poss):

# crop figure out and add caption
for k, bxs in figures.items():
txt = "\n".join(
[b["text"] for b in bxs
if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
and len(b["text"].strip()) >= 4
]
)
txt = "\n".join([b["text"] for b in bxs])
if not txt:
continue

Expand Down
4 changes: 2 additions & 2 deletions deepdoc/vision/layout_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def findLayout(ty):
continue

bxs[i]["layoutno"] = f"{ty}-{ii}"
bxs[i]["layout_type"] = lts_[ii]["type"]
bxs[i]["layout_type"] = lts_[ii]["type"] if lts_[ii]["type"]!="equation" else "figure"
i += 1

for lt in ["footer", "header", "reference", "figure caption",
Expand All @@ -105,7 +105,7 @@ def findLayout(ty):

# add box to figure layouts which has not text box
for i, lt in enumerate(
[lt for lt in lts if lt["type"] == "figure"]):
[lt for lt in lts if lt["type"] in ["figure","equation"]]):
if lt.get("visited"):
continue
lt = deepcopy(lt)
Expand Down
1 change: 0 additions & 1 deletion deepdoc/vision/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import numpy as np
import onnxruntime as ort

from api.utils.file_utils import get_project_base_directory
from .postprocess import build_post_process
from rag.settings import cron_logger

Expand Down
12 changes: 6 additions & 6 deletions deepdoc/vision/recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,18 +276,18 @@ def find_horizontally_tightest_fit(box, boxes):
def find_overlapped_with_threashold(box, boxes, thr=0.3):
if not boxes:
return
max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
max_overlapped_i, max_overlapped, _max_overlapped = None, thr, 0
s, e = 0, len(boxes)
for i in range(s, e):
ov = Recognizer.overlapped_area(box, boxes[i])
_ov = Recognizer.overlapped_area(boxes[i], box)
if (ov, _ov) < (max_overlaped, _max_overlaped):
if (ov, _ov) < (max_overlapped, _max_overlapped):
continue
max_overlaped_i = i
max_overlaped = ov
_max_overlaped = _ov
max_overlapped_i = i
max_overlapped = ov
_max_overlapped = _ov

return max_overlaped_i
return max_overlapped_i

def preprocess(self, image_list):
inputs = []
Expand Down
4 changes: 2 additions & 2 deletions rag/app/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
d = copy.deepcopy(doc)
if pdf_parser:
d["image"], poss = pdf_parser.crop(ck, need_position=True)
add_positions(d, poss)
add_positions(d, poss, from_page)
ck = pdf_parser.remove_tag(ck)
tokenize(d, ck, eng)
res.append(d)
Expand All @@ -112,7 +112,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
import sys


def dummy(a, b):
def dummy(prog=None, msg=""):
pass


Expand Down
4 changes: 2 additions & 2 deletions rag/nlp/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ def search(self, req, idxnm, emb_mdl=None):
)
else:
s = s.sort(
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg"}},
{"page_num_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
{"top_int": {"order": "asc", "unmapped_type": "float", "mode": "avg", "numeric_type": "double"}},
{"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
)
Expand Down