From d2298b8d7b9be454703eae985e1220a2f12b8715 Mon Sep 17 00:00:00 2001 From: kuschzzp <38914005+kuschzzp@users.noreply.github.com> Date: Fri, 8 Nov 2024 09:21:42 +0800 Subject: [PATCH] Fix:#3230 When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful (#3249) ### What problem does this PR solve? When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful Fix:#3230 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: Kevin Hu --- api/db/db_models.py | 2 +- deepdoc/parser/docx_parser.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/api/db/db_models.py b/api/db/db_models.py index fbf0f3cde10..1ce8b51283d 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -840,7 +840,7 @@ class Task(DataBaseModel): doc_id = CharField(max_length=32, null=False, index=True) from_page = IntegerField(default=0) - to_page = IntegerField(default=-1) + to_page = IntegerField(default=100000000) begin_at = DateTimeField(null=True, index=True) process_duation = FloatField(default=0) diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index 57804109938..1c1c14d3041 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -110,7 +110,7 @@ def blockType(b): return lines return ["\n".join(lines)] - def __call__(self, fnm, from_page=0, to_page=100000): + def __call__(self, fnm, from_page=0, to_page=100000000): self.doc = Document(fnm) if isinstance( fnm, str) else Document(BytesIO(fnm)) pn = 0 # parsed page @@ -130,7 +130,7 @@ def __call__(self, fnm, from_page=0, to_page=100000): if 'lastRenderedPageBreak' in run._element.xml: pn += 1 - secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph + secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph tbls = [self.__extract_table_content(tb) for tb in self.doc.tables] return secs, tbls