Fix:infiniflow#3230 When parsing a docx file using the Book parsing m…

…ethod, to_page is always -1, resulting in a block count of 0 even if parsing is successful (infiniflow#3249) ### What problem does this PR solve? When parsing a docx file using the Book parsing method, to_page is always -1, resulting in a block count of 0 even if parsing is successful Fix:infiniflow#3230 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
jhaiq · Nov 30, 2024 · d2298b8 · d2298b8
1 parent 5355ae5
commit d2298b8
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/api/db/db_models.py b/api/db/db_models.py
@@ -840,7 +840,7 @@ class Task(DataBaseModel):
     doc_id = CharField(max_length=32, null=False, index=True)
     from_page = IntegerField(default=0)
 
-    to_page = IntegerField(default=-1)
+    to_page = IntegerField(default=100000000)
 
     begin_at = DateTimeField(null=True, index=True)
     process_duation = FloatField(default=0)

diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py
@@ -110,7 +110,7 @@ def blockType(b):
             return lines
         return ["\n".join(lines)]
 
-    def __call__(self, fnm, from_page=0, to_page=100000):
+    def __call__(self, fnm, from_page=0, to_page=100000000):
         self.doc = Document(fnm) if isinstance(
             fnm, str) else Document(BytesIO(fnm))
         pn = 0 # parsed page
@@ -130,7 +130,7 @@ def __call__(self, fnm, from_page=0, to_page=100000):
                 if 'lastRenderedPageBreak' in run._element.xml:
                     pn += 1
 
-            secs.append(("".join(runs_within_single_paragraph), p.style.name)) # then concat run.text as part of the paragraph
+            secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
 
         tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
         return secs, tbls