chore: reduce excessive logging (#3095)

badGarnet · web-flow · commit 809c7e515aad · 2024-05-24T14:58:47.000Z
- change some info level logging for per page processing into detail
level logging on trace logger
- replace the try block in `document_to_element_list` to use `getattr`
instead and add comment on the reason why sometimes `type` attribute may
not exist for an element
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.14.3-dev3
+## 0.14.3-dev4
 
 ### Enhancements
 
@@ -13,8 +13,9 @@
 * **Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call**.
 * **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml`
   to avoid text being dynamically injected into the XML document.
-
 * **Chromadb change from Add to Upsert using element_id to make idempotent**
+* **Reduce excessive logging** Change per page ocr info level logging into detail level trace logging
+* **Replace try block in `document_to_element_list` for handling HTMLDocument** Use `getattr(element, "type", "")` to get the `type` attribute of an element when it exists. This is more explicit way to handle the special case for HTML documents and prevents other types of attribute error from being silenced by the try block
 
 ## 0.14.2
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.3-dev3"  # pragma: no cover
+__version__ = "0.14.3-dev4"  # pragma: no cover
diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py
@@ -537,6 +537,9 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
     }
 
 
+# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
+# unstructured.documents.html, which imports this module so we can't import the class for type
+# hints. Moreover, those two types of documents have different lists of attributes
 def document_to_element_list(
     document: "DocumentLayout",
     sortable: bool = False,
@@ -550,7 +553,7 @@ def document_to_element_list(
     starting_page_number: int = 1,
     **kwargs: Any,
 ) -> list[Element]:
-    """Converts a DocumentLayout object to a list of unstructured elements."""
+    """Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements."""
     elements: list[Element] = []
 
     num_pages = len(document.pages)
@@ -588,13 +591,16 @@ def document_to_element_list(
                     element.metadata.last_modified = last_modification_date
                 element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
                 element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
-                try:
-                    if (
-                        isinstance(element, Title) and element.metadata.category_depth is None
-                    ) and any(el.type in ["Headline", "Subheadline"] for el in page.elements):
-                        element.metadata.category_depth = 0
-                except AttributeError:
-                    logger.info("HTML element instance has no attribute type")
+                # FIXME: here the elements in a page can be either:
+                # 1. LayoutElement if the document is LayoutDocument (if the partition is on a
+                #   pdf/image);
+                # 2. Element if the document is HTMLDocument (if the partition is on an html file)
+                # this discrepency is due to Element class defined in unstructured and LayoutElement
+                # class defined in unstructured_inference do not have the same list of attributes
+                if (isinstance(element, Title) and element.metadata.category_depth is None) and any(
+                    getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements
+                ):
+                    element.metadata.category_depth = 0
 
                 page_elements.append(element)
                 translation_mapping.append((layout_element, element))
diff --git a/unstructured/partition/utils/ocr_models/paddle_ocr.py b/unstructured/partition/utils/ocr_models/paddle_ocr.py
@@ -6,7 +6,7 @@
 from PIL import Image as PILImage
 
 from unstructured.documents.elements import ElementType
-from unstructured.logger import logger
+from unstructured.logger import logger, trace_logger
 from unstructured.partition.utils.constants import DEFAULT_PADDLE_LANG, Source
 from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
 from unstructured.utils import requires_dependencies
@@ -71,7 +71,7 @@ def get_layout_from_image(
     ) -> list[TextRegion]:
         """Get the OCR regions from image as a list of text regions with paddle."""
 
-        logger.info("Processing entire page OCR with paddle...")
+        trace_logger.detail("Processing entire page OCR with paddle...")
 
         # TODO(yuming): pass in language parameter once we
         # have the mapping for paddle lang code
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -10,7 +10,7 @@
 from PIL import Image as PILImage
 from unstructured_pytesseract import Output
 
-from unstructured.logger import logger
+from unstructured.logger import trace_logger
 from unstructured.partition.utils.config import env_config
 from unstructured.partition.utils.constants import (
     IMAGE_COLOR_DEPTH,
@@ -44,7 +44,7 @@ def get_layout_from_image(
     ) -> List[TextRegion]:
         """Get the OCR regions from image as a list of text regions with tesseract."""
 
-        logger.info("Processing entire page OCR with tesseract...")
+        trace_logger.detail("Processing entire page OCR with tesseract...")
         zoom = 1
         ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
             np.array(image),

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.14.3-dev3" # pragma: no cover`
	`1`	`+__version__ = "0.14.3-dev4" # pragma: no cover`