Skip to content

Commit 809c7e5

Browse files
authored
chore: reduce excessive logging (#3095)
- change some info level logging for per page processing into detail level logging on trace logger - replace the try block in `document_to_element_list` to use `getattr` instead and add comment on the reason why sometimes `type` attribute may not exist for an element
1 parent 26d403d commit 809c7e5

File tree

5 files changed

+22
-15
lines changed

5 files changed

+22
-15
lines changed

CHANGELOG.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.3-dev3
1+
## 0.14.3-dev4
22

33
### Enhancements
44

@@ -13,8 +13,9 @@
1313
* **Add the missing `form_extraction_skip_tables` argument to the `partition_pdf_or_image` call**.
1414
* **Turn off XML resolve entities** Sets `resolve_entities=False` for XML parsing with `lxml`
1515
to avoid text being dynamically injected into the XML document.
16-
1716
* **Chromadb change from Add to Upsert using element_id to make idempotent**
17+
* **Reduce excessive logging** Change per page ocr info level logging into detail level trace logging
18+
* **Replace try block in `document_to_element_list` for handling HTMLDocument** Use `getattr(element, "type", "")` to get the `type` attribute of an element when it exists. This is more explicit way to handle the special case for HTML documents and prevents other types of attribute error from being silenced by the try block
1819

1920
## 0.14.2
2021

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.3-dev3" # pragma: no cover
1+
__version__ = "0.14.3-dev4" # pragma: no cover

unstructured/partition/common.py

+14-8
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,9 @@ def _get_page_image_metadata(page: PageLayout) -> dict[str, Any]:
537537
}
538538

539539

540+
# FIXME: document here can be either DocumentLayout or HTMLDocument; HTMLDocument is defined in
541+
# unstructured.documents.html, which imports this module so we can't import the class for type
542+
# hints. Moreover, those two types of documents have different lists of attributes
540543
def document_to_element_list(
541544
document: "DocumentLayout",
542545
sortable: bool = False,
@@ -550,7 +553,7 @@ def document_to_element_list(
550553
starting_page_number: int = 1,
551554
**kwargs: Any,
552555
) -> list[Element]:
553-
"""Converts a DocumentLayout object to a list of unstructured elements."""
556+
"""Converts a DocumentLayout or HTMLDocument object to a list of unstructured elements."""
554557
elements: list[Element] = []
555558

556559
num_pages = len(document.pages)
@@ -588,13 +591,16 @@ def document_to_element_list(
588591
element.metadata.last_modified = last_modification_date
589592
element.metadata.text_as_html = getattr(layout_element, "text_as_html", None)
590593
element.metadata.table_as_cells = getattr(layout_element, "table_as_cells", None)
591-
try:
592-
if (
593-
isinstance(element, Title) and element.metadata.category_depth is None
594-
) and any(el.type in ["Headline", "Subheadline"] for el in page.elements):
595-
element.metadata.category_depth = 0
596-
except AttributeError:
597-
logger.info("HTML element instance has no attribute type")
594+
# FIXME: here the elements in a page can be either:
595+
# 1. LayoutElement if the document is LayoutDocument (if the partition is on a
596+
# pdf/image);
597+
# 2. Element if the document is HTMLDocument (if the partition is on an html file)
598+
# this discrepency is due to Element class defined in unstructured and LayoutElement
599+
# class defined in unstructured_inference do not have the same list of attributes
600+
if (isinstance(element, Title) and element.metadata.category_depth is None) and any(
601+
getattr(el, "type", "") in ["Headline", "Subheadline"] for el in page.elements
602+
):
603+
element.metadata.category_depth = 0
598604

599605
page_elements.append(element)
600606
translation_mapping.append((layout_element, element))

unstructured/partition/utils/ocr_models/paddle_ocr.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from PIL import Image as PILImage
77

88
from unstructured.documents.elements import ElementType
9-
from unstructured.logger import logger
9+
from unstructured.logger import logger, trace_logger
1010
from unstructured.partition.utils.constants import DEFAULT_PADDLE_LANG, Source
1111
from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
1212
from unstructured.utils import requires_dependencies
@@ -71,7 +71,7 @@ def get_layout_from_image(
7171
) -> list[TextRegion]:
7272
"""Get the OCR regions from image as a list of text regions with paddle."""
7373

74-
logger.info("Processing entire page OCR with paddle...")
74+
trace_logger.detail("Processing entire page OCR with paddle...")
7575

7676
# TODO(yuming): pass in language parameter once we
7777
# have the mapping for paddle lang code

unstructured/partition/utils/ocr_models/tesseract_ocr.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from PIL import Image as PILImage
1111
from unstructured_pytesseract import Output
1212

13-
from unstructured.logger import logger
13+
from unstructured.logger import trace_logger
1414
from unstructured.partition.utils.config import env_config
1515
from unstructured.partition.utils.constants import (
1616
IMAGE_COLOR_DEPTH,
@@ -44,7 +44,7 @@ def get_layout_from_image(
4444
) -> List[TextRegion]:
4545
"""Get the OCR regions from image as a list of text regions with tesseract."""
4646

47-
logger.info("Processing entire page OCR with tesseract...")
47+
trace_logger.detail("Processing entire page OCR with tesseract...")
4848
zoom = 1
4949
ocr_df: pd.DataFrame = unstructured_pytesseract.image_to_data(
5050
np.array(image),

0 commit comments

Comments
 (0)