Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: deprecate dataframe in AzureOCRDocumentConverter #8885

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 8 additions & 35 deletions haystack/components/converters/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@
# SPDX-License-Identifier: Apache-2.0

import copy
import hashlib
import os
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Union

import networkx as nx
import pandas as pd

from haystack import Document, component, default_from_dict, default_to_dict, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
Expand All @@ -24,6 +22,9 @@
from azure.ai.formrecognizer import AnalyzeResult, DocumentAnalysisClient, DocumentLine, DocumentParagraph
from azure.core.credentials import AzureKeyCredential

with LazyImport(message="Run 'pip install pandas'") as pandas_import:
import pandas as pd


@component
class AzureOCRDocumentConverter:
Expand Down Expand Up @@ -90,6 +91,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
If False, only the file name is stored.
"""
azure_import.check()
pandas_import.check()

self.document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(api_key.resolve_value() or "")
Expand Down Expand Up @@ -303,13 +305,10 @@ def _convert_tables(self, result: "AnalyzeResult", meta: Optional[Dict[str, Any]
if table.bounding_regions:
table_meta["page"] = table.bounding_regions[0].page_number

table_df = pd.DataFrame(columns=table_list[0], data=table_list[1:])

# Use custom ID for tables, as columns might not be unique and thus failing in the default ID generation
pd_hashes = self._hash_dataframe(table_df)
data = f"{pd_hashes}{table_meta}"
doc_id = hashlib.sha256(data.encode()).hexdigest()
converted_tables.append(Document(id=doc_id, dataframe=table_df, meta=table_meta))
# Convert table to CSV
table_df = pd.DataFrame(data=table_list)
table_content = table_df.to_csv(header=False, index=False, lineterminator="\n")
converted_tables.append(Document(content=table_content, meta=table_meta))

return converted_tables

Expand Down Expand Up @@ -479,29 +478,3 @@ def _check_if_in_table(
in_table = True
break
return in_table

def _hash_dataframe(self, df: pd.DataFrame, desired_samples=5, hash_length=4) -> str:
"""
Returns a hash of the DataFrame content.

The hash is based on the content of the DataFrame.
:param df: The DataFrame to hash.
:param desired_samples: The desired number of samples to hash.
:param hash_length: The length of the hash for each sample.

:returns: A hash of the DataFrame content.
"""
# take adaptive sample of rows to hash because we can have very large dataframes
hasher = hashlib.md5()
total_rows = len(df)
# sample rate based on DataFrame size and desired number of samples
sample_rate = max(1, total_rows // desired_samples)

hashes = pd.util.hash_pandas_object(df, index=True)
sampled_hashes = hashes[::sample_rate]

for hash_value in sampled_hashes:
partial_hash = str(hash_value)[:hash_length].encode("utf-8")
hasher.update(partial_hash)

return hasher.hexdigest()
72 changes: 29 additions & 43 deletions test/components/converters/test_azure_ocr_doc_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from typing import Literal
from unittest.mock import patch

import pandas as pd
import pytest
from azure.ai.formrecognizer import AnalyzeResult

Expand Down Expand Up @@ -148,11 +147,15 @@ def result(self) -> AnalyzeResult:
docs = out["documents"]
assert len(docs) == 2
# Checking the table doc extracted
assert docs[0].content_type == "table"
assert docs[0].dataframe.shape[0] == 4 # number of rows
assert docs[0].dataframe.shape[1] == 4 # number of columns
assert list(docs[0].dataframe.columns) == ["", "Column 1", "Column 2", "Column 3"]
assert list(docs[0].dataframe.iloc[3]) == ["D", "$54.35", "$6345.", ""]
assert (
docs[0].content
== """,Column 1,Column 2,Column 3
A,324,55 million units,2022
B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
C,23.53%,A short string.,
D,$54.35,$6345.,
"""
)
assert (
docs[0].meta["preceding_context"] == "specification. These proprietary technologies are not "
"standardized and their\nspecification is published only on "
Expand Down Expand Up @@ -191,13 +194,21 @@ def result(self) -> AnalyzeResult:
docs = out["documents"]
assert len(docs) == 2
# Checking the table doc extracted that is missing bounding info
assert docs[0].content_type == "table"
assert docs[0].dataframe.shape[0] == 4 # number of rows
assert docs[0].dataframe.shape[1] == 4 # number of columns
assert list(docs[0].dataframe.columns) == ["", "Column 1", "Column 2", "Column 3"]
assert list(docs[0].dataframe.iloc[3]) == ["D", "$54.35", "$6345.", ""]
# TODO below assert fails
# assert docs[0].meta["preceding_context"] == ""
assert (
docs[0].content
== """,Column 1,Column 2,Column 3
A,324,55 million units,2022
B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
C,23.53%,A short string.,
D,$54.35,$6345.,
"""
)
assert docs[0].meta["preceding_context"] == (
"specification. These proprietary technologies are not standardized and their\nspecification is published "
"only on Adobe's website. Many of them are also not\nsupported by popular third-party implementations of "
"PDF."
)
assert docs[0].meta["following_context"] == ""

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_azure_converter_with_multicolumn_header_table(self, mock_resolve_value, test_files_path) -> None:
Expand All @@ -213,20 +224,17 @@ def result(self) -> AnalyzeResult:
azure_mock.return_value = MockPoller()
ocr_node = AzureOCRDocumentConverter(endpoint="")

# TODO: fails because of non-unique column names, azure_sample_pdf_3.json has duplicate column names
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_3.pdf"])

docs = out["documents"]
assert len(docs) == 2
assert docs[0].content_type == "table"
assert docs[0].dataframe.shape[0] == 1 # number of rows
assert docs[0].dataframe.shape[1] == 3 # number of columns
assert list(docs[0].dataframe.columns) == ["This is a subheader", "This is a subheader", "This is a subheader"]
assert list(docs[0].dataframe.iloc[0]) == ["Value 1", "Value 2", "Val 3"]
assert docs[0].content == "This is a subheader,This is a subheader,This is a subheader\nValue 1,Value 2,Val 3\n"
assert (
docs[0].meta["preceding_context"]
== "Table 1. This is an example table with two multicolumn headers\nHeader 1"
)
assert docs[0].meta["following_context"] == ""
assert docs[0].meta["page"] == 1

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_table_pdf_with_non_empty_meta(self, mock_resolve_value, test_files_path) -> None:
Expand All @@ -244,7 +252,6 @@ def result(self) -> AnalyzeResult:
out = ocr_node.run(sources=[test_files_path / "pdf" / "sample_pdf_1.pdf"], meta=[{"test": "value_1"}])

docs = out["documents"]
# TODO assert below changed from the original test
assert docs[1].meta["test"] == "value_1"

@pytest.mark.integration
Expand Down Expand Up @@ -307,27 +314,6 @@ def test_run_with_store_full_path_false(self, test_files_path):
assert "Sample Docx File" in documents[0].content
assert documents[0].meta["file_path"] == "sample_docx.docx"

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_hashing_dataframe(self, mock_resolve_value):
mock_resolve_value.return_value = "test_api_key"
component = AzureOCRDocumentConverter(endpoint="")
hash_length = 32

df = pd.DataFrame({"A": [1, 2, 3]})
hash_string_1 = component._hash_dataframe(df)
assert len(hash_string_1) == hash_length

df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
hash_string_2 = component._hash_dataframe(df)
assert len(hash_string_2) == hash_length

df = pd.DataFrame({"B": [4, 5, 6], "A": [1, 2, 3], "D": [7, 8, 9]})
hash_string_3 = component._hash_dataframe(df)
assert len(hash_string_3) == hash_length

# doesn't mean much, more for sanity check
assert hash_string_1 != hash_string_2 != hash_string_3

@patch("haystack.utils.auth.EnvVarSecret.resolve_value")
def test_meta_from_byte_stream(self, mock_resolve_value, test_files_path) -> None:
mock_resolve_value.return_value = "test_api_key"
Expand All @@ -341,8 +327,8 @@ def result(self) -> AnalyzeResult:
with patch("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document") as azure_mock:
azure_mock.return_value = MockPoller()
ocr_node = AzureOCRDocumentConverter(endpoint="")
bytes = (test_files_path / "pdf" / "sample_pdf_1.pdf").read_bytes()
byte_stream = ByteStream(data=bytes, meta={"test_from": "byte_stream"})
bytes_ = (test_files_path / "pdf" / "sample_pdf_1.pdf").read_bytes()
byte_stream = ByteStream(data=bytes_, meta={"test_from": "byte_stream"})
out = ocr_node.run(sources=[byte_stream], meta=[{"test": "value_1"}])

docs = out["documents"]
Expand Down
Loading