Add coco staging brick to unstructured base (#2180)

Unstructured-IO · Nov 29, 2023 · 341f0f4 · 341f0f4
1 parent c028a14
commit 341f0f4
Show file tree

Hide file tree

Showing 4 changed files with 199 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.11.1-dev5
+## 0.11.1
 
 ### Enhancements
 * **Use `pikepdf` to repair invalid PDF structure** for PDFminer when we see error `PSSyntaxError` when PDFminer opens the document and creates the PDFminer pages object or processes a single PDF page.
@@ -7,6 +7,7 @@
 
 ### Features
 
+* **Staging Brick for Coco Format** Staging brick which converts a list of Elements into Coco Format.
 * **Adds HubSpot connector** Adds connector to retrieve call, communications, emails, notes, products and tickets from HubSpot
 
 ### Fixes

diff --git a/test_unstructured/staging/test_base_staging.py b/test_unstructured/staging/test_base_staging.py
@@ -292,3 +292,104 @@ def test_filter_element_types_with_exclude_and_include_element_type(
             exclude_element_types=element_types,
             include_element_types=element_types,
         )
+
+
+def test_convert_to_coco():
+    elements = [
+        Text(
+            text="some text",
+            element_id="123",
+            detection_origin="some origin",
+            embeddings=[1.1, 2.2, 3.3, 4.4],
+            metadata=ElementMetadata(
+                coordinates=CoordinatesMetadata(
+                    points=((1, 2), (1, 4), (3, 4), (3, 2)),
+                    system=CoordinateSystem(width=12.3, height=99.4),
+                ),
+                data_source=DataSourceMetadata(
+                    url="http://mysite.com",
+                    version="123",
+                    record_locator={"some": "data", "value": 3},
+                    date_created="then",
+                    date_processed="now",
+                    date_modified="before",
+                    permissions_data=[{"data": 1}, {"data": 2}],
+                ),
+                filename="filename",
+                file_directory="file_directory",
+                last_modified="last_modified",
+                filetype="filetype",
+                attached_to_filename="attached_to_filename",
+                parent_id="parent_id",
+                category_depth=1,
+                image_path="image_path",
+                languages=["eng", "spa"],
+                page_number=1,
+                page_name="page_name",
+                url="url",
+                link_urls=["links", "url"],
+                link_texts=["links", "texts"],
+                links=[Link(text="text", url="url", start_index=1)],
+                sent_from=["sent", "from"],
+                sent_to=["sent", "to"],
+                subject="subject",
+                section="section",
+                header_footer_type="header_footer_type",
+                emphasized_text_contents=["emphasized", "text", "contents"],
+                emphasized_text_tags=["emphasized", "text", "tags"],
+                text_as_html="text_as_html",
+                regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
+                is_continuation=True,
+                detection_class_prob=0.5,
+            ),
+        )
+    ]
+    missing_elements = [
+        Text(
+            text="some text",
+            element_id="123",
+            detection_origin="some origin",
+            embeddings=[1.1, 2.2, 3.3, 4.4],
+            metadata=ElementMetadata(
+                data_source=DataSourceMetadata(
+                    url="http://mysite.com",
+                    version="123",
+                    record_locator={"some": "data", "value": 3},
+                    date_created="then",
+                    date_processed="now",
+                    date_modified="before",
+                    permissions_data=[{"data": 1}, {"data": 2}],
+                ),
+                filename="filename",
+                file_directory="file_directory",
+                last_modified="last_modified",
+                filetype="filetype",
+                attached_to_filename="attached_to_filename",
+                parent_id="parent_id",
+                category_depth=1,
+                image_path="image_path",
+                languages=["eng", "spa"],
+                page_number=1,
+                page_name="page_name",
+                url="url",
+                link_urls=["links", "url"],
+                link_texts=["links", "texts"],
+                links=[Link(text="text", url="url", start_index=1)],
+                sent_from=["sent", "from"],
+                sent_to=["sent", "to"],
+                subject="subject",
+                section="section",
+                header_footer_type="header_footer_type",
+                emphasized_text_contents=["emphasized", "text", "contents"],
+                emphasized_text_tags=["emphasized", "text", "tags"],
+                text_as_html="text_as_html",
+                regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
+                is_continuation=True,
+                detection_class_prob=0.5,
+            ),
+        )
+    ]
+    full_coco = base.convert_to_coco(elements)
+    limited_coco = base.convert_to_coco(missing_elements)
+    assert full_coco["annotations"][0]["area"]
+    assert limited_coco["annotations"][0]["area"] is None
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.1-dev5"  # pragma: no cover
+__version__ = "0.11.1"  # pragma: no cover
diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
@@ -2,7 +2,8 @@
 import io
 import json
 from copy import deepcopy
-from typing import Any, Dict, List, Optional
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
 
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import (
@@ -337,3 +338,96 @@ def filter_element_types(
         return filtered_elements
 
     return elements
+
+
+def convert_to_coco(
+    elements: List[Element],
+    dataset_description: Optional[str] = None,
+    dataset_version: str = "1.0",
+    contributors: Tuple[str] = ("Unstructured Developers",),
+) -> List[Dict[str, Any]]:
+    coco_dataset = {}
+    # Handle Info
+    coco_dataset["info"] = {
+        "description": (
+            dataset_description
+            if dataset_description
+            else f"Unstructured COCO Dataset {datetime.now().strftime('%Y-%m-%d')}"
+        ),
+        "version": dataset_version,
+        "year": datetime.now().year,
+        "contributors": ",".join(contributors),
+        "date_created": datetime.now().date().isoformat(),
+    }
+    elements_dict = convert_to_dict(elements)
+    # Handle Images
+    images = [
+        {
+            "width": (
+                el["metadata"]["coordinates"]["layout_width"]
+                if el["metadata"].get("coordinates")
+                else None
+            ),
+            "height": (
+                el["metadata"]["coordinates"]["layout_height"]
+                if el["metadata"].get("coordinates")
+                else None
+            ),
+            "file_directory": el["metadata"].get("file_directory", ""),
+            "file_name": el["metadata"].get("filename", ""),
+            "page_number": el["metadata"].get("page_number", ""),
+        }
+        for el in elements_dict
+    ]
+    images = list({tuple(sorted(d.items())): d for d in images}.values())
+    for index, d in enumerate(images):
+        d["id"] = index + 1
+    coco_dataset["images"] = images
+    # Handle Categories
+    categories = sorted(set(TYPE_TO_TEXT_ELEMENT_MAP.keys()))
+    categories = [{"id": i + 1, "name": cat} for i, cat in enumerate(categories)]
+    coco_dataset["categories"] = categories
+    # Handle Annotations
+    annotations = [
+        {
+            "id": el["element_id"],
+            "category_id": [x["id"] for x in categories if x["name"] == el["type"]][0],
+            "bbox": [
+                float(el["metadata"].get("coordinates")["points"][0][0]),
+                float(el["metadata"].get("coordinates")["points"][0][1]),
+                float(
+                    abs(
+                        el["metadata"].get("coordinates")["points"][0][0]
+                        - el["metadata"].get("coordinates")["points"][2][0]
+                    )
+                ),
+                float(
+                    abs(
+                        el["metadata"].get("coordinates")["points"][0][1]
+                        - el["metadata"].get("coordinates")["points"][1][1]
+                    )
+                ),
+            ]
+            if el["metadata"].get("coordinates")
+            else [],
+            "area": (
+                float(
+                    abs(
+                        el["metadata"].get("coordinates")["points"][0][0]
+                        - el["metadata"].get("coordinates")["points"][2][0]
+                    )
+                )
+                * float(
+                    abs(
+                        el["metadata"].get("coordinates")["points"][0][1]
+                        - el["metadata"].get("coordinates")["points"][1][1]
+                    )
+                )
+            )
+            if el["metadata"].get("coordinates")
+            else None,
+        }
+        for el in elements_dict
+    ]
+    coco_dataset["annotations"] = annotations
+    return coco_dataset
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.11.1-dev5" # pragma: no cover
		__version__ = "0.11.1" # pragma: no cover