Skip to content

Commit

Permalink
Add coco staging brick to unstructured base (#2180)
Browse files Browse the repository at this point in the history
  • Loading branch information
pravin-unstructured authored Nov 29, 2023
1 parent c028a14 commit 341f0f4
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 3 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.11.1-dev5
## 0.11.1

### Enhancements
* **Use `pikepdf` to repair invalid PDF structure** for PDFminer when we see error `PSSyntaxError` when PDFminer opens the document and creates the PDFminer pages object or processes a single PDF page.
Expand All @@ -7,6 +7,7 @@

### Features

* **Staging Brick for Coco Format** Staging brick which converts a list of Elements into Coco Format.
* **Adds HubSpot connector** Adds connector to retrieve call, communications, emails, notes, products and tickets from HubSpot

### Fixes
Expand Down
101 changes: 101 additions & 0 deletions test_unstructured/staging/test_base_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,104 @@ def test_filter_element_types_with_exclude_and_include_element_type(
exclude_element_types=element_types,
include_element_types=element_types,
)


def test_convert_to_coco():
elements = [
Text(
text="some text",
element_id="123",
detection_origin="some origin",
embeddings=[1.1, 2.2, 3.3, 4.4],
metadata=ElementMetadata(
coordinates=CoordinatesMetadata(
points=((1, 2), (1, 4), (3, 4), (3, 2)),
system=CoordinateSystem(width=12.3, height=99.4),
),
data_source=DataSourceMetadata(
url="http://mysite.com",
version="123",
record_locator={"some": "data", "value": 3},
date_created="then",
date_processed="now",
date_modified="before",
permissions_data=[{"data": 1}, {"data": 2}],
),
filename="filename",
file_directory="file_directory",
last_modified="last_modified",
filetype="filetype",
attached_to_filename="attached_to_filename",
parent_id="parent_id",
category_depth=1,
image_path="image_path",
languages=["eng", "spa"],
page_number=1,
page_name="page_name",
url="url",
link_urls=["links", "url"],
link_texts=["links", "texts"],
links=[Link(text="text", url="url", start_index=1)],
sent_from=["sent", "from"],
sent_to=["sent", "to"],
subject="subject",
section="section",
header_footer_type="header_footer_type",
emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
is_continuation=True,
detection_class_prob=0.5,
),
)
]
missing_elements = [
Text(
text="some text",
element_id="123",
detection_origin="some origin",
embeddings=[1.1, 2.2, 3.3, 4.4],
metadata=ElementMetadata(
data_source=DataSourceMetadata(
url="http://mysite.com",
version="123",
record_locator={"some": "data", "value": 3},
date_created="then",
date_processed="now",
date_modified="before",
permissions_data=[{"data": 1}, {"data": 2}],
),
filename="filename",
file_directory="file_directory",
last_modified="last_modified",
filetype="filetype",
attached_to_filename="attached_to_filename",
parent_id="parent_id",
category_depth=1,
image_path="image_path",
languages=["eng", "spa"],
page_number=1,
page_name="page_name",
url="url",
link_urls=["links", "url"],
link_texts=["links", "texts"],
links=[Link(text="text", url="url", start_index=1)],
sent_from=["sent", "from"],
sent_to=["sent", "to"],
subject="subject",
section="section",
header_footer_type="header_footer_type",
emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
is_continuation=True,
detection_class_prob=0.5,
),
)
]
full_coco = base.convert_to_coco(elements)
limited_coco = base.convert_to_coco(missing_elements)
assert full_coco["annotations"][0]["area"]
assert limited_coco["annotations"][0]["area"] is None
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.11.1-dev5" # pragma: no cover
__version__ = "0.11.1" # pragma: no cover
96 changes: 95 additions & 1 deletion unstructured/staging/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import io
import json
from copy import deepcopy
from typing import Any, Dict, List, Optional
from datetime import datetime
from typing import Any, Dict, List, Optional, Tuple

from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import (
Expand Down Expand Up @@ -337,3 +338,96 @@ def filter_element_types(
return filtered_elements

return elements


def convert_to_coco(
elements: List[Element],
dataset_description: Optional[str] = None,
dataset_version: str = "1.0",
contributors: Tuple[str] = ("Unstructured Developers",),
) -> List[Dict[str, Any]]:
coco_dataset = {}
# Handle Info
coco_dataset["info"] = {
"description": (
dataset_description
if dataset_description
else f"Unstructured COCO Dataset {datetime.now().strftime('%Y-%m-%d')}"
),
"version": dataset_version,
"year": datetime.now().year,
"contributors": ",".join(contributors),
"date_created": datetime.now().date().isoformat(),
}
elements_dict = convert_to_dict(elements)
# Handle Images
images = [
{
"width": (
el["metadata"]["coordinates"]["layout_width"]
if el["metadata"].get("coordinates")
else None
),
"height": (
el["metadata"]["coordinates"]["layout_height"]
if el["metadata"].get("coordinates")
else None
),
"file_directory": el["metadata"].get("file_directory", ""),
"file_name": el["metadata"].get("filename", ""),
"page_number": el["metadata"].get("page_number", ""),
}
for el in elements_dict
]
images = list({tuple(sorted(d.items())): d for d in images}.values())
for index, d in enumerate(images):
d["id"] = index + 1
coco_dataset["images"] = images
# Handle Categories
categories = sorted(set(TYPE_TO_TEXT_ELEMENT_MAP.keys()))
categories = [{"id": i + 1, "name": cat} for i, cat in enumerate(categories)]
coco_dataset["categories"] = categories
# Handle Annotations
annotations = [
{
"id": el["element_id"],
"category_id": [x["id"] for x in categories if x["name"] == el["type"]][0],
"bbox": [
float(el["metadata"].get("coordinates")["points"][0][0]),
float(el["metadata"].get("coordinates")["points"][0][1]),
float(
abs(
el["metadata"].get("coordinates")["points"][0][0]
- el["metadata"].get("coordinates")["points"][2][0]
)
),
float(
abs(
el["metadata"].get("coordinates")["points"][0][1]
- el["metadata"].get("coordinates")["points"][1][1]
)
),
]
if el["metadata"].get("coordinates")
else [],
"area": (
float(
abs(
el["metadata"].get("coordinates")["points"][0][0]
- el["metadata"].get("coordinates")["points"][2][0]
)
)
* float(
abs(
el["metadata"].get("coordinates")["points"][0][1]
- el["metadata"].get("coordinates")["points"][1][1]
)
)
)
if el["metadata"].get("coordinates")
else None,
}
for el in elements_dict
]
coco_dataset["annotations"] = annotations
return coco_dataset

0 comments on commit 341f0f4

Please sign in to comment.