diff --git a/lib/sycamore/sycamore/schema.py b/lib/sycamore/sycamore/schema.py index 40a49a5ba..6f4c623b2 100644 --- a/lib/sycamore/sycamore/schema.py +++ b/lib/sycamore/sycamore/schema.py @@ -7,11 +7,13 @@ class SchemaField(BaseModel): """Represents a field in a DocSet schema.""" name: str + """The name of the field.""" field_type: str """The type of the field.""" default: Optional[Any] = None + """The default value for the field.""" description: Optional[str] = None """A natural language description of the field.""" diff --git a/lib/sycamore/sycamore/tests/integration/transforms/test_data_extraction.py b/lib/sycamore/sycamore/tests/integration/transforms/test_data_extraction.py index 883ef8df3..e682fdef6 100644 --- a/lib/sycamore/sycamore/tests/integration/transforms/test_data_extraction.py +++ b/lib/sycamore/sycamore/tests/integration/transforms/test_data_extraction.py @@ -11,13 +11,15 @@ def test_extract_properties_from_schema(): Document( { "doc_id": "doc_1", - "text_representation": "My name is Vinayak & I'm a 74 year old software engineer from Honolulu Hawaii", + "text_representation": "My name is Vinayak & I'm a 74 year old software engineer from Honolulu Hawaii. " + "This information was written on feb 24, 1923", } ), Document( { "doc_id": "doc_2", - "text_representation": "is a strange case of anti-viral research found in New Delhi.", + "text_representation": "is a strange case of anti-viral research found in New Delhi.\n " + "info date: jan eleven 2014", } ), ] @@ -31,6 +33,7 @@ def test_extract_properties_from_schema(): examples=["Mark", "Ollie", "Winston"], ), SchemaField(name="age", field_type="int", default=999), + SchemaField(name="date", field_type="str", description="Any date in the doc in YYYY-MM-DD format"), SchemaField( name="from_location", field_type="str", @@ -52,7 +55,9 @@ def test_extract_properties_from_schema(): assert taken[0].properties["entity"]["name"] == "Vinayak" assert taken[0].properties["entity"]["age"] == 74 assert taken[0].properties["entity"]["from_location"] == "Honolulu, HI", "Invalid location extracted or formatted" + assert taken[0].properties["entity"]["date"] == "1923-02-24" assert taken[1].properties["entity"]["name"] is None, "Default None value not being used correctly" assert taken[1].properties["entity"]["age"] == 999, "Default value not being used correctly" assert taken[1].properties["entity"]["from_location"] == "New Delhi" + assert taken[1].properties["entity"]["date"] == "2014-01-11"