diff --git a/CHANGELOG.md b/CHANGELOG.md index 68d9d0ff58..2b6e99c743 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### Enhancements - **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes. +- **Add NDJSON file type support** ### Features diff --git a/example-docs/simple.ndjson b/example-docs/simple.ndjson new file mode 100644 index 0000000000..4e4fcee8ce --- /dev/null +++ b/example-docs/simple.ndjson @@ -0,0 +1,8 @@ +{"element_id": "a06d2d9e65212d4aa955c3ab32950ffa", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "These are a few of my favorite things:", "type": "Title"} +{"element_id": "b334c93e9b1cbca3b6f6d78ce8bc2484", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Parrots", "type": "ListItem"} +{"element_id": "76469ecb9f1459943c8d8cca1a550b5a", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Hockey", "type": "ListItem"} +{"element_id": "261fac731945a138415adc2dd4434b17", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "Analysis", "type": "Title"} +{"element_id": "95f392d32c5271bfdb30eaef45921e59", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my first thought. This is my second thought.", "type": "NarrativeText"} +{"element_id": "0de25bd6f0d74bc4f909f2678f385736", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my third thought.", "type": "NarrativeText"} +{"element_id": "f296a3bc8a901f19199fda1da92829b6", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "2023", "type": "UncategorizedText"} +{"element_id": "78c62edbc674fdca0f6a0e3ffb459f86", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "DOYLESTOWN, PA 18901", "type": "Address"} \ No newline at end of file diff --git a/example-docs/spring-weather.html.ndjson b/example-docs/spring-weather.html.ndjson new file mode 100644 index 0000000000..b2880a19c7 --- /dev/null +++ b/example-docs/spring-weather.html.ndjson @@ -0,0 +1,35 @@ +{"type": "Title", "element_id": "fb902c5b26b38e2d35a70a55d43a5de6", "text": "News Around NOAA", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "100233c72890df3d216e2bc2c36f7153", "text": "National Program", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "88f0bebe7a9cca77675bd8a5db823092", "text": "Are You Weather-Ready for the Spring?", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "568c824acda361cfc270a75e2eca7a23", "text": "Weather.gov >", "metadata": {"link_texts": ["Weather.gov"], "link_urls": ["https://www.weather.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "767e68cdb3d891322eb8b65489f53b4c", "text": "News Around NOAA > Are You Weather-Ready for the Spring?", "metadata": {"link_texts": ["News Around NOAA"], "link_urls": ["https://www.weather.gov/news"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "ListItem", "element_id": "79fb885317b2666481d0a1c31970400d", "text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter", "metadata": {"link_texts": ["Weather Safety", "Air Quality", "Beach Hazards", "Cold", "Cold Water", "Drought", "Floods", "Fog", "Heat", " Hurricanes", " Lightning Safety", "Rip Currents", "Safe Boating", "Space Weather", "Sun (Ultraviolet Radiation)", " Thunderstorms & Tornadoes", "Tornado", "Tsunami", "Wildfire", "Wind", "Winter"], "link_urls": ["http://www.weather.gov/safetycampaign", "https://www.weather.gov/safety/airquality", "https://www.weather.gov/safety/beachhazards", "https://www.weather.gov/safety/cold", "https://www.weather.gov/safety/coldwater", "https://www.weather.gov/safety/drought", "https://www.weather.gov/safety/flood", "https://www.weather.gov/safety/fog", "https://www.weather.gov/safety/heat", "https://www.weather.gov/safety/hurricane", "https://www.weather.gov/safety/lightning", "https://www.weather.gov/safety/ripcurrent", "https://www.weather.gov/safety/safeboating", "https://www.weather.gov/safety/space", "https://www.weather.gov/safety/heat-uv", "https://www.weather.gov/safety/thunderstorm", "https://www.weather.gov/safety/tornado", "https://www.weather.gov/safety/tsunami", "https://www.weather.gov/safety/wildfire", "https://www.weather.gov/safety/wind", "https://www.weather.gov/safety/winter "], "link_start_indexes": [0, 14, 25, 38, 42, 52, 59, 65, 68, 72, 83, 100, 112, 124, 137, 164, 190, 197, 204, 212, 216], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "ListItem", "element_id": "512e6a00cacb0ab139ede6b0145f441d", "text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors", "metadata": {"link_texts": ["Safety Campaigns", "Seasonal Safety Campaigns", "#SafePlaceSelfie", "Deaf & Hard of Hearing", "Intellectual Disabilities", "Spanish-language Content", "The Great Outdoors"], "link_urls": ["https://www.weather.gov/safetycampaign", "https://www.weather.gov/safetycampaign", "https://www.weather.gov/wrn/safeplaceselfie", "https://www.weather.gov/wrn/dhh-safety", "https://www.weather.gov/wrn/intellectualdisabilities", "https://www.weather.gov/wrn/fall2020-espanol-sm", "https://www.noaa.gov/explainers/great-outdoors-weather-safety"], "link_start_indexes": [0, 16, 41, 57, 79, 104, 128], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "ListItem", "element_id": "d4145282089e41261300a9bcf440edb9", "text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Espa\u00f1ol", "metadata": {"link_texts": ["Ambassador", "About WRN Ambassadors", "Become an Ambassador", "Ambassadors of Excellence", "People of WRN", " FAQS", "Tell Your Success Story", " Success Stories", "Tri-fold", "Aviation", " Current Ambassadors", "Brochure", "En Espa\u00f1ol"], "link_urls": ["https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/wrn/amb-tou", "https://www.weather.gov/wrn/ambassador_recognition", "https://www.weather.gov/people/", "https://www.weather.gov/wrn/amb-faqs", "https://docs.google.com/forms/d/e/1FAIpQLScPHee5WAyC5K1LZ3pWLa2zjaM1HZSKN4_AxGUc6RaCy_gxLA/viewform", " https://www.weather.gov/wrn/success-stories", "http://www.weather.gov/media/wrn/WRN_Ambassador_Trifold.pdf", "https://www.weather.gov/wrn/aviation", " http://www.weather.gov/wrn/current-ambassadors", "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf", "https://www.weather.gov/wrn/en-espanol"], "link_start_indexes": [0, 10, 31, 51, 76, 89, 94, 117, 133, 141, 149, 169, 177], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "ListItem", "element_id": "aeee9b1d3904eda123d21c851ce4747d", "text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities", "metadata": {"link_texts": ["Education", "NWS Education Home", "Be A Force Of Nature", "WRN Kids Flyer", "Wireless Emergency Alerts", "NOAA Weather Radio", "Mobile Weather", "Brochures", "Hourly Weather Forecast", "Citizen Science", "Intellectual Disabilities"], "link_urls": ["http://www.weather.gov/owlie/", "http://www.weather.gov/owlie/", "https://www.weather.gov/wrn/force", " http://www.weather.gov/media/owlie/nws_kids_fact_sheet2.pdf", "https://www.weather.gov/wrn/wea", "http://www.nws.noaa.gov/nwr/", "https://www.weather.gov/wrn/mobile-phone", "http://www.weather.gov/owlie/publication_brochures", "https://www.weather.gov/wrn/hourly-weather-graph", "http://www.weather.gov/media/wrn/citizen_science_page.pdf", "https://www.weather.gov/wrn/intellectualdisabilities"], "link_start_indexes": [0, 9, 27, 47, 61, 86, 104, 118, 127, 150, 165], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "ListItem", "element_id": "752f5b846e4a24df6d62d9dc014e5aec", "text": "Collaboration Get Involved Social Media WRN Ambassadors \u200b Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only)\u200b SKYWARN", "metadata": {"link_texts": ["Collaboration", "Get Involved ", "Social Media", "WRN Ambassadors \u200b", "Enterprise Resources", "StormReady", "TsunamiReady", "NWSChat (core partners only)", "InteractiveNWS (iNWS) (core partners only)\u200b", "SKYWARN"], "link_urls": ["https://www.weather.gov/wrn/collaborate", "https://www.weather.gov/wrn/get-involved", "http://www.weather.gov/socialmedia", "https://www.weather.gov/wrn/ambassadors", "https://www.weather.gov/enterprise/", "http://www.weather.gov/stormready/", "https://www.weather.gov/tsunamiready/", "https://nwschat.weather.gov/", "https://inws.ncep.noaa.gov/", "https://www.weather.gov/SKYWARN"], "link_start_indexes": [0, 13, 26, 38, 55, 75, 85, 97, 125, 168], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "ListItem", "element_id": "8729b5380b0f442c0512948bd18de66b", "text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter", "metadata": {"link_texts": [" News & Events", "Latest News", "Calendar", "Meetings & Workshops", "NWS Aware Newsletter"], "link_urls": ["http://www.weather.gov/news/", " http://www.weather.gov/news/", "https://www.weather.gov/wrn/calendar", " https://www.weather.gov/wrn/workshops", "https://www.weather.gov/publications/aware"], "link_start_indexes": [0, 14, 25, 33, 53], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "ListItem", "element_id": "ec0f9efa0e7de0d7bbf11f3b8fb2a1ca", "text": "International", "metadata": {"link_texts": ["International"], "link_urls": ["https://www.weather.gov/wrn/wrns"], "link_start_indexes": [0], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "ListItem", "element_id": "f17da617a620de011003a204ecf48752", "text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science", "metadata": {"link_texts": ["About", "Contact Us", " What is WRN?", " WRN FAQ", "WRN Brochure", "Hazard Simplification", "IDSS Brochure", "Roadmap", "Strategic Plan", "WRN International", "Social Science"], "link_urls": ["https://www.weather.gov/wrn/about", " https://www.weather.gov/wrn/contact", "https://www.weather.gov/wrn/about", "https://www.weather.gov/wrn/faqs", "http://www.weather.gov/media/wrn/WRN_Ambassador_Flyer.pdf", "https://www.weather.gov/hazardsimplification/", "https://www.weather.gov/media/wrn/2018-IDSS2-Pager.pdf", "http://www.weather.gov/media/wrn/nws_wrn_roadmap_final_april17.pdf", "https://www.weather.gov/media/wrn/NWS_Weather-Ready-Nation_Strategic_Plan_2019-2022.pdf", " https://www.weather.gov/wrn/international", "https://vlab.noaa.gov/web/nws-social-science"], "link_start_indexes": [0, 5, 15, 28, 36, 48, 69, 82, 89, 103, 120], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "NarrativeText", "element_id": "623c25f2247b125d6df5138a7c5ee153", "text": "The spring season is all about change \u2013 a rebirth both literally and figuratively. Even though the spring season doesn\u2019t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "NarrativeText", "element_id": "c8c953bd87e4571df8e6486e9c467861", "text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "NarrativeText", "element_id": "b6553aef4dc61e5d31e2e28426e56f0b", "text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.", "metadata": {"emphasized_text_contents": ["First, take steps to better prepare for the seasonal hazards weather can throw at you."], "emphasized_text_tags": ["strong"], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "NarrativeText", "element_id": "ac246c4693669d08d274f628c3293a78", "text": "This could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become \u201cweather-ready.\u201d", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "NarrativeText", "element_id": "d1fa2a66a4df9759bdf01f6f1ec51d8e", "text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content \u2013 everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.", "metadata": {"emphasized_text_contents": ["Second, encourage others to become Weather-Ready as well."], "emphasized_text_tags": ["strong"], "link_texts": ["Spring Safety website"], "link_urls": ["https://www.weather.gov/wrn/spring-safety"], "link_start_indexes": [167], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "NarrativeText", "element_id": "996f1b86d1cb5a02028bd3816f5790f1", "text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring\u2019s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.", "metadata": {"link_texts": ["infographics"], "link_urls": ["https://www.weather.gov/wrn/spring-infographics"], "link_start_indexes": [303], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "NarrativeText", "element_id": "90b31790a9b5fd903e6dbaea50e05f45", "text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "9dcf311a7e6225af9333100c709b7f23", "text": "US Dept of Commerce", "metadata": {"link_texts": ["US Dept of Commerce"], "link_urls": ["http://www.commerce.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "60711b68cb732ecb10f4c05f0f784647", "text": "National Oceanic and Atmospheric Administration", "metadata": {"link_texts": ["National Oceanic and Atmospheric Administration"], "link_urls": ["http://www.noaa.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "55ca4bf03b04ffacb8ea8cb528c22a6f", "text": "National Weather Service", "metadata": {"link_texts": ["National Weather Service"], "link_urls": ["https://www.weather.gov"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "3ebaebb5791662dfa6d2e2b8af436f9d", "text": "News Around NOAA", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "ccf5cdb2984d2ac2d934010960d32aca", "text": "1325 East West Highway", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Address", "element_id": "64a081cb854ff90dbc668c2b334d0ae8", "text": "Silver Spring, MD 20910", "metadata": {"languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "6af532045e3aa6fe3764590594dc0dd7", "text": "Comments? Questions? Please Contact Us.", "metadata": {"link_texts": ["Comments? Questions? Please Contact Us."], "link_urls": ["https://www.weather.gov/news/contact"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "a63c69dcc655b1b32bc6157427e9ca8e", "text": "Disclaimer", "metadata": {"link_texts": ["Disclaimer"], "link_urls": ["https://www.weather.gov/disclaimer"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "95054785187bcc0cf98cdb17c135ca1d", "text": "Information Quality", "metadata": {"link_texts": ["Information Quality"], "link_urls": ["http://www.cio.noaa.gov/services_programs/info_quality.html"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "800d660faa52732cd4d361b187bbd6e2", "text": "Help", "metadata": {"link_texts": ["Help"], "link_urls": ["https://www.weather.gov/help"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "718284e0cdf275514b6aa8fb8976a7cc", "text": "Glossary", "metadata": {"link_texts": ["Glossary"], "link_urls": ["http://www.weather.gov/glossary"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "678ef3e5cd635ba851d2dfd7f6f20d0f", "text": "Privacy Policy", "metadata": {"link_texts": ["Privacy Policy"], "link_urls": ["https://www.weather.gov/privacy"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "f66ad83bfffccef0afe60d0aaba55b54", "text": "Freedom of Information Act (FOIA)", "metadata": {"link_texts": ["Freedom of Information Act (FOIA)"], "link_urls": ["https://www.noaa.gov/foia-freedom-of-information-act"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "f50c4a988c7336b9d1100227fa7f03a3", "text": "About Us", "metadata": {"link_texts": ["About Us"], "link_urls": ["https://www.weather.gov/about"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} +{"type": "Title", "element_id": "a9a5f8ac29adb68999173b4e65a189bd", "text": "Career Opportunities", "metadata": {"link_texts": ["Career Opportunities"], "link_urls": ["https://www.weather.gov/careers"], "link_start_indexes": [-1], "languages": ["eng"], "filetype": "text/html", "data_source": {"url": "abfs://container1/spring-weather.html", "version": "162215905222974206637545574128436022861", "record_locator": {"protocol": "abfs", "remote_file_path": "abfs://container1/"}, "date_created": "1678441216.0", "date_modified": "1678441216.0"}}} \ No newline at end of file diff --git a/requirements/base.in b/requirements/base.in index cc2b27d8ad..ff38c57bb6 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -22,3 +22,4 @@ tqdm psutil python-oxmsg html5lib +ndjson diff --git a/requirements/base.txt b/requirements/base.txt index 7117e30a8a..2eb934db6c 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,13 +4,13 @@ # # pip-compile ./base.in # -anyio==4.6.2.post1 +anyio==4.7.0 # via httpx backoff==2.2.1 # via -r ./base.in beautifulsoup4==4.12.3 # via -r ./base.in -certifi==2024.8.30 +certifi==2024.12.14 # via # httpcore # httpx @@ -28,13 +28,13 @@ click==8.1.7 # via # nltk # python-oxmsg -cryptography==43.0.3 +cryptography==44.0.0 # via unstructured-client dataclasses-json==0.6.7 # via # -r ./base.in # unstructured-client -deepdiff==8.0.1 +deepdiff==8.1.1 # via unstructured-client emoji==2.14.0 # via -r ./base.in @@ -46,9 +46,9 @@ h11==0.14.0 # via httpcore html5lib==1.1 # via -r ./base.in -httpcore==1.0.6 +httpcore==1.0.7 # via httpx -httpx==0.27.2 +httpx==0.28.1 # via unstructured-client idna==3.10 # via @@ -64,7 +64,7 @@ langdetect==1.0.9 # via -r ./base.in lxml==5.3.0 # via -r ./base.in -marshmallow==3.23.0 +marshmallow==3.23.1 # via # dataclasses-json # unstructured-client @@ -72,6 +72,8 @@ mypy-extensions==1.0.0 # via # typing-inspect # unstructured-client +ndjson==0.3.1 + # via -r ./base.in nest-asyncio==1.6.0 # via unstructured-client nltk==3.9.1 @@ -80,9 +82,9 @@ numpy==1.26.4 # via -r ./base.in olefile==0.47 # via python-oxmsg -orderly-set==5.2.2 +orderly-set==5.2.3 # via deepdiff -packaging==24.1 +packaging==24.2 # via # marshmallow # unstructured-client @@ -90,7 +92,7 @@ psutil==6.1.0 # via -r ./base.in pycparser==2.22 # via cffi -pypdf==5.0.1 +pypdf==5.1.0 # via unstructured-client python-dateutil==2.9.0.post0 # via unstructured-client @@ -100,9 +102,9 @@ python-magic==0.4.27 # via -r ./base.in python-oxmsg==0.0.1 # via -r ./base.in -rapidfuzz==3.10.1 +rapidfuzz==3.11.0 # via -r ./base.in -regex==2024.9.11 +regex==2024.11.6 # via nltk requests==2.32.3 # via @@ -111,19 +113,17 @@ requests==2.32.3 # unstructured-client requests-toolbelt==1.0.0 # via unstructured-client -six==1.16.0 +six==1.17.0 # via # html5lib # langdetect # python-dateutil # unstructured-client sniffio==1.3.1 - # via - # anyio - # httpx + # via anyio soupsieve==2.6 # via beautifulsoup4 -tqdm==4.66.5 +tqdm==4.67.1 # via # -r ./base.in # nltk @@ -150,5 +150,5 @@ urllib3==1.26.20 # unstructured-client webencodings==0.5.1 # via html5lib -wrapt==1.16.0 +wrapt==1.17.0 # via -r ./base.in diff --git a/requirements/dev.txt b/requirements/dev.txt index bd90364012..8f60a228db 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -17,7 +17,7 @@ distlib==0.3.9 # via virtualenv filelock==3.16.1 # via virtualenv -identify==2.6.1 +identify==2.6.3 # via pre-commit importlib-metadata==8.5.0 # via @@ -25,7 +25,7 @@ importlib-metadata==8.5.0 # build nodeenv==1.9.1 # via pre-commit -packaging==24.1 +packaging==24.2 # via # -c ./base.txt # -c ./test.txt @@ -46,16 +46,16 @@ pyyaml==6.0.2 # via # -c ./test.txt # pre-commit -tomli==2.0.2 +tomli==2.2.1 # via # -c ./test.txt # build # pip-tools -virtualenv==20.27.0 +virtualenv==20.28.0 # via pre-commit -wheel==0.44.0 +wheel==0.45.1 # via pip-tools -zipp==3.20.2 +zipp==3.21.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: diff --git a/requirements/extra-csv.txt b/requirements/extra-csv.txt index 1896204fbd..496cd42fc1 100644 --- a/requirements/extra-csv.txt +++ b/requirements/extra-csv.txt @@ -16,7 +16,7 @@ python-dateutil==2.9.0.post0 # pandas pytz==2024.2 # via pandas -six==1.16.0 +six==1.17.0 # via # -c ./base.txt # python-dateutil diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index 8da7349455..243fd0b0da 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -10,5 +10,5 @@ importlib-metadata==8.5.0 # markdown markdown==3.7 # via -r ./extra-markdown.in -zipp==3.20.2 +zipp==3.21.0 # via importlib-metadata diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index e14c2985ad..2a0a0ec835 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -4,13 +4,13 @@ # # pip-compile ./extra-paddleocr.in # -anyio==4.6.2.post1 +anyio==4.7.0 # via # -c ./base.txt # httpx astor==0.8.1 # via paddlepaddle -certifi==2024.8.30 +certifi==2024.12.14 # via # -c ./base.txt # httpcore @@ -32,17 +32,17 @@ exceptiongroup==1.2.2 # via # -c ./base.txt # anyio -fonttools==4.54.1 +fonttools==4.55.3 # via matplotlib h11==0.14.0 # via # -c ./base.txt # httpcore -httpcore==1.0.6 +httpcore==1.0.7 # via # -c ./base.txt # httpx -httpx==0.27.2 +httpx==0.28.1 # via # -c ./base.txt # paddlepaddle @@ -52,7 +52,7 @@ idna==3.10 # anyio # httpx # requests -imageio==2.36.0 +imageio==2.36.1 # via # imgaug # scikit-image @@ -64,7 +64,7 @@ kiwisolver==1.4.7 # via matplotlib lazy-loader==0.4 # via scikit-image -matplotlib==3.9.2 +matplotlib==3.9.4 # via imgaug networkx==3.2.1 # via @@ -94,7 +94,7 @@ opencv-python==4.10.0.84 # unstructured-paddleocr opt-einsum==3.3.0 # via paddlepaddle -packaging==24.1 +packaging==24.2 # via # -c ./base.txt # lazy-loader @@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0 # matplotlib pyyaml==6.0.2 # via unstructured-paddleocr -rapidfuzz==3.10.1 +rapidfuzz==3.11.0 # via # -c ./base.txt # unstructured-paddleocr @@ -147,7 +147,7 @@ shapely==2.0.6 # via # imgaug # unstructured-paddleocr -six==1.16.0 +six==1.17.0 # via # -c ./base.txt # imgaug @@ -156,10 +156,9 @@ sniffio==1.3.1 # via # -c ./base.txt # anyio - # httpx tifffile==2024.8.30 # via scikit-image -tqdm==4.66.5 +tqdm==4.67.1 # via # -c ./base.txt # unstructured-paddleocr @@ -175,5 +174,5 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests -zipp==3.20.2 +zipp==3.21.0 # via importlib-resources diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index ff34f2dedc..eb0b478502 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -8,7 +8,7 @@ antlr4-python3-runtime==4.9.3 # via omegaconf cachetools==5.5.0 # via google-auth -certifi==2024.8.30 +certifi==2024.12.14 # via # -c ./base.txt # requests @@ -25,13 +25,13 @@ coloredlogs==15.0.1 # via onnxruntime contourpy==1.3.0 # via matplotlib -cryptography==43.0.3 +cryptography==44.0.0 # via # -c ./base.txt # pdfminer-six cycler==0.12.1 # via matplotlib -deprecated==1.2.14 +deprecated==1.2.15 # via pikepdf effdet==0.4.1 # via -r ./extra-pdf-image.in @@ -42,32 +42,32 @@ filelock==3.16.1 # transformers flatbuffers==24.3.25 # via onnxruntime -fonttools==4.54.1 +fonttools==4.55.3 # via matplotlib fsspec==2024.10.0 # via # huggingface-hub # torch -google-api-core[grpc]==2.21.0 +google-api-core[grpc]==2.24.0 # via google-cloud-vision -google-auth==2.35.0 +google-auth==2.37.0 # via # google-api-core # google-cloud-vision -google-cloud-vision==3.8.0 +google-cloud-vision==3.9.0 # via -r ./extra-pdf-image.in -googleapis-common-protos==1.65.0 +googleapis-common-protos==1.66.0 # via # google-api-core # grpcio-status -grpcio==1.67.0 +grpcio==1.68.1 # via # -c ././deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.62.3 # via google-api-core -huggingface-hub==0.26.1 +huggingface-hub==0.27.0 # via # timm # tokenizers @@ -95,7 +95,7 @@ lxml==5.3.0 # pikepdf markupsafe==3.0.2 # via jinja2 -matplotlib==3.9.2 +matplotlib==3.9.4 # via # pycocotools # unstructured-inference @@ -130,7 +130,7 @@ opencv-python==4.10.0.84 # via # layoutparser # unstructured-inference -packaging==24.1 +packaging==24.2 # via # -c ./base.txt # huggingface-hub @@ -151,9 +151,9 @@ pdfminer-six==20231228 # pdfplumber pdfplumber==0.11.4 # via layoutparser -pi-heif==0.20.0 +pi-heif==0.21.0 # via -r ./extra-pdf-image.in -pikepdf==9.3.0 +pikepdf==9.4.2 # via -r ./extra-pdf-image.in pillow==11.0.0 # via @@ -165,7 +165,7 @@ pillow==11.0.0 # pikepdf # torchvision # unstructured-pytesseract -portalocker==2.10.1 +portalocker==3.0.0 # via iopath proto-plus==1.25.0 # via @@ -195,7 +195,7 @@ pycparser==2.22 # cffi pyparsing==3.2.0 # via matplotlib -pypdf==5.0.1 +pypdf==5.1.0 # via # -c ./base.txt # -r ./extra-pdf-image.in @@ -206,7 +206,7 @@ python-dateutil==2.9.0.post0 # -c ./base.txt # matplotlib # pandas -python-multipart==0.0.12 +python-multipart==0.0.20 # via unstructured-inference pytz==2024.2 # via pandas @@ -217,11 +217,11 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.10.1 +rapidfuzz==3.11.0 # via # -c ./base.txt # unstructured-inference -regex==2024.9.11 +regex==2024.11.6 # via # -c ./base.txt # transformers @@ -239,7 +239,7 @@ safetensors==0.4.5 # transformers scipy==1.13.1 # via layoutparser -six==1.16.0 +six==1.17.0 # via # -c ./base.txt # python-dateutil @@ -247,7 +247,7 @@ sympy==1.13.1 # via # onnxruntime # torch -timm==1.0.11 +timm==1.0.12 # via # effdet # unstructured-inference @@ -255,17 +255,17 @@ tokenizers==0.19.1 # via # -c ././deps/constraints.txt # transformers -torch==2.5.0 +torch==2.5.1 # via # effdet # timm # torchvision # unstructured-inference -torchvision==0.20.0 +torchvision==0.20.1 # via # effdet # timm -tqdm==4.66.5 +tqdm==4.67.1 # via # -c ./base.txt # huggingface-hub @@ -291,9 +291,9 @@ urllib3==1.26.20 # -c ././deps/constraints.txt # -c ./base.txt # requests -wrapt==1.16.0 +wrapt==1.17.0 # via # -c ./base.txt # deprecated -zipp==3.20.2 +zipp==3.21.0 # via importlib-resources diff --git a/requirements/extra-xlsx.txt b/requirements/extra-xlsx.txt index a4faf2f5f4..7f00c057a2 100644 --- a/requirements/extra-xlsx.txt +++ b/requirements/extra-xlsx.txt @@ -22,7 +22,7 @@ python-dateutil==2.9.0.post0 # pandas pytz==2024.2 # via pandas -six==1.16.0 +six==1.17.0 # via # -c ./base.txt # python-dateutil diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index afa5e28a46..3d662ea51b 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -4,7 +4,7 @@ # # pip-compile ./huggingface.in # -certifi==2024.8.30 +certifi==2024.12.14 # via # -c ./base.txt # requests @@ -25,7 +25,7 @@ fsspec==2024.10.0 # via # huggingface-hub # torch -huggingface-hub==0.26.1 +huggingface-hub==0.27.0 # via # tokenizers # transformers @@ -53,7 +53,7 @@ numpy==1.26.4 # via # -c ./base.txt # transformers -packaging==24.1 +packaging==24.2 # via # -c ./base.txt # huggingface-hub @@ -62,7 +62,7 @@ pyyaml==6.0.2 # via # huggingface-hub # transformers -regex==2024.9.11 +regex==2024.11.6 # via # -c ./base.txt # sacremoses @@ -78,7 +78,7 @@ safetensors==0.4.5 # via transformers sentencepiece==0.2.0 # via -r ./huggingface.in -six==1.16.0 +six==1.17.0 # via # -c ./base.txt # langdetect @@ -88,9 +88,9 @@ tokenizers==0.19.1 # via # -c ././deps/constraints.txt # transformers -torch==2.5.0 +torch==2.5.1 # via -r ./huggingface.in -tqdm==4.66.5 +tqdm==4.67.1 # via # -c ./base.txt # huggingface-hub diff --git a/requirements/test.txt b/requirements/test.txt index e3762557da..1bc2a98271 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -6,19 +6,25 @@ # annotated-types==0.7.0 # via pydantic -anyio==4.6.2.post1 +anyio==4.7.0 # via # -c ./base.txt # httpx appdirs==1.4.4 # via label-studio-sdk -attrs==24.2.0 - # via jsonschema +argcomplete==3.5.2 + # via datamodel-code-generator +attrs==24.3.0 + # via + # jsonschema + # referencing autoflake==2.3.1 # via -r ./test.in black==24.10.0 - # via -r ./test.in -certifi==2024.8.30 + # via + # -r ./test.in + # datamodel-code-generator +certifi==2024.12.14 # via # -c ./base.txt # httpcore @@ -33,15 +39,23 @@ click==8.1.7 # -c ./base.txt # black # nltk -coverage[toml]==7.6.4 +coverage[toml]==7.6.9 # via # -r ./test.in # pytest-cov +datamodel-code-generator==0.26.1 + # via label-studio-sdk +dnspython==2.7.0 + # via email-validator +email-validator==2.2.0 + # via pydantic exceptiongroup==1.2.2 # via # -c ./base.txt # anyio # pytest +faker==33.1.0 + # via jsf flake8==7.1.1 # via # -r ./test.in @@ -50,7 +64,9 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.1 # via -r ./test.in -grpcio==1.67.0 +genson==1.3.0 + # via datamodel-code-generator +grpcio==1.68.1 # via # -c ././deps/constraints.txt # -r ./test.in @@ -58,11 +74,11 @@ h11==0.14.0 # via # -c ./base.txt # httpcore -httpcore==1.0.6 +httpcore==1.0.7 # via # -c ./base.txt # httpx -httpx==0.27.2 +httpx==0.28.1 # via # -c ./base.txt # label-studio-sdk @@ -70,20 +86,33 @@ idna==3.10 # via # -c ./base.txt # anyio + # email-validator # httpx # requests # yarl ijson==3.3.0 # via label-studio-sdk +inflect==5.6.2 + # via datamodel-code-generator iniconfig==2.0.0 # via pytest +isort==5.13.2 + # via datamodel-code-generator +jinja2==3.1.4 + # via datamodel-code-generator joblib==1.4.2 # via # -c ./base.txt # nltk -jsonschema==3.2.0 +jsf==0.11.2 # via label-studio-sdk -label-studio-sdk==1.0.5 +jsonschema==4.23.0 + # via + # jsf + # label-studio-sdk +jsonschema-specifications==2024.10.1 + # via jsonschema +label-studio-sdk==1.0.8 # via -r ./test.in liccheck==0.9.2 # via -r ./test.in @@ -91,6 +120,8 @@ lxml==5.3.0 # via # -c ./base.txt # label-studio-sdk +markupsafe==3.0.2 + # via jinja2 mccabe==0.7.0 # via flake8 multidict==6.1.0 @@ -109,11 +140,13 @@ nltk==3.9.1 numpy==1.26.4 # via # -c ./base.txt + # label-studio-sdk # pandas -packaging==24.1 +packaging==24.2 # via # -c ./base.txt # black + # datamodel-code-generator # pytest pandas==2.2.3 # via label-studio-sdk @@ -125,42 +158,49 @@ platformdirs==4.3.6 # via black pluggy==1.5.0 # via pytest -propcache==0.2.0 +propcache==0.2.1 # via yarl pycodestyle==2.12.1 # via # flake8 # flake8-print -pydantic==2.9.2 +pydantic[email]==2.10.3 # via # -r ./test.in + # datamodel-code-generator + # jsf # label-studio-sdk -pydantic-core==2.23.4 +pydantic-core==2.27.1 # via pydantic pyflakes==3.2.0 # via # autoflake # flake8 -pyrsistent==0.20.0 - # via jsonschema -pytest==8.3.3 +pytest==8.3.4 # via # pytest-cov # pytest-mock -pytest-cov==5.0.0 +pytest-cov==6.0.0 # via -r ./test.in pytest-mock==3.14.0 # via -r ./test.in python-dateutil==2.9.0.post0 # via # -c ./base.txt + # faker # freezegun # pandas pytz==2024.2 # via pandas pyyaml==6.0.2 - # via vcrpy -regex==2024.9.11 + # via + # datamodel-code-generator + # vcrpy +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications +regex==2024.11.6 # via # -c ./base.txt # nltk @@ -169,42 +209,51 @@ requests==2.32.3 # -c ./base.txt # label-studio-sdk # requests-mock + # smart-open requests-mock==1.12.1 # via label-studio-sdk +rpds-py==0.22.3 + # via + # jsonschema + # referencing +rstr==3.2.2 + # via jsf ruff==0.8.3 # via -r ./test.in semantic-version==2.10.0 # via liccheck -six==1.16.0 +six==1.17.0 # via # -c ./base.txt - # jsonschema # python-dateutil +smart-open[http]==7.1.0 + # via jsf sniffio==1.3.1 # via # -c ./base.txt # anyio - # httpx toml==0.10.2 - # via liccheck -tomli==2.0.2 + # via + # datamodel-code-generator + # liccheck +tomli==2.2.1 # via # autoflake # black # coverage # mypy # pytest -tqdm==4.66.5 +tqdm==4.67.1 # via # -c ./base.txt # nltk types-click==7.1.8 # via -r ./test.in -types-markdown==3.7.0.20240822 +types-markdown==3.7.0.20241204 # via -r ./test.in types-requests==2.31.0.6 # via -r ./test.in -types-tabulate==0.9.0.20240106 +types-tabulate==0.9.0.20241207 # via -r ./test.in types-urllib3==1.26.25.14 # via types-requests @@ -213,6 +262,8 @@ typing-extensions==4.12.2 # -c ./base.txt # anyio # black + # faker + # jsf # label-studio-sdk # multidict # mypy @@ -230,14 +281,12 @@ urllib3==1.26.20 # vcrpy vcrpy==6.0.2 # via -r ./test.in -wrapt==1.16.0 +wrapt==1.17.0 # via # -c ./base.txt + # smart-open # vcrpy xmljson==0.2.1 # via label-studio-sdk -yarl==1.16.0 +yarl==1.18.3 # via vcrpy - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index c1f7ad1f8d..5169b54a11 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -90,6 +90,7 @@ def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direc (FileType.WAV, "CantinaBand3.wav", "audio/wav"), (FileType.XML, "factbook.xml", "application/xml"), (FileType.ZIP, "simple.zip", "application/zip"), + (FileType.NDJSON, "spring-weather.html.ndjson", "application/x-ndjson"), ], ) def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type( @@ -147,6 +148,17 @@ def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_co assert file_type is expected_value +def test_it_identifies_NDJSON_for_file_like_object_with_no_name_but_NDJSON_content_type(): + with open(example_doc_path("simple.ndjson"), "rb") as f: + file = io.BytesIO(f.read()) + assert detect_filetype(file=file, content_type=FileType.NDJSON.mime_type) == FileType.NDJSON + + +# TODO: ideally this test should pass, currently fails +# def test_it_identifies_NDJSON_for_file_with_ndjson_extension_but_JSON_content_type(): +# file_path = example_doc_path("simple.ndjson") +# assert detect_filetype(file_path, content_type=FileType.JSON.mime_type) == FileType.NDJSON + # ================================================================================================ # STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY # ================================================================================================ diff --git a/test_unstructured/partition/test_ndjson.py b/test_unstructured/partition/test_ndjson.py new file mode 100644 index 0000000000..c86ce1c8e6 --- /dev/null +++ b/test_unstructured/partition/test_ndjson.py @@ -0,0 +1,299 @@ +"""Test-suite for `unstructured.partition.ndjson` module.""" + +from __future__ import annotations + +import os +import pathlib +import tempfile + +import pytest +from pytest_mock import MockFixture + +from test_unstructured.unit_utils import example_doc_path +from unstructured.documents.elements import CompositeElement +from unstructured.file_utils.model import FileType +from unstructured.partition.email import partition_email +from unstructured.partition.html import partition_html +from unstructured.partition.ndjson import partition_ndjson +from unstructured.partition.text import partition_text +from unstructured.partition.xml import partition_xml +from unstructured.staging.base import elements_to_ndjson + +DIRECTORY = pathlib.Path(__file__).parent.resolve() + +is_in_docker = os.path.exists("/.dockerenv") + +test_files = [ + "fake-text.txt", + "fake-html.html", + "eml/fake-email.eml", +] + +is_in_docker = os.path.exists("/.dockerenv") + + +def test_it_chunks_elements_when_a_chunking_strategy_is_specified(): + chunks = partition_ndjson( + example_doc_path("spring-weather.html.ndjson"), + chunking_strategy="basic", + max_characters=1500, + ) + + assert len(chunks) == 9 + assert all(isinstance(ch, CompositeElement) for ch in chunks) + + +@pytest.mark.parametrize("filename", test_files) +def test_partition_ndjson_from_filename(filename: str): + path = example_doc_path(filename) + elements = [] + filetype = FileType.from_extension(os.path.splitext(path)[1]) + if filetype == FileType.TXT: + elements = partition_text(filename=path) + if filetype == FileType.HTML: + elements = partition_html(filename=path) + if filetype == FileType.XML: + elements = partition_xml(filename=path) + if filetype == FileType.EML: + elements = partition_email(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + _filename = os.path.basename(filename) + test_path = os.path.join(tmpdir, _filename + ".ndjson") + elements_to_ndjson(elements, filename=test_path) + test_elements = partition_ndjson(filename=test_path) + + assert len(elements) > 0 + assert len(str(elements[0])) > 0 + + assert len(elements) == len(test_elements) + for i in range(len(elements)): + assert elements[i] == test_elements[i] + assert elements[i].metadata.filename == filename.split("/")[-1] + + +@pytest.mark.parametrize("filename", test_files) +def test_partition_ndjson_from_filename_with_metadata_filename(filename: str): + path = example_doc_path(filename) + elements = [] + filetype = FileType.from_extension(os.path.splitext(path)[1]) + if filetype == FileType.TXT: + elements = partition_text(filename=path) + if filetype == FileType.HTML: + elements = partition_html(filename=path) + if filetype == FileType.XML: + elements = partition_xml(filename=path) + if filetype == FileType.EML: + elements = partition_email(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + _filename = os.path.basename(filename) + test_path = os.path.join(tmpdir, _filename + ".ndjson") + elements_to_ndjson(elements, filename=test_path) + test_elements = partition_ndjson(filename=test_path, metadata_filename="test") + + assert len(test_elements) > 0 + assert len(str(test_elements[0])) > 0 + assert all(element.metadata.filename == "test" for element in test_elements) + + +@pytest.mark.parametrize("filename", test_files) +def test_partition_ndjson_from_file(filename: str): + path = example_doc_path(filename) + elements = [] + filetype = FileType.from_extension(os.path.splitext(path)[1]) + if filetype == FileType.TXT: + elements = partition_text(filename=path) + if filetype == FileType.HTML: + elements = partition_html(filename=path) + if filetype == FileType.XML: + elements = partition_xml(filename=path) + if filetype == FileType.EML: + elements = partition_email(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + _filename = os.path.basename(filename) + test_path = os.path.join(tmpdir, _filename + ".ndjson") + elements_to_ndjson(elements, filename=test_path) + with open(test_path, "rb") as f: + test_elements = partition_ndjson(file=f) + + assert len(elements) > 0 + assert len(str(elements[0])) > 0 + assert len(elements) == len(test_elements) + for i in range(len(elements)): + assert elements[i] == test_elements[i] + assert elements[i].metadata.filename == filename.split("/")[-1] + + +@pytest.mark.parametrize("filename", test_files) +def test_partition_ndjson_from_file_with_metadata_filename(filename: str): + path = example_doc_path(filename) + elements = [] + filetype = FileType.from_extension(os.path.splitext(path)[1]) + if filetype == FileType.TXT: + elements = partition_text(filename=path) + if filetype == FileType.HTML: + elements = partition_html(filename=path) + if filetype == FileType.XML: + elements = partition_xml(filename=path) + if filetype == FileType.EML: + elements = partition_email(filename=path) + with tempfile.TemporaryDirectory() as tmpdir: + _filename = os.path.basename(filename) + test_path = os.path.join(tmpdir, _filename + ".ndjson") + elements_to_ndjson(elements, filename=test_path) + with open(test_path, "rb") as f: + test_elements = partition_ndjson(file=f, metadata_filename="test") + + for i in range(len(test_elements)): + assert test_elements[i].metadata.filename == "test" + + +@pytest.mark.parametrize("filename", test_files) +def test_partition_ndjson_from_text(filename: str): + path = example_doc_path(filename) + elements = [] + filetype = FileType.from_extension(os.path.splitext(path)[1]) + if filetype == FileType.TXT: + elements = partition_text(filename=path) + if filetype == FileType.HTML: + elements = partition_html(filename=path) + if filetype == FileType.XML: + elements = partition_xml(filename=path) + if filetype == FileType.EML: + elements = partition_email(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + _filename = os.path.basename(filename) + test_path = os.path.join(tmpdir, _filename + ".ndjson") + elements_to_ndjson(elements, filename=test_path) + with open(test_path) as f: + text = f.read() + test_elements = partition_ndjson(text=text) + + assert len(elements) > 0 + assert len(str(elements[0])) > 0 + assert len(elements) == len(test_elements) + for i in range(len(elements)): + assert elements[i] == test_elements[i] + assert elements[i].metadata.filename == filename.split("/")[-1] + + +def test_partition_json_raises_with_none_specified(): + with pytest.raises(ValueError): + partition_ndjson() + + +def test_partition_ndjson_works_with_empty_string(): + assert partition_ndjson(text="") == [] + + +def test_partition_ndjson_works_with_empty_list(): + assert partition_ndjson(text="{}") == [] + + +def test_partition_ndjson_raises_with_too_many_specified(): + path = example_doc_path("fake-text.txt") + elements = [] + filetype = FileType.from_extension(os.path.splitext(path)[1]) + if filetype == FileType.TXT: + elements = partition_text(filename=path) + if filetype == FileType.HTML: + elements = partition_html(filename=path) + if filetype == FileType.XML: + elements = partition_xml(filename=path) + if filetype == FileType.EML: + elements = partition_email(filename=path) + + with tempfile.TemporaryDirectory() as tmpdir: + test_path = os.path.join(tmpdir, "fake-text.txt.ndjson") + elements_to_ndjson(elements, filename=test_path) + with open(test_path, "rb") as f: + text = f.read().decode("utf-8") + + with pytest.raises(ValueError): + partition_ndjson(filename=test_path, file=f) + + with pytest.raises(ValueError): + partition_ndjson(filename=test_path, text=text) + + with pytest.raises(ValueError): + partition_ndjson(file=f, text=text) + + with pytest.raises(ValueError): + partition_ndjson(filename=test_path, file=f, text=text) + + +# -- .metadata.last_modified --------------------------------------------------------------------- + + +def test_partition_ndjson_from_file_path_gets_last_modified_from_filesystem(mocker: MockFixture): + filesystem_last_modified = "2029-07-05T09:24:28" + mocker.patch( + "unstructured.partition.ndjson.get_last_modified_date", + return_value=filesystem_last_modified, + ) + + elements = partition_ndjson(example_doc_path("spring-weather.html.ndjson")) + + assert all(e.metadata.last_modified == filesystem_last_modified for e in elements) + + +def test_partition_ndjson_from_file_gets_last_modified_None(): + with open(example_doc_path("spring-weather.html.ndjson"), "rb") as f: + elements = partition_ndjson(file=f) + + assert all(e.metadata.last_modified is None for e in elements) + + +def test_partition_ndjson_from_text_gets_last_modified_None(): + with open(example_doc_path("spring-weather.html.ndjson")) as f: + text = f.read() + + elements = partition_ndjson(text=text) + + assert all(e.metadata.last_modified is None for e in elements) + + +def test_partition_ndjson_from_file_path_prefers_metadata_last_modified(mocker: MockFixture): + filesystem_last_modified = "2029-07-05T09:24:28" + metadata_last_modified = "2020-07-05T09:24:28" + mocker.patch( + "unstructured.partition.ndjson.get_last_modified_date", + return_value=filesystem_last_modified, + ) + + elements = partition_ndjson( + example_doc_path("spring-weather.html.ndjson"), + metadata_last_modified=metadata_last_modified, + ) + + assert all(e.metadata.last_modified == metadata_last_modified for e in elements) + + +def test_partition_ndjson_from_file_prefers_metadata_last_modified(): + metadata_last_modified = "2020-07-05T09:24:28" + with open(example_doc_path("spring-weather.html.ndjson"), "rb") as f: + elements = partition_ndjson(file=f, metadata_last_modified=metadata_last_modified) + + assert all(e.metadata.last_modified == metadata_last_modified for e in elements) + + +def test_partition_ndjson_from_text_prefers_metadata_last_modified(): + metadata_last_modified = "2020-07-05T09:24:28" + with open(example_doc_path("spring-weather.html.ndjson")) as f: + text = f.read() + + elements = partition_ndjson(text=text, metadata_last_modified=metadata_last_modified) + + assert all(e.metadata.last_modified == metadata_last_modified for e in elements) + + +# ------------------------------------------------------------------------------------------------ + + +def test_partition_json_raises_with_invalid_json(): + text = '[{"hi": "there"}]]' + with pytest.raises(ValueError): + partition_ndjson(text=text) diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 4c8e4d2be8..9fa42ca1f8 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -46,7 +46,7 @@ from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str from unstructured.file_utils.model import FileType from unstructured.logger import logger -from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN +from unstructured.nlp.patterns import DICT_PATTERN, EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN from unstructured.partition.common.common import add_element_metadata, exactly_one from unstructured.partition.common.metadata import set_element_hierarchy from unstructured.utils import get_call_args_applying_defaults, lazyproperty @@ -89,7 +89,7 @@ def detect_filetype( Raises: ValueError: when: - `file_path` is specified but does not correspond to a file on the - fileesystem. + filesystem. - Neither `file_path` nor `file` were specified. """ ctx = _FileTypeDetectionContext.new( @@ -123,6 +123,27 @@ def is_json_processable( return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None +def is_ndjson_processable( + filename: Optional[str] = None, + file: Optional[IO[bytes]] = None, + file_text: Optional[str] = None, + encoding: Optional[str] = "utf-8", +) -> bool: + """True when file looks like a JSON array of objects. + + Uses regex on a file prefix, so not entirely reliable but good enough if you already know the + file is JSON. + """ + exactly_one(filename=filename, file=file, file_text=file_text) + + if file_text is None: + file_text = _FileTypeDetectionContext.new( + file_path=filename, file=file, encoding=encoding + ).text_head + + return re.match(DICT_PATTERN, file_text) is not None + + class _FileTypeDetector: """Determines file type from a variety of possible inputs.""" diff --git a/unstructured/file_utils/model.py b/unstructured/file_utils/model.py index e4c567975f..4e032954b3 100644 --- a/unstructured/file_utils/model.py +++ b/unstructured/file_utils/model.py @@ -288,6 +288,15 @@ def partitioner_shortname(self) -> str | None: "application/vnd.ms-outlook", cast(list[str], []), ) + NDJSON = ( + "ndjson", + "ndjson", + ["ndjson"], + None, + [".ndjson"], + "application/x-ndjson", + cast(list[str], []), + ) ODT = ( "odt", "odt", diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index b3a77f77a8..faf0ef0b72 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -120,6 +120,8 @@ # format for document elements LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?" +DICT_PATTERN = r"\A\s*{?" + # (?s) dot all (including newline characters) # \{(?=.*:) opening brace and at least one colon # .*? any characters (non-greedy) diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 02ae5219f1..4a6ae7f345 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -11,7 +11,11 @@ from typing_extensions import TypeAlias from unstructured.documents.elements import DataSourceMetadata, Element -from unstructured.file_utils.filetype import detect_filetype, is_json_processable +from unstructured.file_utils.filetype import ( + detect_filetype, + is_json_processable, + is_ndjson_processable, +) from unstructured.file_utils.model import FileType from unstructured.logger import logger from unstructured.partition.common import UnsupportedFileFormatError @@ -244,6 +248,16 @@ def augment_metadata(elements: list[Element]) -> list[Element]: elements = partition_json(filename=filename, file=file, **kwargs) return augment_metadata(elements) + if file_type == FileType.NDJSON: + if not is_ndjson_processable(filename=filename, file=file): + raise ValueError( + "Detected an NDJSON file that does not conform to the Unstructured schema. " + "partition_json currently only processes serialized Unstructured output.", + ) + partition_ndjson = partitioner_loader.get(file_type) + elements = partition_ndjson(filename=filename, file=file, **kwargs) + return augment_metadata(elements) + # -- EMPTY is also a special case because while we can't determine the file type, we can be # -- sure it doesn't contain any elements. if file_type == FileType.EMPTY: diff --git a/unstructured/partition/ndjson.py b/unstructured/partition/ndjson.py new file mode 100644 index 0000000000..c9f3f96d84 --- /dev/null +++ b/unstructured/partition/ndjson.py @@ -0,0 +1,85 @@ +"""Provides `partition_ndjson()`. + +Note this does not partition arbitrary NDJSON. Its only use-case is to "rehydrate" unstructured +document elements serialized to JSON, essentially the same function as `elements_from_json()`, but +this allows a document of already-partitioned elements to be combined transparently with other +documents in a partitioning run. It also allows multiple (low-cost) chunking runs to be performed on +a document while only incurring partitioning cost once. +""" + +from __future__ import annotations + +import json +from typing import IO, Any, Optional + +import ndjson + +from unstructured.chunking import add_chunking_strategy +from unstructured.documents.elements import Element, process_metadata +from unstructured.file_utils.filetype import ( + FileType, + add_metadata_with_filetype, + is_ndjson_processable, +) +from unstructured.partition.common.common import exactly_one +from unstructured.partition.common.metadata import get_last_modified_date +from unstructured.staging.base import elements_from_dicts + + +@process_metadata() +@add_metadata_with_filetype(FileType.NDJSON) +@add_chunking_strategy +def partition_ndjson( + filename: Optional[str] = None, + file: Optional[IO[bytes]] = None, + text: Optional[str] = None, + metadata_last_modified: Optional[str] = None, + **kwargs: Any, +) -> list[Element]: + """Partitions serialized Unstructured output into its constituent elements. + + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object as bytes --> open(filename, "rb"). + text + The string representation of the .json document. + metadata_last_modified + The last modified date for the document. + """ + if text is not None and text.strip() == "" and not file and not filename: + return [] + + exactly_one(filename=filename, file=file, text=text) + + last_modified = get_last_modified_date(filename) if filename else None + file_text = "" + if filename is not None: + with open(filename, encoding="utf8") as f: + file_text = f.read() + + elif file is not None: + file_content = file.read() + file_text = file_content if isinstance(file_content, str) else file_content.decode() + file.seek(0) + + elif text is not None: + file_text = str(text) + + if not is_ndjson_processable(file_text=file_text): + raise ValueError( + "NDJSON cannot be partitioned. Schema does not match the Unstructured schema.", + ) + + try: + element_dicts = ndjson.loads(file_text) + elements = elements_from_dicts(element_dicts) + except json.JSONDecodeError: + raise ValueError("Not a valid ndjson") + + for element in elements: + element.metadata.last_modified = metadata_last_modified or last_modified + + return elements diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py index 1dbedecb70..1e98f47d29 100644 --- a/unstructured/staging/base.py +++ b/unstructured/staging/base.py @@ -9,6 +9,8 @@ from datetime import datetime from typing import Any, Iterable, Optional, Sequence, cast +import ndjson + from unstructured.documents.coordinates import PixelSpace from unstructured.documents.elements import ( TYPE_TO_TEXT_ELEMENT_MAP, @@ -152,6 +154,29 @@ def elements_to_json( return json_str +def elements_to_ndjson( + elements: Iterable[Element], + filename: Optional[str] = None, + encoding: str = "utf-8", +) -> str: + """Serialize `elements` to a JSON array. + + Also writes the JSON to `filename` if it is provided, encoded using `encoding`. + + The JSON is returned as a string. + """ + # -- serialize `elements` as a JSON array (str) -- + precision_adjusted_elements = _fix_metadata_field_precision(elements) + element_dicts = elements_to_dicts(precision_adjusted_elements) + ndjson_str = ndjson.dumps(element_dicts, sort_keys=True) + + if filename is not None: + with open(filename, "w", encoding=encoding) as f: + f.write(ndjson_str) + + return ndjson_str + + def _fix_metadata_field_precision(elements: Iterable[Element]) -> list[Element]: out_elements: list[Element] = [] for element in elements: