Skip to content

Commit

Permalink
community: add init for UnstructuredHTMLLoader to solve pathlib pat…
Browse files Browse the repository at this point in the history
…hs (#29091)

## Description
Add `__init__` for `UnstructuredHTMLLoader` to restrict the input type
to `str` or `Path`, and transfer the `self.file_path` to `str` just like
`UnstructuredXMLLoader` does.

## Issue
Fix #29090 

## Dependencies
No changes.
  • Loading branch information
Marsman1996 authored Jan 8, 2025
1 parent c8ca1cd commit 2b09f79
Showing 1 changed file with 19 additions and 1 deletion.
20 changes: 19 additions & 1 deletion libs/community/langchain_community/document_loaders/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List
from pathlib import Path
from typing import Any, List, Union

from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

Expand Down Expand Up @@ -27,6 +28,23 @@ class UnstructuredHTMLLoader(UnstructuredFileLoader):
https://unstructured-io.github.io/unstructured/bricks.html#partition-html
"""

def __init__(
self,
file_path: Union[str, Path],
mode: str = "single",
**unstructured_kwargs: Any,
):
"""
Args:
file_path: The path to the HTML file to load.
mode: The mode to use when loading the file. Can be one of "single",
"multi", or "all". Default is "single".
**unstructured_kwargs: Any kwargs to pass to the unstructured.
"""
file_path = str(file_path)
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

def _get_elements(self) -> List:
from unstructured.partition.html import partition_html

Expand Down

0 comments on commit 2b09f79

Please sign in to comment.