Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: update firecrawl components #4458

Merged
merged 2 commits into from
Nov 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ dependencies = [
"upstash-vector>=0.5.0",
"gitpython>=3.1.43",
"kubernetes>=30.1.0",
"firecrawl-py>=0.0.16",
"firecrawl-py==1.5.0",
"json-repair>=0.25.2",
"langchain-google-calendar-tools>=0.0.1",
"langchain-milvus>=0.1.1",
Expand Down
121 changes: 60 additions & 61 deletions src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py
Original file line number Diff line number Diff line change
@@ -1,81 +1,80 @@
import uuid

from langflow.custom import CustomComponent
from langflow.custom import Component
from langflow.io import (
DataInput,
IntInput,
Output,
SecretStrInput,
StrInput,
)
from langflow.schema import Data


class FirecrawlCrawlApi(CustomComponent):
class FirecrawlCrawlApi(Component):
display_name: str = "FirecrawlCrawlApi"
description: str = "Firecrawl Crawl API."
name = "FirecrawlCrawlApi"

output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"url": {
"display_name": "URL",
"field_type": "str",
"required": True,
"info": "The base URL to start crawling from.",
},
"timeout": {
"display_name": "Timeout",
"field_type": "int",
"info": "The timeout in milliseconds.",
},
"crawlerOptions": {
"display_name": "Crawler Options",
"info": "Options for the crawler behavior.",
},
"pageOptions": {
"display_name": "Page Options",
"info": "The page options to send with the request.",
},
"idempotency_key": {
"display_name": "Idempotency Key",
"field_type": "str",
"info": "Optional idempotency key to ensure unique requests.",
},
}
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl-post"

def build(
self,
api_key: str,
url: str,
timeout: int = 30000,
crawlerOptions: Data | None = None, # noqa: N803
pageOptions: Data | None = None, # noqa: N803
idempotency_key: str | None = None,
) -> Data:
inputs = [
SecretStrInput(
name="api_key",
display_name="API Key",
required=True,
password=True,
info="The API key to use Firecrawl API.",
),
StrInput(
name="url",
display_name="URL",
required=True,
info="The URL to scrape.",
),
IntInput(
name="timeout",
display_name="Timeout",
info="Timeout in milliseconds for the request.",
),
StrInput(
name="idempotency_key",
display_name="Idempotency Key",
info="Optional idempotency key to ensure unique requests.",
),
DataInput(
name="crawlerOptions",
display_name="Crawler Options",
info="The crawler options to send with the request.",
),
DataInput(
name="scrapeOptions",
display_name="Scrape Options",
info="The page options to send with the request.",
),
]

outputs = [
Output(display_name="Data", name="data", method="crawl"),
]
idempotency_key: str | None = None

def crawl(self) -> Data:
try:
from firecrawl.firecrawl import FirecrawlApp
except ImportError as e:
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
raise ImportError(msg) from e
crawler_options_dict = crawlerOptions.__dict__["data"]["text"] if crawlerOptions else {}

page_options_dict = pageOptions.__dict__["data"]["text"] if pageOptions else {}

if not idempotency_key:
idempotency_key = str(uuid.uuid4())
params = self.crawlerOptions.__dict__["data"] if self.crawlerOptions else {}
scrape_options_dict = self.scrapeOptions.__dict__["data"] if self.scrapeOptions else {}
if scrape_options_dict:
params["scrapeOptions"] = scrape_options_dict

app = FirecrawlApp(api_key=api_key)
crawl_result = app.crawl_url(
url,
params={
"crawlerOptions": crawler_options_dict,
"pageOptions": page_options_dict,
},
wait_until_done=True,
poll_interval=int(timeout / 1000),
idempotency_key=idempotency_key,
)
if not self.idempotency_key:
self.idempotency_key = str(uuid.uuid4())

app = FirecrawlApp(api_key=self.api_key)
crawl_result = app.crawl_url(self.url, params=params, idempotency_key=self.idempotency_key)
return Data(data={"results": crawl_result})
102 changes: 51 additions & 51 deletions src/backend/base/langflow/components/firecrawl/firecrawl_scrape_api.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,69 @@
from langflow.custom import CustomComponent
from langflow.custom import Component
from langflow.io import (
DataInput,
IntInput,
Output,
SecretStrInput,
StrInput,
)
from langflow.schema import Data


class FirecrawlScrapeApi(CustomComponent):
class FirecrawlScrapeApi(Component):
display_name: str = "FirecrawlScrapeApi"
description: str = "Firecrawl Scrape API."
name = "FirecrawlScrapeApi"

output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/scrape"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"url": {
"display_name": "URL",
"field_type": "str",
"required": True,
"info": "The URL to scrape.",
},
"timeout": {
"display_name": "Timeout",
"info": "Timeout in milliseconds for the request.",
"field_type": "int",
"default_value": 10000,
},
"pageOptions": {
"display_name": "Page Options",
"info": "The page options to send with the request.",
},
"extractorOptions": {
"display_name": "Extractor Options",
"info": "The extractor options to send with the request.",
},
}

def build(
self,
api_key: str,
url: str,
timeout: int = 10000,
pageOptions: Data | None = None, # noqa: N803
extractorOptions: Data | None = None, # noqa: N803
) -> Data:
inputs = [
SecretStrInput(
name="api_key",
display_name="API Key",
required=True,
password=True,
info="The API key to use Firecrawl API.",
),
StrInput(
name="url",
display_name="URL",
required=True,
info="The URL to scrape.",
),
IntInput(
name="timeout",
display_name="Timeout",
info="Timeout in milliseconds for the request.",
),
DataInput(
name="scrapeOptions",
display_name="Scrape Options",
info="The page options to send with the request.",
),
DataInput( # https://docs.firecrawl.dev/features/extract
name="extractorOptions",
display_name="Extractor Options",
info="The extractor options to send with the request.",
),
]

outputs = [
Output(display_name="Data", name="data", method="crawl"),
]

def crawl(self) -> list[Data]:
try:
from firecrawl.firecrawl import FirecrawlApp
except ImportError as e:
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
raise ImportError(msg) from e
extractor_options_dict = extractorOptions.__dict__["data"]["text"] if extractorOptions else {}

page_options_dict = pageOptions.__dict__["data"]["text"] if pageOptions else {}

app = FirecrawlApp(api_key=api_key)
results = app.scrape_url(
url,
{
"timeout": str(timeout),
"extractorOptions": extractor_options_dict,
"pageOptions": page_options_dict,
},
)
params = self.scrapeOptions.__dict__["data"] if self.scrapeOptions else {}
extractor_options_dict = self.extractorOptions.__dict__["data"] if self.extractorOptions else {}
if extractor_options_dict:
params["extract"] = extractor_options_dict

app = FirecrawlApp(api_key=self.api_key)
results = app.scrape_url(self.url, params=params)
return Data(data=results)
Loading
Loading