Skip to content

Commit

Permalink
chore: update Firecrawl components
Browse files Browse the repository at this point in the history
  • Loading branch information
italojohnny committed Nov 12, 2024
1 parent ab71e2f commit f61ff7d
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 112 deletions.
121 changes: 60 additions & 61 deletions src/backend/base/langflow/components/firecrawl/firecrawl_crawl_api.py
Original file line number Diff line number Diff line change
@@ -1,81 +1,80 @@
import uuid

from langflow.custom import CustomComponent
from langflow.custom import Component
from langflow.io import (
DataInput,
IntInput,
Output,
SecretStrInput,
StrInput,
)
from langflow.schema import Data


class FirecrawlCrawlApi(CustomComponent):
class FirecrawlCrawlApi(Component):
display_name: str = "FirecrawlCrawlApi"
description: str = "Firecrawl Crawl API."
name = "FirecrawlCrawlApi"

output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"url": {
"display_name": "URL",
"field_type": "str",
"required": True,
"info": "The base URL to start crawling from.",
},
"timeout": {
"display_name": "Timeout",
"field_type": "int",
"info": "The timeout in milliseconds.",
},
"crawlerOptions": {
"display_name": "Crawler Options",
"info": "Options for the crawler behavior.",
},
"pageOptions": {
"display_name": "Page Options",
"info": "The page options to send with the request.",
},
"idempotency_key": {
"display_name": "Idempotency Key",
"field_type": "str",
"info": "Optional idempotency key to ensure unique requests.",
},
}
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/crawl-post"

def build(
self,
api_key: str,
url: str,
timeout: int = 30000,
crawlerOptions: Data | None = None, # noqa: N803
pageOptions: Data | None = None, # noqa: N803
idempotency_key: str | None = None,
) -> Data:
inputs = [
SecretStrInput(
name="api_key",
display_name="API Key",
required=True,
password=True,
info="The API key to use Firecrawl API.",
),
StrInput(
name="url",
display_name="URL",
required=True,
info="The URL to scrape.",
),
IntInput(
name="timeout",
display_name="Timeout",
info="Timeout in milliseconds for the request.",
),
StrInput(
name="idempotency_key",
display_name="Idempotency Key",
info="Optional idempotency key to ensure unique requests.",
),
DataInput(
name="crawlerOptions",
display_name="Crawler Options",
info="The crawler options to send with the request.",
),
DataInput(
name="scrapeOptions",
display_name="Scrape Options",
info="The page options to send with the request.",
),
]

outputs = [
Output(display_name="Data", name="data", method="crawl"),
]
idempotency_key: str | None = None

def crawl(self) -> Data:
try:
from firecrawl.firecrawl import FirecrawlApp
except ImportError as e:
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
raise ImportError(msg) from e
crawler_options_dict = crawlerOptions.__dict__["data"]["text"] if crawlerOptions else {}

page_options_dict = pageOptions.__dict__["data"]["text"] if pageOptions else {}

if not idempotency_key:
idempotency_key = str(uuid.uuid4())
params = self.crawlerOptions.__dict__["data"] if self.crawlerOptions else {}
scrape_options_dict = self.scrapeOptions.__dict__["data"] if self.scrapeOptions else {}
if scrape_options_dict:
params["scrapeOptions"] = scrape_options_dict

app = FirecrawlApp(api_key=api_key)
crawl_result = app.crawl_url(
url,
params={
"crawlerOptions": crawler_options_dict,
"pageOptions": page_options_dict,
},
wait_until_done=True,
poll_interval=int(timeout / 1000),
idempotency_key=idempotency_key,
)
if not self.idempotency_key:
self.idempotency_key = str(uuid.uuid4())

app = FirecrawlApp(api_key=self.api_key)
crawl_result = app.crawl_url(self.url, params=params, idempotency_key=self.idempotency_key)
return Data(data={"results": crawl_result})
102 changes: 51 additions & 51 deletions src/backend/base/langflow/components/firecrawl/firecrawl_scrape_api.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,69 @@
from langflow.custom import CustomComponent
from langflow.custom import Component
from langflow.io import (
DataInput,
IntInput,
Output,
SecretStrInput,
StrInput,
)
from langflow.schema import Data


class FirecrawlScrapeApi(CustomComponent):
class FirecrawlScrapeApi(Component):
display_name: str = "FirecrawlScrapeApi"
description: str = "Firecrawl Scrape API."
name = "FirecrawlScrapeApi"

output_types: list[str] = ["Document"]
documentation: str = "https://docs.firecrawl.dev/api-reference/endpoint/scrape"
field_config = {
"api_key": {
"display_name": "API Key",
"field_type": "str",
"required": True,
"password": True,
"info": "The API key to use Firecrawl API.",
},
"url": {
"display_name": "URL",
"field_type": "str",
"required": True,
"info": "The URL to scrape.",
},
"timeout": {
"display_name": "Timeout",
"info": "Timeout in milliseconds for the request.",
"field_type": "int",
"default_value": 10000,
},
"pageOptions": {
"display_name": "Page Options",
"info": "The page options to send with the request.",
},
"extractorOptions": {
"display_name": "Extractor Options",
"info": "The extractor options to send with the request.",
},
}

def build(
self,
api_key: str,
url: str,
timeout: int = 10000,
pageOptions: Data | None = None, # noqa: N803
extractorOptions: Data | None = None, # noqa: N803
) -> Data:
inputs = [
SecretStrInput(
name="api_key",
display_name="API Key",
required=True,
password=True,
info="The API key to use Firecrawl API.",
),
StrInput(
name="url",
display_name="URL",
required=True,
info="The URL to scrape.",
),
IntInput(
name="timeout",
display_name="Timeout",
info="Timeout in milliseconds for the request.",
),
DataInput(
name="scrapeOptions",
display_name="Scrape Options",
info="The page options to send with the request.",
),
DataInput( # https://docs.firecrawl.dev/features/extract
name="extractorOptions",
display_name="Extractor Options",
info="The extractor options to send with the request.",
),
]

outputs = [
Output(display_name="Data", name="data", method="crawl"),
]

def crawl(self) -> list[Data]:
try:
from firecrawl.firecrawl import FirecrawlApp
except ImportError as e:
msg = "Could not import firecrawl integration package. Please install it with `pip install firecrawl-py`."
raise ImportError(msg) from e
extractor_options_dict = extractorOptions.__dict__["data"]["text"] if extractorOptions else {}

page_options_dict = pageOptions.__dict__["data"]["text"] if pageOptions else {}

app = FirecrawlApp(api_key=api_key)
results = app.scrape_url(
url,
{
"timeout": str(timeout),
"extractorOptions": extractor_options_dict,
"pageOptions": page_options_dict,
},
)
params = self.scrapeOptions.__dict__["data"] if self.scrapeOptions else {}
extractor_options_dict = self.extractorOptions.__dict__["data"] if self.extractorOptions else {}
if extractor_options_dict:
params["extract"] = extractor_options_dict

app = FirecrawlApp(api_key=self.api_key)
results = app.scrape_url(self.url, params=params)
return Data(data=results)

0 comments on commit f61ff7d

Please sign in to comment.