From d0a21086bd8d164c518f286674a64bc40104ce3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20Tonato?= Date: Wed, 29 Jan 2025 03:21:05 +0000 Subject: [PATCH] refactor: Update Firecrawl API parameters and default settings (#13082) --- .../rag/extractor/firecrawl/firecrawl_web_extractor.py | 3 ++- api/services/auth/firecrawl/firecrawl.py | 4 ++-- api/services/website_service.py | 10 ++++------ .../core/rag/extractor/firecrawl/test_firecrawl.py | 5 ++--- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py index b33ce167c21c82..355a2fb2048983 100644 --- a/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py +++ b/api/core/rag/extractor/firecrawl/firecrawl_web_extractor.py @@ -13,9 +13,10 @@ class FirecrawlWebExtractor(BaseExtractor): api_key: The API key for Firecrawl. base_url: The base URL for the Firecrawl API. Defaults to 'https://api.firecrawl.dev'. mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. + only_main_content: Only return the main content of the page excluding headers, navs, footers, etc. """ - def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False): + def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True): """Initialize with url, api_key, base_url and mode.""" self._url = url self.job_id = job_id diff --git a/api/services/auth/firecrawl/firecrawl.py b/api/services/auth/firecrawl/firecrawl.py index cc6eaaa42a0611..6ef034f2920a6f 100644 --- a/api/services/auth/firecrawl/firecrawl.py +++ b/api/services/auth/firecrawl/firecrawl.py @@ -21,8 +21,8 @@ def validate_credentials(self): headers = self._prepare_headers() options = { "url": "https://example.com", - "excludes": [], - "includes": [], + "includePaths": [], + "excludePaths": [], "limit": 1, "scrapeOptions": {"onlyMainContent": True}, } diff --git a/api/services/website_service.py b/api/services/website_service.py index b30e2205f7db20..85d32c9e8aed32 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -38,9 +38,8 @@ def crawl_url(cls, args: dict) -> dict: only_main_content = options.get("only_main_content", False) if not crawl_sub_pages: params = { - "includes": [], - "excludes": [], - "generateImgAltText": True, + "includePaths": [], + "excludePaths": [], "limit": 1, "scrapeOptions": {"onlyMainContent": only_main_content}, } @@ -48,9 +47,8 @@ def crawl_url(cls, args: dict) -> dict: includes = options.get("includes").split(",") if options.get("includes") else [] excludes = options.get("excludes").split(",") if options.get("excludes") else [] params = { - "includes": includes, - "excludes": excludes, - "generateImgAltText": True, + "includePaths": includes, + "excludePaths": excludes, "limit": options.get("limit", 1), "scrapeOptions": {"onlyMainContent": only_main_content}, } diff --git a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py index 120ca9c8ea9845..607728efd8e28a 100644 --- a/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py +++ b/api/tests/unit_tests/core/rag/extractor/firecrawl/test_firecrawl.py @@ -10,9 +10,8 @@ def test_firecrawl_web_extractor_crawl_mode(mocker): base_url = "https://api.firecrawl.dev" firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url) params = { - "includes": [], - "excludes": [], - "generateImgAltText": True, + "includePaths": [], + "excludePaths": [], "maxDepth": 1, "limit": 1, }