crewAIInc · ftonato · Feb 23, 2025
diff --git a/crewai_tools/__init__.py b/crewai_tools/__init__.py
@@ -14,6 +14,7 @@
     FileReadTool,
     FileWriterTool,
     FirecrawlCrawlWebsiteTool,
+    FirecrawlExtractTool,
     FirecrawlScrapeWebsiteTool,
     FirecrawlSearchTool,
     GithubSearchTool,

diff --git a/crewai_tools/tools/__init__.py b/crewai_tools/tools/__init__.py
@@ -15,6 +15,9 @@
 from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import (
     FirecrawlCrawlWebsiteTool,
 )
+from .firecrawl_extract_tool.firecrawl_extract_tool import (
+  FirecrawlExtractTool,
+)
 from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import (
     FirecrawlScrapeWebsiteTool,
 )

diff --git a/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
@@ -2,20 +2,23 @@
 
 ## Description
 
-[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
+[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean
+markdown or structured data.
 
 ## Installation
 
-- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
-- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
+- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables
+  (`FIRECRAWL_API_KEY`).
+- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]`
+  package:
 
 ```
 pip install firecrawl-py 'crewai[tools]'
 ```
 
 ## Example
 
-Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to load websites:
+Utilize the `FirecrawlCrawlWebsiteTool` as follows to allow your agent to load websites:
 
 ```python
 from crewai_tools import FirecrawlCrawlWebsiteTool
@@ -25,18 +28,20 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
 
 ## Arguments
 
-- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
-- `url`: The base URL to start crawling from.
-- `page_options`: Optional. 
-  - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
-  - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
-- `crawler_options`: Optional. Options for controlling the crawling behavior.
-  - `includes`: Optional. URL patterns to include in the crawl.
-  - `exclude`: Optional. URL patterns to exclude from the crawl.
-  - `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan).
-  - `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents.
-  - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on.
-  - `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites.
-  - `limit`: Optional. Maximum number of pages to crawl.
-  - `timeout`: Optional. Timeout in milliseconds for the crawling operation.
-
+> Documentation for the parameters can be found
+> [here](https://docs.firecrawl.dev/api-reference/endpoint/crawl-post).
+
+| Parameter                 | Required | Default                     | Description                                                              |
+| ------------------------- | -------- | --------------------------- | ------------------------------------------------------------------------ |
+| `api_key`                 | ✅       | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key                                              |
+| `url`                     | ✅       | -                           | The base URL to start crawling from                                      |
+| `exclude_paths`           | ❌       | -                           | URL patterns to exclude from the crawl                                   |
+| `include_paths`           | ❌       | -                           | URL patterns to include in the crawl                                     |
+| `max_depth`               | ❌       | `2`                         | Maximum depth to crawl relative to the entered URL                       |
+| `ignore_sitemap`          | ❌       | `false`                     | Ignore the website sitemap when crawling                                 |
+| `ignore_query_parameters` | ❌       | `false`                     | Do not re-scrape the same path with different (or none) query parameters |
+| `limit`                   | ❌       | `10000`                     | Maximum number of pages to crawl                                         |
+| `allow_backward_links`    | ❌       | `false`                     | Enables crawling previously linked pages                                 |
+| `allow_external_links`    | ❌       | `false`                     | Allows crawling external websites                                        |
+| `webhook`                 | ❌       | -                           | Webhook configuration for crawl notifications                            |
+| `scrape_options`          | ❌       | -                           | Options for scraping pages during crawl                                  |
diff --git a/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Type
+from typing import Any, Dict, Optional, Type, List, Union
 
 from crewai.tools import BaseTool
 from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
@@ -12,12 +12,45 @@
 
 class FirecrawlCrawlWebsiteToolSchema(BaseModel):
     url: str = Field(description="Website URL")
-    crawler_options: Optional[Dict[str, Any]] = Field(
-        default=None, description="Options for crawling"
+    exclude_paths: Optional[List[str]] = Field(
+        default=None,
+        description="URL patterns to exclude from the crawl",
     )
-    timeout: Optional[int] = Field(
-        default=30000,
-        description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
+    include_paths: Optional[List[str]] = Field(
+        default=None,
+        description="URL patterns to include in the crawl",
+    )
+    max_depth: Optional[int] = Field(
+        default=2,
+        description="Maximum depth to crawl relative to the entered URL",
+    )
+    ignore_sitemap: Optional[bool] = Field(
+        default=False,
+        description="Ignore the website sitemap when crawling",
+    )
+    ignore_query_parameters: Optional[bool] = Field(
+        default=False,
+        description="Do not re-scrape the same path with different (or none) query parameters",
+    )
+    limit: Optional[int] = Field(
+        default=10000,
+        description="Maximum number of pages to crawl",
+    )
+    allow_backward_links: Optional[bool] = Field(
+        default=False,
+        description="Enables the crawler to navigate from a specific URL to previously linked pages",
+    )
+    allow_external_links: Optional[bool] = Field(
+        default=False,
+        description="Allows the crawler to follow links to external websites",
+    )
+    webhook: Optional[Union[str, Dict[str, Any]]] = Field(
+        default=None,
+        description="Webhook configuration for crawl notifications",
+    )
+    scrape_options: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Options for scraping pages during crawl",
     )
 
 
@@ -64,15 +97,28 @@ def _initialize_firecrawl(self) -> None:
     def _run(
         self,
         url: str,
-        crawler_options: Optional[Dict[str, Any]] = None,
-        timeout: Optional[int] = 30000,
+        exclude_paths: Optional[List[str]] = None,
+        include_paths: Optional[List[str]] = None,
+        max_depth: Optional[int] = 2,
+        ignore_sitemap: Optional[bool] = False,
+        ignore_query_parameters: Optional[bool] = False,
+        limit: Optional[int] = 10000,
+        allow_backward_links: Optional[bool] = False,
+        allow_external_links: Optional[bool] = False,
+        webhook: Optional[Union[str, Dict[str, Any]]] = None,
+        scrape_options: Optional[Dict[str, Any]] = None
     ):
-        if crawler_options is None:
-            crawler_options = {}
-
         options = {
-            "crawlerOptions": crawler_options,
-            "timeout": timeout,
+            "excludePaths": exclude_paths or [],
+            "includePaths": include_paths or [],
+            "maxDepth": max_depth,
+            "ignoreSitemap": ignore_sitemap,
+            "ignoreQueryParameters": ignore_query_parameters,
+            "limit": limit,
+            "allowBackwardLinks": allow_backward_links,
+            "allowExternalLinks": allow_external_links,
+            "webhook": webhook,
+            "scrapeOptions": scrape_options or {},
         }
         return self._firecrawl.crawl_url(url, options)
 

diff --git a/crewai_tools/tools/firecrawl_extract_tool/README.md b/crewai_tools/tools/firecrawl_extract_tool/README.md
@@ -0,0 +1,56 @@
+# FirecrawlExtractTool
+
+## Description
+
+[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean
+markdown or structured data.
+
+## Installation
+
+- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables
+  (`FIRECRAWL_API_KEY`).
+- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]`
+  package:
+
+```
+pip install firecrawl-py 'crewai[tools]'
+```
+
+## Example
+
+Utilize the `FirecrawlExtractTool` as follows to extract structured data from websites:
+
+```python
+from crewai_tools import FirecrawlExtractTool
+
+# Example schema for product information
+schema = {
+    "name": {"type": "string", "description": "Product name"},
+    "price": {"type": "number", "description": "Product price"},
+    "description": {"type": "string", "description": "Product description"}
+}
+
+tool = FirecrawlExtractTool(
+    urls=['https://example.com/products/*'],
+    prompt="Extract product information from these pages",
+    schema=schema,
+    enable_web_search=True,
+    include_subdomains=False,
+    show_sources=True,
+    scrape_options={"formats": ["markdown", "html"]}
+)
+```
+
+## Arguments
+
+| Parameter            | Required | Default                     | Description                                                                                       |
+| -------------------- | -------- | --------------------------- | ------------------------------------------------------------------------------------------------- |
+| `api_key`            | ✅       | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key                                                                       |
+| `urls`               | ✅       | -                           | List of URLs to extract data from. URLs can include glob patterns                                 |
+| `prompt`             | ❌       | -                           | The prompt describing what information to extract from the pages                                  |
+| `schema`             | ❌       | -                           | JSON schema defining the structure of the data to extract                                         |
+| `enable_web_search`  | ❌       | `false`                     | When true, the extraction will use web search to find additional data                             |
+| `ignore_site_map`    | ❌       | `false`                     | When true, the extraction will not use the _sitemap.xml_ to find additional data                  |
+| `include_subdomains` | ❌       | `true`                      | When true, subdomains of the provided URLs will also be scanned                                   |
+| `show_sources`       | ❌       | `false`                     | When true, the sources used to extract the data will be included in the response as `sources` key |
+| `scrape_options`     | ❌       | `{}`                        | Additional options for the crawl request                                                          |
diff --git a/crewai_tools/tools/firecrawl_extract_tool/firecrawl_extract_tool.py b/crewai_tools/tools/firecrawl_extract_tool/firecrawl_extract_tool.py
@@ -0,0 +1,120 @@
+from typing import Any, Dict, List, Optional, Type, Union
+
+from crewai.tools import BaseTool
+from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
+
+try:
+    from firecrawl import FirecrawlApp
+except ImportError:
+    FirecrawlApp = Any
+
+
+class FirecrawlExtractToolSchema(BaseModel):
+    urls: List[str] = Field(
+        description="List of URLs to extract data from. URLs can include glob patterns"
+    )
+    prompt: Optional[str] = Field(
+        default=None,
+        description="The prompt describing what information to extract from the pages"
+    )
+    schema: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="JSON schema defining the structure of the data to extract",
+    )
+    enable_web_search: Optional[bool] = Field(
+        default=False,
+        description="When true, the extraction will use web search to find additional data",
+    )
+    ignore_site_map: Optional[bool] = Field(
+        default=False,
+        description="When true, the extraction will not use the sitemap.xml to find additional data",
+    )
+    include_subdomains: Optional[bool] = Field(
+        default=True,
+        description="When true, subdomains of the provided URLs will also be scanned",
+    )
+    show_sources: Optional[bool] = Field(
+        default=False,
+        description="When true, the sources used to extract the data will be included in the response as sources key",
+    )
+    scrape_options: Optional[Dict[str, Any]] = Field(
+        default={},
+        description="Additional options for the crawl request",
+    )
+
+
+class FirecrawlExtractTool(BaseTool):
+    model_config = ConfigDict(
+        arbitrary_types_allowed=True, validate_assignment=True, frozen=False
+    )
+    name: str = "Firecrawl extract tool"
+    description: str = "Extract structured data from webpages using Firecrawl and LLMs"
+    args_schema: Type[BaseModel] = FirecrawlExtractToolSchema
+    api_key: Optional[str] = None
+    _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
+
+    def __init__(self, api_key: Optional[str] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.api_key = api_key
+        self._initialize_firecrawl()
+
+    def _initialize_firecrawl(self) -> None:
+        try:
+            from firecrawl import FirecrawlApp  # type: ignore
+
+            self._firecrawl = FirecrawlApp(api_key=self.api_key)
+        except ImportError:
+            import click
+
+            if click.confirm(
+                "You are missing the 'firecrawl-py' package. Would you like to install it?"
+            ):
+                import subprocess
+
+                try:
+                    subprocess.run(["uv", "add", "firecrawl-py"], check=True)
+                    from firecrawl import FirecrawlApp
+
+                    self._firecrawl = FirecrawlApp(api_key=self.api_key)
+                except subprocess.CalledProcessError:
+                    raise ImportError("Failed to install firecrawl-py package")
+            else:
+                raise ImportError(
+                    "`firecrawl-py` package not found, please run `uv add firecrawl-py`"
+                )
+
+    def _run(
+        self,
+        urls: List[str],
+        prompt: Optional[str] = None,
+        schema: Optional[Dict[str, Any]] = None,
+        enable_web_search: Optional[bool] = False,
+        ignore_site_map: Optional[bool] = False,
+        include_subdomains: Optional[bool] = True,
+        show_sources: Optional[bool] = False,
+        scrape_options: Optional[Dict[str, Any]] = None,
+    ) -> Any:
+        options = {
+            "urls": urls,
+            "prompt": prompt,
+            "schema": schema,
+            "enableWebSearch": enable_web_search,
+            "ignoreSiteMap": ignore_site_map,
+            "includeSubdomains": include_subdomains,
+            "showSources": show_sources,
+            "scrapeOptions": scrape_options or {},
+        }
+        return self._firecrawl.extract(**options)
+
+
+try:
+    from firecrawl import FirecrawlApp
+
+    # Must rebuild model after class is defined
+    if not hasattr(FirecrawlExtractTool, "_model_rebuilt"):
+        FirecrawlExtractTool.model_rebuild()
+        FirecrawlExtractTool._model_rebuilt = True
+except ImportError:
+    """
+    When this tool is not used, then exception can be ignored.
+    """