diff --git a/crewai_tools/__init__.py b/crewai_tools/__init__.py
index 4d2ea7e1..03aa9d6c 100644
--- a/crewai_tools/__init__.py
+++ b/crewai_tools/__init__.py
@@ -14,6 +14,7 @@
FileReadTool,
FileWriterTool,
FirecrawlCrawlWebsiteTool,
+ FirecrawlExtractTool,
FirecrawlScrapeWebsiteTool,
FirecrawlSearchTool,
GithubSearchTool,
diff --git a/crewai_tools/tools/__init__.py b/crewai_tools/tools/__init__.py
index 4a9786fe..7382fd80 100644
--- a/crewai_tools/tools/__init__.py
+++ b/crewai_tools/tools/__init__.py
@@ -15,6 +15,9 @@
from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import (
FirecrawlCrawlWebsiteTool,
)
+from .firecrawl_extract_tool.firecrawl_extract_tool import (
+ FirecrawlExtractTool,
+)
from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import (
FirecrawlScrapeWebsiteTool,
)
diff --git a/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
index 46d01160..13e2b861 100644
--- a/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
+++ b/crewai_tools/tools/firecrawl_crawl_website_tool/README.md
@@ -2,12 +2,15 @@
## Description
-[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
+[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean
+markdown or structured data.
## Installation
-- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
-- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
+- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables
+ (`FIRECRAWL_API_KEY`).
+- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]`
+ package:
```
pip install firecrawl-py 'crewai[tools]'
@@ -15,7 +18,7 @@ pip install firecrawl-py 'crewai[tools]'
## Example
-Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to load websites:
+Utilize the `FirecrawlCrawlWebsiteTool` as follows to allow your agent to load websites:
```python
from crewai_tools import FirecrawlCrawlWebsiteTool
@@ -25,18 +28,20 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev')
## Arguments
-- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
-- `url`: The base URL to start crawling from.
-- `page_options`: Optional.
- - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
- - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
-- `crawler_options`: Optional. Options for controlling the crawling behavior.
- - `includes`: Optional. URL patterns to include in the crawl.
- - `exclude`: Optional. URL patterns to exclude from the crawl.
- - `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan).
- - `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents.
- - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on.
- - `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites.
- - `limit`: Optional. Maximum number of pages to crawl.
- - `timeout`: Optional. Timeout in milliseconds for the crawling operation.
-
+> Documentation for the parameters can be found
+> [here](https://docs.firecrawl.dev/api-reference/endpoint/crawl-post).
+
+| Parameter | Required | Default | Description |
+| ------------------------- | -------- | --------------------------- | ------------------------------------------------------------------------ |
+| `api_key` | ✅ | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key |
+| `url` | ✅ | - | The base URL to start crawling from |
+| `exclude_paths` | ❌ | - | URL patterns to exclude from the crawl |
+| `include_paths` | ❌ | - | URL patterns to include in the crawl |
+| `max_depth` | ❌ | `2` | Maximum depth to crawl relative to the entered URL |
+| `ignore_sitemap` | ❌ | `false` | Ignore the website sitemap when crawling |
+| `ignore_query_parameters` | ❌ | `false` | Do not re-scrape the same path with different (or none) query parameters |
+| `limit` | ❌ | `10000` | Maximum number of pages to crawl |
+| `allow_backward_links` | ❌ | `false` | Enables crawling previously linked pages |
+| `allow_external_links` | ❌ | `false` | Allows crawling external websites |
+| `webhook` | ❌ | - | Webhook configuration for crawl notifications |
+| `scrape_options` | ❌ | - | Options for scraping pages during crawl |
diff --git a/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
index b95199c8..bdcc2218 100644
--- a/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
+++ b/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, Optional, Type
+from typing import Any, Dict, Optional, Type, List, Union
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
@@ -12,12 +12,45 @@
class FirecrawlCrawlWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
- crawler_options: Optional[Dict[str, Any]] = Field(
- default=None, description="Options for crawling"
+ exclude_paths: Optional[List[str]] = Field(
+ default=None,
+ description="URL patterns to exclude from the crawl",
)
- timeout: Optional[int] = Field(
- default=30000,
- description="Timeout in milliseconds for the crawling operation. The default value is 30000.",
+ include_paths: Optional[List[str]] = Field(
+ default=None,
+ description="URL patterns to include in the crawl",
+ )
+ max_depth: Optional[int] = Field(
+ default=2,
+ description="Maximum depth to crawl relative to the entered URL",
+ )
+ ignore_sitemap: Optional[bool] = Field(
+ default=False,
+ description="Ignore the website sitemap when crawling",
+ )
+ ignore_query_parameters: Optional[bool] = Field(
+ default=False,
+ description="Do not re-scrape the same path with different (or none) query parameters",
+ )
+ limit: Optional[int] = Field(
+ default=10000,
+ description="Maximum number of pages to crawl",
+ )
+ allow_backward_links: Optional[bool] = Field(
+ default=False,
+ description="Enables the crawler to navigate from a specific URL to previously linked pages",
+ )
+ allow_external_links: Optional[bool] = Field(
+ default=False,
+ description="Allows the crawler to follow links to external websites",
+ )
+ webhook: Optional[Union[str, Dict[str, Any]]] = Field(
+ default=None,
+ description="Webhook configuration for crawl notifications",
+ )
+ scrape_options: Optional[Dict[str, Any]] = Field(
+ default=None,
+ description="Options for scraping pages during crawl",
)
@@ -64,15 +97,28 @@ def _initialize_firecrawl(self) -> None:
def _run(
self,
url: str,
- crawler_options: Optional[Dict[str, Any]] = None,
- timeout: Optional[int] = 30000,
+ exclude_paths: Optional[List[str]] = None,
+ include_paths: Optional[List[str]] = None,
+ max_depth: Optional[int] = 2,
+ ignore_sitemap: Optional[bool] = False,
+ ignore_query_parameters: Optional[bool] = False,
+ limit: Optional[int] = 10000,
+ allow_backward_links: Optional[bool] = False,
+ allow_external_links: Optional[bool] = False,
+ webhook: Optional[Union[str, Dict[str, Any]]] = None,
+ scrape_options: Optional[Dict[str, Any]] = None
):
- if crawler_options is None:
- crawler_options = {}
-
options = {
- "crawlerOptions": crawler_options,
- "timeout": timeout,
+ "excludePaths": exclude_paths or [],
+ "includePaths": include_paths or [],
+ "maxDepth": max_depth,
+ "ignoreSitemap": ignore_sitemap,
+ "ignoreQueryParameters": ignore_query_parameters,
+ "limit": limit,
+ "allowBackwardLinks": allow_backward_links,
+ "allowExternalLinks": allow_external_links,
+ "webhook": webhook,
+ "scrapeOptions": scrape_options or {},
}
return self._firecrawl.crawl_url(url, options)
diff --git a/crewai_tools/tools/firecrawl_extract_tool/README.md b/crewai_tools/tools/firecrawl_extract_tool/README.md
new file mode 100644
index 00000000..e5b3fced
--- /dev/null
+++ b/crewai_tools/tools/firecrawl_extract_tool/README.md
@@ -0,0 +1,56 @@
+# FirecrawlExtractTool
+
+## Description
+
+[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean
+markdown or structured data.
+
+## Installation
+
+- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables
+ (`FIRECRAWL_API_KEY`).
+- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]`
+ package:
+
+```
+pip install firecrawl-py 'crewai[tools]'
+```
+
+## Example
+
+Utilize the `FirecrawlExtractTool` as follows to extract structured data from websites:
+
+```python
+from crewai_tools import FirecrawlExtractTool
+
+# Example schema for product information
+schema = {
+ "name": {"type": "string", "description": "Product name"},
+ "price": {"type": "number", "description": "Product price"},
+ "description": {"type": "string", "description": "Product description"}
+}
+
+tool = FirecrawlExtractTool(
+ urls=['https://example.com/products/*'],
+ prompt="Extract product information from these pages",
+ schema=schema,
+ enable_web_search=True,
+ include_subdomains=False,
+ show_sources=True,
+ scrape_options={"formats": ["markdown", "html"]}
+)
+```
+
+## Arguments
+
+| Parameter | Required | Default | Description |
+| -------------------- | -------- | --------------------------- | ------------------------------------------------------------------------------------------------- |
+| `api_key` | ✅ | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key |
+| `urls` | ✅ | - | List of URLs to extract data from. URLs can include glob patterns |
+| `prompt` | ❌ | - | The prompt describing what information to extract from the pages |
+| `schema` | ❌ | - | JSON schema defining the structure of the data to extract |
+| `enable_web_search` | ❌ | `false` | When true, the extraction will use web search to find additional data |
+| `ignore_site_map` | ❌ | `false` | When true, the extraction will not use the _sitemap.xml_ to find additional data |
+| `include_subdomains` | ❌ | `true` | When true, subdomains of the provided URLs will also be scanned |
+| `show_sources` | ❌ | `false` | When true, the sources used to extract the data will be included in the response as `sources` key |
+| `scrape_options` | ❌ | `{}` | Additional options for the crawl request |
diff --git a/crewai_tools/tools/firecrawl_extract_tool/firecrawl_extract_tool.py b/crewai_tools/tools/firecrawl_extract_tool/firecrawl_extract_tool.py
new file mode 100644
index 00000000..da30c345
--- /dev/null
+++ b/crewai_tools/tools/firecrawl_extract_tool/firecrawl_extract_tool.py
@@ -0,0 +1,120 @@
+from typing import Any, Dict, List, Optional, Type, Union
+
+from crewai.tools import BaseTool
+from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
+
+try:
+ from firecrawl import FirecrawlApp
+except ImportError:
+ FirecrawlApp = Any
+
+
+class FirecrawlExtractToolSchema(BaseModel):
+ urls: List[str] = Field(
+ description="List of URLs to extract data from. URLs can include glob patterns"
+ )
+ prompt: Optional[str] = Field(
+ default=None,
+ description="The prompt describing what information to extract from the pages"
+ )
+ schema: Optional[Dict[str, Any]] = Field(
+ default=None,
+ description="JSON schema defining the structure of the data to extract",
+ )
+ enable_web_search: Optional[bool] = Field(
+ default=False,
+ description="When true, the extraction will use web search to find additional data",
+ )
+ ignore_site_map: Optional[bool] = Field(
+ default=False,
+ description="When true, the extraction will not use the sitemap.xml to find additional data",
+ )
+ include_subdomains: Optional[bool] = Field(
+ default=True,
+ description="When true, subdomains of the provided URLs will also be scanned",
+ )
+ show_sources: Optional[bool] = Field(
+ default=False,
+ description="When true, the sources used to extract the data will be included in the response as sources key",
+ )
+ scrape_options: Optional[Dict[str, Any]] = Field(
+ default={},
+ description="Additional options for the crawl request",
+ )
+
+
+class FirecrawlExtractTool(BaseTool):
+ model_config = ConfigDict(
+ arbitrary_types_allowed=True, validate_assignment=True, frozen=False
+ )
+ name: str = "Firecrawl extract tool"
+ description: str = "Extract structured data from webpages using Firecrawl and LLMs"
+ args_schema: Type[BaseModel] = FirecrawlExtractToolSchema
+ api_key: Optional[str] = None
+ _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None)
+
+ def __init__(self, api_key: Optional[str] = None, **kwargs):
+ super().__init__(**kwargs)
+ self.api_key = api_key
+ self._initialize_firecrawl()
+
+ def _initialize_firecrawl(self) -> None:
+ try:
+ from firecrawl import FirecrawlApp # type: ignore
+
+ self._firecrawl = FirecrawlApp(api_key=self.api_key)
+ except ImportError:
+ import click
+
+ if click.confirm(
+ "You are missing the 'firecrawl-py' package. Would you like to install it?"
+ ):
+ import subprocess
+
+ try:
+ subprocess.run(["uv", "add", "firecrawl-py"], check=True)
+ from firecrawl import FirecrawlApp
+
+ self._firecrawl = FirecrawlApp(api_key=self.api_key)
+ except subprocess.CalledProcessError:
+ raise ImportError("Failed to install firecrawl-py package")
+ else:
+ raise ImportError(
+ "`firecrawl-py` package not found, please run `uv add firecrawl-py`"
+ )
+
+ def _run(
+ self,
+ urls: List[str],
+ prompt: Optional[str] = None,
+ schema: Optional[Dict[str, Any]] = None,
+ enable_web_search: Optional[bool] = False,
+ ignore_site_map: Optional[bool] = False,
+ include_subdomains: Optional[bool] = True,
+ show_sources: Optional[bool] = False,
+ scrape_options: Optional[Dict[str, Any]] = None,
+ ) -> Any:
+ options = {
+ "urls": urls,
+ "prompt": prompt,
+ "schema": schema,
+ "enableWebSearch": enable_web_search,
+ "ignoreSiteMap": ignore_site_map,
+ "includeSubdomains": include_subdomains,
+ "showSources": show_sources,
+ "scrapeOptions": scrape_options or {},
+ }
+ return self._firecrawl.extract(**options)
+
+
+try:
+ from firecrawl import FirecrawlApp
+
+ # Must rebuild model after class is defined
+ if not hasattr(FirecrawlExtractTool, "_model_rebuilt"):
+ FirecrawlExtractTool.model_rebuild()
+ FirecrawlExtractTool._model_rebuilt = True
+except ImportError:
+ """
+ When this tool is not used, then exception can be ignored.
+ """
\ No newline at end of file
diff --git a/crewai_tools/tools/firecrawl_scrape_website_tool/README.md b/crewai_tools/tools/firecrawl_scrape_website_tool/README.md
index 93570f06..9ebad5c6 100644
--- a/crewai_tools/tools/firecrawl_scrape_website_tool/README.md
+++ b/crewai_tools/tools/firecrawl_scrape_website_tool/README.md
@@ -2,12 +2,15 @@
## Description
-[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
+[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean
+markdown or structured data.
## Installation
-- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
-- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
+- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables
+ (`FIRECRAWL_API_KEY`).
+- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]`
+ package:
```
pip install firecrawl-py 'crewai[tools]'
@@ -15,7 +18,7 @@ pip install firecrawl-py 'crewai[tools]'
## Example
-Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load websites:
+Utilize the `FirecrawlScrapeWebsiteTool` as follows to allow your agent to load websites:
```python
from crewai_tools import FirecrawlScrapeWebsiteTool
@@ -25,14 +28,28 @@ tool = FirecrawlScrapeWebsiteTool(url='firecrawl.dev')
## Arguments
-- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
-- `url`: The URL to scrape.
-- `page_options`: Optional.
- - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
- - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
-- `extractor_options`: Optional. Options for LLM-based extraction of structured information from the page content
- - `mode`: The extraction mode to use, currently supports 'llm-extraction'
- - `extractionPrompt`: Optional. A prompt describing what information to extract from the page
- - `extractionSchema`: Optional. The schema for the data to be extracted
-- `timeout`: Optional. Timeout in milliseconds for the request
-
+> Documentation for the parameters can be found
+> [here](https://docs.firecrawl.dev/api-reference/endpoint/scrape).
+
+| Parameter | Required | Default | Description |
+| ---------------------------- | -------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `api_key` | ✅ | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key |
+| `url` | ✅ | - | The URL to scrape |
+| `formats` | ❌ | `["markdown"]` | List of formats to include in the output (`markdown`, `html`, `rawHtml`, `links`, `screenshot`, `screenshot@fullPage`, `json`) |
+| `only_main_content` | ❌ | `true` | Only return the main content of the page excluding headers, navs, footers, etc. |
+| `include_tags` | ❌ | - | List of HTML tags to include in the output |
+| `exclude_tags` | ❌ | - | List of HTML tags to exclude from the output |
+| `headers` | ❌ | - | Headers to send with the request (e.g., cookies, user-agent) |
+| `wait_for` | ❌ | `0` | Delay in milliseconds before fetching content |
+| `mobile` | ❌ | `false` | Set to true to emulate scraping from a mobile device |
+| `skip_tls_verification` | ❌ | `false` | Skip TLS certificate verification when making requests |
+| `timeout` | ❌ | `30000` | Timeout in milliseconds for the request |
+| `json_options` | ❌ | - | Options for JSON extraction from the page |
+| `json_options.schema` | ❌ | - | The schema to use for the extraction |
+| `json_options.system_prompt` | ❌ | - | The system prompt to use for the extraction |
+| `json_options.prompt` | ❌ | - | The prompt to use for the extraction without a schema |
+| `location` | ❌ | - | Location settings for the request (country code and languages) |
+| `remove_base64_images` | ❌ | - | Remove base64 encoded images from output |
+| `block_ads` | ❌ | `true` | Enables ad-blocking and cookie popup blocking. |
+| `actions` | ❌ | - | List of actions to perform on the page before scraping (e.g., click, scroll, wait) |
+| `proxy` | ❌ | - | Specifies the type of proxy to use (`basic`, `stealth`).
**basic:** Proxies for scraping sites with none to basic anti-bot solutions. Fast and usually works.
**stealth:** Stealth proxies for scraping sites with advanced anti-bot solutions. Slower, but more reliable on certain sites. |
diff --git a/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py b/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py
index 8530aa71..3d72b9d0 100644
--- a/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py
+++ b/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py
@@ -1,4 +1,4 @@
-from typing import Any, Optional, Type
+from typing import Any, Optional, Type, List, Dict
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
@@ -11,9 +11,65 @@
class FirecrawlScrapeWebsiteToolSchema(BaseModel):
url: str = Field(description="Website URL")
+ formats: Optional[List[str]] = Field(
+ default=["markdown"],
+ description="Formats to include in the output (markdown, html, rawHtml, links, screenshot)",
+ )
+ only_main_content: Optional[bool] = Field(
+ default=True,
+ description="Only return the main content of the page excluding headers, navs, footers, etc.",
+ )
+ include_tags: Optional[List[str]] = Field(
+ default=None,
+ description="Tags to include in the output",
+ )
+ exclude_tags: Optional[List[str]] = Field(
+ default=None,
+ description="Tags to exclude from the output",
+ )
+ headers: Optional[Dict[str, str]] = Field(
+ default=None,
+ description="Headers to send with the request",
+ )
+ wait_for: Optional[int] = Field(
+ default=0,
+ description="Specify a delay in milliseconds before fetching the content",
+ )
+ mobile: Optional[bool] = Field(
+ default=False,
+ description="Set to true if you want to emulate scraping from a mobile device",
+ )
+ skip_tls_verification: Optional[bool] = Field(
+ default=False,
+ description="Skip TLS certificate verification when making requests",
+ )
timeout: Optional[int] = Field(
default=30000,
- description="Timeout in milliseconds for the scraping operation. The default value is 30000.",
+ description="Timeout in milliseconds for the scraping operation",
+ )
+ json_options: Optional[Dict[str, Any]] = Field(
+ default=None,
+ description="Options for JSON extraction from the page",
+ )
+ location: Optional[Dict[str, Any]] = Field(
+ default=None,
+ description="Location settings for the request",
+ )
+ remove_base64_images: Optional[bool] = Field(
+ default=False,
+ description="Removes all base 64 images from the output",
+ )
+ block_ads: Optional[bool] = Field(
+ default=True,
+ description="Enables ad-blocking and cookie popup blocking",
+ )
+ actions: Optional[List[Dict[str, Any]]] = Field(
+ default=None,
+ description="Actions to perform on the page before grabbing the content",
+ )
+ proxy: Optional[str] = Field(
+ default=None,
+ description="Specifies the type of proxy to use",
)
@@ -53,16 +109,38 @@ def __init__(self, api_key: Optional[str] = None, **kwargs):
def _run(
self,
url: str,
+ formats: Optional[List[str]] = ["markdown"],
+ only_main_content: Optional[bool] = True,
+ include_tags: Optional[List[str]] = None,
+ exclude_tags: Optional[List[str]] = None,
+ headers: Optional[Dict[str, str]] = None,
+ wait_for: Optional[int] = 0,
+ mobile: Optional[bool] = False,
+ skip_tls_verification: Optional[bool] = False,
timeout: Optional[int] = 30000,
+ json_options: Optional[Dict[str, Any]] = None,
+ location: Optional[Dict[str, Any]] = None,
+ remove_base64_images: Optional[bool] = False,
+ block_ads: Optional[bool] = True,
+ actions: Optional[List[Dict[str, Any]]] = None,
+ proxy: Optional[str] = None,
):
options = {
- "formats": ["markdown"],
- "onlyMainContent": True,
- "includeTags": [],
- "excludeTags": [],
- "headers": {},
- "waitFor": 0,
+ "formats": formats,
+ "onlyMainContent": only_main_content,
+ "includeTags": include_tags or [],
+ "excludeTags": exclude_tags or [],
+ "headers": headers or {},
+ "waitFor": wait_for,
+ "mobile": mobile,
+ "skipTlsVerification": skip_tls_verification,
"timeout": timeout,
+ "jsonOptions": json_options,
+ "location": location,
+ "removeBase64Images": remove_base64_images,
+ "blockAds": block_ads,
+ "actions": actions or [],
+ "proxy": proxy,
}
return self._firecrawl.scrape_url(url, options)
diff --git a/crewai_tools/tools/firecrawl_search_tool/README.md b/crewai_tools/tools/firecrawl_search_tool/README.md
index effb3f3d..6c14a1ac 100644
--- a/crewai_tools/tools/firecrawl_search_tool/README.md
+++ b/crewai_tools/tools/firecrawl_search_tool/README.md
@@ -2,12 +2,15 @@
## Description
-[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data.
+[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean
+markdown or structured data.
## Installation
-- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`).
-- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package:
+- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables
+ (`FIRECRAWL_API_KEY`).
+- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]`
+ package:
```
pip install firecrawl-py 'crewai[tools]'
@@ -15,7 +18,7 @@ pip install firecrawl-py 'crewai[tools]'
## Example
-Utilize the FirecrawlSearchTool as follows to allow your agent to load websites:
+Utilize the `FirecrawlSearchTool` as follows to allow your agent to search the web:
```python
from crewai_tools import FirecrawlSearchTool
@@ -25,11 +28,14 @@ tool = FirecrawlSearchTool(query='what is firecrawl?')
## Arguments
-- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable.
-- `query`: The search query string to be used for searching.
-- `page_options`: Optional. Options for result formatting.
- - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc.
- - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response.
- - `fetchPageContent`: Optional. Fetch the full content of the page.
-- `search_options`: Optional. Options for controlling the crawling behavior.
- - `limit`: Optional. Maximum number of pages to crawl.
\ No newline at end of file
+| Parameter | Required | Default | Description |
+| ---------------- | -------- | --------------------------- | -------------------------------------------------- |
+| `api_key` | ✅ | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key |
+| `query` | ✅ | - | The search query string to be used for searching |
+| `limit` | ❌ | `5` | Maximum number of results to return (between 1-10) |
+| `tbs` | ❌ | - | Time-based search parameter |
+| `lang` | ❌ | `"en"` | Language code for search results |
+| `country` | ❌ | `"us"` | Country code for search results |
+| `location` | ❌ | - | Location parameter for search results |
+| `timeout` | ❌ | `60000` | Timeout in milliseconds |
+| `scrape_options` | ❌ | - | Options for scraping search results |
diff --git a/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py b/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py
index f7f4f367..4efc7d26 100644
--- a/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py
+++ b/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Any, Dict, Optional, Type
+from typing import TYPE_CHECKING, Any, Dict, Optional, Type, List, Union
from crewai.tools import BaseTool
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
@@ -27,19 +27,18 @@ class FirecrawlSearchToolSchema(BaseModel):
country: Optional[str] = Field(
default="us", description="Country code for search results"
)
- location: Optional[str] = Field(
- default=None, description="Location parameter for search results"
+ location: Optional[Dict[str, Any]] = Field(
+ default=None,
+ description="Location settings for the request",
)
timeout: Optional[int] = Field(default=60000, description="Timeout in milliseconds")
scrape_options: Optional[Dict[str, Any]] = Field(
- default=None, description="Options for scraping search results"
+ default=None,
+ description="Options for scraping search results",
)
class FirecrawlSearchTool(BaseTool):
- model_config = ConfigDict(
- arbitrary_types_allowed=True, validate_assignment=True, frozen=False
- )
model_config = ConfigDict(
arbitrary_types_allowed=True, validate_assignment=True, frozen=False
)
@@ -87,21 +86,22 @@ def _run(
tbs: Optional[str] = None,
lang: Optional[str] = "en",
country: Optional[str] = "us",
- location: Optional[str] = None,
+ location: Optional[Dict[str, Any]] = None,
timeout: Optional[int] = 60000,
- scrape_options: Optional[Dict[str, Any]] = None,
+ scrape_options: Optional[Dict[str, Any]] = None
) -> Any:
if not self.firecrawl:
raise RuntimeError("FirecrawlApp not properly initialized")
options = {
+ "query": query,
"limit": limit,
"tbs": tbs,
"lang": lang,
"country": country,
"location": location,
"timeout": timeout,
- "scrapeOptions": scrape_options or {},
+ "scrapeOptions": scrape_options or {}
}
return self.firecrawl.search(**options)
diff --git a/pyproject.toml b/pyproject.toml
index 24749368..f6633c9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,7 +41,7 @@ multion = [
"multion>=1.1.0",
]
firecrawl-py = [
- "firecrawl-py>=1.8.0",
+ "firecrawl-py>=1.12.0",
]
composio-core = [
"composio-core>=0.6.11.post1",