diff --git a/crewai_tools/__init__.py b/crewai_tools/__init__.py index 4d2ea7e1..03aa9d6c 100644 --- a/crewai_tools/__init__.py +++ b/crewai_tools/__init__.py @@ -14,6 +14,7 @@ FileReadTool, FileWriterTool, FirecrawlCrawlWebsiteTool, + FirecrawlExtractTool, FirecrawlScrapeWebsiteTool, FirecrawlSearchTool, GithubSearchTool, diff --git a/crewai_tools/tools/__init__.py b/crewai_tools/tools/__init__.py index 4a9786fe..7382fd80 100644 --- a/crewai_tools/tools/__init__.py +++ b/crewai_tools/tools/__init__.py @@ -15,6 +15,9 @@ from .firecrawl_crawl_website_tool.firecrawl_crawl_website_tool import ( FirecrawlCrawlWebsiteTool, ) +from .firecrawl_extract_tool.firecrawl_extract_tool import ( + FirecrawlExtractTool, +) from .firecrawl_scrape_website_tool.firecrawl_scrape_website_tool import ( FirecrawlScrapeWebsiteTool, ) diff --git a/crewai_tools/tools/firecrawl_crawl_website_tool/README.md b/crewai_tools/tools/firecrawl_crawl_website_tool/README.md index 46d01160..13e2b861 100644 --- a/crewai_tools/tools/firecrawl_crawl_website_tool/README.md +++ b/crewai_tools/tools/firecrawl_crawl_website_tool/README.md @@ -2,12 +2,15 @@ ## Description -[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data. +[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean +markdown or structured data. ## Installation -- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`). -- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package: +- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables + (`FIRECRAWL_API_KEY`). +- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` + package: ``` pip install firecrawl-py 'crewai[tools]' @@ -15,7 +18,7 @@ pip install firecrawl-py 'crewai[tools]' ## Example -Utilize the FirecrawlScrapeFromWebsiteTool as follows to allow your agent to load websites: +Utilize the `FirecrawlCrawlWebsiteTool` as follows to allow your agent to load websites: ```python from crewai_tools import FirecrawlCrawlWebsiteTool @@ -25,18 +28,20 @@ tool = FirecrawlCrawlWebsiteTool(url='firecrawl.dev') ## Arguments -- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. -- `url`: The base URL to start crawling from. -- `page_options`: Optional. - - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. - - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. -- `crawler_options`: Optional. Options for controlling the crawling behavior. - - `includes`: Optional. URL patterns to include in the crawl. - - `exclude`: Optional. URL patterns to exclude from the crawl. - - `generateImgAltText`: Optional. Generate alt text for images using LLMs (requires a paid plan). - - `returnOnlyUrls`: Optional. If true, returns only the URLs as a list in the crawl status. Note: the response will be a list of URLs inside the data, not a list of documents. - - `maxDepth`: Optional. Maximum depth to crawl. Depth 1 is the base URL, depth 2 includes the base URL and its direct children, and so on. - - `mode`: Optional. The crawling mode to use. Fast mode crawls 4x faster on websites without a sitemap but may not be as accurate and shouldn't be used on heavily JavaScript-rendered websites. - - `limit`: Optional. Maximum number of pages to crawl. - - `timeout`: Optional. Timeout in milliseconds for the crawling operation. - +> Documentation for the parameters can be found +> [here](https://docs.firecrawl.dev/api-reference/endpoint/crawl-post). + +| Parameter | Required | Default | Description | +| ------------------------- | -------- | --------------------------- | ------------------------------------------------------------------------ | +| `api_key` | ✅ | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key | +| `url` | ✅ | - | The base URL to start crawling from | +| `exclude_paths` | ❌ | - | URL patterns to exclude from the crawl | +| `include_paths` | ❌ | - | URL patterns to include in the crawl | +| `max_depth` | ❌ | `2` | Maximum depth to crawl relative to the entered URL | +| `ignore_sitemap` | ❌ | `false` | Ignore the website sitemap when crawling | +| `ignore_query_parameters` | ❌ | `false` | Do not re-scrape the same path with different (or none) query parameters | +| `limit` | ❌ | `10000` | Maximum number of pages to crawl | +| `allow_backward_links` | ❌ | `false` | Enables crawling previously linked pages | +| `allow_external_links` | ❌ | `false` | Allows crawling external websites | +| `webhook` | ❌ | - | Webhook configuration for crawl notifications | +| `scrape_options` | ❌ | - | Options for scraping pages during crawl | diff --git a/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py b/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py index b95199c8..bdcc2218 100644 --- a/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py +++ b/crewai_tools/tools/firecrawl_crawl_website_tool/firecrawl_crawl_website_tool.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Type +from typing import Any, Dict, Optional, Type, List, Union from crewai.tools import BaseTool from pydantic import BaseModel, ConfigDict, Field, PrivateAttr @@ -12,12 +12,45 @@ class FirecrawlCrawlWebsiteToolSchema(BaseModel): url: str = Field(description="Website URL") - crawler_options: Optional[Dict[str, Any]] = Field( - default=None, description="Options for crawling" + exclude_paths: Optional[List[str]] = Field( + default=None, + description="URL patterns to exclude from the crawl", ) - timeout: Optional[int] = Field( - default=30000, - description="Timeout in milliseconds for the crawling operation. The default value is 30000.", + include_paths: Optional[List[str]] = Field( + default=None, + description="URL patterns to include in the crawl", + ) + max_depth: Optional[int] = Field( + default=2, + description="Maximum depth to crawl relative to the entered URL", + ) + ignore_sitemap: Optional[bool] = Field( + default=False, + description="Ignore the website sitemap when crawling", + ) + ignore_query_parameters: Optional[bool] = Field( + default=False, + description="Do not re-scrape the same path with different (or none) query parameters", + ) + limit: Optional[int] = Field( + default=10000, + description="Maximum number of pages to crawl", + ) + allow_backward_links: Optional[bool] = Field( + default=False, + description="Enables the crawler to navigate from a specific URL to previously linked pages", + ) + allow_external_links: Optional[bool] = Field( + default=False, + description="Allows the crawler to follow links to external websites", + ) + webhook: Optional[Union[str, Dict[str, Any]]] = Field( + default=None, + description="Webhook configuration for crawl notifications", + ) + scrape_options: Optional[Dict[str, Any]] = Field( + default=None, + description="Options for scraping pages during crawl", ) @@ -64,15 +97,28 @@ def _initialize_firecrawl(self) -> None: def _run( self, url: str, - crawler_options: Optional[Dict[str, Any]] = None, - timeout: Optional[int] = 30000, + exclude_paths: Optional[List[str]] = None, + include_paths: Optional[List[str]] = None, + max_depth: Optional[int] = 2, + ignore_sitemap: Optional[bool] = False, + ignore_query_parameters: Optional[bool] = False, + limit: Optional[int] = 10000, + allow_backward_links: Optional[bool] = False, + allow_external_links: Optional[bool] = False, + webhook: Optional[Union[str, Dict[str, Any]]] = None, + scrape_options: Optional[Dict[str, Any]] = None ): - if crawler_options is None: - crawler_options = {} - options = { - "crawlerOptions": crawler_options, - "timeout": timeout, + "excludePaths": exclude_paths or [], + "includePaths": include_paths or [], + "maxDepth": max_depth, + "ignoreSitemap": ignore_sitemap, + "ignoreQueryParameters": ignore_query_parameters, + "limit": limit, + "allowBackwardLinks": allow_backward_links, + "allowExternalLinks": allow_external_links, + "webhook": webhook, + "scrapeOptions": scrape_options or {}, } return self._firecrawl.crawl_url(url, options) diff --git a/crewai_tools/tools/firecrawl_extract_tool/README.md b/crewai_tools/tools/firecrawl_extract_tool/README.md new file mode 100644 index 00000000..e5b3fced --- /dev/null +++ b/crewai_tools/tools/firecrawl_extract_tool/README.md @@ -0,0 +1,56 @@ +# FirecrawlExtractTool + +## Description + +[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean +markdown or structured data. + +## Installation + +- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables + (`FIRECRAWL_API_KEY`). +- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` + package: + +``` +pip install firecrawl-py 'crewai[tools]' +``` + +## Example + +Utilize the `FirecrawlExtractTool` as follows to extract structured data from websites: + +```python +from crewai_tools import FirecrawlExtractTool + +# Example schema for product information +schema = { + "name": {"type": "string", "description": "Product name"}, + "price": {"type": "number", "description": "Product price"}, + "description": {"type": "string", "description": "Product description"} +} + +tool = FirecrawlExtractTool( + urls=['https://example.com/products/*'], + prompt="Extract product information from these pages", + schema=schema, + enable_web_search=True, + include_subdomains=False, + show_sources=True, + scrape_options={"formats": ["markdown", "html"]} +) +``` + +## Arguments + +| Parameter | Required | Default | Description | +| -------------------- | -------- | --------------------------- | ------------------------------------------------------------------------------------------------- | +| `api_key` | ✅ | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key | +| `urls` | ✅ | - | List of URLs to extract data from. URLs can include glob patterns | +| `prompt` | ❌ | - | The prompt describing what information to extract from the pages | +| `schema` | ❌ | - | JSON schema defining the structure of the data to extract | +| `enable_web_search` | ❌ | `false` | When true, the extraction will use web search to find additional data | +| `ignore_site_map` | ❌ | `false` | When true, the extraction will not use the _sitemap.xml_ to find additional data | +| `include_subdomains` | ❌ | `true` | When true, subdomains of the provided URLs will also be scanned | +| `show_sources` | ❌ | `false` | When true, the sources used to extract the data will be included in the response as `sources` key | +| `scrape_options` | ❌ | `{}` | Additional options for the crawl request | diff --git a/crewai_tools/tools/firecrawl_extract_tool/firecrawl_extract_tool.py b/crewai_tools/tools/firecrawl_extract_tool/firecrawl_extract_tool.py new file mode 100644 index 00000000..da30c345 --- /dev/null +++ b/crewai_tools/tools/firecrawl_extract_tool/firecrawl_extract_tool.py @@ -0,0 +1,120 @@ +from typing import Any, Dict, List, Optional, Type, Union + +from crewai.tools import BaseTool +from pydantic import BaseModel, ConfigDict, Field, PrivateAttr + +try: + from firecrawl import FirecrawlApp +except ImportError: + FirecrawlApp = Any + + +class FirecrawlExtractToolSchema(BaseModel): + urls: List[str] = Field( + description="List of URLs to extract data from. URLs can include glob patterns" + ) + prompt: Optional[str] = Field( + default=None, + description="The prompt describing what information to extract from the pages" + ) + schema: Optional[Dict[str, Any]] = Field( + default=None, + description="JSON schema defining the structure of the data to extract", + ) + enable_web_search: Optional[bool] = Field( + default=False, + description="When true, the extraction will use web search to find additional data", + ) + ignore_site_map: Optional[bool] = Field( + default=False, + description="When true, the extraction will not use the sitemap.xml to find additional data", + ) + include_subdomains: Optional[bool] = Field( + default=True, + description="When true, subdomains of the provided URLs will also be scanned", + ) + show_sources: Optional[bool] = Field( + default=False, + description="When true, the sources used to extract the data will be included in the response as sources key", + ) + scrape_options: Optional[Dict[str, Any]] = Field( + default={}, + description="Additional options for the crawl request", + ) + + +class FirecrawlExtractTool(BaseTool): + model_config = ConfigDict( + arbitrary_types_allowed=True, validate_assignment=True, frozen=False + ) + name: str = "Firecrawl extract tool" + description: str = "Extract structured data from webpages using Firecrawl and LLMs" + args_schema: Type[BaseModel] = FirecrawlExtractToolSchema + api_key: Optional[str] = None + _firecrawl: Optional["FirecrawlApp"] = PrivateAttr(None) + + def __init__(self, api_key: Optional[str] = None, **kwargs): + super().__init__(**kwargs) + self.api_key = api_key + self._initialize_firecrawl() + + def _initialize_firecrawl(self) -> None: + try: + from firecrawl import FirecrawlApp # type: ignore + + self._firecrawl = FirecrawlApp(api_key=self.api_key) + except ImportError: + import click + + if click.confirm( + "You are missing the 'firecrawl-py' package. Would you like to install it?" + ): + import subprocess + + try: + subprocess.run(["uv", "add", "firecrawl-py"], check=True) + from firecrawl import FirecrawlApp + + self._firecrawl = FirecrawlApp(api_key=self.api_key) + except subprocess.CalledProcessError: + raise ImportError("Failed to install firecrawl-py package") + else: + raise ImportError( + "`firecrawl-py` package not found, please run `uv add firecrawl-py`" + ) + + def _run( + self, + urls: List[str], + prompt: Optional[str] = None, + schema: Optional[Dict[str, Any]] = None, + enable_web_search: Optional[bool] = False, + ignore_site_map: Optional[bool] = False, + include_subdomains: Optional[bool] = True, + show_sources: Optional[bool] = False, + scrape_options: Optional[Dict[str, Any]] = None, + ) -> Any: + options = { + "urls": urls, + "prompt": prompt, + "schema": schema, + "enableWebSearch": enable_web_search, + "ignoreSiteMap": ignore_site_map, + "includeSubdomains": include_subdomains, + "showSources": show_sources, + "scrapeOptions": scrape_options or {}, + } + return self._firecrawl.extract(**options) + + +try: + from firecrawl import FirecrawlApp + + # Must rebuild model after class is defined + if not hasattr(FirecrawlExtractTool, "_model_rebuilt"): + FirecrawlExtractTool.model_rebuild() + FirecrawlExtractTool._model_rebuilt = True +except ImportError: + """ + When this tool is not used, then exception can be ignored. + """ \ No newline at end of file diff --git a/crewai_tools/tools/firecrawl_scrape_website_tool/README.md b/crewai_tools/tools/firecrawl_scrape_website_tool/README.md index 93570f06..9ebad5c6 100644 --- a/crewai_tools/tools/firecrawl_scrape_website_tool/README.md +++ b/crewai_tools/tools/firecrawl_scrape_website_tool/README.md @@ -2,12 +2,15 @@ ## Description -[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data. +[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean +markdown or structured data. ## Installation -- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`). -- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package: +- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables + (`FIRECRAWL_API_KEY`). +- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` + package: ``` pip install firecrawl-py 'crewai[tools]' @@ -15,7 +18,7 @@ pip install firecrawl-py 'crewai[tools]' ## Example -Utilize the FirecrawlScrapeWebsiteTool as follows to allow your agent to load websites: +Utilize the `FirecrawlScrapeWebsiteTool` as follows to allow your agent to load websites: ```python from crewai_tools import FirecrawlScrapeWebsiteTool @@ -25,14 +28,28 @@ tool = FirecrawlScrapeWebsiteTool(url='firecrawl.dev') ## Arguments -- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. -- `url`: The URL to scrape. -- `page_options`: Optional. - - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. - - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. -- `extractor_options`: Optional. Options for LLM-based extraction of structured information from the page content - - `mode`: The extraction mode to use, currently supports 'llm-extraction' - - `extractionPrompt`: Optional. A prompt describing what information to extract from the page - - `extractionSchema`: Optional. The schema for the data to be extracted -- `timeout`: Optional. Timeout in milliseconds for the request - +> Documentation for the parameters can be found +> [here](https://docs.firecrawl.dev/api-reference/endpoint/scrape). + +| Parameter | Required | Default | Description | +| ---------------------------- | -------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `api_key` | ✅ | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key | +| `url` | ✅ | - | The URL to scrape | +| `formats` | ❌ | `["markdown"]` | List of formats to include in the output (`markdown`, `html`, `rawHtml`, `links`, `screenshot`, `screenshot@fullPage`, `json`) | +| `only_main_content` | ❌ | `true` | Only return the main content of the page excluding headers, navs, footers, etc. | +| `include_tags` | ❌ | - | List of HTML tags to include in the output | +| `exclude_tags` | ❌ | - | List of HTML tags to exclude from the output | +| `headers` | ❌ | - | Headers to send with the request (e.g., cookies, user-agent) | +| `wait_for` | ❌ | `0` | Delay in milliseconds before fetching content | +| `mobile` | ❌ | `false` | Set to true to emulate scraping from a mobile device | +| `skip_tls_verification` | ❌ | `false` | Skip TLS certificate verification when making requests | +| `timeout` | ❌ | `30000` | Timeout in milliseconds for the request | +| `json_options` | ❌ | - | Options for JSON extraction from the page | +| `json_options.schema` | ❌ | - | The schema to use for the extraction | +| `json_options.system_prompt` | ❌ | - | The system prompt to use for the extraction | +| `json_options.prompt` | ❌ | - | The prompt to use for the extraction without a schema | +| `location` | ❌ | - | Location settings for the request (country code and languages) | +| `remove_base64_images` | ❌ | - | Remove base64 encoded images from output | +| `block_ads` | ❌ | `true` | Enables ad-blocking and cookie popup blocking. | +| `actions` | ❌ | - | List of actions to perform on the page before scraping (e.g., click, scroll, wait) | +| `proxy` | ❌ | - | Specifies the type of proxy to use (`basic`, `stealth`).
**basic:** Proxies for scraping sites with none to basic anti-bot solutions. Fast and usually works.
**stealth:** Stealth proxies for scraping sites with advanced anti-bot solutions. Slower, but more reliable on certain sites. | diff --git a/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py b/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py index 8530aa71..3d72b9d0 100644 --- a/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py +++ b/crewai_tools/tools/firecrawl_scrape_website_tool/firecrawl_scrape_website_tool.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Type +from typing import Any, Optional, Type, List, Dict from crewai.tools import BaseTool from pydantic import BaseModel, ConfigDict, Field, PrivateAttr @@ -11,9 +11,65 @@ class FirecrawlScrapeWebsiteToolSchema(BaseModel): url: str = Field(description="Website URL") + formats: Optional[List[str]] = Field( + default=["markdown"], + description="Formats to include in the output (markdown, html, rawHtml, links, screenshot)", + ) + only_main_content: Optional[bool] = Field( + default=True, + description="Only return the main content of the page excluding headers, navs, footers, etc.", + ) + include_tags: Optional[List[str]] = Field( + default=None, + description="Tags to include in the output", + ) + exclude_tags: Optional[List[str]] = Field( + default=None, + description="Tags to exclude from the output", + ) + headers: Optional[Dict[str, str]] = Field( + default=None, + description="Headers to send with the request", + ) + wait_for: Optional[int] = Field( + default=0, + description="Specify a delay in milliseconds before fetching the content", + ) + mobile: Optional[bool] = Field( + default=False, + description="Set to true if you want to emulate scraping from a mobile device", + ) + skip_tls_verification: Optional[bool] = Field( + default=False, + description="Skip TLS certificate verification when making requests", + ) timeout: Optional[int] = Field( default=30000, - description="Timeout in milliseconds for the scraping operation. The default value is 30000.", + description="Timeout in milliseconds for the scraping operation", + ) + json_options: Optional[Dict[str, Any]] = Field( + default=None, + description="Options for JSON extraction from the page", + ) + location: Optional[Dict[str, Any]] = Field( + default=None, + description="Location settings for the request", + ) + remove_base64_images: Optional[bool] = Field( + default=False, + description="Removes all base 64 images from the output", + ) + block_ads: Optional[bool] = Field( + default=True, + description="Enables ad-blocking and cookie popup blocking", + ) + actions: Optional[List[Dict[str, Any]]] = Field( + default=None, + description="Actions to perform on the page before grabbing the content", + ) + proxy: Optional[str] = Field( + default=None, + description="Specifies the type of proxy to use", ) @@ -53,16 +109,38 @@ def __init__(self, api_key: Optional[str] = None, **kwargs): def _run( self, url: str, + formats: Optional[List[str]] = ["markdown"], + only_main_content: Optional[bool] = True, + include_tags: Optional[List[str]] = None, + exclude_tags: Optional[List[str]] = None, + headers: Optional[Dict[str, str]] = None, + wait_for: Optional[int] = 0, + mobile: Optional[bool] = False, + skip_tls_verification: Optional[bool] = False, timeout: Optional[int] = 30000, + json_options: Optional[Dict[str, Any]] = None, + location: Optional[Dict[str, Any]] = None, + remove_base64_images: Optional[bool] = False, + block_ads: Optional[bool] = True, + actions: Optional[List[Dict[str, Any]]] = None, + proxy: Optional[str] = None, ): options = { - "formats": ["markdown"], - "onlyMainContent": True, - "includeTags": [], - "excludeTags": [], - "headers": {}, - "waitFor": 0, + "formats": formats, + "onlyMainContent": only_main_content, + "includeTags": include_tags or [], + "excludeTags": exclude_tags or [], + "headers": headers or {}, + "waitFor": wait_for, + "mobile": mobile, + "skipTlsVerification": skip_tls_verification, "timeout": timeout, + "jsonOptions": json_options, + "location": location, + "removeBase64Images": remove_base64_images, + "blockAds": block_ads, + "actions": actions or [], + "proxy": proxy, } return self._firecrawl.scrape_url(url, options) diff --git a/crewai_tools/tools/firecrawl_search_tool/README.md b/crewai_tools/tools/firecrawl_search_tool/README.md index effb3f3d..6c14a1ac 100644 --- a/crewai_tools/tools/firecrawl_search_tool/README.md +++ b/crewai_tools/tools/firecrawl_search_tool/README.md @@ -2,12 +2,15 @@ ## Description -[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean markdown or structured data. +[Firecrawl](https://firecrawl.dev) is a platform for crawling and convert any website into clean +markdown or structured data. ## Installation -- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables (`FIRECRAWL_API_KEY`). -- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` package: +- Get an API key from [firecrawl.dev](https://firecrawl.dev) and set it in environment variables + (`FIRECRAWL_API_KEY`). +- Install the [Firecrawl SDK](https://github.com/mendableai/firecrawl) along with `crewai[tools]` + package: ``` pip install firecrawl-py 'crewai[tools]' @@ -15,7 +18,7 @@ pip install firecrawl-py 'crewai[tools]' ## Example -Utilize the FirecrawlSearchTool as follows to allow your agent to load websites: +Utilize the `FirecrawlSearchTool` as follows to allow your agent to search the web: ```python from crewai_tools import FirecrawlSearchTool @@ -25,11 +28,14 @@ tool = FirecrawlSearchTool(query='what is firecrawl?') ## Arguments -- `api_key`: Optional. Specifies Firecrawl API key. Defaults is the `FIRECRAWL_API_KEY` environment variable. -- `query`: The search query string to be used for searching. -- `page_options`: Optional. Options for result formatting. - - `onlyMainContent`: Optional. Only return the main content of the page excluding headers, navs, footers, etc. - - `includeHtml`: Optional. Include the raw HTML content of the page. Will output a html key in the response. - - `fetchPageContent`: Optional. Fetch the full content of the page. -- `search_options`: Optional. Options for controlling the crawling behavior. - - `limit`: Optional. Maximum number of pages to crawl. \ No newline at end of file +| Parameter | Required | Default | Description | +| ---------------- | -------- | --------------------------- | -------------------------------------------------- | +| `api_key` | ✅ | `FIRECRAWL_API_KEY` env var | Specifies Firecrawl API key | +| `query` | ✅ | - | The search query string to be used for searching | +| `limit` | ❌ | `5` | Maximum number of results to return (between 1-10) | +| `tbs` | ❌ | - | Time-based search parameter | +| `lang` | ❌ | `"en"` | Language code for search results | +| `country` | ❌ | `"us"` | Country code for search results | +| `location` | ❌ | - | Location parameter for search results | +| `timeout` | ❌ | `60000` | Timeout in milliseconds | +| `scrape_options` | ❌ | - | Options for scraping search results | diff --git a/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py b/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py index f7f4f367..4efc7d26 100644 --- a/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py +++ b/crewai_tools/tools/firecrawl_search_tool/firecrawl_search_tool.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Any, Dict, Optional, Type +from typing import TYPE_CHECKING, Any, Dict, Optional, Type, List, Union from crewai.tools import BaseTool from pydantic import BaseModel, ConfigDict, Field, PrivateAttr @@ -27,19 +27,18 @@ class FirecrawlSearchToolSchema(BaseModel): country: Optional[str] = Field( default="us", description="Country code for search results" ) - location: Optional[str] = Field( - default=None, description="Location parameter for search results" + location: Optional[Dict[str, Any]] = Field( + default=None, + description="Location settings for the request", ) timeout: Optional[int] = Field(default=60000, description="Timeout in milliseconds") scrape_options: Optional[Dict[str, Any]] = Field( - default=None, description="Options for scraping search results" + default=None, + description="Options for scraping search results", ) class FirecrawlSearchTool(BaseTool): - model_config = ConfigDict( - arbitrary_types_allowed=True, validate_assignment=True, frozen=False - ) model_config = ConfigDict( arbitrary_types_allowed=True, validate_assignment=True, frozen=False ) @@ -87,21 +86,22 @@ def _run( tbs: Optional[str] = None, lang: Optional[str] = "en", country: Optional[str] = "us", - location: Optional[str] = None, + location: Optional[Dict[str, Any]] = None, timeout: Optional[int] = 60000, - scrape_options: Optional[Dict[str, Any]] = None, + scrape_options: Optional[Dict[str, Any]] = None ) -> Any: if not self.firecrawl: raise RuntimeError("FirecrawlApp not properly initialized") options = { + "query": query, "limit": limit, "tbs": tbs, "lang": lang, "country": country, "location": location, "timeout": timeout, - "scrapeOptions": scrape_options or {}, + "scrapeOptions": scrape_options or {} } return self.firecrawl.search(**options) diff --git a/pyproject.toml b/pyproject.toml index 24749368..f6633c9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ multion = [ "multion>=1.1.0", ] firecrawl-py = [ - "firecrawl-py>=1.8.0", + "firecrawl-py>=1.12.0", ] composio-core = [ "composio-core>=0.6.11.post1",