From 476a9efbcf3ee3aef6991e96bfbe997065396e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20F=2E=20Tonato?= Date: Tue, 25 Mar 2025 23:04:19 +0000 Subject: [PATCH 1/3] feat: add Firecrawl tool and example scripts - Introduced the Firecrawl tool for web scraping, crawling, searching, and information extraction using the Firecrawl API. - Added example scripts: `firecrawl_example.py` for comprehensive usage. - Implemented unit tests for the Firecrawl tool to ensure functionality and error handling. --- examples/firecrawl_example.py | 70 +++++ metagpt/tools/libs/firecrawl.py | 296 +++++++++++++++++++++ tests/metagpt/tools/libs/test_firecrawl.py | 176 ++++++++++++ tests/metagpt/tools/test_firecrawl_tool.py | 177 ++++++++++++ 4 files changed, 719 insertions(+) create mode 100644 examples/firecrawl_example.py create mode 100644 metagpt/tools/libs/firecrawl.py create mode 100644 tests/metagpt/tools/libs/test_firecrawl.py create mode 100644 tests/metagpt/tools/test_firecrawl_tool.py diff --git a/examples/firecrawl_example.py b/examples/firecrawl_example.py new file mode 100644 index 0000000000..6b44d65b5f --- /dev/null +++ b/examples/firecrawl_example.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Example usage of the Firecrawl tool with MetaGPT.""" + +import asyncio +import os +import sys +from pathlib import Path +import time + +# Add the project root to Python path +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +from metagpt.tools.libs.firecrawl import Firecrawl + +async def main(): + """Run example usage of Firecrawl tool.""" + # Set up environment variables if not already set + if "FIRECRAWL_API_KEY" not in os.environ: + os.environ["FIRECRAWL_API_KEY"] = "YOUR-FIRECRAWL-API-KEY" + + # Create Firecrawl instance + firecrawl = Firecrawl() + + # Example 1: Search for information + print("\nExample 1: Search for Therapist in Portugal by name") + search_results = firecrawl.search("Psicologa Clínica Mairí Stumpf") + print("Search Results:", search_results) + + # Example 2: Map and crawl a website + print("\nExample 2: Map and crawl a website") + map_results = firecrawl.map_url("https://docs.firecrawl.dev") + print("Map Results:", map_results) + + if map_results.get("links"): + crawl_job = firecrawl.crawl_url(map_results["links"][0]) + print("Crawl Job:", crawl_job) + + if crawl_job.get("id"): + status = firecrawl.get_crawl_status(crawl_job["id"]) + print("Crawl Status:", status) + # While the status is not "completed" we can loop and print the status + while status != "completed": + status = firecrawl.get_crawl_status(crawl_job["id"]) + print("Crawl Status:", status) + time.sleep(5) + + # Example 3: Scrape a specific URL + print("\nExample 3: Scrape a URL") + scrape_results = firecrawl.scrape_url("https://example.com") + print("Scrape Results:", scrape_results) + + # Example 4: Extract information from URLs + print("\nExample 4: Extract information") + extract_job = firecrawl.extract(["https://www.imdb.com/chart/starmeter/"], params ={"prompt":"Extract the top five most popular celebs names and their popularity score if available"}) + print("Extract Job:", extract_job) + + if extract_job.get("id"): + extract_status = firecrawl.get_extract_status(extract_job["id"]) + print("\nExtract Status:", extract_status) + + # While the status is not "completed" we can loop and print the status + while extract_status.get("status") != "completed": + extract_status = firecrawl.get_extract_status(extract_job["id"]) + print("\nUpdated Status:", extract_status) + time.sleep(10) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/metagpt/tools/libs/firecrawl.py b/metagpt/tools/libs/firecrawl.py new file mode 100644 index 0000000000..c52ce6c462 --- /dev/null +++ b/metagpt/tools/libs/firecrawl.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Firecrawl Tool for MetaGPT. + +This module provides a tool for interacting with the Firecrawl API, enabling web scraping, +crawling, searching, and information extraction capabilities. + +Author: Ademílson Tonato +""" + +import os +from typing import Any, Dict, List, Optional, Union + +import requests +from metagpt.tools.tool_registry import register_tool + + +@register_tool(tags=["web", "scraping", "search"], include_functions=["map_url", "scrape_url", "search", "crawl_url", "extract"]) +class Firecrawl: + """A tool for web scraping, crawling, searching and information extraction using Firecrawl API. + + This tool provides methods to interact with the Firecrawl API for various web data collection + and processing tasks. It supports URL mapping, scraping, searching, crawling, and information + extraction. + + Attributes: + api_key (str): The API key for authenticating with Firecrawl API. + api_url (str): The base URL for the Firecrawl API. + """ + + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None): + """Initialize the Firecrawl tool. + + Args: + api_key (Optional[str]): API key for Firecrawl. Defaults to environment variable. + api_url (Optional[str]): Base URL for Firecrawl API. Defaults to production URL. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + if not self.api_key: + raise ValueError('No API key provided') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + self.request_timeout = 60 + + def _prepare_headers(self) -> Dict[str, str]: + """Prepare headers for API requests. + + Returns: + Dict[str, str]: Headers including content type and authorization. + """ + return { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}', + 'X-Origin': 'metagpt', + 'X-Origin-Type': 'integration', + } + + def _handle_error(self, response: requests.Response, action: str) -> None: + """Handle API errors. + + Args: + response (requests.Response): The response from the API. + action (str): Description of the action being performed. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + try: + error_message = response.json().get('error', 'No error message provided.') + error_details = response.json().get('details', 'No additional error details provided.') + except: + raise requests.exceptions.HTTPError( + f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', + response=response + ) + + message = f"Error during {action}: Status code {response.status_code}. {error_message} - {error_details}" + raise requests.exceptions.HTTPError(message, response=response) + + def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Map a URL to discover all available links. + + Args: + url (str): The URL to map. + params (Optional[Dict[str, Any]]): Additional parameters for the mapping operation. + + Returns: + Dict[str, Any]: A dictionary containing the mapped URLs and related information. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/map', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'map URL') + + def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Scrape content from a specific URL. + + Args: + url (str): The URL to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraping operation. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content and metadata. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/scrape', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'scrape URL') + + def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Perform a web search using Firecrawl. + + Args: + query (str): The search query. + params (Optional[Dict[str, Any]]): Additional parameters for the search operation. + + Returns: + Dict[str, Any]: A dictionary containing search results and metadata. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'query': query} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/search', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'search') + + def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Start a crawl job for a given URL. + + Args: + url (str): The URL to crawl. + params (Optional[Dict[str, Any]]): Additional parameters for the crawl operation. + + Returns: + Dict[str, Any]: A dictionary containing the crawl results and metadata. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/crawl', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'start crawl job') + + def get_crawl_status(self, job_id: str) -> Dict[str, Any]: + """Get the status of a crawl job. + + Args: + job_id (str): The ID of the crawl job. + + Returns: + Dict[str, Any]: A dictionary containing the crawl job status and results. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + response = requests.get( + f'{self.api_url}/v1/crawl/{job_id}', + headers=headers, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'check crawl status') + + def extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Extract structured information from URLs. + + Args: + urls (List[str]): List of URLs to extract information from. + params (Optional[Dict[str, Any]]): Additional parameters for the extraction operation. + + Returns: + Dict[str, Any]: A dictionary containing the extracted information and metadata. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'urls': urls} + if params: + json_data.update(params) + + response = requests.post( + f'{self.api_url}/v1/extract', + headers=headers, + json=json_data, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'extract') + + def get_extract_status(self, job_id: str) -> Dict[str, Any]: + """Get the status of an extract job. + + Args: + job_id (str): The ID of the extract job. + + Returns: + Dict[str, Any]: A dictionary containing the extract job status and results. + + Raises: + requests.exceptions.HTTPError: If the API request fails. + """ + headers = self._prepare_headers() + response = requests.get( + f'{self.api_url}/v1/extract/{job_id}', + headers=headers, + timeout=self.request_timeout + ) + + if response.status_code == 200: + try: + return response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + self._handle_error(response, 'check extract status') \ No newline at end of file diff --git a/tests/metagpt/tools/libs/test_firecrawl.py b/tests/metagpt/tools/libs/test_firecrawl.py new file mode 100644 index 0000000000..ff97dc89f9 --- /dev/null +++ b/tests/metagpt/tools/libs/test_firecrawl.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Test module for the Firecrawl tool.""" + +import os +import pytest +from unittest.mock import MagicMock, patch +import requests + +from metagpt.tools.libs.firecrawl import Firecrawl + +API_KEY = "YOUR-FIRECRAWL-API-KEY" +API_URL = "https://api.firecrawl.dev" + +EXPECTED_HEADERS = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {API_KEY}', + 'X-Origin': 'metagpt', + 'X-Origin-Type': 'integration', +} + +@pytest.fixture +def mock_response(): + """Create a mock response object.""" + response = MagicMock() + response.status_code = 200 + response.json.return_value = {"success": True} + return response + +@pytest.fixture +def firecrawl(): + """Create a Firecrawl instance for testing.""" + return Firecrawl(api_key=API_KEY, api_url=API_URL) + +def test_initialization(): + """Test initialization with direct parameters.""" + tool = Firecrawl(api_key=API_KEY, api_url=API_URL) + assert tool.api_key == API_KEY + assert tool.api_url == API_URL + +def test_initialization_with_env_vars(): + """Test initialization with environment variables.""" + os.environ["FIRECRAWL_API_KEY"] = API_KEY + os.environ["FIRECRAWL_API_URL"] = API_URL + + tool = Firecrawl() + assert tool.api_key == API_KEY + assert tool.api_url == API_URL + + # Clean up environment variables + del os.environ["FIRECRAWL_API_KEY"] + del os.environ["FIRECRAWL_API_URL"] + +def test_initialization_without_api_key(): + """Test initialization without API key raises error.""" + with pytest.raises(ValueError, match="No API key provided"): + Firecrawl() + +def test_map_url(firecrawl, mock_response): + """Test the map_url method.""" + mock_response.json.return_value = {"success": True, "links": ["http://example.com/page1"]} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.map_url("http://example.com") + + assert result == {"success": True, "links": ["http://example.com/page1"]} + mock_post.assert_called_once_with( + f'{API_URL}/v1/map', + headers=EXPECTED_HEADERS, + json={'url': 'http://example.com'}, + timeout=60 + ) + +def test_scrape_url(firecrawl, mock_response): + """Test the scrape_url method.""" + mock_response.json.return_value = {"success": True, "data": {"title": "Example"}} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.scrape_url("http://example.com") + + assert result == {"success": True, "data": {"title": "Example"}} + mock_post.assert_called_once_with( + f'{API_URL}/v1/scrape', + headers=EXPECTED_HEADERS, + json={'url': 'http://example.com'}, + timeout=60 + ) + +def test_search(firecrawl, mock_response): + """Test the search method.""" + mock_response.json.return_value = {"success": True, "results": [{"title": "Test Result"}]} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.search("test query") + + assert result == {"success": True, "results": [{"title": "Test Result"}]} + mock_post.assert_called_once_with( + f'{API_URL}/v1/search', + headers=EXPECTED_HEADERS, + json={'query': 'test query'}, + timeout=60 + ) + +def test_crawl_url(firecrawl, mock_response): + """Test the crawl_url method.""" + mock_response.json.return_value = {"success": True, "id": "test_job_id"} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.crawl_url("http://example.com") + + assert result == {"success": True, "id": "test_job_id"} + mock_post.assert_called_once_with( + f'{API_URL}/v1/crawl', + headers=EXPECTED_HEADERS, + json={'url': 'http://example.com'}, + timeout=60 + ) + +def test_get_crawl_status(firecrawl, mock_response): + """Test the get_crawl_status method.""" + mock_response.json.return_value = {"success": True, "status": "completed"} + + with patch('requests.get', return_value=mock_response) as mock_get: + result = firecrawl.get_crawl_status("test_job_id") + + assert result == {"success": True, "status": "completed"} + mock_get.assert_called_once_with( + f'{API_URL}/v1/crawl/test_job_id', + headers=EXPECTED_HEADERS, + timeout=60 + ) + +def test_extract(firecrawl, mock_response): + """Test the extract method.""" + mock_response.json.return_value = {"success": True, "data": {"extracted": "content"}} + + with patch('requests.post', return_value=mock_response) as mock_post: + result = firecrawl.extract(["http://example.com"]) + + assert result == {"success": True, "data": {"extracted": "content"}} + mock_post.assert_called_once_with( + f'{API_URL}/v1/extract', + headers=EXPECTED_HEADERS, + json={'urls': ['http://example.com']}, + timeout=60 + ) + +def test_get_extract_status(firecrawl, mock_response): + """Test the get_extract_status method.""" + mock_response.json.return_value = {"success": True, "status": "completed"} + + with patch('requests.get', return_value=mock_response) as mock_get: + result = firecrawl.get_extract_status("test_job_id") + + assert result == {"success": True, "status": "completed"} + mock_get.assert_called_once_with( + f'{API_URL}/v1/extract/test_job_id', + headers=EXPECTED_HEADERS, + timeout=60 + ) + +def test_error_handling(firecrawl): + """Test error handling.""" + mock_error_response = MagicMock() + mock_error_response.status_code = 400 + mock_error_response.json.return_value = { + "error": "Test error", + "details": "Test error details" + } + + with patch('requests.post', return_value=mock_error_response): + with pytest.raises(requests.exceptions.HTTPError) as exc_info: + firecrawl.map_url("http://example.com") + + assert "Test error" in str(exc_info.value) + assert "Test error details" in str(exc_info.value) \ No newline at end of file diff --git a/tests/metagpt/tools/test_firecrawl_tool.py b/tests/metagpt/tools/test_firecrawl_tool.py new file mode 100644 index 0000000000..97af4bbc82 --- /dev/null +++ b/tests/metagpt/tools/test_firecrawl_tool.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Desc : Tests for Firecrawl Tool + +import os +import pytest +from unittest.mock import patch, MagicMock + +from metagpt.tools.firecrawl_tool import FirecrawlTool +from metagpt.tools.firecrawl_env import FirecrawlEnv + + +@pytest.fixture +def firecrawl_tool(): + """Create a FirecrawlTool instance for testing.""" + with patch.dict(os.environ, {'FIRECRAWL_API_KEY': 'test_api_key'}): + return FirecrawlTool() + + +@pytest.mark.asyncio +async def test_map_url(firecrawl_tool): + """Test the map_url method.""" + mock_response = { + "success": True, + "links": ["http://example.com/1", "http://example.com/2"] + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.map_url("http://example.com") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "map_url" + assert firecrawl_tool.env.operation_status == "completed" + + +@pytest.mark.asyncio +async def test_scrape_url(firecrawl_tool): + """Test the scrape_url method.""" + mock_response = { + "success": True, + "data": {"title": "Test Page", "content": "Test Content"} + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.scrape_url("http://example.com") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "scrape_url" + assert firecrawl_tool.env.operation_status == "completed" + + +@pytest.mark.asyncio +async def test_search(firecrawl_tool): + """Test the search method.""" + mock_response = { + "success": True, + "data": [ + {"title": "Result 1", "url": "http://example.com/1"}, + {"title": "Result 2", "url": "http://example.com/2"} + ] + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.search("test query") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "search" + assert firecrawl_tool.env.operation_status == "completed" + + +@pytest.mark.asyncio +async def test_crawl_url(firecrawl_tool): + """Test the crawl_url method.""" + mock_response = { + "success": True, + "id": "test_job_id" + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.crawl_url("http://example.com") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "crawl_url" + assert firecrawl_tool.env.operation_status == "completed" + assert "test_job_id" in firecrawl_tool.env.active_jobs + assert firecrawl_tool.env.active_jobs["test_job_id"] == "crawl" + + +@pytest.mark.asyncio +async def test_get_crawl_status(firecrawl_tool): + """Test the get_crawl_status method.""" + mock_response = { + "success": True, + "status": "completed", + "data": {"pages": 10} + } + + firecrawl_tool.env.track_job("test_job_id", "crawl") + + with patch('requests.get') as mock_get: + mock_get.return_value.status_code = 200 + mock_get.return_value.json.return_value = mock_response + + result = await firecrawl_tool.get_crawl_status("test_job_id") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "get_crawl_status" + assert firecrawl_tool.env.operation_status == "completed" + assert "test_job_id" not in firecrawl_tool.env.active_jobs + + +@pytest.mark.asyncio +async def test_extract(firecrawl_tool): + """Test the extract method.""" + mock_response = { + "success": True, + "id": "test_job_id" + } + + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 200 + mock_post.return_value.json.return_value = mock_response + + result = await firecrawl_tool.extract(["http://example.com"]) + assert result == mock_response + assert firecrawl_tool.env.current_operation == "extract" + assert firecrawl_tool.env.operation_status == "completed" + assert "test_job_id" in firecrawl_tool.env.active_jobs + assert firecrawl_tool.env.active_jobs["test_job_id"] == "extract" + + +@pytest.mark.asyncio +async def test_get_extract_status(firecrawl_tool): + """Test the get_extract_status method.""" + mock_response = { + "success": True, + "status": "completed", + "data": {"extracted": 5} + } + + firecrawl_tool.env.track_job("test_job_id", "extract") + + with patch('requests.get') as mock_get: + mock_get.return_value.status_code = 200 + mock_get.return_value.json.return_value = mock_response + + result = await firecrawl_tool.get_extract_status("test_job_id") + assert result == mock_response + assert firecrawl_tool.env.current_operation == "get_extract_status" + assert firecrawl_tool.env.operation_status == "completed" + assert "test_job_id" not in firecrawl_tool.env.active_jobs + + +@pytest.mark.asyncio +async def test_error_handling(firecrawl_tool): + """Test error handling in the tool.""" + with patch('requests.post') as mock_post: + mock_post.return_value.status_code = 500 + mock_post.return_value.json.return_value = {"error": "Internal Server Error"} + + with pytest.raises(Exception): + await firecrawl_tool.map_url("http://example.com") + + assert firecrawl_tool.env.operation_status == "failed" + + +def test_missing_api_key(): + """Test initialization without API key.""" + with pytest.raises(ValueError): + FirecrawlTool() \ No newline at end of file From 190ad52aa2c4d8aa1d27fec112b4daf38f217b6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20Tonato?= Date: Wed, 4 Jun 2025 20:33:59 +0100 Subject: [PATCH 2/3] feat: enhance Firecrawl tool with request data preparation - Added a new method `_prepare_request_data` to include the integration parameter in API requests. - Updated multiple API request methods to utilize the new data preparation method for consistency and improved functionality. --- metagpt/tools/libs/firecrawl.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/metagpt/tools/libs/firecrawl.py b/metagpt/tools/libs/firecrawl.py index c52ce6c462..5a987e3008 100644 --- a/metagpt/tools/libs/firecrawl.py +++ b/metagpt/tools/libs/firecrawl.py @@ -50,10 +50,20 @@ def _prepare_headers(self) -> Dict[str, str]: return { 'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}', - 'X-Origin': 'metagpt', - 'X-Origin-Type': 'integration', } + def _prepare_request_data(self, data: Dict[str, Any]) -> Dict[str, Any]: + """Prepare request data with integration parameter. + + Args: + data (Dict[str, Any]): The original request data. + + Returns: + Dict[str, Any]: Request data with integration parameter added. + """ + data['integration'] = 'metagpt' + return data + def _handle_error(self, response: requests.Response, action: str) -> None: """Handle API errors. @@ -93,6 +103,7 @@ def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str json_data = {'url': url} if params: json_data.update(params) + json_data = self._prepare_request_data(json_data) response = requests.post( f'{self.api_url}/v1/map', @@ -126,6 +137,7 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[ json_data = {'url': url} if params: json_data.update(params) + json_data = self._prepare_request_data(json_data) response = requests.post( f'{self.api_url}/v1/scrape', @@ -159,6 +171,7 @@ def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[st json_data = {'query': query} if params: json_data.update(params) + json_data = self._prepare_request_data(json_data) response = requests.post( f'{self.api_url}/v1/search', @@ -192,6 +205,7 @@ def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[s json_data = {'url': url} if params: json_data.update(params) + json_data = self._prepare_request_data(json_data) response = requests.post( f'{self.api_url}/v1/crawl', @@ -252,6 +266,7 @@ def extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None) -> D json_data = {'urls': urls} if params: json_data.update(params) + json_data = self._prepare_request_data(json_data) response = requests.post( f'{self.api_url}/v1/extract', From b3856c8b1e68535632daa98366f855f2603e8c6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adem=C3=ADlson=20Tonato?= Date: Tue, 1 Jul 2025 17:01:20 +0100 Subject: [PATCH 3/3] refactor: convert Firecrawl tool to use aiohttp for asynchronous requests --- examples/firecrawl_example.py | 37 +-- metagpt/tools/libs/firecrawl.py | 255 +++++++++++---------- tests/metagpt/tools/libs/test_firecrawl.py | 218 +++++++----------- tests/metagpt/tools/test_firecrawl_tool.py | 177 -------------- 4 files changed, 238 insertions(+), 449 deletions(-) delete mode 100644 tests/metagpt/tools/test_firecrawl_tool.py diff --git a/examples/firecrawl_example.py b/examples/firecrawl_example.py index 6b44d65b5f..76797be2a5 100644 --- a/examples/firecrawl_example.py +++ b/examples/firecrawl_example.py @@ -25,46 +25,49 @@ async def main(): # Example 1: Search for information print("\nExample 1: Search for Therapist in Portugal by name") - search_results = firecrawl.search("Psicologa Clínica Mairí Stumpf") + search_results = await firecrawl.search("Psicologa Clínica Mairí Stumpf") print("Search Results:", search_results) # Example 2: Map and crawl a website print("\nExample 2: Map and crawl a website") - map_results = firecrawl.map_url("https://docs.firecrawl.dev") + map_results = await firecrawl.map_url("https://docs.firecrawl.dev") print("Map Results:", map_results) if map_results.get("links"): - crawl_job = firecrawl.crawl_url(map_results["links"][0]) + crawl_job = await firecrawl.crawl_url(map_results["links"][0]) print("Crawl Job:", crawl_job) if crawl_job.get("id"): - status = firecrawl.get_crawl_status(crawl_job["id"]) + status = await firecrawl.get_crawl_status(crawl_job["id"]) print("Crawl Status:", status) - # While the status is not "completed" we can loop and print the status - while status != "completed": - status = firecrawl.get_crawl_status(crawl_job["id"]) - print("Crawl Status:", status) - time.sleep(5) + # While the status is not "completed" we can loop and print the status + while status.get("status") != "completed": + status = await firecrawl.get_crawl_status(crawl_job["id"]) + print("Crawl Status:", status) + await asyncio.sleep(5) # Example 3: Scrape a specific URL print("\nExample 3: Scrape a URL") - scrape_results = firecrawl.scrape_url("https://example.com") + scrape_results = await firecrawl.scrape_url("https://example.com") print("Scrape Results:", scrape_results) # Example 4: Extract information from URLs print("\nExample 4: Extract information") - extract_job = firecrawl.extract(["https://www.imdb.com/chart/starmeter/"], params ={"prompt":"Extract the top five most popular celebs names and their popularity score if available"}) + extract_job = await firecrawl.extract( + ["https://www.imdb.com/chart/starmeter/"], + params={"prompt": "Extract the top five most popular celebs names and their popularity score if available"} + ) print("Extract Job:", extract_job) if extract_job.get("id"): - extract_status = firecrawl.get_extract_status(extract_job["id"]) + extract_status = await firecrawl.get_extract_status(extract_job["id"]) print("\nExtract Status:", extract_status) - # While the status is not "completed" we can loop and print the status - while extract_status.get("status") != "completed": - extract_status = firecrawl.get_extract_status(extract_job["id"]) - print("\nUpdated Status:", extract_status) - time.sleep(10) + # While the status is not "completed" we can loop and print the status + while extract_status.get("status") != "completed": + extract_status = await firecrawl.get_extract_status(extract_job["id"]) + print("\nUpdated Status:", extract_status) + await asyncio.sleep(10) if __name__ == "__main__": asyncio.run(main()) \ No newline at end of file diff --git a/metagpt/tools/libs/firecrawl.py b/metagpt/tools/libs/firecrawl.py index 5a987e3008..f67ac0e388 100644 --- a/metagpt/tools/libs/firecrawl.py +++ b/metagpt/tools/libs/firecrawl.py @@ -11,7 +11,7 @@ import os from typing import Any, Dict, List, Optional, Union -import requests +import aiohttp from metagpt.tools.tool_registry import register_tool @@ -39,7 +39,7 @@ def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) if not self.api_key: raise ValueError('No API key provided') self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - self.request_timeout = 60 + self.request_timeout = aiohttp.ClientTimeout(total=60) def _prepare_headers(self) -> Dict[str, str]: """Prepare headers for API requests. @@ -64,29 +64,37 @@ def _prepare_request_data(self, data: Dict[str, Any]) -> Dict[str, Any]: data['integration'] = 'metagpt' return data - def _handle_error(self, response: requests.Response, action: str) -> None: + async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None: """Handle API errors. Args: - response (requests.Response): The response from the API. + response (aiohttp.ClientResponse): The response from the API. action (str): Description of the action being performed. Raises: - requests.exceptions.HTTPError: If the API request fails. + aiohttp.ClientResponseError: If the API request fails. """ try: - error_message = response.json().get('error', 'No error message provided.') - error_details = response.json().get('details', 'No additional error details provided.') + error_data = await response.json() + error_message = error_data.get('error', 'No error message provided.') + error_details = error_data.get('details', 'No additional error details provided.') except: - raise requests.exceptions.HTTPError( - f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', - response=response + raise aiohttp.ClientResponseError( + response.request_info, + response.history, + status=response.status, + message=f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}' ) - message = f"Error during {action}: Status code {response.status_code}. {error_message} - {error_details}" - raise requests.exceptions.HTTPError(message, response=response) + message = f"Error during {action}: Status code {response.status}. {error_message} - {error_details}" + raise aiohttp.ClientResponseError( + response.request_info, + response.history, + status=response.status, + message=message + ) - def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + async def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Map a URL to discover all available links. Args: @@ -97,7 +105,7 @@ def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str Dict[str, Any]: A dictionary containing the mapped URLs and related information. Raises: - requests.exceptions.HTTPError: If the API request fails. + aiohttp.ClientResponseError: If the API request fails. """ headers = self._prepare_headers() json_data = {'url': url} @@ -105,22 +113,21 @@ def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str json_data.update(params) json_data = self._prepare_request_data(json_data) - response = requests.post( - f'{self.api_url}/v1/map', - headers=headers, - json=json_data, - timeout=self.request_timeout - ) - - if response.status_code == 200: - try: - return response.json() - except: - raise Exception('Failed to parse Firecrawl response as JSON.') - else: - self._handle_error(response, 'map URL') - - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/map', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'map URL') + + async def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Scrape content from a specific URL. Args: @@ -131,7 +138,7 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[ Dict[str, Any]: A dictionary containing the scraped content and metadata. Raises: - requests.exceptions.HTTPError: If the API request fails. + aiohttp.ClientResponseError: If the API request fails. """ headers = self._prepare_headers() json_data = {'url': url} @@ -139,22 +146,21 @@ def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[ json_data.update(params) json_data = self._prepare_request_data(json_data) - response = requests.post( - f'{self.api_url}/v1/scrape', - headers=headers, - json=json_data, - timeout=self.request_timeout - ) - - if response.status_code == 200: - try: - return response.json() - except: - raise Exception('Failed to parse Firecrawl response as JSON.') - else: - self._handle_error(response, 'scrape URL') - - def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/scrape', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'scrape URL') + + async def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Perform a web search using Firecrawl. Args: @@ -165,7 +171,7 @@ def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[st Dict[str, Any]: A dictionary containing search results and metadata. Raises: - requests.exceptions.HTTPError: If the API request fails. + aiohttp.ClientResponseError: If the API request fails. """ headers = self._prepare_headers() json_data = {'query': query} @@ -173,22 +179,21 @@ def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[st json_data.update(params) json_data = self._prepare_request_data(json_data) - response = requests.post( - f'{self.api_url}/v1/search', - headers=headers, - json=json_data, - timeout=self.request_timeout - ) - - if response.status_code == 200: - try: - return response.json() - except: - raise Exception('Failed to parse Firecrawl response as JSON.') - else: - self._handle_error(response, 'search') - - def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/search', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'search') + + async def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Start a crawl job for a given URL. Args: @@ -199,7 +204,7 @@ def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[s Dict[str, Any]: A dictionary containing the crawl results and metadata. Raises: - requests.exceptions.HTTPError: If the API request fails. + aiohttp.ClientResponseError: If the API request fails. """ headers = self._prepare_headers() json_data = {'url': url} @@ -207,22 +212,21 @@ def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[s json_data.update(params) json_data = self._prepare_request_data(json_data) - response = requests.post( - f'{self.api_url}/v1/crawl', - headers=headers, - json=json_data, - timeout=self.request_timeout - ) - - if response.status_code == 200: - try: - return response.json() - except: - raise Exception('Failed to parse Firecrawl response as JSON.') - else: - self._handle_error(response, 'start crawl job') - - def get_crawl_status(self, job_id: str) -> Dict[str, Any]: + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/crawl', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'start crawl job') + + async def get_crawl_status(self, job_id: str) -> Dict[str, Any]: """Get the status of a crawl job. Args: @@ -232,24 +236,24 @@ def get_crawl_status(self, job_id: str) -> Dict[str, Any]: Dict[str, Any]: A dictionary containing the crawl job status and results. Raises: - requests.exceptions.HTTPError: If the API request fails. + aiohttp.ClientResponseError: If the API request fails. """ headers = self._prepare_headers() - response = requests.get( - f'{self.api_url}/v1/crawl/{job_id}', - headers=headers, - timeout=self.request_timeout - ) - - if response.status_code == 200: - try: - return response.json() - except: - raise Exception('Failed to parse Firecrawl response as JSON.') - else: - self._handle_error(response, 'check crawl status') - def extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.get( + f'{self.api_url}/v1/crawl/{job_id}', + headers=headers + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'check crawl status') + + async def extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: """Extract structured information from URLs. Args: @@ -260,7 +264,7 @@ def extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None) -> D Dict[str, Any]: A dictionary containing the extracted information and metadata. Raises: - requests.exceptions.HTTPError: If the API request fails. + aiohttp.ClientResponseError: If the API request fails. """ headers = self._prepare_headers() json_data = {'urls': urls} @@ -268,22 +272,21 @@ def extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None) -> D json_data.update(params) json_data = self._prepare_request_data(json_data) - response = requests.post( - f'{self.api_url}/v1/extract', - headers=headers, - json=json_data, - timeout=self.request_timeout - ) - - if response.status_code == 200: - try: - return response.json() - except: - raise Exception('Failed to parse Firecrawl response as JSON.') - else: - self._handle_error(response, 'extract') - - def get_extract_status(self, job_id: str) -> Dict[str, Any]: + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/extract', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'extract') + + async def get_extract_status(self, job_id: str) -> Dict[str, Any]: """Get the status of an extract job. Args: @@ -293,19 +296,19 @@ def get_extract_status(self, job_id: str) -> Dict[str, Any]: Dict[str, Any]: A dictionary containing the extract job status and results. Raises: - requests.exceptions.HTTPError: If the API request fails. + aiohttp.ClientResponseError: If the API request fails. """ headers = self._prepare_headers() - response = requests.get( - f'{self.api_url}/v1/extract/{job_id}', - headers=headers, - timeout=self.request_timeout - ) - if response.status_code == 200: - try: - return response.json() - except: - raise Exception('Failed to parse Firecrawl response as JSON.') - else: - self._handle_error(response, 'check extract status') \ No newline at end of file + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.get( + f'{self.api_url}/v1/extract/{job_id}', + headers=headers + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'check extract status') \ No newline at end of file diff --git a/tests/metagpt/tools/libs/test_firecrawl.py b/tests/metagpt/tools/libs/test_firecrawl.py index ff97dc89f9..7226628017 100644 --- a/tests/metagpt/tools/libs/test_firecrawl.py +++ b/tests/metagpt/tools/libs/test_firecrawl.py @@ -4,8 +4,8 @@ import os import pytest -from unittest.mock import MagicMock, patch -import requests +from unittest.mock import MagicMock, patch, AsyncMock +import aiohttp from metagpt.tools.libs.firecrawl import Firecrawl @@ -15,162 +15,122 @@ EXPECTED_HEADERS = { 'Content-Type': 'application/json', 'Authorization': f'Bearer {API_KEY}', - 'X-Origin': 'metagpt', - 'X-Origin-Type': 'integration', } -@pytest.fixture -def mock_response(): - """Create a mock response object.""" - response = MagicMock() - response.status_code = 200 - response.json.return_value = {"success": True} - return response - @pytest.fixture def firecrawl(): """Create a Firecrawl instance for testing.""" return Firecrawl(api_key=API_KEY, api_url=API_URL) def test_initialization(): - """Test initialization with direct parameters.""" tool = Firecrawl(api_key=API_KEY, api_url=API_URL) assert tool.api_key == API_KEY assert tool.api_url == API_URL def test_initialization_with_env_vars(): - """Test initialization with environment variables.""" os.environ["FIRECRAWL_API_KEY"] = API_KEY os.environ["FIRECRAWL_API_URL"] = API_URL - tool = Firecrawl() assert tool.api_key == API_KEY assert tool.api_url == API_URL - - # Clean up environment variables del os.environ["FIRECRAWL_API_KEY"] del os.environ["FIRECRAWL_API_URL"] def test_initialization_without_api_key(): - """Test initialization without API key raises error.""" with pytest.raises(ValueError, match="No API key provided"): Firecrawl() -def test_map_url(firecrawl, mock_response): - """Test the map_url method.""" - mock_response.json.return_value = {"success": True, "links": ["http://example.com/page1"]} - - with patch('requests.post', return_value=mock_response) as mock_post: - result = firecrawl.map_url("http://example.com") - +def mock_aiohttp_session(method: str, mock_response_data: dict, status: int = 200): + mock_response = AsyncMock() + mock_response.status = status + mock_response.json = AsyncMock(return_value=mock_response_data) + + mock_cm_response = MagicMock() + mock_cm_response.__aenter__ = AsyncMock(return_value=mock_response) + mock_cm_response.__aexit__ = AsyncMock(return_value=None) + + mock_method = MagicMock(return_value=mock_cm_response) + + mock_session = MagicMock() + setattr(mock_session, method, mock_method) + + mock_session_cm = MagicMock() + mock_session_cm.__aenter__ = AsyncMock(return_value=mock_session) + mock_session_cm.__aexit__ = AsyncMock(return_value=None) + + return mock_session_cm + +@pytest.mark.asyncio +async def test_map_url(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "links": ["http://example.com/page1"]})): + result = await firecrawl.map_url("http://example.com") assert result == {"success": True, "links": ["http://example.com/page1"]} - mock_post.assert_called_once_with( - f'{API_URL}/v1/map', - headers=EXPECTED_HEADERS, - json={'url': 'http://example.com'}, - timeout=60 - ) - -def test_scrape_url(firecrawl, mock_response): - """Test the scrape_url method.""" - mock_response.json.return_value = {"success": True, "data": {"title": "Example"}} - - with patch('requests.post', return_value=mock_response) as mock_post: - result = firecrawl.scrape_url("http://example.com") - + +@pytest.mark.asyncio +async def test_scrape_url(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "data": {"title": "Example"}})): + result = await firecrawl.scrape_url("http://example.com") assert result == {"success": True, "data": {"title": "Example"}} - mock_post.assert_called_once_with( - f'{API_URL}/v1/scrape', - headers=EXPECTED_HEADERS, - json={'url': 'http://example.com'}, - timeout=60 - ) - -def test_search(firecrawl, mock_response): - """Test the search method.""" - mock_response.json.return_value = {"success": True, "results": [{"title": "Test Result"}]} - - with patch('requests.post', return_value=mock_response) as mock_post: - result = firecrawl.search("test query") - + +@pytest.mark.asyncio +async def test_search(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "results": [{"title": "Test Result"}]})): + result = await firecrawl.search("test query") assert result == {"success": True, "results": [{"title": "Test Result"}]} - mock_post.assert_called_once_with( - f'{API_URL}/v1/search', - headers=EXPECTED_HEADERS, - json={'query': 'test query'}, - timeout=60 - ) - -def test_crawl_url(firecrawl, mock_response): - """Test the crawl_url method.""" - mock_response.json.return_value = {"success": True, "id": "test_job_id"} - - with patch('requests.post', return_value=mock_response) as mock_post: - result = firecrawl.crawl_url("http://example.com") - + +@pytest.mark.asyncio +async def test_crawl_url(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "id": "test_job_id"})): + result = await firecrawl.crawl_url("http://example.com") assert result == {"success": True, "id": "test_job_id"} - mock_post.assert_called_once_with( - f'{API_URL}/v1/crawl', - headers=EXPECTED_HEADERS, - json={'url': 'http://example.com'}, - timeout=60 - ) - -def test_get_crawl_status(firecrawl, mock_response): - """Test the get_crawl_status method.""" - mock_response.json.return_value = {"success": True, "status": "completed"} - - with patch('requests.get', return_value=mock_response) as mock_get: - result = firecrawl.get_crawl_status("test_job_id") - + +@pytest.mark.asyncio +async def test_get_crawl_status(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("get", {"success": True, "status": "completed"})): + result = await firecrawl.get_crawl_status("test_job_id") assert result == {"success": True, "status": "completed"} - mock_get.assert_called_once_with( - f'{API_URL}/v1/crawl/test_job_id', - headers=EXPECTED_HEADERS, - timeout=60 - ) - -def test_extract(firecrawl, mock_response): - """Test the extract method.""" - mock_response.json.return_value = {"success": True, "data": {"extracted": "content"}} - - with patch('requests.post', return_value=mock_response) as mock_post: - result = firecrawl.extract(["http://example.com"]) - + +@pytest.mark.asyncio +async def test_extract(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "data": {"extracted": "content"}})): + result = await firecrawl.extract(["http://example.com"]) assert result == {"success": True, "data": {"extracted": "content"}} - mock_post.assert_called_once_with( - f'{API_URL}/v1/extract', - headers=EXPECTED_HEADERS, - json={'urls': ['http://example.com']}, - timeout=60 - ) - -def test_get_extract_status(firecrawl, mock_response): - """Test the get_extract_status method.""" - mock_response.json.return_value = {"success": True, "status": "completed"} - - with patch('requests.get', return_value=mock_response) as mock_get: - result = firecrawl.get_extract_status("test_job_id") - + +@pytest.mark.asyncio +async def test_get_extract_status(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("get", {"success": True, "status": "completed"})): + result = await firecrawl.get_extract_status("test_job_id") assert result == {"success": True, "status": "completed"} - mock_get.assert_called_once_with( - f'{API_URL}/v1/extract/test_job_id', - headers=EXPECTED_HEADERS, - timeout=60 - ) - -def test_error_handling(firecrawl): - """Test error handling.""" - mock_error_response = MagicMock() - mock_error_response.status_code = 400 - mock_error_response.json.return_value = { - "error": "Test error", - "details": "Test error details" - } - - with patch('requests.post', return_value=mock_error_response): - with pytest.raises(requests.exceptions.HTTPError) as exc_info: - firecrawl.map_url("http://example.com") - - assert "Test error" in str(exc_info.value) - assert "Test error details" in str(exc_info.value) \ No newline at end of file + +@pytest.mark.asyncio +async def test_error_handling(firecrawl): + mock_response = AsyncMock() + mock_response.status = 400 + mock_response.json = AsyncMock(return_value={"error": "Test error", "details": "Test error details"}) + mock_response.request_info = MagicMock() + mock_response.history = [] + + mock_context = MagicMock() + mock_context.__aenter__ = AsyncMock(return_value=mock_response) + mock_context.__aexit__ = AsyncMock(return_value=None) + + def mock_post(*args, **kwargs): + return mock_context + + mock_session = MagicMock() + mock_session.post = mock_post + + mock_session_cm = MagicMock() + mock_session_cm.__aenter__ = AsyncMock(return_value=mock_session) + mock_session_cm.__aexit__ = AsyncMock(return_value=None) + + with patch("aiohttp.ClientSession", return_value=mock_session_cm): + with pytest.raises(aiohttp.ClientResponseError): + await firecrawl.map_url("http://example.com") + +@pytest.mark.asyncio +async def test_params_integration(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True})): + params = {"param1": "value1", "param2": "value2"} + result = await firecrawl.map_url("http://example.com", params=params) + assert result == {"success": True} diff --git a/tests/metagpt/tools/test_firecrawl_tool.py b/tests/metagpt/tools/test_firecrawl_tool.py deleted file mode 100644 index 97af4bbc82..0000000000 --- a/tests/metagpt/tools/test_firecrawl_tool.py +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# @Desc : Tests for Firecrawl Tool - -import os -import pytest -from unittest.mock import patch, MagicMock - -from metagpt.tools.firecrawl_tool import FirecrawlTool -from metagpt.tools.firecrawl_env import FirecrawlEnv - - -@pytest.fixture -def firecrawl_tool(): - """Create a FirecrawlTool instance for testing.""" - with patch.dict(os.environ, {'FIRECRAWL_API_KEY': 'test_api_key'}): - return FirecrawlTool() - - -@pytest.mark.asyncio -async def test_map_url(firecrawl_tool): - """Test the map_url method.""" - mock_response = { - "success": True, - "links": ["http://example.com/1", "http://example.com/2"] - } - - with patch('requests.post') as mock_post: - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = mock_response - - result = await firecrawl_tool.map_url("http://example.com") - assert result == mock_response - assert firecrawl_tool.env.current_operation == "map_url" - assert firecrawl_tool.env.operation_status == "completed" - - -@pytest.mark.asyncio -async def test_scrape_url(firecrawl_tool): - """Test the scrape_url method.""" - mock_response = { - "success": True, - "data": {"title": "Test Page", "content": "Test Content"} - } - - with patch('requests.post') as mock_post: - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = mock_response - - result = await firecrawl_tool.scrape_url("http://example.com") - assert result == mock_response - assert firecrawl_tool.env.current_operation == "scrape_url" - assert firecrawl_tool.env.operation_status == "completed" - - -@pytest.mark.asyncio -async def test_search(firecrawl_tool): - """Test the search method.""" - mock_response = { - "success": True, - "data": [ - {"title": "Result 1", "url": "http://example.com/1"}, - {"title": "Result 2", "url": "http://example.com/2"} - ] - } - - with patch('requests.post') as mock_post: - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = mock_response - - result = await firecrawl_tool.search("test query") - assert result == mock_response - assert firecrawl_tool.env.current_operation == "search" - assert firecrawl_tool.env.operation_status == "completed" - - -@pytest.mark.asyncio -async def test_crawl_url(firecrawl_tool): - """Test the crawl_url method.""" - mock_response = { - "success": True, - "id": "test_job_id" - } - - with patch('requests.post') as mock_post: - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = mock_response - - result = await firecrawl_tool.crawl_url("http://example.com") - assert result == mock_response - assert firecrawl_tool.env.current_operation == "crawl_url" - assert firecrawl_tool.env.operation_status == "completed" - assert "test_job_id" in firecrawl_tool.env.active_jobs - assert firecrawl_tool.env.active_jobs["test_job_id"] == "crawl" - - -@pytest.mark.asyncio -async def test_get_crawl_status(firecrawl_tool): - """Test the get_crawl_status method.""" - mock_response = { - "success": True, - "status": "completed", - "data": {"pages": 10} - } - - firecrawl_tool.env.track_job("test_job_id", "crawl") - - with patch('requests.get') as mock_get: - mock_get.return_value.status_code = 200 - mock_get.return_value.json.return_value = mock_response - - result = await firecrawl_tool.get_crawl_status("test_job_id") - assert result == mock_response - assert firecrawl_tool.env.current_operation == "get_crawl_status" - assert firecrawl_tool.env.operation_status == "completed" - assert "test_job_id" not in firecrawl_tool.env.active_jobs - - -@pytest.mark.asyncio -async def test_extract(firecrawl_tool): - """Test the extract method.""" - mock_response = { - "success": True, - "id": "test_job_id" - } - - with patch('requests.post') as mock_post: - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = mock_response - - result = await firecrawl_tool.extract(["http://example.com"]) - assert result == mock_response - assert firecrawl_tool.env.current_operation == "extract" - assert firecrawl_tool.env.operation_status == "completed" - assert "test_job_id" in firecrawl_tool.env.active_jobs - assert firecrawl_tool.env.active_jobs["test_job_id"] == "extract" - - -@pytest.mark.asyncio -async def test_get_extract_status(firecrawl_tool): - """Test the get_extract_status method.""" - mock_response = { - "success": True, - "status": "completed", - "data": {"extracted": 5} - } - - firecrawl_tool.env.track_job("test_job_id", "extract") - - with patch('requests.get') as mock_get: - mock_get.return_value.status_code = 200 - mock_get.return_value.json.return_value = mock_response - - result = await firecrawl_tool.get_extract_status("test_job_id") - assert result == mock_response - assert firecrawl_tool.env.current_operation == "get_extract_status" - assert firecrawl_tool.env.operation_status == "completed" - assert "test_job_id" not in firecrawl_tool.env.active_jobs - - -@pytest.mark.asyncio -async def test_error_handling(firecrawl_tool): - """Test error handling in the tool.""" - with patch('requests.post') as mock_post: - mock_post.return_value.status_code = 500 - mock_post.return_value.json.return_value = {"error": "Internal Server Error"} - - with pytest.raises(Exception): - await firecrawl_tool.map_url("http://example.com") - - assert firecrawl_tool.env.operation_status == "failed" - - -def test_missing_api_key(): - """Test initialization without API key.""" - with pytest.raises(ValueError): - FirecrawlTool() \ No newline at end of file