# Imports from 3rd party packages are handled with this context manager
with optional_import_block():
from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode, CrawlerRunConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
__all__ = ["Crawl4AITool"]
# Denote that this requires a 3rd party package, with "crawl4ai"
# being the namespace. Our AG2 'extra' is called "crawl4ai".
@require_optional_import(["crawl4ai"], "crawl4ai")
@export_module("autogen.tools.experimental")
# Indicates where this appears in the API Reference documentation
# autogen > tools > experimental > Crawl4AITool
class Crawl4AITool(Tool): # Built on the Tool class
"""
Crawl a website and extract information using the crawl4ai library.
"""
# Ensure there's a docstring for the tool for documentation
def __init__(
self,
llm_config: Optional[dict[str, Any]] = None,
extraction_model: Optional[type[BaseModel]] = None,
llm_strategy_kwargs: Optional[dict[str, Any]] = None,
) -> None:
"""
Initialize the Crawl4AITool.
Args:
llm_config: The config dictionary for the LLM model. If None, the tool will run without LLM.
extraction_model: The Pydantic model to use for extraction. If None, the tool will use the default schema.
llm_strategy_kwargs: The keyword arguments to pass to the LLM extraction strategy.
""" # Follow this docstring format
llm_config = LLMConfig.get_current_llm_config(llm_config)
Crawl4AITool._validate_llm_strategy_kwargs(llm_strategy_kwargs, llm_config_provided=(llm_config is not None))
# Helper function inside init
async def crawl4ai_helper( # type: ignore[no-any-unimported]
url: str,
browser_cfg: Optional["BrowserConfig"] = None,
crawl_config: Optional["CrawlerRunConfig"] = None,
) -> Any:
async with AsyncWebCrawler(config=browser_cfg) as crawler:
result = await crawler.arun(
url=url,
config=crawl_config,
)
if crawl_config is None:
response = result.markdown
else:
response = result.extracted_content if result.success else result.error_message
return response
# Crawl without an LLM
async def crawl4ai_without_llm(
url: Annotated[str, "The url to crawl and extract information from."],
) -> Any:
return await crawl4ai_helper(url=url)
# Crawl with an LLM, using the LLM configuration passed in
async def crawl4ai_with_llm(
url: Annotated[str, "The url to crawl and extract information from."],
instruction: Annotated[str, "The instruction to provide on how and what to extract."],
llm_config: Annotated[Any, Depends(on(llm_config))],
llm_strategy_kwargs: Annotated[Optional[dict[str, Any]], Depends(on(llm_strategy_kwargs))],
extraction_model: Annotated[Optional[type[BaseModel]], Depends(on(extraction_model))],
) -> Any:
browser_cfg = BrowserConfig(headless=True)
crawl_config = Crawl4AITool._get_crawl_config(
llm_config=llm_config,
instruction=instruction,
extraction_model=extraction_model,
llm_strategy_kwargs=llm_strategy_kwargs,
)
return await crawl4ai_helper(url=url, browser_cfg=browser_cfg, crawl_config=crawl_config)
# Initialise the base Tool class with the LLM description
# and the function to call
super().__init__(
name="crawl4ai",
description="Crawl a website and extract information.",
func_or_tool=crawl4ai_without_llm if llm_config is None else crawl4ai_with_llm,
)