✨ (summarize): introduce web scraping and summarization feature

Add Firecrawl API integration for web scraping, enabling content extraction from URLs. Implement a summarization agent to process scraped data and generate summaries. Update `.env.example` with `FIRECRAWL_API_KEY` for configuration. Enhance `create_tools.py` to include the new summarize tool. Update `pyproject.toml` to include `firecrawl-py` dependency. Adjust `models.py` to incorporate summarization usage guidelines.
kyaukyuai · Feb 19, 2025 · 307057a · 307057a
1 parent 258f14a
commit 307057a
Show file tree

Hide file tree

Showing 9 changed files with 298 additions and 2 deletions.
diff --git a/.env.example b/.env.example
@@ -17,3 +17,6 @@ LANGGRAPH_TOKEN=admin # Authentication token for LangGraph
 LANGSMITH_TRACING=false # Enable tracing for LangSmith
 LANGSMITH_API_KEY=your-langsmith-api-key # Required for LangSmith tracing
 LANGSMITH_PROJECT=your-langsmith-project-name # Required for LangSmith tracing
+
+# For web scraping
+FIRECRAWL_API_KEY=your-firecrawl-api-key # Required for web scraping
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,5 +1,6 @@
 {
     "cSpell.words": [
+        "firecrawl",
         "Tavily"
     ]
 }
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,6 +32,7 @@ beautifulsoup4 = "^4.13.1"
 slack-sdk = "^3.34.0"
 youtube-search = "^2.1.2"
 langchain-experimental = "^0.3.4"
+firecrawl-py = "^1.12.0"
 
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.9.2"

diff --git a/slack_ai_agent/agents/summarize_agent.py b/slack_ai_agent/agents/summarize_agent.py
@@ -0,0 +1,99 @@
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Optional
+
+from langchain.schema import HumanMessage
+from langchain.schema import SystemMessage
+from langchain_core.runnables import RunnableConfig
+from langgraph.graph import END
+from langgraph.graph import START
+from langgraph.graph import StateGraph
+
+from slack_ai_agent.agents.prompts.summarizer_instructions import (
+    SUMMARIZER_INSTRUCTIONS,
+)
+from slack_ai_agent.agents.tools.firecrawl_scrape import firecrawl_scrape
+from slack_ai_agent.agents.utils.models import model
+
+
+@dataclass(kw_only=True)
+class SummarizeState:
+    scrape_result: Optional[str] = field(default=None)
+    summarize_url: Optional[str] = field(default=None)
+    summarize_result: Optional[str] = field(default=None)
+    summarize_loop_count: int = field(default=0)
+
+
+@dataclass(kw_only=True)
+class SummarizeStateInput:
+    summarize_url: Optional[str] = field(default=None)
+
+
+@dataclass(kw_only=True)
+class SummarizeStateOutput:
+    summarize_result: Optional[str] = field(default=None)
+
+
+def scrape_url(state: SummarizeState, config: RunnableConfig):
+    """Scrape the URL.
+
+    Args:
+        state: The current state containing the URL to scrape
+        config: The runnable configuration
+
+    Returns:
+        dict: Dictionary containing the scraped result
+    """
+    scrape_result = firecrawl_scrape(url=state.summarize_url)
+
+    return {
+        "scrape_result": scrape_result,
+    }
+
+
+def summarize_sources(state: SummarizeState, config: RunnableConfig):
+    """Summarize the gathered sources"""
+
+    # Existing summary
+    existing_summary = state.summarize_result
+
+    # Most recent web research
+    most_recent_web_research = state.scrape_result
+
+    # Build the human message
+    if existing_summary:
+        human_message_content = (
+            f"<User Input> \n {state.summarize_url} \n <User Input>\n\n"
+            f"<Existing Summary> \n {existing_summary} \n <Existing Summary>\n\n"
+            f"<New Search Results> \n {most_recent_web_research} \n <New Search Results>"
+        )
+    else:
+        human_message_content = (
+            f"<User Input> \n {state.summarize_url} \n <User Input>\n\n"
+            f"<Search Results> \n {most_recent_web_research} \n <Search Results>"
+        )
+
+    # Run the LLM
+    result = model.invoke(
+        [
+            SystemMessage(content=SUMMARIZER_INSTRUCTIONS),
+            HumanMessage(content=human_message_content),
+        ]
+    )
+
+    summarize_result = result.content
+
+    return {"summarize_result": summarize_result}
+
+
+builder = StateGraph(
+    SummarizeState, input=SummarizeStateInput, output=SummarizeStateOutput
+)
+builder.add_node("scrape_url", scrape_url)
+builder.add_node("summarize_sources", summarize_sources)
+
+builder.add_edge(START, "scrape_url")
+builder.add_edge("scrape_url", "summarize_sources")
+builder.add_edge("summarize_sources", END)
+
+graph = builder.compile()
diff --git a/slack_ai_agent/agents/tools/create_tools.py b/slack_ai_agent/agents/tools/create_tools.py
@@ -6,6 +6,7 @@
 from .python import create_python_repl_tool
 from .research import research
 from .slack import create_slack_tools
+from .summarize import summarize
 from .youtube import create_youtube_tool
 
 
@@ -26,6 +27,15 @@ def create_tools() -> List:
         )
     )
 
+    # Add summarize tool
+    tools.append(
+        Tool.from_function(
+            func=summarize,
+            name="summarize",
+            description="Useful for when you need to summarize the content of a specific URL. Input should be a URL that you want to analyze and summarize.",
+        )
+    )
+
     # Add memory tool
     tools.append(upsert_memory)
 

diff --git a/slack_ai_agent/agents/tools/firecrawl_scrape.py b/slack_ai_agent/agents/tools/firecrawl_scrape.py
@@ -0,0 +1,23 @@
+import os
+from typing import Dict
+
+from firecrawl import FirecrawlApp
+from langsmith import traceable
+
+
+@traceable
+def firecrawl_scrape(url: str) -> Dict:
+    """Scrape a webpage using the Firecrawl API.
+
+    Args:
+        url (str): The URL of the webpage to scrape
+
+    Returns:
+        dict: Scraped webpage content in markdown format, containing:
+            - content (str): The webpage content converted to markdown
+            - metadata (dict): Additional metadata about the webpage
+            - status (str): Status of the scraping request
+            - url (str): The original URL that was scraped"""
+
+    firecrawl_client = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
+    return firecrawl_client.scrape_url(url, params={"formats": ["markdown"]})
diff --git a/slack_ai_agent/agents/tools/summarize.py b/slack_ai_agent/agents/tools/summarize.py
@@ -0,0 +1,29 @@
+from typing import Any
+from typing import Dict
+
+
+def summarize(url: str) -> Dict[str, Any]:
+    """Summarize the content of a given URL.
+
+    This tool performs a comprehensive summarization by:
+    1. Scraping the content from the provided URL
+    2. Analyzing and summarizing the gathered information
+    3. Providing a detailed summary of the content
+
+    Args:
+        url (str): The URL to summarize
+
+    Returns:
+        Dict[str, Any]: A dictionary containing the summarization results with:
+            - result:
+                - summary: A comprehensive summary of the URL content
+    """
+    # Import here to avoid circular import
+    from slack_ai_agent.agents.summarize_agent import graph
+
+    summarize_result = graph.invoke({"summarize_url": url})
+    return {
+        "result": {
+            "summary": summarize_result["summarize_result"],
+        }
+    }