Skip to content

Commit

Permalink
✨ (summarize): introduce web scraping and summarization feature
Browse files Browse the repository at this point in the history
Add Firecrawl API integration for web scraping, enabling content
extraction from URLs. Implement a summarization agent to process
scraped data and generate summaries. Update `.env.example` with
`FIRECRAWL_API_KEY` for configuration. Enhance `create_tools.py` to
include the new summarize tool. Update `pyproject.toml` to include
`firecrawl-py` dependency. Adjust `models.py` to incorporate
summarization usage guidelines.
  • Loading branch information
kyaukyuai committed Feb 19, 2025
1 parent 258f14a commit 307057a
Show file tree
Hide file tree
Showing 9 changed files with 298 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,6 @@ LANGGRAPH_TOKEN=admin # Authentication token for LangGraph
LANGSMITH_TRACING=false # Enable tracing for LangSmith
LANGSMITH_API_KEY=your-langsmith-api-key # Required for LangSmith tracing
LANGSMITH_PROJECT=your-langsmith-project-name # Required for LangSmith tracing

# For web scraping
FIRECRAWL_API_KEY=your-firecrawl-api-key # Required for web scraping
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"cSpell.words": [
"firecrawl",
"Tavily"
]
}
109 changes: 108 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ beautifulsoup4 = "^4.13.1"
slack-sdk = "^3.34.0"
youtube-search = "^2.1.2"
langchain-experimental = "^0.3.4"
firecrawl-py = "^1.12.0"

[tool.poetry.group.dev.dependencies]
ruff = "^0.9.2"
Expand Down
99 changes: 99 additions & 0 deletions slack_ai_agent/agents/summarize_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from dataclasses import dataclass
from dataclasses import field
from typing import Optional

from langchain.schema import HumanMessage
from langchain.schema import SystemMessage
from langchain_core.runnables import RunnableConfig
from langgraph.graph import END
from langgraph.graph import START
from langgraph.graph import StateGraph

from slack_ai_agent.agents.prompts.summarizer_instructions import (
SUMMARIZER_INSTRUCTIONS,
)
from slack_ai_agent.agents.tools.firecrawl_scrape import firecrawl_scrape
from slack_ai_agent.agents.utils.models import model


@dataclass(kw_only=True)
class SummarizeState:
scrape_result: Optional[str] = field(default=None)
summarize_url: Optional[str] = field(default=None)
summarize_result: Optional[str] = field(default=None)
summarize_loop_count: int = field(default=0)


@dataclass(kw_only=True)
class SummarizeStateInput:
summarize_url: Optional[str] = field(default=None)


@dataclass(kw_only=True)
class SummarizeStateOutput:
summarize_result: Optional[str] = field(default=None)


def scrape_url(state: SummarizeState, config: RunnableConfig):
"""Scrape the URL.
Args:
state: The current state containing the URL to scrape
config: The runnable configuration
Returns:
dict: Dictionary containing the scraped result
"""
scrape_result = firecrawl_scrape(url=state.summarize_url)

return {
"scrape_result": scrape_result,
}


def summarize_sources(state: SummarizeState, config: RunnableConfig):
"""Summarize the gathered sources"""

# Existing summary
existing_summary = state.summarize_result

# Most recent web research
most_recent_web_research = state.scrape_result

# Build the human message
if existing_summary:
human_message_content = (
f"<User Input> \n {state.summarize_url} \n <User Input>\n\n"
f"<Existing Summary> \n {existing_summary} \n <Existing Summary>\n\n"
f"<New Search Results> \n {most_recent_web_research} \n <New Search Results>"
)
else:
human_message_content = (
f"<User Input> \n {state.summarize_url} \n <User Input>\n\n"
f"<Search Results> \n {most_recent_web_research} \n <Search Results>"
)

# Run the LLM
result = model.invoke(
[
SystemMessage(content=SUMMARIZER_INSTRUCTIONS),
HumanMessage(content=human_message_content),
]
)

summarize_result = result.content

return {"summarize_result": summarize_result}


builder = StateGraph(
SummarizeState, input=SummarizeStateInput, output=SummarizeStateOutput
)
builder.add_node("scrape_url", scrape_url)
builder.add_node("summarize_sources", summarize_sources)

builder.add_edge(START, "scrape_url")
builder.add_edge("scrape_url", "summarize_sources")
builder.add_edge("summarize_sources", END)

graph = builder.compile()
10 changes: 10 additions & 0 deletions slack_ai_agent/agents/tools/create_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .python import create_python_repl_tool
from .research import research
from .slack import create_slack_tools
from .summarize import summarize
from .youtube import create_youtube_tool


Expand All @@ -26,6 +27,15 @@ def create_tools() -> List:
)
)

# Add summarize tool
tools.append(
Tool.from_function(
func=summarize,
name="summarize",
description="Useful for when you need to summarize the content of a specific URL. Input should be a URL that you want to analyze and summarize.",
)
)

# Add memory tool
tools.append(upsert_memory)

Expand Down
23 changes: 23 additions & 0 deletions slack_ai_agent/agents/tools/firecrawl_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import os
from typing import Dict

from firecrawl import FirecrawlApp
from langsmith import traceable


@traceable
def firecrawl_scrape(url: str) -> Dict:
"""Scrape a webpage using the Firecrawl API.
Args:
url (str): The URL of the webpage to scrape
Returns:
dict: Scraped webpage content in markdown format, containing:
- content (str): The webpage content converted to markdown
- metadata (dict): Additional metadata about the webpage
- status (str): Status of the scraping request
- url (str): The original URL that was scraped"""

firecrawl_client = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
return firecrawl_client.scrape_url(url, params={"formats": ["markdown"]})
29 changes: 29 additions & 0 deletions slack_ai_agent/agents/tools/summarize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import Any
from typing import Dict


def summarize(url: str) -> Dict[str, Any]:
"""Summarize the content of a given URL.
This tool performs a comprehensive summarization by:
1. Scraping the content from the provided URL
2. Analyzing and summarizing the gathered information
3. Providing a detailed summary of the content
Args:
url (str): The URL to summarize
Returns:
Dict[str, Any]: A dictionary containing the summarization results with:
- result:
- summary: A comprehensive summary of the URL content
"""
# Import here to avoid circular import
from slack_ai_agent.agents.summarize_agent import graph

summarize_result = graph.invoke({"summarize_url": url})
return {
"result": {
"summary": summarize_result["summarize_result"],
}
}
Loading

0 comments on commit 307057a

Please sign in to comment.