Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tutorial for evaluating LangGraph agents #1636

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
424 changes: 424 additions & 0 deletions docs/howtos/integrations/_langgraph_agent_evaluation.md

Large diffs are not rendered by default.

783 changes: 783 additions & 0 deletions docs/howtos/integrations/langgraph_agent_evaluation.ipynb

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions docs/references/integrations.md
jjmachan marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@
::: ragas.integrations.helicone
options:
show_root_heading: true

::: ragas.integrations.langgraph
options:
show_root_heading: true
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ nav:
- Integrations:
- howtos/integrations/index.md
- LlamaIndex: howtos/integrations/_llamaindex.md
- LangGraph: howtos/integrations/_langgraph_agent_evaluation.md
- Migrations:
- From v0.1 to v0.2: howtos/migrations/migrate_from_v01_to_v02.md
- 📖 References:
Expand Down
7 changes: 4 additions & 3 deletions src/ragas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
from langchain_core.callbacks import BaseCallbackHandler, BaseCallbackManager
from langchain_core.embeddings import Embeddings as LangchainEmbeddings
from langchain_core.language_models import BaseLanguageModel as LangchainLLM

from llama_index.core.base.llms.base import BaseLLM as LlamaIndexLLM
from llama_index.core.base.embeddings.base import BaseEmbedding as LlamaIndexEmbedding
from llama_index.core.base.llms.base import BaseLLM as LlamaIndexLLM

from ragas._analytics import EvaluationEvent, track, track_was_completed
from ragas.callbacks import ChainType, RagasTracer, new_group
Expand Down Expand Up @@ -61,7 +60,9 @@ def evaluate(
dataset: t.Union[Dataset, EvaluationDataset],
metrics: t.Optional[t.Sequence[Metric]] = None,
llm: t.Optional[BaseRagasLLM | LangchainLLM | LlamaIndexLLM] = None,
embeddings: t.Optional[BaseRagasEmbeddings | LangchainEmbeddings | LlamaIndexEmbedding] = None,
embeddings: t.Optional[
BaseRagasEmbeddings | LangchainEmbeddings | LlamaIndexEmbedding
] = None,
callbacks: Callbacks = None,
in_ci: bool = False,
run_config: RunConfig = RunConfig(),
Expand Down
74 changes: 74 additions & 0 deletions src/ragas/integrations/langgraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
from typing import List, Union

from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage

import ragas.messages as r


def convert_to_ragas_messages(
jjmachan marked this conversation as resolved.
Show resolved Hide resolved
messages: List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]]
) -> List[Union[r.HumanMessage, r.AIMessage, r.ToolMessage]]:
"""
Converts LangChain messages into Ragas messages for agent evaluation.

Args:
messages: List of LangChain message objects (HumanMessage, SystemMessage,
AIMessage, ToolMessage)

Returns:
List of corresponding Ragas message objects

Raises:
ValueError: If an unsupported message type is encountered
"""

def _validate_string_content(message, message_type: str) -> str:
if not isinstance(message.content, str):
raise TypeError(
f"{message_type} content must be a string, got {type(message.content).__name__}. "
f"Content: {message.content}"
)
return message.content

MESSAGE_TYPE_MAP = {
HumanMessage: lambda m: r.HumanMessage(
content=_validate_string_content(m, "HumanMessage")
),
ToolMessage: lambda m: r.ToolMessage(
content=_validate_string_content(m, "ToolMessage")
),
}

def _extract_tool_calls(message: AIMessage) -> List[r.ToolCall]:
tool_calls = message.additional_kwargs.get("tool_calls", [])
return [
r.ToolCall(
name=tool_call["function"]["name"],
args=json.loads(tool_call["function"]["arguments"]),
)
for tool_call in tool_calls
]

def _convert_ai_message(message: AIMessage) -> r.AIMessage:
tool_calls = _extract_tool_calls(message) if message.additional_kwargs else None
return r.AIMessage(
content=_validate_string_content(message, "AIMessage"),
tool_calls=tool_calls,
)

def _convert_message(message):
if isinstance(message, SystemMessage):
return None # Skip SystemMessages
if isinstance(message, AIMessage):
return _convert_ai_message(message)
converter = MESSAGE_TYPE_MAP.get(type(message))
if converter is None:
raise ValueError(f"Unsupported message type: {type(message).__name__}")
return converter(message)

return [
converted
for message in messages
if (converted := _convert_message(message)) is not None
]
16 changes: 10 additions & 6 deletions src/ragas/metrics/_topic_adherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,7 @@ class TopicClassificationOutput(BaseModel):
class TopicClassificationPrompt(
PydanticPrompt[TopicClassificationInput, TopicClassificationOutput]
):
instruction = (
"Given a set of topics classify if the topic falls into any of the given reference topics."
)
instruction = "Given a set of topics classify if the topic falls into any of the given reference topics."
input_model = TopicClassificationInput
output_model = TopicClassificationOutput
examples = [
Expand Down Expand Up @@ -149,10 +147,14 @@ class TopicAdherenceScore(MetricWithLLM, MultiTurnMetric):
topic_classification_prompt: PydanticPrompt = TopicClassificationPrompt()
topic_refused_prompt: PydanticPrompt = TopicRefusedPrompt()

async def _multi_turn_ascore(self, sample: MultiTurnSample, callbacks: Callbacks) -> float:
async def _multi_turn_ascore(
self, sample: MultiTurnSample, callbacks: Callbacks
) -> float:
assert self.llm is not None, "LLM must be set"
assert isinstance(sample.user_input, list), "Sample user_input must be a list"
assert isinstance(sample.reference_topics, list), "Sample reference_topics must be a list"
assert isinstance(
sample.reference_topics, list
), "Sample reference_topics must be a list"
user_input = sample.pretty_repr()

prompt_input = TopicExtractionInput(user_input=user_input)
Expand All @@ -168,7 +170,9 @@ async def _multi_turn_ascore(self, sample: MultiTurnSample, callbacks: Callbacks
data=prompt_input, llm=self.llm, callbacks=callbacks
)
topic_answered_verdict.append(response.refused_to_answer)
topic_answered_verdict = np.array([not answer for answer in topic_answered_verdict])
topic_answered_verdict = np.array(
[not answer for answer in topic_answered_verdict]
)

prompt_input = TopicClassificationInput(
reference_topics=sample.reference_topics, topics=topics
Expand Down
129 changes: 129 additions & 0 deletions tests/unit/test_langgraph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import json

import pytest
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage

import ragas.messages as r
from ragas.integrations.langgraph import convert_to_ragas_messages


def test_human_message_conversion():
"""Test conversion of HumanMessage with valid string content"""
messages = [
HumanMessage(content="Hello, add 4 and 5"),
ToolMessage(content="9", tool_call_id="1"),
]
result = convert_to_ragas_messages(messages)

assert len(result) == 2
assert isinstance(result[0], r.HumanMessage)
assert result[0].content == "Hello, add 4 and 5"


def test_human_message_invalid_content():
"""Test HumanMessage with invalid content type raises TypeError"""
messages = [HumanMessage(content=["invalid", "content"])]

with pytest.raises(TypeError) as exc_info:
convert_to_ragas_messages(messages)
assert "HumanMessage content must be a string" in str(exc_info.value)


def test_ai_message_conversion():
"""Test conversion of AIMessage with valid string content"""
messages = [AIMessage(content="I'm doing well, thanks!")]
result = convert_to_ragas_messages(messages)

assert len(result) == 1
assert isinstance(result[0], r.AIMessage)
assert result[0].content == "I'm doing well, thanks!"
assert result[0].tool_calls is None


def test_ai_message_with_tool_calls():
"""Test conversion of AIMessage with tool calls"""

tool_calls = [
{
"function": {
"arguments": '{"metal_name": "gold"}',
"name": "get_metal_price",
}
},
{
"function": {
"arguments": '{"metal_name": "silver"}',
"name": "get_metal_price",
}
},
]

messages = [
AIMessage(
content="Find the difference in the price of gold and silver?",
additional_kwargs={"tool_calls": tool_calls},
)
]

result = convert_to_ragas_messages(messages)
assert len(result) == 1
assert isinstance(result[0], r.AIMessage)
assert result[0].content == "Find the difference in the price of gold and silver?"
assert len(result[0].tool_calls) == 2
assert result[0].tool_calls[0].name == "get_metal_price"
assert result[0].tool_calls[0].args == {"metal_name": "gold"}
assert result[0].tool_calls[1].name == "get_metal_price"
assert result[0].tool_calls[1].args == {"metal_name": "silver"}


def test_tool_message_conversion():
"""Test conversion of ToolMessage with valid string content"""
messages = [
HumanMessage(content="Hello, add 4 and 5"),
ToolMessage(content="9", tool_call_id="2"),
]
result = convert_to_ragas_messages(messages)

assert len(result) == 2
assert isinstance(result[1], r.ToolMessage)
assert result[1].content == "9"


def test_system_message_skipped():
"""Test that SystemMessages are properly skipped"""
messages = [SystemMessage(content="System prompt"), HumanMessage(content="Hello")]
result = convert_to_ragas_messages(messages)

assert len(result) == 1
assert isinstance(result[0], r.HumanMessage)
assert result[0].content == "Hello"


def test_unsupported_message_type():
"""Test that unsupported message types raise ValueError"""

class CustomMessage:
content = "test"

messages = [CustomMessage()]

with pytest.raises(ValueError) as exc_info:
convert_to_ragas_messages(messages)
assert "Unsupported message type: CustomMessage" in str(exc_info.value)


def test_empty_message_list():
"""Test conversion of empty message list"""
messages = []
result = convert_to_ragas_messages(messages)
assert result == []


def test_invalid_tool_calls_json():
"""Test handling of invalid JSON in tool calls"""
tool_calls = [{"function": {"name": "search", "arguments": "invalid json"}}]

messages = [AIMessage(content="Test", additional_kwargs={"tool_calls": tool_calls})]

with pytest.raises(json.JSONDecodeError):
convert_to_ragas_messages(messages)
Loading