Testing agents
How to write reliable tests for agents, tools, and deep agent workflows. SHIPIT Agent is designed to be testable without requiring live LLM calls.
Mock LLM for Testing
Use MockLLM to simulate LLM responses in tests without API calls:
python
from shipit_agent import Agent
from shipit_agent.testing import MockLLM, MockResponse
mock = MockLLM(responses=[MockResponse(content="The price of Bitcoin is $67,432 USD."),])
agent = Agent.with_builtins(llm=mock)
result = agent.run("What is the Bitcoin price?")
assert "67,432" in result.output
assert result.step_count == 1Testing Tool Calls
Verify that the agent calls the right tools with the right arguments:
python
from shipit_agent.testing import MockLLM, MockToolCall, MockResponse
mock = MockLLM(responses=[MockResponse(tool_calls=[MockToolCall(name="web_search", arguments={"query": "Bitcoin price USD"}),]),
MockResponse(content="Bitcoin is currently $67,432."),])
agent = Agent.with_builtins(llm=mock)
result = agent.run("Find the current Bitcoin price")
# Check that web_search was called
assert any(step.tool_name == "web_search" for step in result.steps if step.tool_name)Testing Deep Agents
Deep agents like GoalAgent can be tested the same way:
python
from shipit_agent.deep import GoalAgent, Goal
from shipit_agent.testing import MockLLM, MockResponse
mock = MockLLM(responses=[# Planner response
MockResponse(content='{"tasks": [{"description": "Search for data"}]}'),
# Execution response
MockResponse(content="Found the required data."),
# Evaluation response
MockResponse(content='{"status": "completed", "criteria_met": [true, true]}'),])
agent = GoalAgent(
llm=mock,
goal=Goal(
objective="Find relevant data",
success_criteria=["Data found", "Source cited"],
),
)
result = agent.run()
assert result.goal_status == "completed"Testing Custom Tools
Test custom tools in isolation:
python
from shipit_agent import Tool
@Tool(name="add", description="Add two numbers")
def add_tool(a: int, b: int) -> str:
return str(a + b)
# Direct invocation
result = add_tool.execute({"a": 5, "b": 3})
assert result == "8"
# Schema validation
assert add_tool.schema["properties"]["a"]["type"] == "integer"Testing Event Streams
Verify the sequence of events emitted during a run:
python
from shipit_agent.testing import MockLLM, MockResponse, MockToolCall
mock = MockLLM(responses=[MockResponse(tool_calls=[MockToolCall(name="web_search", arguments={"query": "test"}),]),
MockResponse(content="Done."),])
agent = Agent.with_builtins(llm=mock)
events = list(agent.stream("test query"))
event_types = [e.type for e in events]
assert "run_started" in event_types
assert "tool_called" in event_types
assert "tool_completed" in event_types
assert "run_completed" in event_types
assert event_types[0] == "run_started"
assert event_types[-1] == "run_completed"Integration Tests with Real LLMs
For end-to-end validation, run against a real provider:
python
import pytest
import os
@pytest.mark.skipif(
not os.getenv("OPENAI_API_KEY"),
reason="Requires OPENAI_API_KEY"
)
def test_real_agent_run():
from shipit_agent import Agent
from shipit_agent.llms import OpenAIChatLLM
agent = Agent.with_builtins(
llm=OpenAIChatLLM(model="gpt-4o-mini")
)
result = agent.run("What is 2 + 2?")
assert "4" in result.output
assert result.step_count >= 1Snapshot Testing
Capture and compare agent behavior over time:
python
from shipit_agent.testing import snapshot_run
# First run creates the snapshot
snapshot_run(
agent=agent,
prompt="Explain JWT tokens",
snapshot_path="tests/snapshots/jwt_explain.json",
)
# Subsequent runs compare against the snapshot
# Fails if tool calls or event sequence changedBest Practices
- Use
MockLLMfor unit tests — fast, deterministic, no API costs - Test tools in isolation — verify schemas and outputs independently
- Test event streams — ensure correct ordering and completeness
- Run integration tests in CI — with real providers, behind env-var gates
- Set
max_iterations=3in tests to prevent runaway loops - Use
timeout=30to catch infinite loops early
Next Steps
- Custom Tools — build tools that are easy to test
- Error Recovery — test error handling paths
- Deployment — from tested code to production