Testing patterns¶
Common patterns for testing AI agents with Tenro.
Simulating responses¶
Single result (same every call)¶
When a tool should return the same value regardless of how many times it's called:
from tenro import link_tool, Provider, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("search")
def search(query: str) -> list[str]:
return api.search(query)
@tenro
def test_consistent_search_results():
# Search always returns the same docs
tool.simulate(search, result=["doc1", "doc2"])
llm.simulate(Provider.OPENAI, responses=[
ToolCall(search, query="AI"),
"Found 2 documents about AI.",
])
result = research_agent.run("Find AI papers")
tool.verify_many(search, count=1)
assert "2 documents" in result
Sequential results (different each call)¶
When you need different results for successive calls (e.g., retry logic):
from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("api_call")
def api_call() -> dict:
return external_api.call()
@tenro
def test_retry_logic():
# First tool call fails, second succeeds
tool.simulate(api_call, results=[
ConnectionError("Timeout"),
{"status": "ok"},
])
# LLM requests tool, gets error, retries, succeeds
llm.simulate(Provider.OPENAI, responses=[
ToolCall(api_call),
ToolCall(api_call),
"API call succeeded.",
])
result = resilient_agent.run("Make API call")
tool.verify_many(api_call, count=2)
assert "succeeded" in result
Dynamic results (computed at call time)¶
When tool results should depend on what your agent passes:
from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("search")
def search(query: str) -> list[str]:
return api.search(query)
@tenro
def test_search_returns_relevant_results():
# Return different results based on search query
tool.simulate(
search,
side_effect=lambda query: [f"Result for: {query}"],
)
llm.simulate(Provider.OPENAI, responses=[
ToolCall(search, query="AI trends"),
"Found relevant info about AI trends.",
])
result = research_agent.run("AI trends")
tool.verify(search, query="AI trends")
Simulating tool calls¶
ToolCall syntax¶
Use ToolCall to simulate LLM tool call requests:
from tenro import Provider, ToolCall, link_tool
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("search")
def search(query: str) -> list[str]:
return api.search(query)
@tenro
def test_tool_call():
# ToolCall with callable (type-safe, IDE autocomplete)
llm.simulate(Provider.OPENAI, responses=[
ToolCall(search, query="AI papers"),
"Found results.",
])
tool.simulate(search, result=["paper1"])
agent.run("Find AI papers")
tool.verify(search)
ToolCall forms:
# Callable (preferred - type-safe)
ToolCall(search, query="AI")
# String name (when callable unavailable)
ToolCall("search", query="AI")
# Explicit form
ToolCall(name="search", arguments={"query": "AI"})
Single turn vs multiple turns¶
The outer responses= list controls how many LLM calls. Each item is consumed by one call:
# THREE separate LLM calls (3 turns)
responses=["First", "Second", "Third"]
# ONE LLM call with interleaved content (1 turn)
responses=[LLMResponse(["Thinking", ToolCall(search, q="AI"), "Done"])]
Use LLMResponse when you need text and tool calls in a single atomic response:
from tenro import LLMResponse, Provider, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@tenro
def test_single_turn_interleaved():
# ONE LLM call returns: text + tool call + more text
llm.simulate(Provider.ANTHROPIC, responses=[
LLMResponse([
"I'll search for that.",
ToolCall(search, query="AI"),
"Let me also check weather.",
ToolCall(get_weather, city="NYC"),
])
])
tool.simulate(search, result=["result"])
tool.simulate(get_weather, result={"temp": 72})
agent.run("Research")
# Only ONE LLM call was made
llm.verify_many(Provider.ANTHROPIC, count=1)
tool.verify_many(count=2)
Provider interleaving support
Anthropic and Gemini preserve block order. OpenAI Chat API flattens blocks (text concatenated, tool calls extracted to separate array). The response still works, but ordering within the turn is lost.
Simulating agentic loops¶
When the LLM decides to call tools and then responds with the result:
from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("get_weather")
def get_weather(city: str) -> dict:
return weather_api.fetch(city)
@tenro
def test_llm_calls_tool():
# Set up tool result first
tool.simulate(get_weather, result={"temp": 72, "condition": "sunny"})
# LLM requests tool (1st response), then gives final answer (2nd response)
llm.simulate(Provider.OPENAI, responses=[
ToolCall(get_weather, city="Paris"),
"It's 72°F and sunny in Paris!",
])
result = weather_agent.run("What's the weather in Paris?")
tool.verify(get_weather, city="Paris")
assert "72" in result
Verifying calls¶
Verify call count¶
from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("search")
def search(query: str) -> list[str]:
return api.search(query)
@tenro
def test_call_counts():
tool.simulate(search, result=["doc1"])
llm.simulate(Provider.OPENAI, responses=[
ToolCall(search, query="AI"),
"Found 1 document.",
])
agent.run("Search for AI")
# Verify LLM was called twice (tool request + final response)
llm.verify_many(Provider.OPENAI, count=2)
tool.verify_many(search, count=1)
Verify arguments¶
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@tenro
def test_correct_arguments():
tool.simulate(search, result=[])
llm.simulate(Provider.OPENAI, responses=[
ToolCall(search, query="AI papers"),
"No papers found.",
])
agent.run("Find AI papers")
# Verify the LLM passed correct arguments to the tool
tool.verify(search, query="AI papers")
Verify output content¶
from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_output_content():
llm.simulate(Provider.OPENAI, response="The answer is 42.")
calculator_agent.run("What is 6 times 7?")
llm.verify(output_contains="42")
Verify never called¶
from tenro import Provider, link_tool
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("dangerous_operation")
def dangerous_operation() -> None:
# Something risky
pass
@tenro
def test_safety_check():
llm.simulate(Provider.OPENAI, response="I cannot do that.")
safe_agent.run("Do something dangerous")
# Verify agent refused without calling dangerous tool
tool.verify_never(dangerous_operation)
Verifying call sequence¶
When your agent performs multiple tool calls in order:
from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("fetch")
def fetch() -> str:
return data_source.fetch()
@link_tool("save")
def save(data: str) -> str:
return storage.save(data)
@tenro
def test_workflow_order():
tool.simulate(fetch, result="raw data")
tool.simulate(save, result="ok")
llm.simulate(Provider.OPENAI, responses=[
ToolCall(fetch),
ToolCall(save, data="processed"),
"Data fetched and saved.",
])
pipeline_agent.run("Process data")
# Verify tools were called in the expected order
tool.verify_sequence([fetch, save])
Testing error handling¶
Simulate API failures to test how your agent handles errors.
Test your agent's behavior¶
When you simulate an exception, your agent sees it as a real API failure. Test how your agent responds—retry logic, fallback messages, graceful degradation:
from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_handles_rate_limit():
llm.simulate(
provider=Provider.OPENAI,
responses=[RateLimitError("Too many requests")],
)
result = resilient_agent.run("Hello")
# Verify your agent handles the error gracefully
assert "try again" in result.lower()
Don't test SDK exception types
Different SDKs wrap errors differently. Anthropic might turn ConnectionError
into APIConnectionError, OpenAI wraps it another way. Test your agent's
behavior, not the specific exception class.
Testing retries¶
Simulate multiple failures followed by success:
from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_retry_succeeds():
llm.simulate(
provider=Provider.OPENAI,
responses=[
ConnectionError("Network error"),
ConnectionError("Network error"),
"Success!",
],
)
result = retry_agent.run("Hello")
# Verify retry worked
assert result == "Success!"
llm.verify_many(count=3) # 2 failures + 1 success
Testing max retries¶
Verify your agent stops after a maximum number of attempts:
from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_gives_up_after_max_retries():
# More failures than your agent will retry
llm.simulate(
provider=Provider.OPENAI,
responses=[ConnectionError("Network error")] * 5,
)
result = agent_with_3_retries.run("Hello")
# Agent should stop after 3 attempts
llm.verify_many(count=3)
assert "failed" in result.lower() or "error" in result.lower()
Need to assert specific exception types?
Use use_http=False for direct exception raising without SDK wrapping.
See Simulating errors in the API reference.
LLM hallucinations (non-existent tools)¶
Test how your agent handles an LLM requesting a tool that doesn't exist:
from tenro import Provider, ToolCall
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_handles_hallucinated_tool():
# LLM requests a tool that doesn't exist in your system
llm.simulate(
Provider.OPENAI,
responses=["I'll use magic", ToolCall("nonexistent_tool", x=1)],
)
result = defensive_agent.run("Do something")
# Verify your agent handles the unknown tool gracefully
llm.verify(Provider.OPENAI)
assert "error" in result.lower() or "cannot" in result.lower()
Invalid tool arguments¶
Test how your agent handles an LLM passing wrong argument types:
from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("search")
def search(query: str, limit: int = 10) -> list[str]:
return api.search(query, limit)
@tenro
def test_handles_invalid_args():
# LLM sends string where int expected
llm.simulate(
Provider.OPENAI,
responses=["Searching", ToolCall("search", query="AI", limit="ten")],
)
tool.simulate(search, result=[])
result = agent.run("Search")
# Your agent should validate args before execution
llm.verify(Provider.OPENAI)
Optional simulations¶
When a tool might or might not be called (e.g., caching):
from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@link_tool("cache_lookup")
def cache_lookup(key: str) -> str | None:
return cache.get(key)
@link_tool("fetch")
def fetch() -> str:
return data_source.fetch()
@tenro
def test_conditional_tool():
# Cache might be hit or missed - we don't know which path
tool.simulate(cache_lookup, result=None, optional=True)
tool.simulate(fetch, result="data")
llm.simulate(Provider.OPENAI, responses=[
ToolCall(fetch),
"Here's the data.",
])
agent.run("Get data")
# Only verify what must happen
tool.verify_many(fetch, count=1)
Multiple providers¶
Testing with multiple LLM providers:
from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_multi_provider():
llm.simulate(Provider.OPENAI, response="OpenAI response")
llm.simulate(Provider.ANTHROPIC, response="Claude response")
multi_agent.run("Compare responses")
llm.verify(Provider.OPENAI)
llm.verify(Provider.ANTHROPIC)
Next steps¶
- Frameworks: Examples with LangChain, CrewAI, and more
- API Reference: Complete Construct API documentation