Skip to content

Testing patterns

Common patterns for testing AI agents with Tenro.

Simulating responses

Single result (same every call)

When a tool should return the same value regardless of how many times it's called:

from tenro import link_tool, Provider, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

@tenro
def test_consistent_search_results():
    # Search always returns the same docs
    tool.simulate(search, result=["doc1", "doc2"])
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI"),
        "Found 2 documents about AI.",
    ])

    result = research_agent.run("Find AI papers")

    tool.verify_many(search, count=1)
    assert "2 documents" in result

Sequential results (different each call)

When you need different results for successive calls (e.g., retry logic):

from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("api_call")
def api_call() -> dict:
    return external_api.call()

@tenro
def test_retry_logic():
    # First tool call fails, second succeeds
    tool.simulate(api_call, results=[
        ConnectionError("Timeout"),
        {"status": "ok"},
    ])
    # LLM requests tool, gets error, retries, succeeds
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(api_call),
        ToolCall(api_call),
        "API call succeeded.",
    ])

    result = resilient_agent.run("Make API call")

    tool.verify_many(api_call, count=2)
    assert "succeeded" in result

Dynamic results (computed at call time)

When tool results should depend on what your agent passes:

from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

@tenro
def test_search_returns_relevant_results():
    # Return different results based on search query
    tool.simulate(
        search,
        side_effect=lambda query: [f"Result for: {query}"],
    )
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI trends"),
        "Found relevant info about AI trends.",
    ])

    result = research_agent.run("AI trends")

    tool.verify(search, query="AI trends")

Simulating tool calls

ToolCall syntax

Use ToolCall to simulate LLM tool call requests:

from tenro import Provider, ToolCall, link_tool
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

@tenro
def test_tool_call():
    # ToolCall with callable (type-safe, IDE autocomplete)
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI papers"),
        "Found results.",
    ])
    tool.simulate(search, result=["paper1"])

    agent.run("Find AI papers")

    tool.verify(search)

ToolCall forms:

# Callable (preferred - type-safe)
ToolCall(search, query="AI")

# String name (when callable unavailable)
ToolCall("search", query="AI")

# Explicit form
ToolCall(name="search", arguments={"query": "AI"})

Single turn vs multiple turns

The outer responses= list controls how many LLM calls. Each item is consumed by one call:

# THREE separate LLM calls (3 turns)
responses=["First", "Second", "Third"]

# ONE LLM call with interleaved content (1 turn)
responses=[LLMResponse(["Thinking", ToolCall(search, q="AI"), "Done"])]

Use LLMResponse when you need text and tool calls in a single atomic response:

from tenro import LLMResponse, Provider, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@tenro
def test_single_turn_interleaved():
    # ONE LLM call returns: text + tool call + more text
    llm.simulate(Provider.ANTHROPIC, responses=[
        LLMResponse([
            "I'll search for that.",
            ToolCall(search, query="AI"),
            "Let me also check weather.",
            ToolCall(get_weather, city="NYC"),
        ])
    ])
    tool.simulate(search, result=["result"])
    tool.simulate(get_weather, result={"temp": 72})

    agent.run("Research")

    # Only ONE LLM call was made
    llm.verify_many(Provider.ANTHROPIC, count=1)
    tool.verify_many(count=2)

Provider interleaving support

Anthropic and Gemini preserve block order. OpenAI Chat API flattens blocks (text concatenated, tool calls extracted to separate array). The response still works, but ordering within the turn is lost.

Simulating agentic loops

When the LLM decides to call tools and then responds with the result:

from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("get_weather")
def get_weather(city: str) -> dict:
    return weather_api.fetch(city)

@tenro
def test_llm_calls_tool():
    # Set up tool result first
    tool.simulate(get_weather, result={"temp": 72, "condition": "sunny"})

    # LLM requests tool (1st response), then gives final answer (2nd response)
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(get_weather, city="Paris"),
        "It's 72°F and sunny in Paris!",
    ])

    result = weather_agent.run("What's the weather in Paris?")

    tool.verify(get_weather, city="Paris")
    assert "72" in result

Verifying calls

Verify call count

from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

@tenro
def test_call_counts():
    tool.simulate(search, result=["doc1"])
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI"),
        "Found 1 document.",
    ])

    agent.run("Search for AI")

    # Verify LLM was called twice (tool request + final response)
    llm.verify_many(Provider.OPENAI, count=2)
    tool.verify_many(search, count=1)

Verify arguments

from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro
@tenro
def test_correct_arguments():
    tool.simulate(search, result=[])
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI papers"),
        "No papers found.",
    ])

    agent.run("Find AI papers")

    # Verify the LLM passed correct arguments to the tool
    tool.verify(search, query="AI papers")

Verify output content

from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_output_content():
    llm.simulate(Provider.OPENAI, response="The answer is 42.")

    calculator_agent.run("What is 6 times 7?")

    llm.verify(output_contains="42")

Verify never called

from tenro import Provider, link_tool
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("dangerous_operation")
def dangerous_operation() -> None:
    # Something risky
    pass

@tenro
def test_safety_check():
    llm.simulate(Provider.OPENAI, response="I cannot do that.")

    safe_agent.run("Do something dangerous")

    # Verify agent refused without calling dangerous tool
    tool.verify_never(dangerous_operation)

Verifying call sequence

When your agent performs multiple tool calls in order:

from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("fetch")
def fetch() -> str:
    return data_source.fetch()

@link_tool("save")
def save(data: str) -> str:
    return storage.save(data)

@tenro
def test_workflow_order():
    tool.simulate(fetch, result="raw data")
    tool.simulate(save, result="ok")
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(fetch),
        ToolCall(save, data="processed"),
        "Data fetched and saved.",
    ])

    pipeline_agent.run("Process data")

    # Verify tools were called in the expected order
    tool.verify_sequence([fetch, save])

Testing error handling

Simulate API failures to test how your agent handles errors.

Test your agent's behavior

When you simulate an exception, your agent sees it as a real API failure. Test how your agent responds—retry logic, fallback messages, graceful degradation:

from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_handles_rate_limit():
    llm.simulate(
        provider=Provider.OPENAI,
        responses=[RateLimitError("Too many requests")],
    )

    result = resilient_agent.run("Hello")

    # Verify your agent handles the error gracefully
    assert "try again" in result.lower()

Don't test SDK exception types

Different SDKs wrap errors differently. Anthropic might turn ConnectionError into APIConnectionError, OpenAI wraps it another way. Test your agent's behavior, not the specific exception class.

Testing retries

Simulate multiple failures followed by success:

from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_retry_succeeds():
    llm.simulate(
        provider=Provider.OPENAI,
        responses=[
            ConnectionError("Network error"),
            ConnectionError("Network error"),
            "Success!",
        ],
    )

    result = retry_agent.run("Hello")

    # Verify retry worked
    assert result == "Success!"
    llm.verify_many(count=3)  # 2 failures + 1 success

Testing max retries

Verify your agent stops after a maximum number of attempts:

from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_gives_up_after_max_retries():
    # More failures than your agent will retry
    llm.simulate(
        provider=Provider.OPENAI,
        responses=[ConnectionError("Network error")] * 5,
    )

    result = agent_with_3_retries.run("Hello")

    # Agent should stop after 3 attempts
    llm.verify_many(count=3)
    assert "failed" in result.lower() or "error" in result.lower()

Need to assert specific exception types?

Use use_http=False for direct exception raising without SDK wrapping. See Simulating errors in the API reference.

LLM hallucinations (non-existent tools)

Test how your agent handles an LLM requesting a tool that doesn't exist:

from tenro import Provider, ToolCall
from tenro.simulate import llm
from tenro.testing import tenro

@tenro
def test_handles_hallucinated_tool():
    # LLM requests a tool that doesn't exist in your system
    llm.simulate(
        Provider.OPENAI,
        responses=["I'll use magic", ToolCall("nonexistent_tool", x=1)],
    )

    result = defensive_agent.run("Do something")

    # Verify your agent handles the unknown tool gracefully
    llm.verify(Provider.OPENAI)
    assert "error" in result.lower() or "cannot" in result.lower()

Invalid tool arguments

Test how your agent handles an LLM passing wrong argument types:

from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("search")
def search(query: str, limit: int = 10) -> list[str]:
    return api.search(query, limit)

@tenro
def test_handles_invalid_args():
    # LLM sends string where int expected
    llm.simulate(
        Provider.OPENAI,
        responses=["Searching", ToolCall("search", query="AI", limit="ten")],
    )
    tool.simulate(search, result=[])

    result = agent.run("Search")

    # Your agent should validate args before execution
    llm.verify(Provider.OPENAI)

Optional simulations

When a tool might or might not be called (e.g., caching):

from tenro import Provider, link_tool, ToolCall
from tenro.simulate import llm, tool
from tenro.testing import tenro

@link_tool("cache_lookup")
def cache_lookup(key: str) -> str | None:
    return cache.get(key)

@link_tool("fetch")
def fetch() -> str:
    return data_source.fetch()

@tenro
def test_conditional_tool():
    # Cache might be hit or missed - we don't know which path
    tool.simulate(cache_lookup, result=None, optional=True)
    tool.simulate(fetch, result="data")
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(fetch),
        "Here's the data.",
    ])

    agent.run("Get data")

    # Only verify what must happen
    tool.verify_many(fetch, count=1)

Multiple providers

Testing with multiple LLM providers:

from tenro import Provider
from tenro.simulate import llm
from tenro.testing import tenro
@tenro
def test_multi_provider():
    llm.simulate(Provider.OPENAI, response="OpenAI response")
    llm.simulate(Provider.ANTHROPIC, response="Claude response")

    multi_agent.run("Compare responses")

    llm.verify(Provider.OPENAI)
    llm.verify(Provider.ANTHROPIC)

Next steps