Testing patterns¶

Common patterns for testing AI agents with Tenro.

Simulating responses¶

Single result (same every call)¶

When a tool should return the same value regardless of how many times it's called:

# agents.py
from tenro import link_tool

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_consistent_search_results():
    # Search always returns the same docs
    tool.simulate(search, result=["doc1", "doc2"])
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI"),
        "Found 2 documents about AI.",
    ])

    result = research_agent.run("Find AI papers")

    tool.verify_many(search, count=1)
    assert "2 documents" in result

Sequential results (different each call)¶

When you need different results for successive calls (e.g., retry logic):

# agents.py
from tenro import link_tool

@link_tool("api_call")
def api_call() -> dict:
    return external_api.call()

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_retry_logic():
    # First tool call fails, second succeeds
    tool.simulate(api_call, results=[
        ConnectionError("Timeout"),
        {"status": "ok"},
    ])
    # LLM requests tool, gets error, retries, succeeds
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(api_call),
        ToolCall(api_call),
        "API call succeeded.",
    ])

    result = resilient_agent.run("Make API call")

    tool.verify_many(api_call, count=2)
    assert "succeeded" in result

Dynamic results (computed at call time)¶

When tool results should depend on what your agent passes:

# agents.py
from tenro import link_tool

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_search_returns_relevant_results():
    # Return different results based on search query
    tool.simulate(
        search,
        side_effect=lambda query: [f"Result for: {query}"],
    )
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI trends"),
        "Found relevant info about AI trends.",
    ])

    result = research_agent.run("AI trends")

    tool.verify(search, query="AI trends")

Simulating tool calls¶

ToolCall syntax¶

Use ToolCall to simulate LLM tool call requests:

# agents.py
from tenro import link_tool

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_tool_call():
    # ToolCall with callable (type-safe, IDE autocomplete)
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI papers"),
        "Found results.",
    ])
    tool.simulate(search, result=["paper1"])

    agent.run("Find AI papers")

    tool.verify(search)

ToolCall forms:

# Callable (preferred - type-safe)
ToolCall(search, query="AI")

# String name (when callable unavailable)
ToolCall("search", query="AI")

# Explicit form
ToolCall(name="search", arguments={"query": "AI"})

Single turn vs multiple turns¶

The outer responses= list controls how many LLM calls. Each item is consumed by one call:

# THREE separate LLM calls (3 turns)
responses=["First", "Second", "Third"]

# ONE LLM call with interleaved content (1 turn)
responses=[LLMResponse(["Thinking", ToolCall(search, q="AI"), "Done"])]

Use LLMResponse when you need text and tool calls in a single atomic response:

# test_agents.py
from tenro import LLMResponse, Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_single_turn_interleaved():
    # ONE LLM call returns: text + tool call + more text
    llm.simulate(Provider.ANTHROPIC, responses=[
        LLMResponse([
            "I'll search for that.",
            ToolCall(search, query="AI"),
            "Let me also check weather.",
            ToolCall(get_weather, city="NYC"),
        ])
    ])
    tool.simulate(search, result=["result"])
    tool.simulate(get_weather, result={"temp": 72})

    agent.run("Research")

    # Only ONE LLM call was made
    llm.verify_many(Provider.ANTHROPIC, count=1)
    tool.verify_many(count=2)

Provider interleaving support

Anthropic and Gemini preserve block order. OpenAI Chat API flattens blocks (text concatenated, tool calls extracted to separate array). The response still works, but ordering within the turn is lost.

Simulating agentic loops¶

When the LLM decides to call tools and then responds with the result:

# agents.py
from tenro import link_tool

@link_tool("get_weather")
def get_weather(city: str) -> dict:
    return weather_api.fetch(city)

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_llm_calls_tool():
    # Set up tool result first
    tool.simulate(get_weather, result={"temp": 72, "condition": "sunny"})

    # LLM requests tool (1st response), then gives final answer (2nd response)
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(get_weather, city="Paris"),
        "It's 72°F and sunny in Paris!",
    ])

    result = weather_agent.run("What's the weather in Paris?")

    tool.verify(get_weather, city="Paris")
    assert "72" in result

Verifying calls¶

Verify call count¶

# agents.py
from tenro import link_tool

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_call_counts():
    tool.simulate(search, result=["doc1"])
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI"),
        "Found 1 document.",
    ])

    agent.run("Search for AI")

    # Verify LLM was called twice (tool request + final response)
    llm.verify_many(Provider.OPENAI, count=2)
    tool.verify_many(search, count=1)

Verify arguments¶

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_correct_arguments():
    tool.simulate(search, result=[])
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI papers"),
        "No papers found.",
    ])

    agent.run("Find AI papers")

    # Verify the LLM passed correct arguments to the tool
    tool.verify(search, query="AI papers")

Verify output content¶

# test_agents.py
from tenro import Provider
from tenro.simulate import llm
import tenro

@tenro.simulate
def test_output_content():
    llm.simulate(Provider.OPENAI, response="The answer is 42.")

    calculator_agent.run("What is 6 times 7?")

    llm.verify(output_contains="42")

Verify agent output¶

Use agent.verify to check what your agent returned:

# agents.py
from tenro import link_agent, link_tool

@link_tool("search")
def search(query: str) -> list[str]:
    return api.search(query)

@link_agent("Researcher")
def researcher(topic: str) -> str:
    docs = search(topic)
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": f"Summarize {topic}: {docs}"}],
    )
    return response.choices[0].message.content

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool, agent
import tenro

@tenro.simulate
def test_agent_output():
    tool.simulate(search, result=["doc1"])
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(search, query="AI"),
        "Summary of AI research.",
    ])

    researcher("AI papers")

    # Verify agent returned the expected result
    agent.verify(researcher, result="Summary of AI research.")

Verify never called¶

# agents.py
from tenro import link_tool

@link_tool("dangerous_operation")
def dangerous_operation() -> None:
    # Something risky
    pass

# test_agents.py
from tenro import Provider
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_safety_check():
    llm.simulate(Provider.OPENAI, response="I cannot do that.")

    safe_agent.run("Do something dangerous")

    # Verify agent refused without calling dangerous tool
    tool.verify_never(dangerous_operation)

Verifying call sequence¶

When your agent performs multiple tool calls in order:

# agents.py
from tenro import link_tool

@link_tool("fetch")
def fetch() -> str:
    return data_source.fetch()

@link_tool("save")
def save(data: str) -> str:
    return storage.save(data)

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_workflow_order():
    tool.simulate(fetch, result="raw data")
    tool.simulate(save, result="ok")
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(fetch),
        ToolCall(save, data="processed"),
        "Data fetched and saved.",
    ])

    pipeline_agent.run("Process data")

    # Verify tools were called in the expected order
    tool.verify_sequence([fetch, save])

Testing error handling¶

Simulate API failures to test how your agent handles errors.

Test your agent's behavior¶

When you simulate an exception, your agent sees it as a real API failure. Test how your agent responds—retry logic, fallback messages, graceful degradation:

# test_agents.py
from tenro import Provider
from tenro.simulate import llm
import tenro

@tenro.simulate
def test_handles_rate_limit():
    llm.simulate(
        provider=Provider.OPENAI,
        responses=[RateLimitError("Too many requests")],
    )

    result = resilient_agent.run("Hello")

    # Verify your agent handles the error gracefully
    assert "try again" in result.lower()

Don't test SDK exception types

Different SDKs wrap errors differently. Anthropic might turn ConnectionError into APIConnectionError, OpenAI wraps it another way. Test your agent's behavior, not the specific exception class.

Testing retries¶

Simulate multiple failures followed by success:

# test_agents.py
from tenro import Provider
from tenro.simulate import llm
import tenro

@tenro.simulate
def test_retry_succeeds():
    llm.simulate(
        provider=Provider.OPENAI,
        responses=[
            ConnectionError("Network error"),
            ConnectionError("Network error"),
            "Success!",
        ],
    )

    result = retry_agent.run("Hello")

    # Verify retry worked
    assert result == "Success!"
    llm.verify_many(count=3)  # 2 failures + 1 success

Testing max retries¶

Verify your agent stops after a maximum number of attempts:

# test_agents.py
from tenro import Provider
from tenro.simulate import llm
import tenro

@tenro.simulate
def test_gives_up_after_max_retries():
    # More failures than your agent will retry
    llm.simulate(
        provider=Provider.OPENAI,
        responses=[ConnectionError("Network error")] * 5,
    )

    result = agent_with_3_retries.run("Hello")

    # Agent should stop after 3 attempts
    llm.verify_many(count=3)
    assert "failed" in result.lower() or "error" in result.lower()

Need to assert specific exception types?

Use use_http=False for direct exception raising without SDK wrapping. See Simulating errors in the API reference.

LLM hallucinations (non-existent tools)¶

Test how your agent handles an LLM requesting a tool that doesn't exist:

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm
import tenro

@tenro.simulate
def test_handles_hallucinated_tool():
    # LLM requests a tool that doesn't exist in your system
    llm.simulate(
        Provider.OPENAI,
        responses=["I'll use magic", ToolCall("nonexistent_tool", x=1)],
    )

    result = defensive_agent.run("Do something")

    # Verify your agent handles the unknown tool gracefully
    llm.verify(Provider.OPENAI)
    assert "error" in result.lower() or "cannot" in result.lower()

Invalid tool arguments¶

Test how your agent handles an LLM passing wrong argument types:

# agents.py
from tenro import link_tool

@link_tool("search")
def search(query: str, limit: int = 10) -> list[str]:
    return api.search(query, limit)

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_handles_invalid_args():
    # LLM sends string where int expected
    llm.simulate(
        Provider.OPENAI,
        responses=["Searching", ToolCall("search", query="AI", limit="ten")],
    )
    tool.simulate(search, result=[])

    result = agent.run("Search")

    # Your agent should validate args before execution
    llm.verify(Provider.OPENAI)

Optional simulations¶

When a tool might or might not be called (e.g., caching):

# agents.py
from tenro import link_tool

@link_tool("cache_lookup")
def cache_lookup(key: str) -> str | None:
    return cache.get(key)

@link_tool("fetch")
def fetch() -> str:
    return data_source.fetch()

# test_agents.py
from tenro import Provider, ToolCall
from tenro.simulate import llm, tool
import tenro

@tenro.simulate
def test_conditional_tool():
    # Cache might be hit or missed - we don't know which path
    tool.simulate(cache_lookup, result=None, optional=True)
    tool.simulate(fetch, result="data")
    llm.simulate(Provider.OPENAI, responses=[
        ToolCall(fetch),
        "Here's the data.",
    ])

    agent.run("Get data")

    # Only verify what must happen
    tool.verify_many(fetch, count=1)

Multiple providers¶

Testing with multiple LLM providers:

# test_agents.py
from tenro import Provider
from tenro.simulate import llm
import tenro

@tenro.simulate
def test_multi_provider():
    llm.simulate(Provider.OPENAI, response="OpenAI response")
    llm.simulate(Provider.ANTHROPIC, response="Claude response")

    multi_agent.run("Compare responses")

    llm.verify(Provider.OPENAI)
    llm.verify(Provider.ANTHROPIC)

Multi-agent testing¶

When an orchestrator agent calls sub-agents, you can test at two levels.

Full pipeline¶

Let all agents run, simulate only tools and LLMs:

# test_pipeline.py
from tenro.simulate import agent, tool, llm
import tenro

@tenro.simulate
def test_full_pipeline():
    tool.simulate(validate_input, result=True)
    tool.simulate(process_data, result={"processed": True})
    tool.simulate(save_result, result="saved")

    PipelineAgent().run({"input": "data"})

    # Verify each agent ran exactly once
    agent.verify_many(PipelineAgent, count=1)
    agent.verify_many(ValidationAgent, count=1)
    agent.verify_many(ProcessingAgent, count=1)
    tool.verify_many(validate_input, count=1)

Isolated (simulate sub-agents)¶

Simulate sub-agent results directly to test the orchestrator in isolation:

# test_pipeline.py
from tenro.simulate import agent
import tenro

@tenro.simulate
def test_orchestrator_in_isolation():
    agent.simulate(ValidationAgent, result=True)
    agent.simulate(ProcessingAgent, result={"processed": True})
    agent.simulate(PersistenceAgent, result="saved")

    PipelineAgent().run({"input": "data"})

    # Verify orchestrator output and sub-agent call counts
    agent.verify(PipelineAgent, result="saved")
    agent.verify_many(ValidationAgent, count=1)

Third-party tools¶

When your agent uses functions from libraries you don't control, you have two options:

@link_tool wrapper — create a thin wrapper around the third-party function. Simpler, but only intercepts calls through your wrapper. If the library internally calls the function from its own code, those calls won't be simulated or traced.
register() — works at the function object level so code that already holds a reference to that callable can be intercepted during tests, including calls from code you don't own.

A third-party tool may do purely local work or call an external API internally — that distinction doesn't change how either approach works. Both operate at the Python callable boundary. Choose based on whether you own the call boundary.

Option 1: `@link_tool` wrapper¶

# agents.py
from tenro import link_tool
from third_party_lib import their_search

@link_tool
def search(query: str) -> str:
    return their_search(query)  # thin wrapper

# test_agents.py
from agents import search  # simulate the wrapper, not the original
from tenro.simulate import tool
import tenro

@tenro.simulate
def test_with_wrapper():
    tool.simulate(search, result="simulated")

    result = my_agent.run("query")

    assert result == "simulated"

Important: Simulate and verify the wrapper function (search), not the original third-party callable (their_search).

Option 2: `register()` in conftest.py¶

Register third-party functions in a fixture so they're available across all tests:

# conftest.py
import pytest
from tenro.simulate import register
from third_party_lib import their_search

@pytest.fixture(autouse=True)
def _register_third_party():
    register(their_search)

# test_agents.py
from tenro.simulate import tool
from third_party_lib import their_search
import tenro

@tenro.simulate
def test_with_third_party_tool():
    tool.simulate(their_search, result="simulated")

    result = my_agent.run("query")

    assert result == "simulated"

Framework tools (LangChain, etc.)¶

# conftest.py
import pytest
from tenro.simulate import register
from langchain_community import tools as lc_tools

@pytest.fixture(autouse=True)
def _register_langchain():
    register(lc_tools.DuckDuckGoSearchRun.invoke)

# test_agents.py
from langchain_community import tools as lc_tools
from tenro.simulate import tool, llm
from tenro import Provider
import tenro

@tenro.simulate
def test_langchain_agent():
    tool.simulate(
        lc_tools.DuckDuckGoSearchRun.invoke,
        result="Search results about AI...",
    )
    llm.simulate(Provider.OPENAI, response="Summary of AI.")

    result = MyLangChainAgent().run("Research AI")

    assert "AI" in result

Limitations¶

Constraint	Details
CPython only	Uses CPython-specific internals
No closures	Functions with closure variables can't be registered
Pure Python only	C extensions and builtins can't be registered

If register() can't handle a function, it raises TenroSimulationSetupError with a specific reason.

For full details, see register() in the API reference.

Next steps¶

Frameworks: Examples with LangChain, CrewAI, and more
API Reference: Complete Construct API documentation