> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cekura.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Tool Call Testing

> Validate that your agent calls the right tools, with the right arguments, at the right time.

export const CopyPageButton = () => {
  if (typeof window !== 'undefined') {
    setTimeout(function () {
      if (document.getElementById('ck-tools')) return;
      var anchor = document.getElementById('content-area') || document.querySelector('.mdx-content');
      if (!anchor) return;
      if (!document.getElementById('ck-style')) {
        var s = document.createElement('style');
        s.id = 'ck-style';
        s.textContent = '#ck-tools{position:absolute;top:6px;right:0;z-index:100;font-family:inherit;}' + '.ck-row{display:inline-flex;align-items:stretch;border:1px solid rgba(0,0,0,0.15);border-radius:8px;overflow:hidden;background:#fff;}' + ':root.dark .ck-row{background:rgba(255,255,255,0.06);border-color:rgba(255,255,255,0.12);}' + '.ck-btn{padding:5px 12px;border:none;background:none;cursor:pointer;font-size:13px;font-weight:500;font-family:inherit;color:#374151;}' + ':root.dark .ck-btn{color:#d1d5db;}' + '.ck-btn:hover{background:rgba(0,0,0,0.04);}' + ':root.dark .ck-btn:hover{background:rgba(255,255,255,0.06);}' + '.ck-chevron{padding:5px 8px;border:none;background:none;cursor:pointer;font-size:14px;font-family:inherit;color:#374151;}' + ':root.dark .ck-chevron{color:#d1d5db;}' + '.ck-chevron:hover{background:rgba(0,0,0,0.04);}' + ':root.dark .ck-chevron:hover{background:rgba(255,255,255,0.06);}' + '.ck-divider{width:1px;background:rgba(0,0,0,0.12);flex-shrink:0;}' + ':root.dark .ck-divider{background:rgba(255,255,255,0.12);}' + '.ck-dd{position:absolute;top:calc(100% + 4px);right:0;min-width:180px;background:#fff;border:1px solid rgba(0,0,0,0.12);border-radius:8px;box-shadow:0 4px 12px rgba(0,0,0,0.1);padding:4px;display:none;z-index:200;}' + ':root.dark .ck-dd{background:#1f2937;border-color:rgba(255,255,255,0.1);box-shadow:0 4px 16px rgba(0,0,0,0.35);}' + '.ck-item{display:block;width:100%;padding:7px 12px;border:none;background:none;border-radius:6px;cursor:pointer;font-size:13px;font-family:inherit;text-align:left;color:#374151;}' + ':root.dark .ck-item{color:#d1d5db;}' + '.ck-item:hover{background:rgba(0,0,0,0.05);}' + ':root.dark .ck-item:hover{background:rgba(255,255,255,0.07);}';
        document.head.appendChild(s);
      }
      var wrap = document.createElement('div');
      wrap.id = 'ck-tools';
      var row = document.createElement('div');
      row.className = 'ck-row';
      var mainBtn = document.createElement('button');
      mainBtn.className = 'ck-btn';
      mainBtn.textContent = 'Copy page';
      var divider = document.createElement('span');
      divider.className = 'ck-divider';
      var chevron = document.createElement('button');
      chevron.className = 'ck-chevron';
      chevron.textContent = '▾';
      var dd = document.createElement('div');
      dd.className = 'ck-dd';
      function closeDD() {
        dd.style.display = 'none';
      }
      function openDD() {
        dd.style.display = 'block';
      }
      chevron.onclick = function (e) {
        e.stopPropagation();
        if (dd.style.display === 'block') {
          closeDD();
        } else {
          openDD();
        }
      };
      document.addEventListener('click', function (e) {
        if (!e.target.closest('#ck-tools')) {
          closeDD();
        }
      });
      document.addEventListener('keydown', function (e) {
        if (e.key === 'Escape') {
          closeDD();
        }
      });
      function makeItem(label, fn) {
        var b = document.createElement('button');
        b.className = 'ck-item';
        b.textContent = label;
        b.onclick = function () {
          fn();
          closeDD();
        };
        return b;
      }
      function getMarkdown() {
        var walk = function (node) {
          if (!node) return '';
          if (node.nodeType === 3) return node.textContent || '';
          if (node.nodeType !== 1) return '';
          var tag = node.tagName.toLowerCase();
          var skip = ['script', 'style', 'svg', 'noscript', 'button', 'iframe'];
          if (skip.indexOf(tag) !== -1) return '';
          if (node.id === 'ck-tools') return '';
          var ch = Array.from(node.childNodes).map(walk).join('');
          if (tag === 'h1') return '\n# ' + ch.trim() + '\n\n';
          if (tag === 'h2') return '\n## ' + ch.trim() + '\n\n';
          if (tag === 'h3') return '\n### ' + ch.trim() + '\n\n';
          if (tag === 'p') return '\n' + ch.trim() + '\n\n';
          if (tag === 'pre') return '\n```\n' + node.textContent.trim() + '\n```\n\n';
          if (tag === 'li') return '- ' + ch.trim() + '\n';
          if (tag === 'code') return '`' + ch.trim() + '`';
          return ch;
        };
        var content = document.querySelector('.mdx-content') || document.getElementById('content-area') || document.body;
        return walk(content).replace(/\n\n\n+/g, '\n\n').trim();
      }
      function copyMd() {
        var md = getMarkdown();
        navigator.clipboard.writeText(md).then(function () {
          mainBtn.textContent = 'Copied!';
          setTimeout(function () {
            mainBtn.textContent = 'Copy page';
          }, 2000);
        });
      }
      function viewMd() {
        var md = getMarkdown();
        var safe = md.split('&').join('&amp;').split('<').join('&lt;').split('>').join('&gt;');
        var html = '<!DOCTYPE html><html><head><meta charset="utf-8"><style>body{font-family:monospace;max-width:860px;margin:40px auto;padding:0 24px;line-height:1.7;white-space:pre-wrap;word-wrap:break-word}</style></head><body>' + safe + '</body></html>';
        window.open(URL.createObjectURL(new Blob([html], {
          type: 'text/html'
        })), '_blank');
      }
      function openClaude() {
        var prompt = 'Can you read this Cekura docs page ' + window.location.href + ' so I can ask you questions?';
        window.open('https://claude.ai/new?q=' + encodeURIComponent(prompt), '_blank');
      }
      mainBtn.onclick = copyMd;
      dd.appendChild(makeItem('Copy page', copyMd));
      dd.appendChild(makeItem('View as Markdown', viewMd));
      dd.appendChild(makeItem('Open in Claude', openClaude));
      row.appendChild(mainBtn);
      row.appendChild(divider);
      row.appendChild(chevron);
      wrap.appendChild(row);
      wrap.appendChild(dd);
      anchor.style.position = 'relative';
      anchor.insertBefore(wrap, anchor.firstChild);
    }, 50);
  }
  return null;
};

<CopyPageButton />

## Overview

Most real voice agents don't just talk — they call tools: look up a customer, schedule an appointment, transfer to a human, end the call. If your agent says "I've booked your appointment" but never actually invoked the booking tool, the call is a failure no matter how good the transcript sounds.

Tool call testing on Cekura is about making sure:

1. Cekura can **see** the tool calls your agent made (names, arguments, results, latency).
2. You can **assert** against those tool calls — both "did it happen" and "did it happen correctly."

## Step 1 — Make Tool Calls Visible to Cekura

Before you can evaluate tool calls, Cekura needs the tool call events in the transcript and metadata for each call. There are two ways to get this:

### Option A — Use a Native Integration (Recommended)

If you're on one of our supported providers, you don't need to do anything special to capture tool calls from simulated test runs — once your agent is connected to Cekura via the provider integration, we automatically fetch the tool call names, arguments, results, and latency from the provider after each simulation and attach them to the transcript.

Supported providers:

* [VAPI](/documentation/integrations/vapi/testing)
* [Retell](/documentation/integrations/retell/testing)
* [ElevenLabs](/documentation/integrations/elevenlabs/testing)
* [LiveKit](/documentation/integrations/livekit/testing)
* [Pipecat](/documentation/integrations/pipecat/automated)

### Option B — Send the Transcript Yourself

If you're on a custom stack or a provider we don't natively support, send the call transcript (including tool call invocations and results) to Cekura directly via the observability API. See:

* [Custom Integration](/documentation/integrations/custom-integration) — how to send calls programmatically
* [Transcript Format](/documentation/advanced/transcript-format) — the exact shape of `tool_call`, `tool_call_invocation`, and `tool_call_result` entries Cekura expects

<Note>
  The transcript format supports both OpenAI-style (`toolCalls` / `tool_call_result`) and generic (`tool_calls` / `tool_results`) representations. As long as tool names, arguments, and results are included, Cekura can evaluate against them.
</Note>

## Step 2 — Test Tool Calls in Evaluators

Once tool calls are flowing into Cekura, you can validate them in two ways.

### Assertion Approach 1 — Expected Outcome

The simplest way to test a tool call is to write the expectation into the evaluator's **expected outcome**. For example:

> The agent must call the `book_appointment` tool with the caller's requested date and time before confirming the booking verbally.

Cekura's expected-outcome judge has access to the full transcript including tool calls, and will fail the evaluator if the tool wasn't invoked (or was invoked with the wrong arguments).

**Use this when:**

* The tool call is part of a specific workflow scenario
* You want a single pass/fail signal per evaluator
* The assertion is naturally expressible in prose

### Assertion Approach 2 — Custom Metrics

For assertions you want to run across **many** evaluators, or that need structured logic (exact argument matching, ordering, counting, etc.), use a custom metric instead:

* [LLM-Judge Metric](/documentation/key-concepts/metrics/llm-judge-metric) — write a prompt that inspects the tool calls and returns pass/fail or a score
* [Python Metric](/documentation/key-concepts/metrics/python-metric) — write code that parses `tool_calls` out of the transcript and makes deterministic assertions

**Use this when:**

* You want to enforce "never call `transfer_to_human` before verifying identity" across every call
* You need exact matching on tool arguments
* You want to track tool-call reliability as a metric over time on a [Dashboard](/documentation/guides/dashboards)

## Putting It Together

<Steps>
  <Step title="Wire up visibility">
    Either connect a native integration or start sending transcripts via the custom integration. Confirm you can see tool call entries in the Cekura transcript viewer for a sample call.
  </Step>

  <Step title="Pick the right assertion mechanism">
    For per-scenario checks, put the tool call requirement into the evaluator's expected outcome. For cross-cutting rules, build a custom LLM-judge or Python metric.
  </Step>

  <Step title="Run and iterate">
    Run the evaluators, inspect failing transcripts, and refine either the agent prompt or the assertion as needed.
  </Step>
</Steps>

## Recommended: Set Up Mock Tools

For reliable tool call testing, we strongly recommend configuring [Mock Tools](/documentation/key-concepts/evaluators/mock-tools) before running evaluators at scale. Mock tools let you return predefined responses for each tool call during a simulation, which gives you:

* **Deterministic tests** — the same evaluator produces the same tool responses on every run, so a failure is a real agent failure and not a flaky backend
* **No backend dependency** — your tests don't hit production systems, don't burn quota on third-party APIs, and don't require network access to internal services
* **Edge-case coverage** — you can force a tool to return an error, a timeout, or an unusual payload to test how the agent handles it

Without mock tools, tool call testing still works, but every failure becomes ambiguous — was it the agent, or was it the backend? Mocking removes that ambiguity.

## Troubleshooting

<AccordionGroup>
  <Accordion title="Tool calls work in text simulation but not in voice testing">
    This is a known behavior difference between Retell's native text simulation and Cekura's voice tester. Two common causes:

    **1. Voice Activity Detection (VAD) triggering early.** In voice mode, VAD may detect a brief silence or noise and interrupt the agent mid-flow, before the tool call is dispatched. The agent announces the intent ("Let me check those appointments") but the actual tool invocation is cut off. This doesn't happen in text/chat mode because there's no VAD.

    **2. Different model checkpoints in voice vs. text.** Some providers use slightly different model configurations for voice calls versus native text simulations, which can affect how reliably tool calls are issued.

    **Recommended steps:**

    * Run the same evaluator in **Chat Mode** within Cekura. If the tool call executes in chat but not in voice, the issue is voice-specific (VAD or model checkpoint).
    * Check your provider's logs for the reproduced run. If you've configured the Retell integration in Cekura, you can inspect the logs directly from the run view.
    * Tighten your agent prompt to be explicit about tool calls — for example, instruct the agent to only continue speaking after a tool call has returned a result.
    * Add a **Tool Call Hallucination** metric to your evaluator to automatically detect cases where the agent claims to invoke a tool but doesn't.
    * OpenAI models (GPT-4o, GPT-4.1) have generally shown more reliable tool call execution in voice contexts than Gemini.
  </Accordion>
</AccordionGroup>

## Related Resources

* [Mock Tools](/documentation/key-concepts/evaluators/mock-tools) — Stub tool responses for reproducible tests
* [Transcript Format](/documentation/advanced/transcript-format) — How tool calls appear in the transcript
* [LLM-Judge Metric](/documentation/key-concepts/metrics/llm-judge-metric) — Prompt-based assertions over tool calls
* [Python Metric](/documentation/key-concepts/metrics/python-metric) — Deterministic assertions over tool calls
* [Custom Integration](/documentation/integrations/custom-integration) — Send transcripts from any stack