> ## Documentation Index
> Fetch the complete documentation index at: https://docs.cekura.ai/llms.txt
> Use this file to discover all available pages before exploring further.

# Runs & Results

> Trigger evaluations and inspect outcomes

export const CopyPageButton = () => {
  if (typeof window !== 'undefined') {
    setTimeout(function () {
      if (document.getElementById('ck-tools')) return;
      var anchor = document.getElementById('content-area') || document.querySelector('.mdx-content');
      if (!anchor) return;
      if (!document.getElementById('ck-style')) {
        var s = document.createElement('style');
        s.id = 'ck-style';
        s.textContent = '#ck-tools{position:absolute;top:6px;right:0;z-index:100;font-family:inherit;}' + '.ck-row{display:inline-flex;align-items:stretch;border:1px solid rgba(0,0,0,0.15);border-radius:8px;overflow:hidden;background:#fff;}' + ':root.dark .ck-row{background:rgba(255,255,255,0.06);border-color:rgba(255,255,255,0.12);}' + '.ck-btn{padding:5px 12px;border:none;background:none;cursor:pointer;font-size:13px;font-weight:500;font-family:inherit;color:#374151;}' + ':root.dark .ck-btn{color:#d1d5db;}' + '.ck-btn:hover{background:rgba(0,0,0,0.04);}' + ':root.dark .ck-btn:hover{background:rgba(255,255,255,0.06);}' + '.ck-chevron{padding:5px 8px;border:none;background:none;cursor:pointer;font-size:14px;font-family:inherit;color:#374151;}' + ':root.dark .ck-chevron{color:#d1d5db;}' + '.ck-chevron:hover{background:rgba(0,0,0,0.04);}' + ':root.dark .ck-chevron:hover{background:rgba(255,255,255,0.06);}' + '.ck-divider{width:1px;background:rgba(0,0,0,0.12);flex-shrink:0;}' + ':root.dark .ck-divider{background:rgba(255,255,255,0.12);}' + '.ck-dd{position:absolute;top:calc(100% + 4px);right:0;min-width:180px;background:#fff;border:1px solid rgba(0,0,0,0.12);border-radius:8px;box-shadow:0 4px 12px rgba(0,0,0,0.1);padding:4px;display:none;z-index:200;}' + ':root.dark .ck-dd{background:#1f2937;border-color:rgba(255,255,255,0.1);box-shadow:0 4px 16px rgba(0,0,0,0.35);}' + '.ck-item{display:block;width:100%;padding:7px 12px;border:none;background:none;border-radius:6px;cursor:pointer;font-size:13px;font-family:inherit;text-align:left;color:#374151;}' + ':root.dark .ck-item{color:#d1d5db;}' + '.ck-item:hover{background:rgba(0,0,0,0.05);}' + ':root.dark .ck-item:hover{background:rgba(255,255,255,0.07);}';
        document.head.appendChild(s);
      }
      var wrap = document.createElement('div');
      wrap.id = 'ck-tools';
      var row = document.createElement('div');
      row.className = 'ck-row';
      var mainBtn = document.createElement('button');
      mainBtn.className = 'ck-btn';
      mainBtn.textContent = 'Copy page';
      var divider = document.createElement('span');
      divider.className = 'ck-divider';
      var chevron = document.createElement('button');
      chevron.className = 'ck-chevron';
      chevron.textContent = '▾';
      var dd = document.createElement('div');
      dd.className = 'ck-dd';
      function closeDD() {
        dd.style.display = 'none';
      }
      function openDD() {
        dd.style.display = 'block';
      }
      chevron.onclick = function (e) {
        e.stopPropagation();
        if (dd.style.display === 'block') {
          closeDD();
        } else {
          openDD();
        }
      };
      document.addEventListener('click', function (e) {
        if (!e.target.closest('#ck-tools')) {
          closeDD();
        }
      });
      document.addEventListener('keydown', function (e) {
        if (e.key === 'Escape') {
          closeDD();
        }
      });
      function makeItem(label, fn) {
        var b = document.createElement('button');
        b.className = 'ck-item';
        b.textContent = label;
        b.onclick = function () {
          fn();
          closeDD();
        };
        return b;
      }
      function getMarkdown() {
        var walk = function (node) {
          if (!node) return '';
          if (node.nodeType === 3) return node.textContent || '';
          if (node.nodeType !== 1) return '';
          var tag = node.tagName.toLowerCase();
          var skip = ['script', 'style', 'svg', 'noscript', 'button', 'iframe'];
          if (skip.indexOf(tag) !== -1) return '';
          if (node.id === 'ck-tools') return '';
          var ch = Array.from(node.childNodes).map(walk).join('');
          if (tag === 'h1') return '\n# ' + ch.trim() + '\n\n';
          if (tag === 'h2') return '\n## ' + ch.trim() + '\n\n';
          if (tag === 'h3') return '\n### ' + ch.trim() + '\n\n';
          if (tag === 'p') return '\n' + ch.trim() + '\n\n';
          if (tag === 'pre') return '\n```\n' + node.textContent.trim() + '\n```\n\n';
          if (tag === 'li') return '- ' + ch.trim() + '\n';
          if (tag === 'code') return '`' + ch.trim() + '`';
          return ch;
        };
        var content = document.querySelector('.mdx-content') || document.getElementById('content-area') || document.body;
        return walk(content).replace(/\n\n\n+/g, '\n\n').trim();
      }
      function copyMd() {
        var md = getMarkdown();
        navigator.clipboard.writeText(md).then(function () {
          mainBtn.textContent = 'Copied!';
          setTimeout(function () {
            mainBtn.textContent = 'Copy page';
          }, 2000);
        });
      }
      function viewMd() {
        var md = getMarkdown();
        var safe = md.split('&').join('&amp;').split('<').join('&lt;').split('>').join('&gt;');
        var html = '<!DOCTYPE html><html><head><meta charset="utf-8"><style>body{font-family:monospace;max-width:860px;margin:40px auto;padding:0 24px;line-height:1.7;white-space:pre-wrap;word-wrap:break-word}</style></head><body>' + safe + '</body></html>';
        window.open(URL.createObjectURL(new Blob([html], {
          type: 'text/html'
        })), '_blank');
      }
      function openClaude() {
        var prompt = 'Can you read this Cekura docs page ' + window.location.href + ' so I can ask you questions?';
        window.open('https://claude.ai/new?q=' + encodeURIComponent(prompt), '_blank');
      }
      mainBtn.onclick = copyMd;
      dd.appendChild(makeItem('Copy page', copyMd));
      dd.appendChild(makeItem('View as Markdown', viewMd));
      dd.appendChild(makeItem('Open in Claude', openClaude));
      row.appendChild(mainBtn);
      row.appendChild(divider);
      row.appendChild(chevron);
      wrap.appendChild(row);
      wrap.appendChild(dd);
      anchor.style.position = 'relative';
      anchor.insertBefore(wrap, anchor.firstChild);
    }, 50);
  }
  return null;
};

<CopyPageButton />

<Note>
  Setup steps and authentication are in the [Overview](/cli-sdk/overview). This page covers triggering runs and reading their output.
</Note>

A **run** is one execution of a scenario against an agent. A **result** is the parent batch (one trigger → many runs, one per scenario × personality combination). The CLI and SDK expose both.

## Pick a mode

The right command depends on the transport:

| Mode                                         | When to use                                                       |
| -------------------------------------------- | ----------------------------------------------------------------- |
| `text`                                       | Fast functional check — no audio, scenarios run as text exchanges |
| `voice`                                      | Outbound voice call via your provider (Vapi/Retell/etc.)          |
| `chirp`                                      | Cekura's hosted voice runner                                      |
| `livekit_v2` / `pipecat_v2`                  | Self-hosted LiveKit or Pipecat agents                             |
| `sip`                                        | SIP-based provider (e.g. Plivo, Twilio SIP)                       |
| `websocket`                                  | Custom WebSocket protocol                                         |
| `vapi_webrtc`, `retell_webrtc`, `elevenlabs` | Provider WebRTC paths                                             |

## Trigger a run

<Tabs>
  <Tab title="CLI">
    ```bash theme={null}
    # Text mode (fastest for iteration)
    cekura scenarios run-text \
      --agent-id 123 \
      --scenario-ids 1,2,3

    # Voice via Cekura's hosted runner
    cekura scenarios run-chirp \
      --agent-id 123 \
      --scenario-ids 1,2,3

    # Self-hosted LiveKit
    cekura scenarios run-livekit-v2 --from-file run.json
    ```

    Each command returns a JSON envelope with a top-level `result_id`.
  </Tab>

  <Tab title="SDK">
    ```python theme={null}
    from cekura import Cekura

    client = Cekura()

    # Text mode
    result = client.scenarios.run_text(
        agent_id=123,
        scenario_ids=[1, 2, 3],
    )
    result_id = result["result_id"]
    ```

    Other modes follow the same shape:

    ```python theme={null}
    client.scenarios.run_chirp(agent_id=123, scenario_ids=[1, 2, 3])
    client.scenarios.run_livekit_v2(agent_id=123, scenario_ids=[1, 2, 3], ...)
    client.scenarios.run_pipecat_v2(agent_id=123, scenario_ids=[1, 2, 3], ...)
    client.scenarios.run_voice(agent_id=123, scenario_ids=[1, 2, 3], ...)
    client.scenarios.run_sip(agent_id=123, scenario_ids=[1, 2, 3], ...)
    ```
  </Tab>
</Tabs>

## Poll status

<Tabs>
  <Tab title="CLI">
    ```bash theme={null}
    cekura runs list --result-id <result-id> --format json
    ```

    Loop in a shell script:

    ```bash theme={null}
    while true; do
      STATUSES=$(cekura runs list --result-id "$RESULT_ID" --format json | jq -r '.[].status' | sort -u)
      [[ "$STATUSES" =~ pending|running ]] || break
      sleep 5
    done
    ```
  </Tab>

  <Tab title="SDK">
    ```python theme={null}
    import time

    while True:
        runs = client.runs.list(result_id=result_id)
        statuses = [r["status"] for r in runs.get("results", runs)]
        if all(s in ("passed", "failed", "errored", "cancelled") for s in statuses):
            break
        time.sleep(5)

    print("done:", statuses)
    ```
  </Tab>
</Tabs>

## Inspect a run

<Tabs>
  <Tab title="CLI">
    ```bash theme={null}
    cekura runs get 5544
    cekura runs list --agent-id 123
    ```
  </Tab>

  <Tab title="SDK">
    ```python theme={null}
    run = client.runs.get(run_id=5544)
    print(run["status"])
    print(run["transcript"])
    for m in run["metric_results"]:
        print(m["metric_name"], m["value"])
    ```
  </Tab>
</Tabs>

## Live operations on in-progress runs

<Tabs>
  <Tab title="CLI">
    ```bash theme={null}
    # Get a live listen URL while a voice run is in progress
    cekura runs listen-url 5544

    # End an in-progress call
    cekura runs end-call 5544
    ```
  </Tab>

  <Tab title="SDK">
    ```python theme={null}
    client.runs.get_listen_url(run_id=5544)
    client.runs.end_call(run_id=5544)
    ```
  </Tab>
</Tabs>

## Vote on a metric result

Capture thumbs up/down feedback on a specific metric evaluation for a run, optionally attach the expected value and free-text feedback. The run is marked as reviewed; the metric evaluation is updated. Feeds the labs / metric-review workflow.

<Tabs>
  <Tab title="CLI">
    ```bash theme={null}
    cekura runs mark-metric-vote 5544 \
      --metric-id 55 \
      --thumbs-down \
      --expected-value 2 \
      --feedback "Agent missed the reschedule step"
    ```

    `--expected-value` is parsed as JSON when possible (`5`, `true`, `"foo"`), so numeric / boolean metrics get the right type.
  </Tab>

  <Tab title="SDK">
    ```python theme={null}
    client.runs.mark_metric_vote(
        run_id=5544,
        metric_id=55,
        thumbs_up=False,
        expected_value=2,
        feedback="Agent missed the reschedule step",
    )
    ```
  </Tab>
</Tabs>

## Expected outcome

If a scenario has an `expected_outcome`, you can (re)evaluate whether the run met it, then thumbs-vote on the outcome verdict the same way you vote on metric results.

<Tabs>
  <Tab title="SDK">
    ```python theme={null}
    # Re-score expected outcome for this run
    client.runs.run_expected_outcome(run_id=5544)

    # Thumbs vote on the outcome verdict
    client.runs.mark_expected_outcome_vote(
        run_id=5544,
        thumbs_up=True,
        feedback="Agent did escalate to a human",
    )
    ```
  </Tab>
</Tabs>

## Flag a critical-scenario verdict as wrong

<Tabs>
  <Tab title="SDK">
    ```python theme={null}
    client.runs.mark_critical_scenario_wrong(run_id=5544, scenario_id=42)
    client.runs.unmark_critical_scenario_wrong(run_id=5544, scenario_id=42)
    ```
  </Tab>
</Tabs>

## Improve the agent's prompt from run failures

Iterate on the agent's prompt using the failure pattern across recent runs.

<Tabs>
  <Tab title="SDK">
    ```python theme={null}
    # Background job — returns a progress_id you can poll
    job = client.runs.improve_prompt_bg(
        run_ids=[5544, 5545, 5546],
        agent=123,
        prompt="<current agent system prompt>",
    )
    client.runs.improve_prompt_progress(progress_id=job["progress_id"])

    # One-shot synchronous variant (smaller batches)
    client.runs.improve_prompt(run_ids=[5544], agent=123, prompt="<current>")

    # Just the categorized failure issues, without rewriting the prompt
    client.runs.improve_prompt_issues(run_ids=[5544, 5545], agent=123)
    ```
  </Tab>
</Tabs>

## Re-evaluate without re-running

If you change a metric prompt and want to score existing runs against the new definition:

<Tabs>
  <Tab title="CLI">
    ```bash theme={null}
    cekura run rerun 987 --metric-ids 55,56
    ```
  </Tab>

  <Tab title="SDK">
    ```python theme={null}
    client.results.rerun(result_id=987, metric_ids=[55, 56])
    ```
  </Tab>
</Tabs>

## Promote a run into a test set

Take a passing run and freeze it as a regression dataset:

<Tabs>
  <Tab title="CLI">
    ```bash theme={null}
    cekura test-sets create-from-run --run-id 5544 --name "regression-2024-04"
    ```
  </Tab>

  <Tab title="SDK">
    ```python theme={null}
    client.test_sets.create_from_run(run_id=5544, name="regression-2024-04")
    ```
  </Tab>
</Tabs>

## See also

<CardGroup cols={2}>
  <Card title="Evaluators" icon="vial" href="/cli-sdk/evaluators">
    The scenarios that runs execute.
  </Card>

  <Card title="Metrics" icon="database" href="/cli-sdk/metrics">
    Define how runs are scored.
  </Card>

  <Card title="Calls" icon="phone" href="/cli-sdk/calls">
    Production calls — same scoring engine, different input.
  </Card>

  <Card title="API Reference" icon="book" href="/api-reference">
    Full field reference for runs, results, and run-mode payloads.
  </Card>
</CardGroup>