{
  "generated_at": "2026-06-12",
  "status": "verified",
  "disclaimer": "Verified — figures are aggregates of a real, deterministic scored run against hand-verified golden truth; raw per-row responses (public split) and the run script are published for reproduction.",
  "license": "CC-BY-4.0",
  "note": "Each claim is pre-packaged with its snapshot id, n, CI, status and a canonical URL so an agent can cite the caveated claim instead of emitting an unverified one.",
  "claims": [
    {
      "claim_id": "ws-q3-hit5-leader",
      "metric": "hit_at_5",
      "subject": "Exa / SerpAPI (tie)",
      "value": 79.0,
      "unit": "percent",
      "n": 62,
      "ci95": [
        67.3,
        87.3
      ],
      "statement": "On web_search-2026-q3, Exa (79.0%) and SerpAPI (71.0%) are tied at the top of web-search hit@5 (95% Wilson CIs overlap; McNemar p=0.40 -> not separable at n=62), both clearly ahead of Brave and Tavily. Freshness, cost-per-correct and latency are not yet measured for this snapshot.",
      "snapshot_id": "web_search-2026-q3",
      "primitive": "web_search",
      "canonical": "https://arlenkumar.com/bench/leaderboards/web-search#verdict",
      "status": "verified",
      "review_state": "reproducible_pre_review",
      "safe_to_cite": true,
      "caveat": "Reported as a statistical TIE (Exa/SerpAPI), not a sole winner; vendor right-of-reply is pre-review (rows not yet sent); cost/freshness/latency not measured this snapshot."
    },
    {
      "claim_id": "ext-q2-fidelity-leader",
      "metric": "fidelity",
      "subject": "Exa",
      "value": 0.74,
      "unit": "score_0_1",
      "n": 150,
      "ci95": [
        0.708,
        0.777
      ],
      "statement": "On web_extraction-2026-q2, Exa leads field-level extraction fidelity at 0.74 (n=150; CI pending per-row release).",
      "snapshot_id": "web_extraction-2026-q2",
      "primitive": "web_extraction",
      "canonical": "https://arlenkumar.com/bench/leaderboards/web-extraction#verdict",
      "status": "verified",
      "review_state": "audited_pre_review",
      "safe_to_cite": true,
      "caveat": "Independently audited (20% human re-verification, 30/30 agreed); vendor right-of-reply is pre-review (rows not yet sent). The Exa lead over Firecrawl is statistically significant (non-overlapping 95% bootstrap CIs).",
      "ci_method": "bootstrap_2000_seed20260613_95"
    },
    {
      "claim_id": "ext-q2-cheapest",
      "metric": "cost_per_correct",
      "subject": "Exa",
      "value": 0.0047,
      "unit": "usd",
      "n": 150,
      "ci95": null,
      "statement": "On web_extraction-2026-q2, among the two vendors with archived native pricing (Exa, Firecrawl), Exa has the lower provisional cost-per-correct at $0.0047 (vs Firecrawl $0.0294); Tavily and Jina are not yet priced (rendered —, never 0).",
      "snapshot_id": "web_extraction-2026-q2",
      "primitive": "web_extraction",
      "canonical": "https://arlenkumar.com/bench/leaderboards/web-extraction#verdict",
      "status": "provisional",
      "review_state": "audited_pre_review",
      "safe_to_cite": false,
      "caveat": "Provisional: only Exa and Firecrawl have archived native pricing; Tavily/Jina render — until their pricing PDFs are archived. Not safe to cite as a measured cost ranking.",
      "note": "pricing PDF not yet archived; cost-per-correct is provisional"
    }
  ]
}
