{
  "snapshot_id": "web_search-2026-q3",
  "primitive": "web_search",
  "status": "verified",
  "disclaimer": "Verified — figures are aggregates of a real, deterministic scored run against hand-verified golden truth; raw per-row responses (public split) and the run script are published for reproduction.",
  "generated_at": "2026-06-12",
  "license": "CC-BY-4.0",
  "methodology": "https://arlenkumar.com/bench/methodology",
  "cohort_size": 62,
  "cohort_unit": "public verified queries (descriptive form, holdout excluded)",
  "holdout_fraction": 0.3,
  "content_hash": "f4f233ef2031c616f906b1a8d1e585028dbf2c75b6c56a809f508e16d7e5d901",
  "dash_semantics": "A null value (rendered — on the site) means the metric is undefined or the vendor returned no coverage. It is never zero, and is excluded from ranking and from aggregates.",
  "metrics": [
    {
      "key": "hit_at_1",
      "description": "Share of golden queries whose normalized golden URL appears in the vendor's top 1 result.",
      "direction": "higher_better",
      "unit": "percent",
      "statistic": "proportion"
    },
    {
      "key": "hit_at_5",
      "description": "Share of golden queries whose normalized golden URL appears in the vendor's top 5 results.",
      "direction": "higher_better",
      "unit": "percent",
      "statistic": "proportion"
    },
    {
      "key": "fresh_lt_30d",
      "description": "hit@5 restricted to queries whose golden page was published in the last 30 days.",
      "direction": "higher_better",
      "unit": "percent",
      "statistic": "proportion"
    },
    {
      "key": "retrievability_h",
      "description": "Kaplan-Meier median hours from sentinel-page publication to first appearance in vendor results.",
      "direction": "lower_better",
      "unit": "hours",
      "statistic": "km_median",
      "dash_semantics": "vendor returns no timestamped freshness signal; excluded from ranking, never scored 0."
    },
    {
      "key": "cost_per_correct",
      "description": "Billed spend on the disclosed pinned plan divided by verified-correct answers.",
      "direction": "lower_better",
      "unit": "usd",
      "statistic": "ratio",
      "dash_semantics": "zero correct answers; cost-per-correct is undefined, not infinite."
    },
    {
      "key": "p50_latency_ms",
      "description": "Median end-to-end response latency over the cohort.",
      "direction": "lower_better",
      "unit": "ms",
      "statistic": "quantile"
    }
  ],
  "agent_ready": [],
  "vendors": {
    "exa": {
      "vendor": "exa",
      "display_name": "Exa",
      "rank": 1,
      "n": 62,
      "status": "verified",
      "agent_ready": false,
      "overfit_gap_pp": null,
      "metrics": {
        "hit_at_1": {
          "value": 61.3,
          "unit": "percent",
          "status": "verified",
          "ci95": [
            48.9,
            72.4
          ],
          "ci_method": "wilson_score_95"
        },
        "hit_at_5": {
          "value": 79.0,
          "unit": "percent",
          "status": "verified",
          "ci95": [
            67.3,
            87.3
          ],
          "ci_method": "wilson_score_95"
        },
        "fresh_lt_30d": {
          "value": null,
          "unit": "percent",
          "status": "pending",
          "dash_semantics": "Not measured in fresh<30d % for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "retrievability_h": {
          "value": null,
          "unit": "hours",
          "status": "pending",
          "dash_semantics": "Not measured in retrievability h for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "cost_per_correct": {
          "value": null,
          "unit": "usd",
          "status": "pending",
          "dash_semantics": "Not measured in cost/correct $ for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "p50_latency_ms": {
          "value": null,
          "unit": "ms",
          "status": "pending",
          "dash_semantics": "Not measured in p50 latency ms for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        }
      }
    },
    "serpapi": {
      "vendor": "serpapi",
      "display_name": "SerpAPI",
      "rank": 2,
      "n": 62,
      "status": "verified",
      "agent_ready": false,
      "overfit_gap_pp": null,
      "metrics": {
        "hit_at_1": {
          "value": 58.1,
          "unit": "percent",
          "status": "verified",
          "ci95": [
            45.7,
            69.6
          ],
          "ci_method": "wilson_score_95"
        },
        "hit_at_5": {
          "value": 71.0,
          "unit": "percent",
          "status": "verified",
          "ci95": [
            58.7,
            80.8
          ],
          "ci_method": "wilson_score_95"
        },
        "fresh_lt_30d": {
          "value": null,
          "unit": "percent",
          "status": "pending",
          "dash_semantics": "Not measured in fresh<30d % for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "retrievability_h": {
          "value": null,
          "unit": "hours",
          "status": "pending",
          "dash_semantics": "Not measured in retrievability h for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "cost_per_correct": {
          "value": null,
          "unit": "usd",
          "status": "pending",
          "dash_semantics": "Not measured in cost/correct $ for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "p50_latency_ms": {
          "value": null,
          "unit": "ms",
          "status": "pending",
          "dash_semantics": "Not measured in p50 latency ms for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        }
      }
    },
    "brave": {
      "vendor": "brave",
      "display_name": "Brave",
      "rank": 3,
      "n": 62,
      "status": "verified",
      "agent_ready": false,
      "overfit_gap_pp": null,
      "metrics": {
        "hit_at_1": {
          "value": 45.2,
          "unit": "percent",
          "status": "verified",
          "ci95": [
            33.5,
            57.5
          ],
          "ci_method": "wilson_score_95"
        },
        "hit_at_5": {
          "value": 54.8,
          "unit": "percent",
          "status": "verified",
          "ci95": [
            42.5,
            66.5
          ],
          "ci_method": "wilson_score_95"
        },
        "fresh_lt_30d": {
          "value": null,
          "unit": "percent",
          "status": "pending",
          "dash_semantics": "Not measured in fresh<30d % for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "retrievability_h": {
          "value": null,
          "unit": "hours",
          "status": "pending",
          "dash_semantics": "Not measured in retrievability h for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "cost_per_correct": {
          "value": null,
          "unit": "usd",
          "status": "pending",
          "dash_semantics": "Not measured in cost/correct $ for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "p50_latency_ms": {
          "value": null,
          "unit": "ms",
          "status": "pending",
          "dash_semantics": "Not measured in p50 latency ms for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        }
      }
    },
    "tavily": {
      "vendor": "tavily",
      "display_name": "Tavily",
      "rank": 4,
      "n": 62,
      "status": "verified",
      "agent_ready": false,
      "overfit_gap_pp": null,
      "metrics": {
        "hit_at_1": {
          "value": 29.0,
          "unit": "percent",
          "status": "verified",
          "ci95": [
            19.2,
            41.3
          ],
          "ci_method": "wilson_score_95"
        },
        "hit_at_5": {
          "value": 43.5,
          "unit": "percent",
          "status": "verified",
          "ci95": [
            31.9,
            55.9
          ],
          "ci_method": "wilson_score_95"
        },
        "fresh_lt_30d": {
          "value": null,
          "unit": "percent",
          "status": "pending",
          "dash_semantics": "Not measured in fresh<30d % for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "retrievability_h": {
          "value": null,
          "unit": "hours",
          "status": "pending",
          "dash_semantics": "Not measured in retrievability h for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "cost_per_correct": {
          "value": null,
          "unit": "usd",
          "status": "pending",
          "dash_semantics": "Not measured in cost/correct $ for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        },
        "p50_latency_ms": {
          "value": null,
          "unit": "ms",
          "status": "pending",
          "dash_semantics": "Not measured in p50 latency ms for this snapshot — the instrument has not been run yet. Pending, never zero, and excluded from ranking."
        }
      }
    }
  },
  "exclusions": [
    {
      "vendor": "Serper",
      "reason": "not yet scored on web_search this snapshot"
    },
    {
      "vendor": "Firecrawl",
      "reason": "not yet scored on web_search this snapshot"
    }
  ],
  "provenance": {
    "dataset": "GoldenEvalWebSearch engine — data/results/results.jsonl",
    "snapshot": "web_search-2026-q3 (public split, descriptive form)",
    "n": 62,
    "holdout": "30% private holdout excluded from these figures",
    "measured_metrics": [
      "hit_at_1",
      "hit_at_5"
    ],
    "metric_model": "hit@k = share of golden queries whose normalized golden URL appears in the vendor's top-k results.",
    "primary_pair": "exa vs serpapi: hit@5 79.0% vs 71.0%; 95% Wilson CIs overlap; McNemar p=0.40 -> not separable at n=62, reported as a TIE at the top.",
    "open_gates": [
      "freshness lag — sentinel pages minted, awaiting first crawl",
      "cost-per-correct & latency — metering not yet run",
      "small n — required n≈338 for the exa/serpapi pair at 80% power; top pair reported as a tie"
    ]
  },
  "reproduction": {
    "data_files": {
      "rows": "https://arlenkumar.com/bench/api/web_search-2026-q3-rows.json",
      "results": "https://arlenkumar.com/bench/api/web_search-2026-q3-results.json"
    },
    "rows_description": "62 public golden rows scored: each carries the query sent, equivalence_members (accepted golden URLs), truth_token, token_depth, stratum and slices.",
    "results_description": "248 per-(row,vendor) outcomes: hit@1, hit@5, first_rank, miss_reason, sentinel_verdict, promoted, n_results.",
    "holdout_excluded": true,
    "salt_published": false,
    "scoring_rule": "deterministic set membership after URL normalization + truth-token check",
    "method": "Re-issue each row's descriptive query to a vendor, normalize URLs, and check set-membership against equivalence_members; or audit the recorded results directly. No LLM judge.",
    "pipeline": [
      "pump",
      "verify",
      "split",
      "reslice",
      "probe",
      "report"
    ],
    "engine": "GoldenEvalWebSearch"
  }
}
