{
  "cohort_size": 12,
  "consumer_tasks": [
    "page_extraction"
  ],
  "content_hash": "ab22ff86b0db7532eb4be52f267177e9ea99e98a747aae94e427a0813eb43277",
  "exclusions": [
    {
      "reason": "ToS forbids publishing comparative benchmark results.",
      "vendor": "epsilon_ai"
    }
  ],
  "license": "CC-BY-4.0",
  "metrics": [
    {
      "dash_semantics": "vendor made no scoreable attempt on any item (all blocked/empty)",
      "description": "correct / (correct + incorrect)",
      "direction": "higher_better",
      "key": "accuracy",
      "unit": null
    },
    {
      "dash_semantics": "no rows attempted",
      "description": "scoreable attempts / cohort rows",
      "direction": "higher_better",
      "key": "coverage",
      "unit": null
    },
    {
      "dash_semantics": "zero correct answers \u2014 cost-per-correct is undefined, not infinite",
      "description": "total billed cost / correct answers",
      "direction": "lower_better",
      "key": "cost_per_correct",
      "unit": "usd"
    },
    {
      "dash_semantics": "either split had no scoreable attempts",
      "description": "public accuracy minus holdout accuracy",
      "direction": "lower_better",
      "key": "overfit_gap",
      "unit": null
    }
  ],
  "primitive": "web_extraction",
  "snapshot_id": "web_extraction-2026-q2",
  "vendors": {
    "alpha_extract": {
      "accuracy": 1.0,
      "cost_per_correct": 0.0012,
      "coverage": 1.0,
      "holdout_accuracy": 1.0,
      "mean_latency_ms": 0.0009305755763004223,
      "n": 12,
      "overfit_gap": 0.0,
      "public_accuracy": 1.0,
      "state_counts": {
        "blocked": 0,
        "correct": 12,
        "fetch_failed": 0,
        "incorrect": 0,
        "no_coverage": 0,
        "timeout": 0
      },
      "tos_status": "cleared",
      "total_cost_usd": 0.0144,
      "vendor": "mock_perfect"
    },
    "beta_scrape": {
      "accuracy": 1.0,
      "cost_per_correct": 0.0013333333333333333,
      "coverage": 0.25,
      "holdout_accuracy": 1.0,
      "mean_latency_ms": 0.002232729457318783,
      "n": 12,
      "overfit_gap": 0.0,
      "public_accuracy": 1.0,
      "state_counts": {
        "blocked": 3,
        "correct": 3,
        "fetch_failed": 2,
        "incorrect": 0,
        "no_coverage": 2,
        "timeout": 2
      },
      "tos_status": "cleared",
      "total_cost_usd": 0.004,
      "vendor": "mock_faulty"
    },
    "delta_parse": {
      "accuracy": 1.0,
      "cost_per_correct": 0.0,
      "coverage": 0.25,
      "holdout_accuracy": 1.0,
      "mean_latency_ms": 0.001076521584764123,
      "n": 12,
      "overfit_gap": 0.0,
      "public_accuracy": 1.0,
      "state_counts": {
        "blocked": 3,
        "correct": 3,
        "fetch_failed": 2,
        "incorrect": 0,
        "no_coverage": 2,
        "timeout": 2
      },
      "tos_status": "static_rubric_only",
      "total_cost_usd": 0.0,
      "vendor": "mock_faulty"
    },
    "gamma_render": {
      "accuracy": 1.0,
      "cost_per_correct": 0.004166666666666667,
      "coverage": 0.25,
      "holdout_accuracy": 1.0,
      "mean_latency_ms": 0.0011907444180299838,
      "n": 12,
      "overfit_gap": 0.0,
      "public_accuracy": 1.0,
      "state_counts": {
        "blocked": 3,
        "correct": 3,
        "fetch_failed": 2,
        "incorrect": 0,
        "no_coverage": 2,
        "timeout": 2
      },
      "tos_status": "cleared",
      "total_cost_usd": 0.0125,
      "vendor": "mock_faulty"
    }
  }
}