{
  "generated_at": "2026-05-20T17:38:31Z",
  "host": "shared-cpu-host",
  "n_cells": 10,
  "n_skipped": 2,
  "cells": [
    {
      "cell_id": "gemma-4-e4b_ikllama",
      "model_id": "gemma-4-e4b",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "gen_eval_tps": 7.62,
      "prompt_eval_tps": 38.58,
      "p50_ms": 5958.74,
      "p95_ms": 9094.69,
      "overall_pass": 85.71,
      "format_pass_rate": 100.0,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "gemma-4-e4b_std",
      "model_id": "gemma-4-e4b_std",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "gen_eval_tps": 8.59,
      "prompt_eval_tps": 31.16,
      "p50_ms": 6240.05,
      "p95_ms": 9544.62,
      "overall_pass": 94.29,
      "format_pass_rate": 94.29,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "phi-4-mini_std",
      "model_id": "phi-4-mini_std",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "gen_eval_tps": 10.62,
      "prompt_eval_tps": 32.08,
      "p50_ms": 7517.19,
      "p95_ms": 20043.15,
      "overall_pass": 0.0,
      "format_pass_rate": 0.0,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "phi-4-mini_std_workaround",
      "model_id": "phi-4-mini",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "gen_eval_tps": null,
      "prompt_eval_tps": null,
      "p50_ms": 7983.35,
      "p95_ms": 19645.79,
      "overall_pass": 74.29,
      "format_pass_rate": 77.14,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "phi-4-mini_tbq3",
      "model_id": "phi-4-mini_tbq3",
      "weight_quant": "Q4_K_M",
      "kv_quant": "tbq3_0",
      "gen_eval_tps": null,
      "prompt_eval_tps": null,
      "p50_ms": 26454.47,
      "p95_ms": 116384.59,
      "overall_pass": 51.43,
      "format_pass_rate": 51.43,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "qwen3.5-4b_ikllama",
      "model_id": "qwen3.5-4b",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "gen_eval_tps": 8.37,
      "prompt_eval_tps": 58.08,
      "p50_ms": 8977.07,
      "p95_ms": 13373.52,
      "overall_pass": 82.86,
      "format_pass_rate": 100.0,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "qwen3.5-4b_specdec",
      "model_id": "qwen3.5-4b",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "gen_eval_tps": null,
      "prompt_eval_tps": null,
      "p50_ms": 20376.55,
      "p95_ms": 25534.54,
      "overall_pass": 88.57,
      "format_pass_rate": 91.43,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "qwen3.5-4b_specdec",
      "model_id": "qwen3.5-4b",
      "weight_quant": null,
      "kv_quant": null,
      "gen_eval_tps": null,
      "prompt_eval_tps": null,
      "p50_ms": null,
      "p95_ms": null,
      "overall_pass": 0.0,
      "format_pass_rate": 0.0,
      "n_cases": 1,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "qwen3.5-4b_std",
      "model_id": "qwen3.5-4b_std",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "gen_eval_tps": 9.79,
      "prompt_eval_tps": 35.85,
      "p50_ms": 13738.63,
      "p95_ms": 19325.27,
      "overall_pass": 91.43,
      "format_pass_rate": 97.14,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    },
    {
      "cell_id": "qwen3.5-4b_tbq3",
      "model_id": "qwen3.5-4b_tbq3",
      "weight_quant": "Q4_K_M",
      "kv_quant": "tbq3_0",
      "gen_eval_tps": 4.26,
      "prompt_eval_tps": 24.33,
      "p50_ms": 30008.35,
      "p95_ms": 72045.83,
      "overall_pass": 74.29,
      "format_pass_rate": 88.57,
      "n_cases": 35,
      "status": "complete",
      "skip_reason": null
    }
  ],
  "skipped": [
    {
      "cell_id": "gemma-4-e4b_tbq3",
      "model_id": "gemma-4-e4b",
      "weight_quant": "Q4_K_M",
      "kv_quant": "tbq3_0",
      "gen_eval_tps": null,
      "prompt_eval_tps": null,
      "p50_ms": null,
      "p95_ms": null,
      "overall_pass": null,
      "format_pass_rate": null,
      "n_cases": 0,
      "status": "skipped",
      "skip_reason": "PR #21089 branch is based on an older llama.cpp commit (ggml version 0.9.8) that predates Gemma-4 architecture support. Loading gemma-4-E4B-it-Q4_K_M.gguf fails with: \"error loading model architecture: unknown model architecture: 'gemma4'\". Supported gemma archs in this build: gemma, gemma2, gemma3, gemma3n. The std cell ran on ghcr.io/ggml-org/llama.cpp:full which is a newer build with gemma4 support. To unblock this cell the PR would need to be rebased on a current llama.cpp master that has gemma4 weights support, OR use a gemma3 model instead."
    },
    {
      "cell_id": "phi-4-mini_ikllama",
      "model_id": "phi-4-mini",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "gen_eval_tps": null,
      "prompt_eval_tps": null,
      "p50_ms": null,
      "p95_ms": null,
      "overall_pass": null,
      "format_pass_rate": null,
      "n_cases": 0,
      "status": "skipped",
      "skip_reason": "ik_llama.cpp@40254a5 cannot load Phi-4-mini-instruct-Q4_K_M.gguf \u2014 model loader errors with: check_tensor_dims: tensor output.weight not found. Phi-4 uses tied embeddings (the input embedding matrix doubles as the output projection), and ik_llama.cpp master does not yet handle that tensor layout. The same GGUF loads fine on stock llama.cpp:full and on PR #21089. To unblock this cell ik_llama would need to add tied-embedding support, OR use a Phi-4 GGUF rebuild that exposes a separate output.weight."
    }
  ],
  "full": [
    {
      "cell_id": "gemma-4-e4b_ikllama",
      "model_id": "gemma-4-e4b",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "llamacpp_variant": "ikawrakow/ik_llama.cpp@40254a5 (CPU-only AVX2, IQK matmul + IQK FA kernels)",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": 38.58,
        "gen_eval_tps": 7.62
      },
      "memory": {
        "peak_rss_str": null
      },
      "latency_ms": {
        "p50": 5958.74,
        "p95": 9094.69,
        "mean": 5885.94
      },
      "tool_calling": {
        "format_pass_rate": 100.0,
        "function_accuracy": 85.71,
        "argument_accuracy": 85.71,
        "overall_pass": 85.71
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 100.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 0.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 100.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-20T12:32:26Z",
      "duration_sec": 208.02,
      "notes": "Gemma-4-E4B-it on ik_llama.cpp: only 1.05x faster end-to-end p50 (5959 vs 6240 ms) \u2014 Gemma was already efficient on stock llama.cpp. Same parallel-call collapse as Qwen on ik_llama (100% -> 0%) \u2014 points at a master-branch chat-template bug, not a model-specific issue."
    },
    {
      "cell_id": "gemma-4-e4b_std",
      "model_id": "gemma-4-e4b_std",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "llamacpp_variant": "ghcr.io/ggml-org/llama.cpp:full",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": 31.16,
        "gen_eval_tps": 8.59
      },
      "memory": {
        "peak_rss_str": "2.781GiB"
      },
      "latency_ms": {
        "p50": 6240.05,
        "p95": 9544.62,
        "mean": 5918.86
      },
      "tool_calling": {
        "format_pass_rate": 94.29,
        "function_accuracy": 94.29,
        "argument_accuracy": 94.29,
        "overall_pass": 94.29
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 100.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 100.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 90.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-19T20:30:03Z",
      "duration_sec": 215.35
    },
    {
      "cell_id": "gemma-4-e4b_tbq3",
      "model_id": "gemma-4-e4b",
      "weight_quant": "Q4_K_M",
      "kv_quant": "tbq3_0",
      "llamacpp_variant": "elusznik/llama.cpp#21089@0aae7d78c7e1c3029cebdbe4c318704d4057c18e",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": null,
        "gen_eval_tps": null
      },
      "memory": {
        "peak_rss_str": null
      },
      "latency_ms": {
        "p50": null,
        "p95": null,
        "mean": null
      },
      "tool_calling": {
        "format_pass_rate": null,
        "function_accuracy": null,
        "argument_accuracy": null,
        "overall_pass": null
      },
      "by_category": {},
      "n_cases": 0,
      "started_at": "2026-05-20T02:30:54Z",
      "duration_sec": 0,
      "status": "skipped",
      "skip_reason": "PR #21089 branch is based on an older llama.cpp commit (ggml version 0.9.8) that predates Gemma-4 architecture support. Loading gemma-4-E4B-it-Q4_K_M.gguf fails with: \"error loading model architecture: unknown model architecture: 'gemma4'\". Supported gemma archs in this build: gemma, gemma2, gemma3, gemma3n. The std cell ran on ghcr.io/ggml-org/llama.cpp:full which is a newer build with gemma4 support. To unblock this cell the PR would need to be rebased on a current llama.cpp master that has gemma4 weights support, OR use a gemma3 model instead."
    },
    {
      "cell_id": "phi-4-mini_ikllama",
      "model_id": "phi-4-mini",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "llamacpp_variant": "ikawrakow/ik_llama.cpp@40254a5",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": null,
        "gen_eval_tps": null
      },
      "memory": {
        "peak_rss_str": null
      },
      "latency_ms": {
        "p50": null,
        "p95": null,
        "mean": null
      },
      "tool_calling": {
        "format_pass_rate": null,
        "function_accuracy": null,
        "argument_accuracy": null,
        "overall_pass": null
      },
      "by_category": {},
      "n_cases": 0,
      "started_at": "2026-05-20T12:42:41Z",
      "duration_sec": 0,
      "status": "skipped",
      "skip_reason": "ik_llama.cpp@40254a5 cannot load Phi-4-mini-instruct-Q4_K_M.gguf \u2014 model loader errors with: check_tensor_dims: tensor output.weight not found. Phi-4 uses tied embeddings (the input embedding matrix doubles as the output projection), and ik_llama.cpp master does not yet handle that tensor layout. The same GGUF loads fine on stock llama.cpp:full and on PR #21089. To unblock this cell ik_llama would need to add tied-embedding support, OR use a Phi-4 GGUF rebuild that exposes a separate output.weight."
    },
    {
      "cell_id": "phi-4-mini_std",
      "model_id": "phi-4-mini_std",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "llamacpp_variant": "ghcr.io/ggml-org/llama.cpp:full",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": 32.08,
        "gen_eval_tps": 10.62
      },
      "memory": {
        "peak_rss_str": "2.146GiB"
      },
      "latency_ms": {
        "p50": 7517.19,
        "p95": 20043.15,
        "mean": 8656.31
      },
      "tool_calling": {
        "format_pass_rate": 0.0,
        "function_accuracy": 0.0,
        "argument_accuracy": 0.0,
        "overall_pass": 0.0
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 0.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 0.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 0.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-19T20:34:45Z",
      "duration_sec": 302.67
    },
    {
      "cell_id": "phi-4-mini_std_workaround",
      "model_id": "phi-4-mini",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "llamacpp_variant": "ghcr.io/ggml-org/llama.cpp:full + tools-in-system-prompt workaround",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": null,
        "gen_eval_tps": null
      },
      "memory": {
        "peak_rss_str": null
      },
      "latency_ms": {
        "p50": 7983.35,
        "p95": 19645.79,
        "mean": 9365.64
      },
      "tool_calling": {
        "format_pass_rate": 77.14,
        "function_accuracy": 77.14,
        "argument_accuracy": 74.29,
        "overall_pass": 74.29
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 90.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 0.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 85.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-19T20:42:56Z",
      "duration_sec": 332.44,
      "notes": "Same Q4_K_M weights + FP16 KV as phi-4-mini_std. The harness was run with a tools-in-system-prompt that explicitly tells the model to emit JSON of form {name, arguments}. This works around the GGUF chat-template gap where llama.cpp falls back to chat_format peg-native and does not surface tool schemas to Phi-4-mini natively."
    },
    {
      "cell_id": "phi-4-mini_tbq3",
      "model_id": "phi-4-mini_tbq3",
      "weight_quant": "Q4_K_M",
      "kv_quant": "tbq3_0",
      "llamacpp_variant": "elusznik/llama.cpp#21089@0aae7d78c7e1c3029cebdbe4c318704d4057c18e",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": null,
        "gen_eval_tps": null
      },
      "memory": {
        "peak_rss_str": "1.388GiB"
      },
      "latency_ms": {
        "p50": 26454.47,
        "p95": 116384.59,
        "mean": 38747.45
      },
      "tool_calling": {
        "format_pass_rate": 51.43,
        "function_accuracy": 51.43,
        "argument_accuracy": 51.43,
        "overall_pass": 51.43
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 70.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 0.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 55.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-20T02:36:52Z",
      "duration_sec": 1332.73,
      "notes": "llama-bench was run with -fa 1 (flash-attn) which is required for the tbq3_0 cache type at the model's full ctx (262144); the BFCL harness itself runs at ctx-size 4096 without -fa (matching the std-cell settings)."
    },
    {
      "cell_id": "qwen3.5-4b_ikllama",
      "model_id": "qwen3.5-4b",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "llamacpp_variant": "ikawrakow/ik_llama.cpp@40254a5 (CPU-only AVX2, IQK matmul + IQK FA kernels)",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": 58.08,
        "gen_eval_tps": 8.37
      },
      "memory": {
        "peak_rss_str": null
      },
      "latency_ms": {
        "p50": 8977.07,
        "p95": 13373.52,
        "mean": 9463.25
      },
      "tool_calling": {
        "format_pass_rate": 100.0,
        "function_accuracy": 85.71,
        "argument_accuracy": 82.86,
        "overall_pass": 82.86
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 90.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 0.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 100.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-20T12:24:28Z",
      "duration_sec": 340.41,
      "notes": "ik_llama.cpp 1.53x faster end-to-end p50 vs stock llama.cpp on Qwen (8977 vs 13739 ms). Prompt eval is the source of the win: 58.08 vs 35.85 tps (1.62x). Generation tok/s is actually slightly slower (8.37 vs 9.79). Parallel-call accuracy collapses 80% -> 0% \u2014 likely a chat-template handling difference in ik_llama master. Simple + multi-function categories unaffected or improved (100% / 90%)."
    },
    {
      "cell_id": "qwen3.5-4b_specdec",
      "model_id": "qwen3.5-4b",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "llamacpp_variant": "ghcr.io/ggml-org/llama.cpp:full + speculative decoding (draft=Qwen3.5-0.8B-Q4_K_M, --spec-draft-n-max 8 --spec-draft-n-min 2)",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": null,
        "gen_eval_tps": null
      },
      "memory": {
        "peak_rss_str": null
      },
      "latency_ms": {
        "p50": 20376.55,
        "p95": 25534.54,
        "mean": 20076.01
      },
      "tool_calling": {
        "format_pass_rate": 91.43,
        "function_accuracy": 91.43,
        "argument_accuracy": 88.57,
        "overall_pass": 88.57
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 90.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 80.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 90.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-20T03:38:35Z",
      "duration_sec": 686.09,
      "notes": "Speculative decoding on a 4B target with a 0.8B draft on 4-core CPU cgroup: p50 latency increased 1.48x vs std baseline (20.4s vs 13.7s). Overall accuracy dropped 2.9pp (88.6 vs 91.4). The CPU draft+verify cycle cost exceeds the savings at this target size \u2014 published 2.5x speedups are typically on 7B+ targets with much smaller drafts. Anti-pattern at the 4B-class scale."
    },
    {
      "cell_id": "qwen3.5-4b_specdec",
      "model_id": "qwen3.5-4b",
      "endpoint": "http://127.0.0.1:11434/v1",
      "tests_file": "bfcl_subset.json",
      "n_cases": 1,
      "started_at": "2026-05-20T03:02:03Z",
      "duration_sec": 2.02,
      "tool_calling": {
        "format_pass_rate": 0.0,
        "function_accuracy": 0.0,
        "argument_accuracy": 0.0,
        "overall_pass": 0.0
      },
      "latency_ms": {
        "p50": null,
        "p95": null,
        "mean": null
      },
      "by_category": {
        "simple": {
          "n": 1,
          "overall_pass": 0.0
        }
      },
      "results": [
        {
          "id": "simple_001",
          "category": "simple",
          "ok": false,
          "error": "HTTPError: HTTP Error 503: Service Unavailable",
          "elapsed_ms": 0.0,
          "calls": [],
          "score": {
            "format": false,
            "function": false,
            "argument": false,
            "overall": false
          }
        }
      ]
    },
    {
      "cell_id": "qwen3.5-4b_std",
      "model_id": "qwen3.5-4b_std",
      "weight_quant": "Q4_K_M",
      "kv_quant": "fp16",
      "llamacpp_variant": "ghcr.io/ggml-org/llama.cpp:full",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": 35.85,
        "gen_eval_tps": 9.79
      },
      "memory": {
        "peak_rss_str": "4.536GiB"
      },
      "latency_ms": {
        "p50": 13738.63,
        "p95": 19325.27,
        "mean": 14005.06
      },
      "tool_calling": {
        "format_pass_rate": 97.14,
        "function_accuracy": 97.14,
        "argument_accuracy": 91.43,
        "overall_pass": 91.43
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 90.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 80.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 95.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-19T20:20:51Z",
      "duration_sec": 495.06
    },
    {
      "cell_id": "qwen3.5-4b_tbq3",
      "model_id": "qwen3.5-4b_tbq3",
      "weight_quant": "Q4_K_M",
      "kv_quant": "tbq3_0",
      "llamacpp_variant": "elusznik/llama.cpp#21089@0aae7d78c7e1c3029cebdbe4c318704d4057c18e",
      "host": "shared-cpu-host",
      "throughput": {
        "prompt_eval_tps": 24.33,
        "gen_eval_tps": 4.26
      },
      "memory": {
        "peak_rss_str": "1.764GiB"
      },
      "latency_ms": {
        "p50": 30008.35,
        "p95": 72045.83,
        "mean": 35686.3
      },
      "tool_calling": {
        "format_pass_rate": 88.57,
        "function_accuracy": 74.29,
        "argument_accuracy": 74.29,
        "overall_pass": 74.29
      },
      "by_category": {
        "multiple_function": {
          "n": 10,
          "overall_pass": 80.0
        },
        "parallel": {
          "n": 5,
          "overall_pass": 0.0
        },
        "simple": {
          "n": 20,
          "overall_pass": 90.0
        }
      },
      "n_cases": 35,
      "started_at": "2026-05-20T02:06:35Z",
      "duration_sec": 1240.34,
      "notes": "llama-bench was run with -fa 1 (flash-attn) which is required for the tbq3_0 cache type at the model's full ctx (262144); the BFCL harness itself runs at ctx-size 4096 without -fa (matching the std-cell settings) and observed ~2.9 gen tok/s in server timings."
    }
  ]
}