feat : 新增 HumanEval pass@k 代码生成评测

- InferenceEngine.generate() 批量生成 n 个补全 - 正则提取函数体 + 停止符截断 - multiprocessing sandbox 执行 + timeout 保护 - 标准无偏 pass@k 公式 (1, 10, 100)
2026-06-03 10:52:32 +08:00 · 2026-06-03 10:52:32 +08:00 · 615ba5d8ef
parent 02a7cb9fa0
commit 615ba5d8ef
1 changed files with 336 additions and 0 deletions
--- a/scripts/tools/evaluate_humaneval.py
+++ b/scripts/tools/evaluate_humaneval.py
@ -0,0 +1,336 @@
+"""HumanEval code generation benchmark.
+
+Generates n completions per problem, extracts function bodies, executes
+against hidden tests, and computes pass@k.
+
+Usage::
+
+    python scripts/tools/evaluate_humaneval.py --param_path ./params \
+        --data_path HumanEval.jsonl.gz --output results.json \
+        --num_samples 200 --temperature 0.8 --max_tokens 512
+"""
+
+import argparse
+import json
+import os
+import re
+import signal
+import sys
+from math import prod
+from multiprocessing import Process, Queue
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import tqdm
+
+from astrai.inference import InferenceEngine
+from astrai.model import AutoModel
+from astrai.tokenize import AutoTokenizer
+
+HUMANEVAL_URL = (
+    "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
+)
+
+_STOP_SEQUENCES = [
+    "\nclass ",
+    "\ndef ",
+    "\n# ",
+    "\nif __name__",
+    "\nprint(",
+    "\n\n\n",
+]
+
+
+def _download_humaneval(data_path: str):
+    if os.path.exists(data_path):
+        return
+    import gzip
+    import urllib.request
+
+    os.makedirs(os.path.dirname(data_path) or ".", exist_ok=True)
+    print(f"Downloading HumanEval from {HUMANEVAL_URL} ...")
+    tmp = data_path + ".tmp"
+    urllib.request.urlretrieve(HUMANEVAL_URL, tmp)
+    with gzip.open(tmp, "rb") as f_in:
+        with open(data_path, "wb") as f_out:
+            f_out.write(f_in.read())
+    os.remove(tmp)
+    print(f"  saved to {data_path}")
+
+
+def _load_problems(data_path: str) -> List[dict]:
+    problems = []
+    with open(data_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                problems.append(json.loads(line))
+    return problems
+
+
+def _extract_function_body(code: str, entry_point: str) -> Optional[str]:
+    """Extract the function body from a completion."""
+    pattern = rf"def\s+{re.escape(entry_point)}\b[^:]*:"
+    match = re.search(pattern, code)
+    if not match:
+        # Use the full code as-is if we can't find the function
+        return code
+
+    body_start = match.end()
+    lines = code[body_start:].split("\n")
+    body_lines = []
+    started = False
+
+    for line in lines:
+        stripped = line.rstrip()
+        if not stripped and not started:
+            continue
+        if not stripped and started:
+            body_lines.append("")
+            continue
+        if not started:
+            started = True
+        if stripped.lstrip() == stripped and started:
+            break
+        body_lines.append(stripped)
+
+    body = "\n".join(body_lines)
+    if not body.strip():
+        return None
+    return body
+
+
+def _trim_stop_sequences(text: str) -> str:
+    for stop in _STOP_SEQUENCES:
+        idx = text.find(stop)
+        if idx != -1:
+            text = text[:idx]
+    return text
+
+
+def _execute_code(problem: dict, completion: str, timeout: float = 3.0) -> bool:
+    """Run the completion against hidden tests in a subprocess."""
+
+    def _worker(queue, full_code):
+        try:
+            namespace = {}
+            exec(full_code, namespace)
+            check = namespace.get("check")
+            if check is None:
+                queue.put(False)
+                return
+            check(namespace.get(problem["entry_point"]))
+            queue.put(True)
+        except Exception:
+            queue.put(False)
+
+    full_code = problem["prompt"] + completion + "\n" + problem["test"]
+
+    queue: Queue = Queue()
+    proc = Process(target=_worker, args=(queue, full_code))
+    proc.start()
+    proc.join(timeout)
+
+    if proc.is_alive():
+        proc.terminate()
+        proc.join()
+        return False
+
+    try:
+        return queue.get_nowait()
+    except Exception:
+        return False
+
+
+def _pass_at_k(n: int, c: int, k: int) -> float:
+    """Unbiased estimator of pass@k."""
+    if n - c < k:
+        return 1.0
+    return 1.0 - float(prod(1.0 - k / np.arange(n - c + 1, n + 1)))
+
+
+def _deduplicate(completions: List[str]) -> List[str]:
+    seen = set()
+    unique = []
+    for c in completions:
+        if c not in seen:
+            seen.add(c)
+            unique.append(c)
+    return unique
+
+
+def _generate(
+    engine: InferenceEngine,
+    prompt: str,
+    num_samples: int,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    batch_size: int,
+) -> List[str]:
+    batches = [prompt] * min(batch_size, num_samples)
+    completions = []
+    remaining = num_samples
+
+    while remaining > 0:
+        current = min(batch_size, remaining)
+        batch_prompts = batches[:current]
+        outputs = engine.generate(
+            prompt=batch_prompts,
+            stream=False,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+        )
+        if isinstance(outputs, str):
+            outputs = [outputs]
+        completions.extend(outputs)
+        remaining -= current
+
+    return _deduplicate(completions)
+
+
+def evaluate(
+    engine: InferenceEngine,
+    problems: List[dict],
+    num_samples: int,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    batch_size: int,
+    k_values: Tuple[int, ...] = (1, 10, 100),
+) -> Dict:
+    results = {}
+    all_pass_at_k = {k: [] for k in k_values}
+
+    for problem in tqdm.tqdm(problems, desc="HumanEval", unit="problem"):
+        task_id = problem["task_id"]
+        prompt = problem["prompt"]
+        entry_point = problem["entry_point"]
+
+        raw_completions = _generate(
+            engine,
+            prompt,
+            num_samples,
+            max_tokens,
+            temperature,
+            top_p,
+            top_k,
+            batch_size,
+        )
+
+        completions = []
+        for raw in raw_completions:
+            trimmed = _trim_stop_sequences(raw)
+            body = _extract_function_body(trimmed, entry_point)
+            if body:
+                completions.append(body)
+
+        passed = 0
+        for comp in completions:
+            if _execute_code(problem, comp):
+                passed += 1
+
+        n = len(completions)
+        c = passed
+        result = {"task_id": task_id, "n": n, "passed": c}
+        for k in k_values:
+            result[f"pass@{k}"] = round(_pass_at_k(n, c, k), 4)
+            all_pass_at_k[k].append(_pass_at_k(n, c, k))
+        results[task_id] = result
+
+    summary = {}
+    for k in k_values:
+        vals = all_pass_at_k[k]
+        summary[f"pass@{k}"] = round(float(np.mean(vals)), 4)
+    results["_summary"] = summary
+
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="HumanEval benchmark")
+    parser.add_argument(
+        "--param_path", type=str, default="./params", help="Model directory"
+    )
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        default="./humaneval/HumanEval.jsonl",
+        help="HumanEval JSONL file (auto-download if missing)",
+    )
+    parser.add_argument("--output", type=str, default=None, help="Output JSON path")
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=200,
+        help="Completions per problem",
+    )
+    parser.add_argument(
+        "--max_tokens", type=int, default=512, help="Max generation tokens"
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.8, help="Sampling temperature"
+    )
+    parser.add_argument("--top_p", type=float, default=0.95, help="Top-p sampling")
+    parser.add_argument("--top_k", type=int, default=50, help="Top-k sampling")
+    parser.add_argument(
+        "--batch_size", type=int, default=1, help="Inference batch size"
+    )
+    parser.add_argument(
+        "--problems",
+        type=int,
+        nargs="+",
+        default=None,
+        help="Specific problem indices (0-based)",
+    )
+    args = parser.parse_args()
+
+    _download_humaneval(args.data_path)
+    problems = _load_problems(args.data_path)
+    if args.problems:
+        problems = [problems[i] for i in args.problems if i < len(problems)]
+
+    model = AutoModel.from_pretrained(args.param_path)
+    tokenizer = AutoTokenizer.from_pretrained(args.param_path)
+    model.to(device="cuda", dtype=torch.bfloat16)
+
+    engine = InferenceEngine(
+        model=model,
+        tokenizer=tokenizer,
+        max_batch_size=args.batch_size,
+    )
+
+    results = evaluate(
+        engine=engine,
+        problems=problems,
+        num_samples=args.num_samples,
+        max_tokens=args.max_tokens,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        top_k=args.top_k,
+        batch_size=args.batch_size,
+        k_values=(1, 10, 100),
+    )
+
+    summary = results.pop("_summary")
+    print(f"\n{'=' * 60}")
+    for k, v in summary.items():
+        print(f"  {k}: {v:.2%}")
+    print(f"{'=' * 60}")
+
+    if args.output:
+        results["_summary"] = summary
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+        print(f"Results saved to {args.output}")
+
+    engine.shutdown()
+
+
+if __name__ == "__main__":
+    main()