JetBrains-Research · gt22 · Jan 30, 2025 · Feb 3, 2025 · Mar 10, 2025 · Mar 10, 2025
diff --git a/scripts/evalplus/extract_tests.py b/scripts/evalplus/extract_tests.py
@@ -0,0 +1,230 @@
+# %%
+import json
+import sys
+import os
+import glob
+from pathlib import Path
+import subprocess
+import shutil
+sys.set_int_max_str_digits(100000000)
+# %%
+with open("benches/HumanEval-Dafny/tests/collected_tests.json") as f:
+    tests = json.load(f)
+
+# %%
+names = [Path(f).name for f in glob.glob("benches/HumanEval-Dafny/*.dfy")]
+
+name_by_id = {int(n[:3]): n for n in names}
+
+# %%
+outer_pattern = "method Main()\n  decreases *\n{{\n{tests}}}"
+
+default_test_pattern = '\t{{var v := {entrypoint}({args}); expect v == {result}, "test {i} failed";}}\n'
+
+two_rets_pattern = '\t{{var s, p := {entrypoint}({args}); expect [s, p] == {result}, "test {i} failed";}}\n'
+
+test_pattern = {
+    4: '\t{{var v := {entrypoint}({args}); expect abs(v - {result}) < 0.00001, "test {i} failed";}}\n',
+    8: two_rets_pattern,
+    20: two_rets_pattern,
+    107: two_rets_pattern,
+    112: '\t{{var s, p := {entrypoint}({args}); expect (s == {result1} && p == {result2}), "test {i} failed";}}\n',
+    136: two_rets_pattern,
+    155: two_rets_pattern
+}
+
+skips = {58, 69, 84, 90, 92, 116, 119, 127, 145, 146, 148, 151, 158}
+# 116, 119, 127, 145, 146, 148, 151?, 158 for rewrite
+
+default_formatter = lambda problem_id, entrypoint, args, result, i: (
+    test_pattern.get(problem_id, default_test_pattern).format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, args)),
+            result=preprocess(result),
+            i=i
+        )
+    )
+
+preprocess_optional = lambda r: f"Some({preprocess(r)})" if r is not None else "None"
+
+optional_result = lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, args)),
+            result=preprocess_optional(result),
+            i=i
+        )
+    )
+
+alternative_formatters = {
+    12: optional_result,
+    45: lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, [float(a) for a in args])),
+            result=preprocess(result),
+            i=i
+        )
+    ),
+    81: lambda problem_id, entrypoint, args, result, i: (
+        default_test_pattern.format(
+            entrypoint=entrypoint,
+            args=', '.join(map(lambda xs: preprocess([float(x) for x in xs]), args)),
+            result=preprocess(result),
+            i=i
+        )
+    ),
+    87: lambda problem_id, entrypoint, args, result, i: (
+        default_test_pattern.format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, args)),
+            result=preprocess([tuple(x) for x in result]),
+            i=i
+        )
+    ),
+    90: optional_result,
+    111: lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, args)),
+            result=preprocess(result).replace('"', "'"),
+            i=i
+        )
+    ),
+    112: lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, args)),
+            result1=preprocess(result[0]),
+            result2=preprocess(result[1]),
+            i=i
+        )
+    ),
+    119: lambda problem_id, entrypoint, args, result, i: default_formatter(problem_id, entrypoint, args[0], result, i),
+    127: lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+                entrypoint=entrypoint,
+                args=', '.join(map(preprocess, args[0] + args[1])),
+                result=preprocess(result),
+                i=i
+            )
+        ),
+    128: optional_result,
+    130: lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+                entrypoint=entrypoint,
+                args=', '.join(map(preprocess, args)),
+                result=preprocess([int(r) for r in result]),
+                i=i
+            )
+        ),
+    133: lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, [[float(a) for a in args[0]]])),
+            result=preprocess(result),
+            i=i
+        )
+    ),
+    136: lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, args)),
+            result=f'[{preprocess_optional(result[0])}, {preprocess_optional(result[1])}]',
+            i=i
+        )
+    ),
+    151: lambda problem_id, entrypoint, args, result, i: (
+        test_pattern.get(problem_id, default_test_pattern).format(
+            entrypoint=entrypoint,
+            args=', '.join(map(preprocess, [[float(a) for a in args[0]]])),
+            result=preprocess(result),
+            i=i
+        )
+    )
+}
+
+
+def preprocess(s):
+    if isinstance(s, float):
+        s = round(s, 7)
+    if isinstance(s, str):
+        s = f'"{s}"'
+    if isinstance(s, dict):
+        sr = "map["
+        for k, v in s.items():
+            sr += preprocess(k)
+            sr += " := "
+            sr += preprocess(v)
+            sr += ", "
+        if s:
+            sr = sr[:-2]
+        sr += "]"
+        s = sr
+    else:
+        s = str(s)
+    s = s.replace("False", "false").replace("True", "true")
+    s = s.replace("'", '"').replace('\n', '\\n')
+    return s
+
+
+def make_test(problem_id: int, problem_data):
+    if problem_id not in name_by_id:
+        return None
+    entrypoint = problem_data["entrypoint"]
+    test_lines = ""
+    for i, (args, res) in enumerate(problem_data["base"]):
+        test_lines += (alternative_formatters.get(problem_id, default_formatter)
+                       (problem_id, entrypoint, args, res, i))
+    return outer_pattern.format(tests=test_lines)
+
+
+# %%
+for pid, pd in tests.items():
+    pid = int(pid.rsplit("/", 1)[1])
+    if pid in skips:
+        continue
+    ptest = make_test(pid, pd)
+    if ptest is not None:
+        with open(f"scripts/evalplus/tests/{name_by_id[pid]}", "w") as f:
+            f.write(ptest)
+# %%
+
+test_dir = Path("scripts/evalplus/tests/")
+for pid, pd in tests.items():
+    pid = int(pid.rsplit("/", 1)[1])
+    if pid not in name_by_id:
+        continue
+    if pid in skips:
+        continue
+    working_dir = test_dir / "working" / Path(name_by_id[pid]).stem
+    working_dir.mkdir(parents=True, exist_ok=True)
+    test_file = working_dir / f"{name_by_id[pid]}"
+    if test_file.exists():
+        continue
+    with open(f"benches/HumanEval-Dafny/{name_by_id[pid]}") as f:
+        prg = f.read()
+    ptest = make_test(pid, pd)
+    if ptest is not None:
+        with open(test_file, "w") as f:
+            f.write(prg)
+            f.write("\n")
+            f.write(ptest)
+        try:
+            res = subprocess.run(
+                f'dafny run --no-verify --allow-warnings "{test_file}"',
+                capture_output=True,
+                shell=True,
+                timeout=60
+            )
+        except subprocess.TimeoutExpired as e:
+            print(name_by_id[pid], "timeout")
+            shutil.rmtree(working_dir)
+            continue
+        if res.returncode != 0:
+            print(name_by_id[pid], "fail")
+            print(res.stdout.decode("utf-8"))
+            # shutil.rmtree(working_dir)
+            break
+        else:
+            print(name_by_id[pid], "success")
diff --git a/scripts/generate_several_modes_table.py b/scripts/generate_several_modes_table.py
@@ -0,0 +1,83 @@
+# %%
+import matplotlib.pyplot as plt
+import json
+from itertools import accumulate
+import pathlib
+
+# %%
+
+
+def read_bench(d, ext):
+    bench = pathlib.Path(d)
+    return [x.name.removesuffix(f'.{ext}') for x in bench.glob(f'*.{ext}')]
+
+
+def read_result(p):
+    with open(p) as f:
+        return json.load(f)
+
+
+def get_tries(d):
+    return [value for file, value in d.items() if file.removesuffix('.dfy') in files and value != -1]
+
+
+def get_best_tries(ds):
+    return get_tries({file: min((d[file] for d in ds if d[file] != -1), default=-1) for file in ds[0]})
+
+
+# %%
+intersect_bench = False
+dafny_bench = pathlib.Path("benches/HumanEval-Dafny")
+nagini_bench = pathlib.Path("benches/HumanEval-Nagini/Bench")
+dafny_files = read_bench("benches/HumanEval-Dafny", "dfy")
+nagini_files = read_bench("benches/HumanEval-Nagini/Bench", "py")
+shared_files = [x for x in dafny_files if x in nagini_files]
+files = shared_files if intersect_bench else dafny_files
+
+# %%
+base_name = "results/tries_HumanEval-Dafny_mode{mode}_{run}.json"
+modes = [6]
+runs = [0, 1, 2, 3, 4]
+for mode in modes:
+    print(f"Mode {mode}:")
+    paths = [base_name.format(mode=mode, run=run) for run in runs]
+    file_cnt = len(files)
+    data = [read_result(p) for p in paths]
+    tries = [get_tries(d) for d in data]
+
+    max_tries = max(max(t) for t in tries)
+    cnt = [[0] * (max_tries + 1) for _ in range(len(runs) + 1)]
+    for i, t in enumerate(tries):
+        for tt in t:
+            cnt[i][tt] += 1
+            cnt[-1][tt] += 1
+    cnt[-1] = [x / len(runs) for x in cnt[-1]]
+    cumulative = [list(accumulate(c)) for c in cnt]
+    perc = [[100 * (c / file_cnt) for c in cm] for cm in cumulative]
+
+    for run in runs:
+        print(f"Run {run}: {cumulative[run][-1]} / {file_cnt}")
+
+    print("Average:")
+    print(f"{round(cumulative[-1][-1], 1)} / {file_cnt}")
+    print(f"{round(perc[-1][-1])}% of the files were successfully verified")
+
+    best_tries = get_best_tries(data)
+    cnt_best = [0] * (max_tries + 1)
+    for t in best_tries:
+        cnt_best[t] += 1
+    cumulative_best = list(accumulate(cnt_best))
+    perc_best = [100 * (c / file_cnt) for c in cumulative_best]
+    print("Unique:")
+    print(f"{cumulative_best[-1]} / {file_cnt}")
+    print(f"{round(perc_best[-1], 2)}% of the files were successfully verified")
+# fig, ax = plt.subplots()  # type: ignore
+# plt.title("Percentage of files verified")  # type: ignore
+# plt.xlabel("Number of tries")  # type: ignore
+# plt.ylabel("Percentage of files")  # type: ignore
+# ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f"{x}%"))  # type: ignore
+# ax.plot(perc)  # type: ignore
+# # %%
+# plt.show()
+# %%
+