From fcf7969f9bbeb8b00691d00127c410c9519c9463 Mon Sep 17 00:00:00 2001 From: Daniel Nezamabadi <55559979+dnezam@users.noreply.github.com> Date: Tue, 16 Jun 2026 13:48:47 +0800 Subject: [PATCH 1/3] regression.py: Exchange checkpointing for parallelism Since I have the vague feeling that regression has to be rewritten in the future anyway, all of these changes have been LLM generated. For now, my priority is understanding whether or not the 100 Great Theorems are broken or not. --- candle/regression.py | 603 +++++++++++++++++++++++-------------------- 1 file changed, 327 insertions(+), 276 deletions(-) diff --git a/candle/regression.py b/candle/regression.py index c5b47a77..8f319452 100644 --- a/candle/regression.py +++ b/candle/regression.py @@ -1,34 +1,41 @@ """ -Simple Python script to test Candle. - -Note that this script assumes passwordless sudo for criu, which can be -achieved by: -1. Calling visudo -2. Adding - user ALL = (root) NOPASSWD: /usr/bin/criu - to it, taking care to replace user with the proper username. - -No claims are made on whether this a good idea in regards to security. - -The reason criu is used, is because DMTCP segfaulted on my machine. -However, I suspect that that might have been due to the packaged -version being outdated, so it is worth reconsidering, as it appears -that DMTCP supports one checkpoint to be restored multiple times -simulatenously, which would make parallelism during testing easier. +!!! Large amount of LLM generated code !!! + +Parallel regression test runner for Candle. + +Each test runs in its own fresh Candle process that loads hol.ml and then the +test file(s). There is no checkpointing (no criu/DMTCP, no sudo): running one +process per test, up to -j at a time, is what hides the cost of reloading +hol.ml for every test. + +The Candle REPL transcript for each test is written to its own temporary log +file under /tmp (via tempfile.mkstemp), e.g. /tmp/candle-100_arithmetic-XXXX.log. + +Two suites are available: + * REGRESSION - a small subset, run by default. + * TOP100 - the full "Top 100 theorems" set (from holtest.mk's + GREAT_100_THEOREMS), run with --top100. """ import sys -import pexpect -import time -import subprocess import os +import time import argparse +import tempfile +import concurrent.futures from dataclasses import dataclass from enum import Enum from pathlib import Path -# /candle/candle-regression.py +import pexpect + +# /candle/regression.py CANDLE_ROOT = Path(__file__).resolve().parent.parent +# The Top 100 suite needs a larger CakeML heap than the build's default. +# Set via CML_HEAP_SIZE (MB) per candle process; parallelism is capped so the +# combined heap reservation stays within available memory. +TOP100_HEAP_MB = 6000 + # --------------------------------------------------------------------------- # Exceptions # --------------------------------------------------------------------------- @@ -45,9 +52,8 @@ class LoadFailure(Exception): """Loading a file failed.""" - # --------------------------------------------------------------------------- -# Test status and result +# Test definitions # --------------------------------------------------------------------------- class TestStatus(Enum): @@ -55,70 +61,137 @@ class TestStatus(Enum): FAIL = "FAIL" TIMEOUT = "TIMEOUT" -@dataclass -class TestResult: - name: str - status: TestStatus - elapsed: float = 0.0 - error_message: str = "" +@dataclass(frozen=True) +class Test: + """A named test, defined by the file(s) it loads (in order) on top of hol.ml.""" + name: str + files: tuple + + +def _t(name, *files): + """Build a Test. With no explicit files, loads ".ml".""" + return Test(name, files if files else (f"{name}.ml",)) + + +# The default regression subset. +REGRESSION = [ + _t("100/arithmetic"), + _t("100/cantor"), + _t("100/konigsberg"), + _t("100/gcd"), + _t("100/wilson"), + _t("100/combinations"), + _t("100/ratcountable"), + _t("100/euler"), + _t("100/lhopital"), + _t("100/stirling"), + _t("100/liouville"), + _t("100/cayley_hamilton"), +] -TESTS = [ - "100/arithmetic", - "100/cantor", - "100/konigsberg", - "100/gcd", - "100/wilson", - "100/combinations", - "100/ratcountable", - "100/euler", - "100/lhopital", - "100/stirling", - "100/liouville", - "100/cayley_hamilton", +# The full "Top 100 theorems" suite, mirroring GREAT_100_THEOREMS in holtest.mk. +# bertrand-primerecip is special: primerecip.ml depends on bertrand.ml, so both +# are loaded (in order) in the same session. +TOP100 = [ + _t("100/arithmetic_geometric_mean"), + _t("100/arithmetic"), + _t("100/ballot"), + _t("100/bernoulli"), + _t("100/bertrand-primerecip", "100/bertrand.ml", "100/primerecip.ml"), + _t("100/birthday"), + _t("100/buffon"), + _t("100/cantor"), + _t("100/cayley_hamilton"), + _t("100/ceva"), + _t("100/circle"), + _t("100/chords"), + _t("100/combinations"), + _t("100/constructible"), + _t("100/cosine"), + _t("100/cubedissection"), + _t("100/cubic"), + _t("100/derangements"), + _t("100/desargues"), + _t("100/descartes"), + _t("100/dirichlet"), + _t("100/div3"), + _t("100/divharmonic"), + _t("100/e_is_transcendental"), + _t("100/euler"), + _t("100/feuerbach"), + _t("100/fourier"), + _t("100/four_squares"), + _t("100/friendship"), + _t("100/fta"), + _t("100/gcd"), + _t("100/green"), + _t("100/heron"), + _t("100/isoperimetric"), + _t("100/inclusion_exclusion"), + _t("100/independence"), + _t("100/isosceles"), + _t("100/konigsberg"), + _t("100/lagrange"), + _t("100/leibniz"), + _t("100/lhopital"), + _t("100/liouville"), + _t("100/minkowski"), + _t("100/morley"), + _t("100/pascal"), + _t("100/perfect"), + _t("100/pick"), + _t("100/piseries"), + _t("100/platonic"), + _t("100/pnt"), + _t("100/polyhedron"), + _t("100/ptolemy"), + _t("100/pythagoras"), + _t("100/quartic"), + _t("100/ramsey"), + _t("100/ratcountable"), + _t("100/realsuncountable"), + _t("100/reciprocity"), + _t("100/stirling"), + _t("100/subsequence"), + _t("100/thales"), + _t("100/transcendence"), + _t("100/triangular"), + _t("100/two_squares"), + _t("100/wilson"), ] +# Lookup by name, so --test can reuse multi-file definitions (e.g. the +# bertrand-primerecip pairing) rather than always assuming ".ml". +BY_NAME = {t.name: t for t in TOP100} + # --------------------------------------------------------------------------- # CandleREPL # --------------------------------------------------------------------------- class CandleREPL: - def __init__(self, restore=False): - # Easier to assume that we are in the candle directory for now. - os.chdir(CANDLE_ROOT) - - # Indicates whether we are a restored instance. In that case, I suspect - # that the pid of self.process might be of sudo criu restore instead of - # Candle. - self._cake_pid = None - - self._logfile = sys.stdout - self._checkpoint_dir = str(CANDLE_ROOT / "checkpoint") - self._pidfile_name = "cake.pid" - + def __init__(self, logfile, env=None): + self._logfile = logfile self.load_stack = [] self.last_val = None - if restore: - self.restore() - else: - try: - self.process = pexpect.spawn( - str(CANDLE_ROOT / "candle.sh"), - encoding='utf-8', - logfile=self._logfile - ) - except Exception as e: - raise StartFailure from e - try: - self._check_boot() - except BootFailure: - self.kill() - raise - - def _pidfile(self): - return os.path.join(self._checkpoint_dir, self._pidfile_name) + try: + self.process = pexpect.spawn( + str(CANDLE_ROOT / "candle.sh"), + encoding="utf-8", + logfile=logfile, + cwd=str(CANDLE_ROOT), + env=env, + ) + except Exception as e: + raise StartFailure from e + + try: + self._check_boot() + except BootFailure: + self.kill() + raise def _check_boot(self): try: @@ -135,7 +208,7 @@ def _check_boot(self): reasons = { 1: str(self._get_match(1)), 2: "Timeout", - 3: "Process exited unexpectedly" + 3: "Process exited unexpectedly", } raise BootFailure(reasons[index]) @@ -143,9 +216,9 @@ def _get_match(self, idx): return self.process.match.group(idx) def load_stack_str(self): - f"[while loading: {' > '.join(self.load_stack)}]" + return f"[while loading: {' > '.join(self.load_stack)}]" - def _check_output(self, timeout=600): + def _check_output(self, timeout): try: index = self.process.expect([ r'\n\- Loading (\S+)', @@ -167,19 +240,20 @@ def _check_output(self, timeout=600): case 1: self.last_val = self._get_match(1) case 2 | 3 | 4: - raise LoadFailure(f"{self._get_match(1)} {self.load_stack_str()}") + raise LoadFailure(self._get_match(1)) case 5: finished = self._get_match(1) expected = self.load_stack.pop() - assert finished == expected, f'Expected to finish loading {expected}. Actual: {finished}' + assert finished == expected, ( + f"Expected to finish loading {expected}. Actual: {finished}") case 6: - raise LoadFailure(f"Timeout waiting for output {self.load_stack_str()}") + raise LoadFailure("Timeout waiting for output") case 7: - raise LoadFailure(f"Process exited unexpectedly {self.load_stack_str()}") + raise LoadFailure("Process exited unexpectedly") case _: assert False, "Unreachable: Did you add a new case in _check_output?" - def load(self, file, timeout=600): + def load(self, file, timeout): self.process.sendline(f'#use "{file}";;') self._check_output(timeout=timeout) @@ -187,53 +261,83 @@ def load(self, file, timeout=600): self._check_output(timeout=timeout) def kill(self): + # No criu restore anymore, so there is no detached process group to + # chase down: closing the pexpect child (and its candle.sh subtree) is + # enough. self.process.close(force=True) - # We need to make sure that the Candle process is killed and reaped, - # as criu tries to restore into the same pid (I think), - # which will fail if the pid is occupied. - # My understanding is that after restoring at least Candle is a process - # group that is not the child of this process. The former motivates the - # g (group), and the latter is why we have the loop with killpg. - if self._cake_pid: - subprocess.run(["pkill", "-9", "-g", str(self._cake_pid)]) - while True: - try: - os.killpg(self._cake_pid, 0) - except ProcessLookupError: - return - time.sleep(0.1) - - def dump(self): - os.makedirs(self._checkpoint_dir, exist_ok=True) - - # We are not in the situation where Candle has been restored, so the - # pid of process is correct. We also save it for future restoring. - if not self._cake_pid: - pid = self.process.pid - with open(self._pidfile(), "w") as f: - f.write(str(pid)) - else: - pid = self._cake_pid - - subprocess.run( - ["sudo", "criu", "dump", "-D", self._checkpoint_dir, "-t", str(pid), "--shell-job"], - check=True, - ) - - # If I remember correctly, this is for making sure the dumped process - # gets reaped. I think. - self.process.wait() - - def restore(self): - self.process = pexpect.spawn( - "sudo", ["criu", "restore", "-D", self._checkpoint_dir, "--shell-job"], - encoding='utf-8', - logfile=self._logfile, - ) - # Assumption: CRIU restores processes with their original PIDs. - # Read the cake PID (saved during dump) so kill() can target it. - self._cake_pid = int(open(self._pidfile()).read().strip()) + +# --------------------------------------------------------------------------- +# Test result and runner +# --------------------------------------------------------------------------- + +@dataclass +class TestResult: + name: str + status: TestStatus + hol_elapsed: float = 0.0 + test_elapsed: float = 0.0 + error_message: str = "" + log_path: str = "" + + @property + def total(self): + return self.hol_elapsed + self.test_elapsed + + +def _format_error(repl, exc): + """One-line failure summary: the error, where it happened, last value bound.""" + err = str(exc) or exc.__class__.__name__ + if repl is not None: + if repl.load_stack: + err += f" {repl.load_stack_str()}" # where: the file/dep being loaded + if repl.last_val: + err += f" (last val: {repl.last_val})" # last value bound before failure + return err + + +def run_test(test, test_timeout, env=None): + """Run a single test in a fresh Candle process. Never raises.""" + safe_name = test.name.replace("/", "_") + fd, log_path = tempfile.mkstemp(prefix=f"candle-{safe_name}-", suffix=".log") + logfile = os.fdopen(fd, "w") + + repl = None + start = None # set once Candle is booted; marks the start of hol.ml load + hol_elapsed = 0.0 + test_elapsed = 0.0 + try: + repl = CandleREPL(logfile=logfile, env=env) + + start = time.perf_counter() + repl.load("hol.ml", timeout=test_timeout) + hol_elapsed = time.perf_counter() - start + + for f in test.files: + repl.load(f, timeout=test_timeout) + test_elapsed = time.perf_counter() - start - hol_elapsed + + return TestResult(test.name, TestStatus.PASS, hol_elapsed, test_elapsed, + log_path=log_path) + + except Exception as e: # pylint: disable=broad-exception-caught + # A test must never crash the runner: record any failure and move on. + # Attribute elapsed time to whichever phase we failed in. + if start is not None: + if hol_elapsed: + test_elapsed = time.perf_counter() - start - hol_elapsed + else: + hol_elapsed = time.perf_counter() - start + status = (TestStatus.TIMEOUT + if isinstance(e, pexpect.TIMEOUT) or "Timeout" in str(e) + else TestStatus.FAIL) + return TestResult(test.name, status, hol_elapsed, test_elapsed, + error_message=_format_error(repl, e), log_path=log_path) + + finally: + if repl is not None: + repl.kill() + logfile.close() # --------------------------------------------------------------------------- @@ -248,31 +352,32 @@ class Reporter: } @staticmethod - def print_summary(results): - # Header + def print_summary(results, wall): print() - print(f"{'Test':<40} {'Status':>6} {'Time':>8}") + print(f"{'Test':<40} {'Status':>6} {'Time':>9}") print("-" * 58) - for r in results: - elapsed_str = f"{r.elapsed:.1f}s" sym = Reporter.STATUS_SYMBOLS[r.status] - print(f"{r.name:<40} {sym:>6} {elapsed_str:>8}") + print(f"{r.name:<40} {sym:>6} {f'{r.total:.1f}s':>9}") + print("-" * 58) - # Footer counts = {} for s in TestStatus: c = sum(1 for r in results if r.status == s) if c: counts[s] = c - - print("-" * 58) parts = [f"{s.value}: {c}" for s, c in counts.items()] + + sum_total = sum(r.total for r in results) + hol_times = [r.hol_elapsed for r in results if r.hol_elapsed > 0] + avg_hol = sum(hol_times) / len(hol_times) if hol_times else 0.0 + print(f"Total: {len(results)} | " + " ".join(parts)) + print(f"Sum of per-test time: {sum_total:.1f}s") + print(f"Wall clock: {wall:.1f}s") + print(f"Average hol.ml load: {avg_hol:.1f}s") - # Highlight problems failures = [r for r in results if r.status in (TestStatus.FAIL, TestStatus.TIMEOUT)] - if failures: print() print("FAILURES:") @@ -281,121 +386,66 @@ def print_summary(results): if r.error_message: msg += f" — {r.error_message}" print(msg) + print(f" log: {r.log_path}") # --------------------------------------------------------------------------- -# TestRunner +# Runner # --------------------------------------------------------------------------- -class TestRunner: - def __init__(self, timeout=600, fail_fast=False): - self.timeout = timeout - self.fail_fast = fail_fast - - def setup(self, reuse_checkpoint=False): - """Start candle, load hol.ml, and dump a checkpoint.""" - checkpoint_path = CANDLE_ROOT / "checkpoint" - if reuse_checkpoint and os.path.isdir(checkpoint_path): - print("Reusing existing checkpoint.") - return - - print("Starting candle and loading hol.ml...") - repl = CandleREPL() +def _available_memory_mb(): + """Best-effort available RAM in MB (Linux /proc/meminfo), else None.""" + try: + with open("/proc/meminfo", encoding="utf-8") as f: + for line in f: + if line.startswith("MemAvailable:"): + return int(line.split()[1]) // 1024 + except (OSError, ValueError, IndexError): + pass + return None + + +def cap_jobs_for_heap(jobs, heap_mb): + """Cap workers so heap_mb * jobs stays within available RAM. + + Each candle process reserves its full heap, so heap_mb * jobs is the + memory ceiling. Falls back to the requested jobs if memory is unknown. + """ + avail = _available_memory_mb() + if avail is None: + return jobs + return max(1, min(jobs, avail // heap_mb)) + + +def run_suite(tests, jobs, test_timeout, env=None): + total = len(tests) + print(f"Running {total} test(s) with {jobs} parallel worker(s).") + print("Logs: /tmp/candle--*.log\n") + + results = [] + done = 0 + wall_start = time.perf_counter() + + with concurrent.futures.ThreadPoolExecutor(max_workers=jobs) as ex: + futures = {ex.submit(run_test, t, test_timeout, env): t for t in tests} try: - repl.load("hol.ml", timeout=3600) - print("hol.ml loaded. Dumping checkpoint...") - repl.dump() - print("Checkpoint created.") - except Exception: - repl.kill() - raise - - def run_test(self, name): - """Restore from checkpoint, run a single test, return TestResult.""" - start = time.perf_counter() - - try: - repl = CandleREPL(restore=True) - except Exception as e: - elapsed = time.perf_counter() - start - return TestResult( - name=name, status=TestStatus.FAIL, - elapsed=elapsed, error_message=f"Restore failed: {e}", - ) - - try: - repl.load(f"{name}.ml", timeout=self.timeout) - elapsed = time.perf_counter() - start - - return TestResult(name=name, status=TestStatus.PASS, elapsed=elapsed) - - except LoadFailure as e: - elapsed = time.perf_counter() - start - err = str(e) - if repl.load_stack: - err += f" {repl.load_stack_str()}" - if repl.last_val: - err += f" (last val: {repl.last_val})" - if "Timeout" in str(e): - status = TestStatus.TIMEOUT - - else: - status = TestStatus.FAIL - return TestResult(name=name, status=status, elapsed=elapsed, error_message=err) - - except pexpect.TIMEOUT: - elapsed = time.perf_counter() - start - err = "Timeout" - if repl.load_stack: - err += f" {repl.load_stack_str()}" - if repl.last_val: - err += f" (last val: {repl.last_val})" - else: - status = TestStatus.TIMEOUT - return TestResult( - name=name, status=status, - elapsed=elapsed, error_message=err, - ) - - except Exception as e: - elapsed = time.perf_counter() - start - err = str(e) - if repl.load_stack: - err += f" {repl.load_stack_str()}" - if repl.last_val: - err += f" (last val: {repl.last_val})" - else: - status = TestStatus.FAIL - return TestResult( - name=name, status=status, - elapsed=elapsed, error_message=err, - ) - - finally: - repl.kill() - - - def run_all(self, tests): - """Run all tests, printing progress inline.""" - results = [] - total = len(tests) - - try: - for i, name in enumerate(tests, 1): - print(f"[{i}/{total}] {name} ... ", end="", flush=True) - result = self.run_test(name) - sym = Reporter.STATUS_SYMBOLS[result.status] - print(f"{sym} ({result.elapsed:.1f}s)") - if result.error_message and result.status in (TestStatus.FAIL, TestStatus.TIMEOUT): - print(f" {result.error_message}") - results.append(result) - if self.fail_fast and result.status in (TestStatus.FAIL, TestStatus.TIMEOUT): - print("Stopping early due to --fail-fast.") - break + for fut in concurrent.futures.as_completed(futures): + r = fut.result() + done += 1 + sym = Reporter.STATUS_SYMBOLS[r.status] + print(f"[{done}/{total}] {sym} {r.name} ({r.total:.1f}s)") + if r.status is not TestStatus.PASS: + if r.error_message: + print(f" {r.error_message}") + print(f" log: {r.log_path}") + results.append(r) except KeyboardInterrupt: - print("\nInterrupted — showing results so far.") + print("\nInterrupted — cancelling pending tests; showing results so far.") + for fut in futures: + fut.cancel() - return results + wall = time.perf_counter() - wall_start + return results, wall # --------------------------------------------------------------------------- @@ -403,60 +453,61 @@ def run_all(self, tests): # --------------------------------------------------------------------------- def main(): - parser = argparse.ArgumentParser( - description="Candle regression test suite", - ) + parser = argparse.ArgumentParser(description="Candle parallel regression suite") parser.add_argument( - "--reuse-checkpoint", action="store_true", - help="Skip hol.ml loading if a checkpoint already exists", + "--top100", action="store_true", + help="Run the full Top 100 theorems suite instead of the regression subset", ) parser.add_argument( "--test", nargs="+", - help="Run specific test(s) by name", + help="Run specific test name(s), e.g. 100/arithmetic (overrides suite selection)", ) parser.add_argument( "--list", action="store_true", - help="List available tests and exit", + help="List the selected suite and exit", ) parser.add_argument( - "--fail-fast", action="store_true", - help="Stop after the first unexpected failure", + "-j", "--jobs", type=int, default=max(1, (os.cpu_count() or 2) - 1), + help="Number of parallel workers (default: CPU count - 1)", ) parser.add_argument( "--timeout", type=int, default=600, - help="Per-test timeout in seconds (default: 600)", + help="Timeout in seconds for each #use load, including hol.ml (default: 600)", ) args = parser.parse_args() - available = TESTS - if args.list: - for name in available: - print(f" {name}") - print(f"\n{len(available)} tests") - return - - # Determine which tests to run if args.test: - tests = args.test + tests = [BY_NAME.get(name, _t(name)) for name in args.test] + running_top100 = False + elif args.top100: + tests = TOP100 + running_top100 = True else: - tests = list(TESTS) - - runner = TestRunner( - timeout=args.timeout, - fail_fast=args.fail_fast, - ) + tests = REGRESSION + running_top100 = False - # Setup checkpoint - runner.setup(reuse_checkpoint=args.reuse_checkpoint) + if args.list: + for t in tests: + extra = f" ({', '.join(t.files)})" if t.files != (f"{t.name}.ml",) else "" + print(f" {t.name}{extra}") + print(f"\n{len(tests)} test(s)") + return - # Run tests - results = runner.run_all(tests) + # The Top 100 suite gets a larger heap; cap parallelism so the combined + # per-process heap reservation does not exceed available memory. + child_env = None + jobs = args.jobs + if running_top100: + child_env = {**os.environ, "CML_HEAP_SIZE": str(TOP100_HEAP_MB)} + jobs = cap_jobs_for_heap(args.jobs, TOP100_HEAP_MB) + if jobs < args.jobs: + print(f"Capping workers {args.jobs} -> {jobs}: {TOP100_HEAP_MB} MB heap " + f"x {args.jobs} workers exceeds available RAM.") - # Report - Reporter.print_summary(results) + results, wall = run_suite(tests, jobs, args.timeout, child_env) + Reporter.print_summary(results, wall) - # Exit code: 0 if no unexpected failures unexpected = [r for r in results if r.status in (TestStatus.FAIL, TestStatus.TIMEOUT)] sys.exit(1 if unexpected else 0) From 06ae8f51523af9268e4485e407485f5e7b4831de Mon Sep 17 00:00:00 2001 From: Daniel Nezamabadi <55559979+dnezam@users.noreply.github.com> Date: Tue, 16 Jun 2026 13:51:44 +0800 Subject: [PATCH 2/3] Use two cores for standard regression testing --- test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.sh b/test.sh index 5a09811f..67c31e26 100755 --- a/test.sh +++ b/test.sh @@ -5,4 +5,4 @@ set -euo pipefail ./build-instructions.sh # Run regression suite -python candle/regression.py +python candle/regression.py -j2 From 30a61939ce984161115277c4c2ee11f444f6e8db Mon Sep 17 00:00:00 2001 From: Daniel Nezamabadi <55559979+dnezam@users.noreply.github.com> Date: Tue, 16 Jun 2026 14:26:13 +0800 Subject: [PATCH 3/3] build-instructions.sh: python -> python3 Seems like Debian does not have this symlink by default. --- build-instructions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-instructions.sh b/build-instructions.sh index ade35db3..c1824bc6 100755 --- a/build-instructions.sh +++ b/build-instructions.sh @@ -26,7 +26,7 @@ make ./cake --types < /dev/null > types.txt 2>&1 # Generate candle_insulate.ml -python ../insulate.py types.txt insulate.ml +python3 ../insulate.py types.txt insulate.ml # The working directory of the binary will be CANDLE_ROOT/candle/build, # so it needs to change directory to CANDLE_ROOT after booting..