From 00c59783de0cc4d7d0b2e62381414cf1f42aa359 Mon Sep 17 00:00:00 2001
From: ruvnet <ruvnet@gmail.com>
Date: Mon, 4 May 2026 10:37:11 -0400
Subject: [PATCH 1/2] feat(hailo): add `fingerprint` label to bench --prom
 output (iter 256)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bench's textfile-collector output carried only `concurrency` as a
label, so a Prometheus alert grouping by series couldn't tell a
genuine throughput regression apart from a model swap. The
fingerprint *was* recorded by the bench (--auto-fingerprint
already discovered + printed it to stderr) but never made it to
the prom labels.

Now every metric carries `concurrency="N",fingerprint="<hex>"`.
Empty fingerprint (--allow-empty-fingerprint) renders as
`fingerprint=""` rather than getting dropped, so the label set
stays scrape-stable whether or not enforcement is on.

Example output (iter 256, cognitum-v0):

  ruvector_hailo_bench_throughput_per_second{concurrency="2",fingerprint="9c56e5965aea9afd99ad51826805f1be01bb0ea3301aafb74982e29e3b9cf3fa"} 70.712

Now `rate(ruvector_hailo_bench_throughput_per_second[1h]) by (fingerprint)`
gives one series per model — a 9c56...-deploy throughput drop is a
real regression, while a fingerprint change is a deploy event the
operator already knew about.

# What ships
- BenchSummary gains a `fingerprint: String` field, populated from
  the resolved fingerprint (whatever --fingerprint or
  --auto-fingerprint produced).
- write_prom_textfile renders it on every metric.
- bench_cli_prom_file_contains_throughput_metric updated to lock
  the new label format so a future regression surfaces in CI.

Local verification:
  cargo test -p ruvector-hailo-cluster --test bench_cli (6 passed)
  cargo clippy --all-targets -- -D warnings (clean)

Co-Authored-By: claude-flow <ruv@ruv.net>
---
 .../ruvector-hailo-cluster/src/bin/bench.rs   | 25 +++++++++++++++++--
 .../ruvector-hailo-cluster/tests/bench_cli.rs | 11 ++++++--
 2 files changed, 32 insertions(+), 4 deletions(-)
diff --git a/crates/ruvector-hailo-cluster/src/bin/bench.rs b/crates/ruvector-hailo-cluster/src/bin/bench.rs
index 2a1f89b9d..43a069519 100644
--- a/crates/ruvector-hailo-cluster/src/bin/bench.rs
+++ b/crates/ruvector-hailo-cluster/src/bin/bench.rs
@@ -393,7 +393,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
 
     let cluster = Arc::new({
-        let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint)?;
+        // Iter 256 — clone fingerprint so the original String stays
+        // available for the BenchSummary's `fingerprint` label later.
+        let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint.clone())?;
         match (cache_cap, cache_ttl_secs) {
             (0, _) => c,
             (cap, 0) => c.with_cache(cap),
@@ -654,6 +656,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 samples: all_samples.len(),
                 concurrency,
                 cache: cache_stats,
+                // Iter 256 — surface the resolved fingerprint as a
+                // prom label. Empty string when --allow-empty-
+                // fingerprint was set, which renders as
+                // `fingerprint=""` and stays scrape-stable.
+                fingerprint: fingerprint.clone(),
             },
         )?;
         if !quiet {
@@ -682,6 +689,12 @@ struct BenchSummary {
     /// `None` when --cache 0; otherwise carries hit/miss/eviction counts
     /// so the Prom output reflects what actually happened on the cache.
     cache: Option<ruvector_hailo_cluster::cache::CacheStats>,
+    /// Iter 256 — resolved fingerprint (--fingerprint or
+    /// --auto-fingerprint result). Empty when neither was set
+    /// (--allow-empty-fingerprint). Surfaces as a `fingerprint=`
+    /// label on every prom metric so a CI scrape can alert on
+    /// per-model regressions instead of a single global series.
+    fingerprint: String,
 }
 
 /// Emit Prometheus textfile-collector format. node_exporter's textfile
@@ -693,7 +706,15 @@ fn write_prom_textfile(path: &str, s: &BenchSummary) -> std::io::Result<()> {
     // races us never sees a half-written file.
     let tmp = format!("{}.tmp", path);
     let mut f = std::fs::File::create(&tmp)?;
-    let labels = format!("concurrency=\"{}\"", s.concurrency);
+    // Iter 256 — fingerprint label on every metric. Empty fingerprint
+    // (--allow-empty-fingerprint) renders as `fingerprint=""` rather
+    // than getting omitted, which keeps the label set scrape-stable
+    // across runs (a Prometheus alert that groups by `fingerprint`
+    // sees the same dimensionality whether or not enforcement is on).
+    let labels = format!(
+        "concurrency=\"{}\",fingerprint=\"{}\"",
+        s.concurrency, s.fingerprint
+    );
     writeln!(
         f,
         "# HELP ruvector_hailo_bench_wall_seconds Wall-clock duration of the benchmark run."
diff --git a/crates/ruvector-hailo-cluster/tests/bench_cli.rs b/crates/ruvector-hailo-cluster/tests/bench_cli.rs
index 16fadf5a6..b08b4dc97 100644
--- a/crates/ruvector-hailo-cluster/tests/bench_cli.rs
+++ b/crates/ruvector-hailo-cluster/tests/bench_cli.rs
@@ -129,9 +129,16 @@ fn bench_cli_prom_file_contains_throughput_metric() {
         "missing HELP, got: {}",
         prom_body
     );
+    // Iter 256 — added `fingerprint` label alongside `concurrency`.
+    // Empty string here because this test uses --allow-empty-fingerprint
+    // (passed implicitly via the worker test fixture). The label being
+    // present (even empty) is the contract — Prometheus alerts grouping
+    // by `fingerprint` should see a stable label set across runs.
     assert!(
-        prom_body.contains("ruvector_hailo_bench_throughput_per_second{concurrency=\"2\"}"),
-        "missing throughput metric with concurrency label, got: {}",
+        prom_body.contains(
+            "ruvector_hailo_bench_throughput_per_second{concurrency=\"2\",fingerprint=\"\"}"
+        ),
+        "missing throughput metric with concurrency+fingerprint labels, got: {}",
         prom_body
     );
 }

From 3d59d3a688dea75cb6b609c13406ecd70c46492d Mon Sep 17 00:00:00 2001
From: ruvnet <ruvnet@gmail.com>
Date: Mon, 4 May 2026 10:45:45 -0400
Subject: [PATCH 2/2] feat(hailo): expose npu_pool_size via StatsResponse + ADR
 refresh (iter 257)

Surface the resolved RUVECTOR_NPU_POOL_SIZE through the gRPC
StatsResponse so cluster-side observability can differentiate
single-pipeline vs pool=N measurements.

# Proto change (backward-compatible)
StatsResponse gains `uint32 npu_pool_size = 10`. Old workers
send 0 (proto3 default), which clients render as "unknown / pre-
iter-257"; new workers send the resolved value (1, 2, 4, ...).

# Wire-through
- worker.rs: WorkerService.npu_pool_size populated from the env
  var at startup, surfaced via get_stats RPC.
- transport.rs: StatsSnapshot.npu_pool_size field with
  #[serde(default)] so JSON consumers from old workers don't fail.
- grpc_transport.rs: populated from proto resp on stats() RPC.

# ADR refresh (also in this commit)
- ADR-176 (HEF integration EPIC): added P6 row covering iter
  234-237 pool measurement work + iter 256-257 observability layer.
- ADR-178 (gap analysis): bumped Status from Proposed to Closed
  with a per-gap remediation table (8 gaps, 6 closed, 1 deferred,
  2 tracked separately).

Local verification:
  cargo check -p ruvector-hailo-cluster --bins (clean)
  cargo test -p ruvector-hailo-cluster --lib (114 passed)

Co-Authored-By: claude-flow <ruv@ruv.net>
---
 .../proto/embedding.proto                     |  8 +++++++
 .../ruvector-hailo-cluster/src/bin/worker.rs  | 11 +++++++++
 .../src/grpc_transport.rs                     |  4 ++++
 .../ruvector-hailo-cluster/src/transport.rs   |  6 +++++
 docs/adr/ADR-176-hef-integration-epic.md      |  2 ++
 ...r-ruview-hailo-integration-gap-analysis.md | 24 ++++++++++++++++---
 6 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/crates/ruvector-hailo-cluster/proto/embedding.proto b/crates/ruvector-hailo-cluster/proto/embedding.proto
index 1e03df6bb..33549095d 100644
--- a/crates/ruvector-hailo-cluster/proto/embedding.proto
+++ b/crates/ruvector-hailo-cluster/proto/embedding.proto
@@ -96,4 +96,12 @@ message StatsResponse {
   // both as Prometheus gauges so a sudden spike in denials is grep-able.
   uint64 rate_limit_denials       = 8;  // ResourceExhausted returned since boot
   uint64 rate_limit_tracked_peers = 9;  // distinct peers seen since boot
+  // Iter 257 — surface RUVECTOR_NPU_POOL_SIZE the worker resolved at
+  // startup. Lets the cluster-side stats CLI + bench --prom output
+  // differentiate "single-pipeline worker" vs "pool=N worker" measurements.
+  // 1 = single-pipeline default (iter-235 baseline); >=2 enables the
+  // iter-237 HefEmbedderPool. Backward-compatible proto3 add: old
+  // clients see this as 0 ("unknown"), new clients see the resolved
+  // value.
+  uint32 npu_pool_size            = 10;
 }
diff --git a/crates/ruvector-hailo-cluster/src/bin/worker.rs b/crates/ruvector-hailo-cluster/src/bin/worker.rs
index b0e822c66..30bf829f0 100644
--- a/crates/ruvector-hailo-cluster/src/bin/worker.rs
+++ b/crates/ruvector-hailo-cluster/src/bin/worker.rs
@@ -213,6 +213,11 @@ struct WorkerService {
     /// affecting any legitimate caller (iter-179 streaming sweep
     /// peaked at b=16). Env: RUVECTOR_MAX_BATCH_SIZE.
     max_batch_size: usize,
+    /// Iter 257 — resolved NPU pool size (RUVECTOR_NPU_POOL_SIZE).
+    /// Surfaced via StatsResponse.npu_pool_size so cluster-side
+    /// observability can differentiate single-pipeline vs pool=N
+    /// measurements.
+    npu_pool_size: u32,
     /// Process start time, for uptime reporting in GetStats.
     start: Instant,
     /// Atomic counters surfaced via GetStats.
@@ -450,6 +455,8 @@ impl Embedding for WorkerService {
             uptime_seconds: self.start.elapsed().as_secs(),
             rate_limit_denials: self.rate_limit_denials.load(Ordering::Relaxed),
             rate_limit_tracked_peers: tracked_peers,
+            // Iter 257 — surface the resolved RUVECTOR_NPU_POOL_SIZE.
+            npu_pool_size: self.npu_pool_size,
         }))
     }
 }
@@ -695,6 +702,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         rate_limiter: Arc::clone(&rate_limiter),
         rate_limit_denials: Arc::clone(&rate_limit_denials),
         max_batch_size,
+        // Iter 257 — surface the resolved pool size via gRPC StatsResponse.
+        // Cast usize → u32 is safe — pool sizes are bounded to single
+        // digits in practice (RAM cost; see iter-239 measurement table).
+        npu_pool_size: u32::try_from(npu_pool_size).unwrap_or(u32::MAX),
         start: Instant::now(),
         embed_ok: AtomicU64::new(0),
         embed_err: AtomicU64::new(0),
diff --git a/crates/ruvector-hailo-cluster/src/grpc_transport.rs b/crates/ruvector-hailo-cluster/src/grpc_transport.rs
index 4a07970d6..8a85c9a2c 100644
--- a/crates/ruvector-hailo-cluster/src/grpc_transport.rs
+++ b/crates/ruvector-hailo-cluster/src/grpc_transport.rs
@@ -343,6 +343,10 @@ impl EmbeddingTransport for GrpcTransport {
                 uptime: Duration::from_secs(resp.uptime_seconds),
                 rate_limit_denials: resp.rate_limit_denials,
                 rate_limit_tracked_peers: resp.rate_limit_tracked_peers,
+                // Iter 257 — populate from proto. Pre-iter-257 workers
+                // serialise this as 0 (proto3 default), which the
+                // consumer renders as "unknown pool size" / "old worker".
+                npu_pool_size: resp.npu_pool_size,
             })
         })
     }
diff --git a/crates/ruvector-hailo-cluster/src/transport.rs b/crates/ruvector-hailo-cluster/src/transport.rs
index 095866455..b4676ff46 100644
--- a/crates/ruvector-hailo-cluster/src/transport.rs
+++ b/crates/ruvector-hailo-cluster/src/transport.rs
@@ -144,6 +144,12 @@ pub struct StatsSnapshot {
     /// since boot. 0 = limiter disabled.
     #[serde(default)]
     pub rate_limit_tracked_peers: u64,
+    /// Iter 257 — RUVECTOR_NPU_POOL_SIZE the worker resolved at startup.
+    /// 1 = single-pipeline default (iter-235 baseline); >=2 = pool=N
+    /// (iter-237 HefEmbedderPool). 0 = old worker without the field
+    /// populated (pre-iter-257).
+    #[serde(default)]
+    pub npu_pool_size: u32,
 }
 
 fn serialize_duration_us<S: serde::Serializer>(
diff --git a/docs/adr/ADR-176-hef-integration-epic.md b/docs/adr/ADR-176-hef-integration-epic.md
index 059cd4372..0e04fa7b3 100644
--- a/docs/adr/ADR-176-hef-integration-epic.md
+++ b/docs/adr/ADR-176-hef-integration-epic.md
@@ -27,6 +27,8 @@ phases shipped + hardware-validated end-to-end on cognitum-v0 (Pi 5
 | P5b | 168 | Cache + NPU bench — 100% hit ⇒ **15.86 M/sec** (226,000×) |
 | P5b | 169 | HEF release + `download-encoder-hef.sh` (adoption unblocked) |
 | P5b | 170 | Saturation test C=100 60s — **no OOM, tonic backpressure works** |
+| P6  | 234-237 | `HefEmbedderPool` (multi-pipeline) — **measured: NPU-bound 70 RPS ceiling holds across pool sizes** but pool=2 cuts p50 23% under multi-bridge concurrent load. iter-237 deploy default pool=2 |
+| P6  | 256-257 | bench `--prom` carries `fingerprint` label; StatsResponse exposes `npu_pool_size` for cluster-side observability |
 
 **Real Pi 5 measurements** (cluster-bench, concurrency=4, 15s,
 HEF worker on 50051 via systemd):
diff --git a/docs/adr/ADR-178-ruvector-ruview-hailo-integration-gap-analysis.md b/docs/adr/ADR-178-ruvector-ruview-hailo-integration-gap-analysis.md
index b86bf64ad..c2b60f1e4 100644
--- a/docs/adr/ADR-178-ruvector-ruview-hailo-integration-gap-analysis.md
+++ b/docs/adr/ADR-178-ruvector-ruview-hailo-integration-gap-analysis.md
@@ -12,9 +12,27 @@ branch: hailo-backend
 
 ## Status
 
-**Proposed.** Planning ADR. No code lands here — output is a graded gap
-inventory plus a remediation plan sized to the existing iter cadence
-(213 iters across ~5 days).
+**Closed (iter 257).** All HIGH+MEDIUM gaps remediated; G (Pi 4
+measurement) deferred without a Pi 4 in lab; long-form C/D (CSI
+pose semantics + downstream cluster consumer) tracked as separate
+multi-month ADRs out of this branch's scope.
+
+| Gap | Severity | Status | Closed by |
+|-----|----------|--------|-----------|
+| A — ruvllm-bridge no deploy artifacts | HIGH | closed | iter 215 |
+| B — `EmbeddingProvider` not impl'd | HIGH | closed | iter 218 (path dep + impl) |
+| C — CSI bridge dropping I/Q (short) | MEDIUM | closed | iter 217 (doc-only) |
+| C — CSI bridge dropping I/Q (long) | MEDIUM | tracked separately | future ADR |
+| D — no downstream cluster consumer (short) | MEDIUM | closed | iter 221 (example) |
+| D — mcp-brain client (long) | MEDIUM | tracked separately | future ADR |
+| E — hailo crates excluded from workspace | MEDIUM | closed | iter 219 |
+| F — ADR-167 status stratigraphy | MEDIUM | closed | iter 217 |
+| G — Pi 4 throughput unmeasured | LOW | deferred | needs Pi 4 hardware |
+| H — `install-bridge.sh` misnamed | LOW | closed | iter 216 |
+
+Original (planning) text below; output is a graded gap inventory
+plus a remediation plan sized to the iter cadence (213 iters
+across ~5 days at the time the ADR was first written).
 
 ## 1. Context