ruvnet · ruvnet · May 4, 2026 · May 4, 2026 · May 4, 2026
diff --git a/crates/ruvector-hailo-cluster/proto/embedding.proto b/crates/ruvector-hailo-cluster/proto/embedding.proto
@@ -96,4 +96,12 @@ message StatsResponse {
   // both as Prometheus gauges so a sudden spike in denials is grep-able.
   uint64 rate_limit_denials       = 8;  // ResourceExhausted returned since boot
   uint64 rate_limit_tracked_peers = 9;  // distinct peers seen since boot
+  // Iter 257 — surface RUVECTOR_NPU_POOL_SIZE the worker resolved at
+  // startup. Lets the cluster-side stats CLI + bench --prom output
+  // differentiate "single-pipeline worker" vs "pool=N worker" measurements.
+  // 1 = single-pipeline default (iter-235 baseline); >=2 enables the
+  // iter-237 HefEmbedderPool. Backward-compatible proto3 add: old
+  // clients see this as 0 ("unknown"), new clients see the resolved
+  // value.
+  uint32 npu_pool_size            = 10;
 }
diff --git a/crates/ruvector-hailo-cluster/src/bin/bench.rs b/crates/ruvector-hailo-cluster/src/bin/bench.rs
@@ -393,7 +393,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
 
     let cluster = Arc::new({
-        let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint)?;
+        // Iter 256 — clone fingerprint so the original String stays
+        // available for the BenchSummary's `fingerprint` label later.
+        let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint.clone())?;
         match (cache_cap, cache_ttl_secs) {
             (0, _) => c,
             (cap, 0) => c.with_cache(cap),
@@ -654,6 +656,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 samples: all_samples.len(),
                 concurrency,
                 cache: cache_stats,
+                // Iter 256 — surface the resolved fingerprint as a
+                // prom label. Empty string when --allow-empty-
+                // fingerprint was set, which renders as
+                // `fingerprint=""` and stays scrape-stable.
+                fingerprint: fingerprint.clone(),
             },
         )?;
         if !quiet {
@@ -682,6 +689,12 @@ struct BenchSummary {
     /// `None` when --cache 0; otherwise carries hit/miss/eviction counts
     /// so the Prom output reflects what actually happened on the cache.
     cache: Option<ruvector_hailo_cluster::cache::CacheStats>,
+    /// Iter 256 — resolved fingerprint (--fingerprint or
+    /// --auto-fingerprint result). Empty when neither was set
+    /// (--allow-empty-fingerprint). Surfaces as a `fingerprint=`
+    /// label on every prom metric so a CI scrape can alert on
+    /// per-model regressions instead of a single global series.
+    fingerprint: String,
 }
 
 /// Emit Prometheus textfile-collector format. node_exporter's textfile
@@ -693,7 +706,15 @@ fn write_prom_textfile(path: &str, s: &BenchSummary) -> std::io::Result<()> {
     // races us never sees a half-written file.
     let tmp = format!("{}.tmp", path);
     let mut f = std::fs::File::create(&tmp)?;
-    let labels = format!("concurrency=\"{}\"", s.concurrency);
+    // Iter 256 — fingerprint label on every metric. Empty fingerprint
+    // (--allow-empty-fingerprint) renders as `fingerprint=""` rather
+    // than getting omitted, which keeps the label set scrape-stable
+    // across runs (a Prometheus alert that groups by `fingerprint`
+    // sees the same dimensionality whether or not enforcement is on).
+    let labels = format!(
+        "concurrency=\"{}\",fingerprint=\"{}\"",
+        s.concurrency, s.fingerprint
+    );
     writeln!(
         f,
         "# HELP ruvector_hailo_bench_wall_seconds Wall-clock duration of the benchmark run."

diff --git a/crates/ruvector-hailo-cluster/src/bin/worker.rs b/crates/ruvector-hailo-cluster/src/bin/worker.rs
@@ -213,6 +213,11 @@ struct WorkerService {
     /// affecting any legitimate caller (iter-179 streaming sweep
     /// peaked at b=16). Env: RUVECTOR_MAX_BATCH_SIZE.
     max_batch_size: usize,
+    /// Iter 257 — resolved NPU pool size (RUVECTOR_NPU_POOL_SIZE).
+    /// Surfaced via StatsResponse.npu_pool_size so cluster-side
+    /// observability can differentiate single-pipeline vs pool=N
+    /// measurements.
+    npu_pool_size: u32,
     /// Process start time, for uptime reporting in GetStats.
     start: Instant,
     /// Atomic counters surfaced via GetStats.
@@ -450,6 +455,8 @@ impl Embedding for WorkerService {
             uptime_seconds: self.start.elapsed().as_secs(),
             rate_limit_denials: self.rate_limit_denials.load(Ordering::Relaxed),
             rate_limit_tracked_peers: tracked_peers,
+            // Iter 257 — surface the resolved RUVECTOR_NPU_POOL_SIZE.
+            npu_pool_size: self.npu_pool_size,
         }))
     }
 }
@@ -695,6 +702,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         rate_limiter: Arc::clone(&rate_limiter),
         rate_limit_denials: Arc::clone(&rate_limit_denials),
         max_batch_size,
+        // Iter 257 — surface the resolved pool size via gRPC StatsResponse.
+        // Cast usize → u32 is safe — pool sizes are bounded to single
+        // digits in practice (RAM cost; see iter-239 measurement table).
+        npu_pool_size: u32::try_from(npu_pool_size).unwrap_or(u32::MAX),
         start: Instant::now(),
         embed_ok: AtomicU64::new(0),
         embed_err: AtomicU64::new(0),

diff --git a/crates/ruvector-hailo-cluster/src/grpc_transport.rs b/crates/ruvector-hailo-cluster/src/grpc_transport.rs
@@ -343,6 +343,10 @@ impl EmbeddingTransport for GrpcTransport {
                 uptime: Duration::from_secs(resp.uptime_seconds),
                 rate_limit_denials: resp.rate_limit_denials,
                 rate_limit_tracked_peers: resp.rate_limit_tracked_peers,
+                // Iter 257 — populate from proto. Pre-iter-257 workers
+                // serialise this as 0 (proto3 default), which the
+                // consumer renders as "unknown pool size" / "old worker".
+                npu_pool_size: resp.npu_pool_size,
             })
         })
     }

diff --git a/crates/ruvector-hailo-cluster/src/transport.rs b/crates/ruvector-hailo-cluster/src/transport.rs
@@ -144,6 +144,12 @@ pub struct StatsSnapshot {
     /// since boot. 0 = limiter disabled.
     #[serde(default)]
     pub rate_limit_tracked_peers: u64,
+    /// Iter 257 — RUVECTOR_NPU_POOL_SIZE the worker resolved at startup.
+    /// 1 = single-pipeline default (iter-235 baseline); >=2 = pool=N
+    /// (iter-237 HefEmbedderPool). 0 = old worker without the field
+    /// populated (pre-iter-257).
+    #[serde(default)]
+    pub npu_pool_size: u32,
 }
 
 fn serialize_duration_us<S: serde::Serializer>(

diff --git a/crates/ruvector-hailo-cluster/tests/bench_cli.rs b/crates/ruvector-hailo-cluster/tests/bench_cli.rs
@@ -129,9 +129,16 @@ fn bench_cli_prom_file_contains_throughput_metric() {
         "missing HELP, got: {}",
         prom_body
     );
+    // Iter 256 — added `fingerprint` label alongside `concurrency`.
+    // Empty string here because this test uses --allow-empty-fingerprint
+    // (passed implicitly via the worker test fixture). The label being
+    // present (even empty) is the contract — Prometheus alerts grouping
+    // by `fingerprint` should see a stable label set across runs.
     assert!(
-        prom_body.contains("ruvector_hailo_bench_throughput_per_second{concurrency=\"2\"}"),
-        "missing throughput metric with concurrency label, got: {}",
+        prom_body.contains(
+            "ruvector_hailo_bench_throughput_per_second{concurrency=\"2\",fingerprint=\"\"}"
+        ),
+        "missing throughput metric with concurrency+fingerprint labels, got: {}",
         prom_body
     );
 }

diff --git a/docs/adr/ADR-176-hef-integration-epic.md b/docs/adr/ADR-176-hef-integration-epic.md
@@ -27,6 +27,8 @@ phases shipped + hardware-validated end-to-end on cognitum-v0 (Pi 5
 | P5b | 168 | Cache + NPU bench — 100% hit ⇒ **15.86 M/sec** (226,000×) |
 | P5b | 169 | HEF release + `download-encoder-hef.sh` (adoption unblocked) |
 | P5b | 170 | Saturation test C=100 60s — **no OOM, tonic backpressure works** |
+| P6  | 234-237 | `HefEmbedderPool` (multi-pipeline) — **measured: NPU-bound 70 RPS ceiling holds across pool sizes** but pool=2 cuts p50 23% under multi-bridge concurrent load. iter-237 deploy default pool=2 |
+| P6  | 256-257 | bench `--prom` carries `fingerprint` label; StatsResponse exposes `npu_pool_size` for cluster-side observability |
 
 **Real Pi 5 measurements** (cluster-bench, concurrency=4, 15s,
 HEF worker on 50051 via systemd):

diff --git a/docs/adr/ADR-178-ruvector-ruview-hailo-integration-gap-analysis.md b/docs/adr/ADR-178-ruvector-ruview-hailo-integration-gap-analysis.md
@@ -12,9 +12,27 @@ branch: hailo-backend
 
 ## Status
 
-**Proposed.** Planning ADR. No code lands here — output is a graded gap
-inventory plus a remediation plan sized to the existing iter cadence
-(213 iters across ~5 days).
+**Closed (iter 257).** All HIGH+MEDIUM gaps remediated; G (Pi 4
+measurement) deferred without a Pi 4 in lab; long-form C/D (CSI
+pose semantics + downstream cluster consumer) tracked as separate
+multi-month ADRs out of this branch's scope.
+
+| Gap | Severity | Status | Closed by |
+|-----|----------|--------|-----------|
+| A — ruvllm-bridge no deploy artifacts | HIGH | closed | iter 215 |
+| B — `EmbeddingProvider` not impl'd | HIGH | closed | iter 218 (path dep + impl) |
+| C — CSI bridge dropping I/Q (short) | MEDIUM | closed | iter 217 (doc-only) |
+| C — CSI bridge dropping I/Q (long) | MEDIUM | tracked separately | future ADR |
+| D — no downstream cluster consumer (short) | MEDIUM | closed | iter 221 (example) |
+| D — mcp-brain client (long) | MEDIUM | tracked separately | future ADR |
+| E — hailo crates excluded from workspace | MEDIUM | closed | iter 219 |
+| F — ADR-167 status stratigraphy | MEDIUM | closed | iter 217 |
+| G — Pi 4 throughput unmeasured | LOW | deferred | needs Pi 4 hardware |
+| H — `install-bridge.sh` misnamed | LOW | closed | iter 216 |
+
+Original (planning) text below; output is a graded gap inventory
+plus a remediation plan sized to the iter cadence (213 iters
+across ~5 days at the time the ADR was first written).
 
 ## 1. Context