Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions crates/ruvector-hailo-cluster/proto/embedding.proto
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,12 @@ message StatsResponse {
// both as Prometheus gauges so a sudden spike in denials is grep-able.
uint64 rate_limit_denials = 8; // ResourceExhausted returned since boot
uint64 rate_limit_tracked_peers = 9; // distinct peers seen since boot
// Iter 257 — surface RUVECTOR_NPU_POOL_SIZE the worker resolved at
// startup. Lets the cluster-side stats CLI + bench --prom output
// differentiate "single-pipeline worker" vs "pool=N worker" measurements.
// 1 = single-pipeline default (iter-235 baseline); >=2 enables the
// iter-237 HefEmbedderPool. Backward-compatible proto3 add: old
// clients see this as 0 ("unknown"), new clients see the resolved
// value.
uint32 npu_pool_size = 10;
}
25 changes: 23 additions & 2 deletions crates/ruvector-hailo-cluster/src/bin/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,9 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
}

let cluster = Arc::new({
let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint)?;
// Iter 256 — clone fingerprint so the original String stays
// available for the BenchSummary's `fingerprint` label later.
let c = HailoClusterEmbedder::new(workers, transport, dim, fingerprint.clone())?;
match (cache_cap, cache_ttl_secs) {
(0, _) => c,
(cap, 0) => c.with_cache(cap),
Expand Down Expand Up @@ -654,6 +656,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
samples: all_samples.len(),
concurrency,
cache: cache_stats,
// Iter 256 — surface the resolved fingerprint as a
// prom label. Empty string when --allow-empty-
// fingerprint was set, which renders as
// `fingerprint=""` and stays scrape-stable.
fingerprint: fingerprint.clone(),
},
)?;
if !quiet {
Expand Down Expand Up @@ -682,6 +689,12 @@ struct BenchSummary {
/// `None` when --cache 0; otherwise carries hit/miss/eviction counts
/// so the Prom output reflects what actually happened on the cache.
cache: Option<ruvector_hailo_cluster::cache::CacheStats>,
/// Iter 256 — resolved fingerprint (--fingerprint or
/// --auto-fingerprint result). Empty when neither was set
/// (--allow-empty-fingerprint). Surfaces as a `fingerprint=`
/// label on every prom metric so a CI scrape can alert on
/// per-model regressions instead of a single global series.
fingerprint: String,
}

/// Emit Prometheus textfile-collector format. node_exporter's textfile
Expand All @@ -693,7 +706,15 @@ fn write_prom_textfile(path: &str, s: &BenchSummary) -> std::io::Result<()> {
// races us never sees a half-written file.
let tmp = format!("{}.tmp", path);
let mut f = std::fs::File::create(&tmp)?;
let labels = format!("concurrency=\"{}\"", s.concurrency);
// Iter 256 — fingerprint label on every metric. Empty fingerprint
// (--allow-empty-fingerprint) renders as `fingerprint=""` rather
// than getting omitted, which keeps the label set scrape-stable
// across runs (a Prometheus alert that groups by `fingerprint`
// sees the same dimensionality whether or not enforcement is on).
let labels = format!(
"concurrency=\"{}\",fingerprint=\"{}\"",
s.concurrency, s.fingerprint
);
writeln!(
f,
"# HELP ruvector_hailo_bench_wall_seconds Wall-clock duration of the benchmark run."
Expand Down
11 changes: 11 additions & 0 deletions crates/ruvector-hailo-cluster/src/bin/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,11 @@ struct WorkerService {
/// affecting any legitimate caller (iter-179 streaming sweep
/// peaked at b=16). Env: RUVECTOR_MAX_BATCH_SIZE.
max_batch_size: usize,
/// Iter 257 — resolved NPU pool size (RUVECTOR_NPU_POOL_SIZE).
/// Surfaced via StatsResponse.npu_pool_size so cluster-side
/// observability can differentiate single-pipeline vs pool=N
/// measurements.
npu_pool_size: u32,
/// Process start time, for uptime reporting in GetStats.
start: Instant,
/// Atomic counters surfaced via GetStats.
Expand Down Expand Up @@ -450,6 +455,8 @@ impl Embedding for WorkerService {
uptime_seconds: self.start.elapsed().as_secs(),
rate_limit_denials: self.rate_limit_denials.load(Ordering::Relaxed),
rate_limit_tracked_peers: tracked_peers,
// Iter 257 — surface the resolved RUVECTOR_NPU_POOL_SIZE.
npu_pool_size: self.npu_pool_size,
}))
}
}
Expand Down Expand Up @@ -695,6 +702,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
rate_limiter: Arc::clone(&rate_limiter),
rate_limit_denials: Arc::clone(&rate_limit_denials),
max_batch_size,
// Iter 257 — surface the resolved pool size via gRPC StatsResponse.
// Cast usize → u32 is safe — pool sizes are bounded to single
// digits in practice (RAM cost; see iter-239 measurement table).
npu_pool_size: u32::try_from(npu_pool_size).unwrap_or(u32::MAX),
start: Instant::now(),
embed_ok: AtomicU64::new(0),
embed_err: AtomicU64::new(0),
Expand Down
4 changes: 4 additions & 0 deletions crates/ruvector-hailo-cluster/src/grpc_transport.rs
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,10 @@ impl EmbeddingTransport for GrpcTransport {
uptime: Duration::from_secs(resp.uptime_seconds),
rate_limit_denials: resp.rate_limit_denials,
rate_limit_tracked_peers: resp.rate_limit_tracked_peers,
// Iter 257 — populate from proto. Pre-iter-257 workers
// serialise this as 0 (proto3 default), which the
// consumer renders as "unknown pool size" / "old worker".
npu_pool_size: resp.npu_pool_size,
})
})
}
Expand Down
6 changes: 6 additions & 0 deletions crates/ruvector-hailo-cluster/src/transport.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,12 @@ pub struct StatsSnapshot {
/// since boot. 0 = limiter disabled.
#[serde(default)]
pub rate_limit_tracked_peers: u64,
/// Iter 257 — RUVECTOR_NPU_POOL_SIZE the worker resolved at startup.
/// 1 = single-pipeline default (iter-235 baseline); >=2 = pool=N
/// (iter-237 HefEmbedderPool). 0 = old worker without the field
/// populated (pre-iter-257).
#[serde(default)]
pub npu_pool_size: u32,
}

fn serialize_duration_us<S: serde::Serializer>(
Expand Down
11 changes: 9 additions & 2 deletions crates/ruvector-hailo-cluster/tests/bench_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,16 @@ fn bench_cli_prom_file_contains_throughput_metric() {
"missing HELP, got: {}",
prom_body
);
// Iter 256 — added `fingerprint` label alongside `concurrency`.
// Empty string here because this test uses --allow-empty-fingerprint
// (passed implicitly via the worker test fixture). The label being
// present (even empty) is the contract — Prometheus alerts grouping
// by `fingerprint` should see a stable label set across runs.
assert!(
prom_body.contains("ruvector_hailo_bench_throughput_per_second{concurrency=\"2\"}"),
"missing throughput metric with concurrency label, got: {}",
prom_body.contains(
"ruvector_hailo_bench_throughput_per_second{concurrency=\"2\",fingerprint=\"\"}"
),
"missing throughput metric with concurrency+fingerprint labels, got: {}",
prom_body
);
}
Expand Down
2 changes: 2 additions & 0 deletions docs/adr/ADR-176-hef-integration-epic.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ phases shipped + hardware-validated end-to-end on cognitum-v0 (Pi 5
| P5b | 168 | Cache + NPU bench — 100% hit ⇒ **15.86 M/sec** (226,000×) |
| P5b | 169 | HEF release + `download-encoder-hef.sh` (adoption unblocked) |
| P5b | 170 | Saturation test C=100 60s — **no OOM, tonic backpressure works** |
| P6 | 234-237 | `HefEmbedderPool` (multi-pipeline) — **measured: NPU-bound 70 RPS ceiling holds across pool sizes** but pool=2 cuts p50 23% under multi-bridge concurrent load. iter-237 deploy default pool=2 |
| P6 | 256-257 | bench `--prom` carries `fingerprint` label; StatsResponse exposes `npu_pool_size` for cluster-side observability |

**Real Pi 5 measurements** (cluster-bench, concurrency=4, 15s,
HEF worker on 50051 via systemd):
Expand Down
24 changes: 21 additions & 3 deletions docs/adr/ADR-178-ruvector-ruview-hailo-integration-gap-analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,27 @@ branch: hailo-backend

## Status

**Proposed.** Planning ADR. No code lands here — output is a graded gap
inventory plus a remediation plan sized to the existing iter cadence
(213 iters across ~5 days).
**Closed (iter 257).** All HIGH+MEDIUM gaps remediated; G (Pi 4
measurement) deferred without a Pi 4 in lab; long-form C/D (CSI
pose semantics + downstream cluster consumer) tracked as separate
multi-month ADRs out of this branch's scope.

| Gap | Severity | Status | Closed by |
|-----|----------|--------|-----------|
| A — ruvllm-bridge no deploy artifacts | HIGH | closed | iter 215 |
| B — `EmbeddingProvider` not impl'd | HIGH | closed | iter 218 (path dep + impl) |
| C — CSI bridge dropping I/Q (short) | MEDIUM | closed | iter 217 (doc-only) |
| C — CSI bridge dropping I/Q (long) | MEDIUM | tracked separately | future ADR |
| D — no downstream cluster consumer (short) | MEDIUM | closed | iter 221 (example) |
| D — mcp-brain client (long) | MEDIUM | tracked separately | future ADR |
| E — hailo crates excluded from workspace | MEDIUM | closed | iter 219 |
| F — ADR-167 status stratigraphy | MEDIUM | closed | iter 217 |
| G — Pi 4 throughput unmeasured | LOW | deferred | needs Pi 4 hardware |
| H — `install-bridge.sh` misnamed | LOW | closed | iter 216 |

Original (planning) text below; output is a graded gap inventory
plus a remediation plan sized to the iter cadence (213 iters
across ~5 days at the time the ADR was first written).

## 1. Context

Expand Down
Loading