fix(diagnostics): count all token types (input, output, cached, reasoning) (#213)

devin-ai-integration[bot] · dcramer · web-flow · commit b1bc92c7cff4 · 2026-04-17T15:44:54.000-07:00
## Summary The turn-diagnostics usage extractor was under-counting tokens for two reasons: 1. The key-alias list only recognised `input_tokens`/`output_tokens`/`total_tokens` style names, so the pi-ai `AssistantMessage.usage` shape (`input`, `output`, `cacheRead`, `cacheWrite`, `totalTokens`) was only matching on `totalTokens`. Cache-read, cache-write, and reasoning tokens were dropped on the floor. 2. When a turn produced multiple assistant messages (tool calls → another model call → final answer), the extractor used `.find((v) => v !== undefined)` and took the **first** message's usage instead of summing across the turn. The Slack footer also computed total tokens as `inputTokens + outputTokens` only, which missed cached/cache-creation/reasoning tokens even when individual counters were available. ### Changes - `packages/junior/src/chat/usage.ts` — extend `AgentTurnUsage` with `cachedInputTokens`, `cacheCreationTokens`, and `reasoningTokens`. Diagnostics now carry every counter the provider reports as its own field so renderers can choose how to present them. - `packages/junior/src/chat/logging.ts` — `extractGenAiUsageSummary` now: - recognises pi-ai aliases (`input`, `output`, `cacheRead`, `cacheWrite`) alongside the previous OpenAI/Anthropic/Gemini aliases; - extracts each field per-source and **sums across sources**, so multi-message turns report aggregate usage. - `packages/junior/src/chat/slack/footer.ts` — render the `Tokens` footer item as the sum of every reported component counter (`input + output + cachedInput + cacheCreation + reasoning`). Falls back to `totalTokens` only when no component counters were reported, since providers disagree on whether `totalTokens` includes cached tokens. - `packages/junior/src/chat/respond.ts` — detect "has usage" by checking any field instead of hard-coding the old three. - New unit tests in `tests/unit/logging/extract-gen-ai-usage-summary.test.ts` and additional cases in `tests/unit/slack/footer.test.ts`. ## Review & Testing Checklist for Human - [ ] Verify on a real Slack turn that the `Tokens` footer value now reflects cached + cache-creation tokens (e.g. a turn against an Anthropic model that hits prompt caching). - [ ] Confirm downstream consumers of `AssistantReply.diagnostics.usage` (logs, metrics, evals) handle the new optional fields correctly. - [ ] Sanity-check that summing `totalTokens` across sources is acceptable; if any call site currently expects `totalTokens` to be a single-message value rather than a turn aggregate, that assumption changes with this PR. ### Notes - `totalTokens` is still preserved as an individual field. We prefer the sum of component counters when any are present because pi-ai's provider adapters disagree on whether their `totalTokens` already includes `cacheRead` (openai-completions adds it, openai-responses passes the provider value through). Summing components avoids both under- and over-counting. - Reasoning tokens are captured if a provider surfaces them as a top-level `reasoning_tokens`/`reasoningTokens` key. pi-ai currently folds reasoning tokens into `output` for the OpenAI completions path, so `reasoningTokens` will often remain undefined — no double counting. Link to Devin session: https://app.devin.ai/sessions/dcea113d0cba43448157973f8f4b7105 Requested by: @dcramer --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Devin <devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: David Cramer <david@sentry.io>
diff --git a/packages/junior/src/chat/logging.ts b/packages/junior/src/chat/logging.ts
@@ -1780,110 +1780,54 @@ function toFiniteTokenCount(value: unknown): number | undefined {
   return rounded >= 0 ? rounded : undefined;
 }
 
-function readTokenCount(
-  root: Record<string, unknown>,
-  keys: string[],
-): number | undefined {
-  for (const key of keys) {
-    const value = toFiniteTokenCount(root[key]);
-    if (value !== undefined) {
-      return value;
-    }
-  }
-  return undefined;
-}
-
-function collectUsageRoots(source: unknown): Record<string, unknown>[] {
-  const sourceRecord = asRecord(source);
-  if (!sourceRecord) {
-    return [];
-  }
-
-  const roots: Record<string, unknown>[] = [sourceRecord];
-  const usage = asRecord(sourceRecord.usage);
-  if (usage) {
-    roots.push(usage);
-  }
-
-  const tokenUsage = asRecord(sourceRecord.tokenUsage);
-  if (tokenUsage) {
-    roots.push(tokenUsage);
-  }
+// pi-ai `Usage` field name -> our camelCase equivalent. This is the only shape
+// that reaches the extractor today; pi-ai normalizes every provider response
+// into this canonical set before we ever see it.
+const PI_USAGE_FIELDS: ReadonlyArray<[string, keyof AgentTurnUsage]> = [
+  ["input", "inputTokens"],
+  ["output", "outputTokens"],
+  ["cacheRead", "cachedInputTokens"],
+  ["cacheWrite", "cacheCreationTokens"],
+  ["totalTokens", "totalTokens"],
+];
 
-  const providerMetadata = asRecord(sourceRecord.providerMetadata);
-  if (providerMetadata) {
-    roots.push(providerMetadata);
-    const providerUsage = asRecord(providerMetadata.usage);
-    if (providerUsage) {
-      roots.push(providerUsage);
-    }
+function readPiUsage(source: unknown): AgentTurnUsage {
+  const record = asRecord(source);
+  if (!record) {
+    return {};
   }
-
-  const response = asRecord(sourceRecord.response);
-  if (response) {
-    roots.push(response);
-    const responseUsage = asRecord(response.usage);
-    if (responseUsage) {
-      roots.push(responseUsage);
+  // Accept either a pi-ai AssistantMessage (has `.usage`) or a bare Usage record.
+  const usage = asRecord(record.usage) ?? record;
+  const summary: AgentTurnUsage = {};
+  for (const [piKey, ourKey] of PI_USAGE_FIELDS) {
+    const value = toFiniteTokenCount(usage[piKey]);
+    if (value !== undefined) {
+      summary[ourKey] = value;
     }
   }
-
-  return roots;
+  return summary;
 }
 
-/** Extract a structured token-usage summary from provider metadata roots. */
+/**
+ * Sum pi-ai `Usage` counters across every source into an `AgentTurnUsage`.
+ *
+ * Callers pass every assistant message produced during a turn so the result
+ * reflects the aggregate usage for the entire turn rather than a single model
+ * call. Sources without a recognized usage record contribute nothing.
+ */
 export function extractGenAiUsageSummary(
   ...sources: unknown[]
 ): AgentTurnUsage {
-  const roots = sources.flatMap((source) => collectUsageRoots(source));
-  if (roots.length === 0) {
-    return {};
+  const summary: AgentTurnUsage = {};
+  for (const source of sources) {
+    const single = readPiUsage(source);
+    for (const field of Object.keys(single) as (keyof AgentTurnUsage)[]) {
+      const value = single[field];
+      if (value === undefined) continue;
+      summary[field] = (summary[field] ?? 0) + value;
+    }
   }
-
-  const inputTokens =
-    roots
-      .map((root) =>
-        readTokenCount(root, [
-          "input_tokens",
-          "inputTokens",
-          "prompt_tokens",
-          "promptTokens",
-          "inputTokenCount",
-          "promptTokenCount",
-        ]),
-      )
-      .find((value) => value !== undefined) ?? undefined;
-
-  const outputTokens =
-    roots
-      .map((root) =>
-        readTokenCount(root, [
-          "output_tokens",
-          "outputTokens",
-          "completion_tokens",
-          "completionTokens",
-          "outputTokenCount",
-          "completionTokenCount",
-        ]),
-      )
-      .find((value) => value !== undefined) ?? undefined;
-
-  const totalTokens =
-    roots
-      .map((root) =>
-        readTokenCount(root, [
-          "total_tokens",
-          "totalTokens",
-          "totalTokenCount",
-        ]),
-      )
-      .find((value) => value !== undefined) ?? undefined;
-
-  return {
-    ...(inputTokens !== undefined ? { inputTokens } : {}),
-    ...(outputTokens !== undefined ? { outputTokens } : {}),
-    ...(totalTokens !== undefined ? { totalTokens } : {}),
-  };
+  return summary;
 }
 
 /** Extract input/output token counts from AI provider usage metadata for tracing. */
diff --git a/packages/junior/src/chat/respond.ts b/packages/junior/src/chat/respond.ts
@@ -864,12 +864,11 @@ export async function generateAssistantReply(
             agent.state,
             ...outputMessages,
           );
-          turnUsage =
-            usageSummary.inputTokens !== undefined ||
-            usageSummary.outputTokens !== undefined ||
-            usageSummary.totalTokens !== undefined
-              ? usageSummary
-              : undefined;
+          turnUsage = Object.values(usageSummary).some(
+            (value) => value !== undefined,
+          )
+            ? usageSummary
+            : undefined;
           setSpanAttributes({
             ...(outputMessagesAttribute
               ? { "gen_ai.output.messages": outputMessagesAttribute }
diff --git a/packages/junior/src/chat/slack/footer.ts b/packages/junior/src/chat/slack/footer.ts
@@ -53,15 +53,26 @@ function formatSlackDuration(durationMs: number): string {
 function resolveTotalTokens(
   usage: AgentTurnUsage | undefined,
 ): number | undefined {
-  if (usage?.totalTokens !== undefined) {
-    return usage.totalTokens;
+  if (!usage) {
+    return undefined;
   }
 
-  if (usage?.inputTokens !== undefined && usage.outputTokens !== undefined) {
-    return usage.inputTokens + usage.outputTokens;
+  // Sum every individual counter the provider reported so cached + cache
+  // creation tokens are included in the displayed total. Provider `totalTokens`
+  // fields are inconsistent across vendors (some exclude cached tokens, some
+  // include them), so prefer the sum when component counts exist.
+  const components = [
+    usage.inputTokens,
+    usage.outputTokens,
+    usage.cachedInputTokens,
+    usage.cacheCreationTokens,
+  ].filter((value): value is number => value !== undefined);
+
+  if (components.length > 0) {
+    return components.reduce((sum, value) => sum + value, 0);
   }
 
-  return undefined;
+  return usage.totalTokens;
 }
 
 /** Build a compact Slack reply footer so operators can correlate visible replies with backend state. */
diff --git a/packages/junior/src/chat/usage.ts b/packages/junior/src/chat/usage.ts
@@ -1,5 +1,20 @@
+/**
+ * Structured token usage captured for a single agent turn.
+ *
+ * Mirrors the fields pi-ai emits on `AssistantMessage.usage` (see
+ * `@mariozechner/pi-ai` `Usage`) so diagnostics carry every counter the
+ * provider normalizes into the pi-ai shape as its own item. Renderers decide
+ * whether to display a breakdown or a single aggregate.
+ */
 export interface AgentTurnUsage {
+  /** Non-cached input tokens (pi-ai subtracts cached tokens from this). */
   inputTokens?: number;
+  /** Output tokens; pi-ai folds reasoning tokens into this for providers that report them. */
   outputTokens?: number;
+  /** Cached input tokens read from the provider's prompt cache. */
+  cachedInputTokens?: number;
+  /** Input tokens written into the provider's prompt cache. */
+  cacheCreationTokens?: number;
+  /** Provider-reported total. May not equal the sum of individual counters across providers. */
   totalTokens?: number;
 }
diff --git a/packages/junior/tests/unit/logging/extract-gen-ai-usage-summary.test.ts b/packages/junior/tests/unit/logging/extract-gen-ai-usage-summary.test.ts
@@ -0,0 +1,99 @@
+import { describe, expect, it } from "vitest";
+import { extractGenAiUsageSummary } from "@/chat/logging";
+
+describe("extractGenAiUsageSummary", () => {
+  it("returns empty object for sources with no usage metadata", () => {
+    expect(extractGenAiUsageSummary({}, undefined, null)).toEqual({});
+  });
+
+  it("captures the pi-ai AssistantMessage.usage shape", () => {
+    const assistantMessage = {
+      role: "assistant",
+      usage: {
+        input: 120,
+        output: 45,
+        cacheRead: 900,
+        cacheWrite: 60,
+        totalTokens: 1125,
+      },
+    };
+
+    expect(extractGenAiUsageSummary(assistantMessage)).toEqual({
+      inputTokens: 120,
+      outputTokens: 45,
+      cachedInputTokens: 900,
+      cacheCreationTokens: 60,
+      totalTokens: 1125,
+    });
+  });
+
+  it("accepts a bare pi-ai Usage record as a source", () => {
+    expect(
+      extractGenAiUsageSummary({
+        input: 10,
+        output: 5,
+        cacheRead: 0,
+        cacheWrite: 0,
+        totalTokens: 15,
+      }),
+    ).toEqual({
+      inputTokens: 10,
+      outputTokens: 5,
+      cachedInputTokens: 0,
+      cacheCreationTokens: 0,
+      totalTokens: 15,
+    });
+  });
+
+  it("sums usage across multiple sources (multi-message turn)", () => {
+    const firstCall = {
+      usage: {
+        input: 100,
+        output: 50,
+        cacheRead: 10,
+        cacheWrite: 0,
+        totalTokens: 160,
+      },
+    };
+    const secondCall = {
+      usage: {
+        input: 200,
+        output: 30,
+        cacheRead: 5,
+        cacheWrite: 0,
+        totalTokens: 235,
+      },
+    };
+
+    expect(extractGenAiUsageSummary(firstCall, secondCall)).toEqual({
+      inputTokens: 300,
+      outputTokens: 80,
+      cachedInputTokens: 15,
+      cacheCreationTokens: 0,
+      totalTokens: 395,
+    });
+  });
+
+  it("ignores sources without a usage record while summing the rest", () => {
+    const emptyAgentState = { messages: [] };
+    const assistantMessage = {
+      usage: {
+        input: 10,
+        output: 2,
+        cacheRead: 0,
+        cacheWrite: 0,
+        totalTokens: 12,
+      },
+    };
+
+    expect(
+      extractGenAiUsageSummary(undefined, emptyAgentState, assistantMessage),
+    ).toEqual({
+      inputTokens: 10,
+      outputTokens: 2,
+      cachedInputTokens: 0,
+      cacheCreationTokens: 0,
+      totalTokens: 12,
+    });
+  });
+});
diff --git a/packages/junior/tests/unit/slack/footer.test.ts b/packages/junior/tests/unit/slack/footer.test.ts
@@ -40,6 +40,42 @@ describe("buildSlackReplyFooter", () => {
   it("omits the footer when no items are available", () => {
     expect(buildSlackReplyFooter({})).toBeUndefined();
   });
+
+  it("sums individual token counters when rendering the Tokens item", () => {
+    expect(
+      buildSlackReplyFooter({
+        usage: {
+          inputTokens: 100,
+          outputTokens: 50,
+          cachedInputTokens: 200,
+          cacheCreationTokens: 10,
+          totalTokens: 9999,
+        },
+      }),
+    ).toEqual({
+      items: [
+        {
+          label: "Tokens",
+          value: "360",
+        },
+      ],
+    });
+  });
+
+  it("falls back to totalTokens when no component counters are reported", () => {
+    expect(
+      buildSlackReplyFooter({
+        usage: { totalTokens: 1234 },
+      }),
+    ).toEqual({
+      items: [
+        {
+          label: "Tokens",
+          value: "1,234",
+        },
+      ],
+    });
+  });
 });
 
 describe("buildSlackReplyBlocks", () => {