JCV's Portfolio

src/voice/voiceGoldenHarness.ts

import { performance } from "node:perf_hooks"; import { appConfig } from "../config.ts"; import { LLMService } from "../llm.ts"; import { ClankerBot } from "../bot.ts"; import { DEFAULT_SETTINGS } from "../settings/settingsSchema.ts"; import { normalizeSettings } from "../store/settingsNormalization.ts"; import { parseBooleanFlag } from "../normalization/valueParsers.ts"; import { WebSearchService } from "../services/search.ts"; import { runJsonJudge } from "../../scripts/replay/core/judge.ts"; import { summarizeNamedMetricRows, type NumericStats } from "../../scripts/replay/core/metrics.ts"; import { formatPct, stableNumber } from "../../scripts/replay/core/utils.ts"; import { VoiceSessionManager } from "./voiceSessionManager.ts"; import { VOICE_RUNTIME_MODES, parseVoiceRuntimeMode } from "./voiceModes.ts"; import { sleep } from "../normalization/time.ts";

export const VOICE_GOLDEN_MODES = VOICE_RUNTIME_MODES;

type VoiceGoldenMode = (typeof VOICE_GOLDEN_MODES)[number]; type VoiceGoldenRunMode = "simulated" | "live";

type VoiceGoldenCase = { id: string; title: string; userText: string; expectedAllow: boolean; expectedResponse?: "non_empty" | "empty"; objective: string; participantCount?: number; participantDisplayNames?: string[]; sessionAgeMs?: number; recentAssistantReplyMs?: number; recentDirectAddressMs?: number; recentDirectAddressUserId?: string; };

type VoiceGoldenJudgeConfig = { enabled: boolean; provider: string; model: string; };

type VoiceGoldenHarnessOptions = { mode?: VoiceGoldenRunMode; modes?: VoiceGoldenMode[]; iterations?: number; actorProvider?: string; actorModel?: string; deciderProvider?: string; deciderModel?: string; judge?: Partial; allowMissingCredentials?: boolean; maxCases?: number; onCaseProgress?: (event: VoiceGoldenCaseProgressEvent) => void; };

type VoiceGoldenResolvedOptions = { mode: VoiceGoldenRunMode; modes: VoiceGoldenMode[]; iterations: number; actorProvider: string; actorModel: string; deciderProvider: string; deciderModel: string; judge: VoiceGoldenJudgeConfig; allowMissingCredentials: boolean; maxCases: number; };

type StageTimings = { totalMs: number; decisionMs: number; connectMs: number; inputPrepMs: number; inputSendMs: number; actorMs: number; asrMs: number; ttsMs: number; outputAsrMs: number; responseMs: number; };

type DecisionResult = { allow: boolean; reason: string; directAddressed: boolean; transcript: string; error: string; };

type ModeExecutionResult = { transcript: string; responseText: string; audioBytes: number; stage: Omit<StageTimings, "totalMs" | "decisionMs">; };

type JudgeResult = { pass: boolean; score: number; confidence: number; summary: string; issues: string[]; rawText: string; };

type VoiceGoldenCaseResult = { mode: VoiceGoldenMode; caseId: string; caseTitle: string; iteration: number; expectedAllow: boolean; decision: DecisionResult; transcript: string; responseText: string; audioBytes: number; timings: StageTimings; pass: boolean; judge: JudgeResult; error: string | null; };

type VoiceGoldenModeReport = { mode: VoiceGoldenMode; skippedReason: string | null; results: VoiceGoldenCaseResult[]; aggregates: { executed: number; passed: number; failed: number; passRate: number; stageStats: Record<string, StageStat>; }; };

type StageStat = NumericStats;

type VoiceGoldenHarnessReport = { startedAt: string; finishedAt: string; options: VoiceGoldenResolvedOptions; modeReports: VoiceGoldenModeReport[]; summary: { executed: number; passed: number; failed: number; passRate: number; stageStats: Record<string, StageStat>; }; };

type VoiceGoldenCaseProgressEvent = { phase: "start" | "done"; mode: VoiceGoldenMode; iteration: number; modeCaseIndex: number; modeCaseCount: number; globalCaseIndex: number; globalCaseCount: number; caseId: string; caseTitle: string; expectedAllow: boolean; pass?: boolean; decisionAllow?: boolean; decisionReason?: string; error?: string | null; durationMs?: number; };

type VoiceGoldenCaseProgressLogger = (event: VoiceGoldenCaseProgressEvent) => void;

function buildVoiceGoldenCaseProgressLine(event: VoiceGoldenCaseProgressEvent): string { const base = [ mode=${event.mode}, iteration=${event.iteration}, modeCase=${event.modeCaseIndex}/${event.modeCaseCount}, globalCase=${event.globalCaseIndex}/${event.globalCaseCount}, case=${event.caseId} ].join(" ");

if (event.phase === "start") { return [voice-golden] start ${base}; }

const durationMs = Math.max(0, Number(event.durationMs || 0)).toFixed(1); return [ "[voice-golden] done", base, pass=${event.pass ? "yes" : "no"}, allow=${event.decisionAllow ? "yes" : "no"}, reason=${event.decisionReason || "none"}, durationMs=${durationMs}, error=${event.error || "none"} ].join(" "); }

export function createVoiceGoldenCaseProgressLogger({ log = (line: string) => { console.log(line); } }: { log?: (line: string) => void; } = {}): VoiceGoldenCaseProgressLogger { return (event) => { log(buildVoiceGoldenCaseProgressLine(event)); }; }

type DecisionLlmTrace = { guildId: string | null; channelId: string | null; userId: string | null; source: string | null; event: string | null; reason: string | null; messageId: string | null; };

type DecisionLlmGeneratePayload = { settings: Record<string, unknown>; systemPrompt: string; userPrompt: string; contextMessages?: unknown[]; trace?: DecisionLlmTrace; jsonSchema?: string; };

type DecisionLlmGenerateResult = { text?: string; provider?: string; model?: string; };

type DecisionLlm = { generate: (payload: DecisionLlmGeneratePayload) => Promise; };

type HarnessStoreAction = { kind?: string; content?: string; metadata?: Record<string, unknown>; usdCost?: number; createdAt?: string; };

class HarnessStore { actions: HarnessStoreAction[];

constructor() { this.actions = []; }

logAction(action: HarnessStoreAction) { this.actions.push({ ...(action || {}), createdAt: String(action?.createdAt || "").trim() || new Date().toISOString() }); }

getSettings() { return { botName: "clanky" }; }

countActionsSince(kind: string, sinceIso: string) { const targetKind = String(kind || "").trim(); const sinceAt = Date.parse(String(sinceIso || "")); if (!targetKind || !Number.isFinite(sinceAt)) return 0;

let count = 0;
for (const action of this.actions) {
  if (String(action?.kind || "") !== targetKind) continue;
  const createdAt = Date.parse(String(action?.createdAt || ""));
  if (!Number.isFinite(createdAt)) continue;
  if (createdAt >= sinceAt) count += 1;
}
return count;

} }

const VOICE_GOLDEN_CASES: VoiceGoldenCase[] = [ { id: "wake-ping", title: "Wake Ping", userText: "yo clanker", expectedAllow: true, objective: "Give a short natural acknowledgement suitable for live voice." }, { id: "wake-ping-laughter-followup", title: "Wake Ping Laughter Followup", userText: "haha", expectedAllow: true, expectedResponse: "empty", objective: "After a recent direct wake ping and bot acknowledgement, skip pure laughter/backchannel follow-up instead of filling space.", participantCount: 1, participantDisplayNames: ["alice"], sessionAgeMs: 8_000, recentAssistantReplyMs: 3_000, recentDirectAddressMs: 4_000, recentDirectAddressUserId: "speaker-1" }, { id: "direct-question", title: "Direct Question", userText: "clanker can you explain in one sentence why rust ownership matters?", expectedAllow: true, objective: "Reply with a short, relevant explanation tied to Rust ownership." }, { id: "merged-name", title: "Merged Name", userText: "clankerconk are you there right now?", expectedAllow: true, objective: "Acknowledge the direct callout and respond briefly." }, { id: "fresh-join-greeting-yo-single", title: "Fresh Join Greeting Yo (Single)", userText: "yo", expectedAllow: true, objective: "Right after join, treat a short greeting as worth a brief acknowledgement.", participantCount: 1, participantDisplayNames: ["alice"], sessionAgeMs: 4_000 }, { id: "fresh-join-greeting-yo-multi", title: "Fresh Join Greeting Yo (Multi)", userText: "yo", expectedAllow: true, objective: "Right after join in a group call, treat a short greeting as worth a brief acknowledgement.", participantCount: 2, participantDisplayNames: ["alice", "bob"], sessionAgeMs: 4_000 }, { id: "fresh-join-greeting-hi-single", title: "Fresh Join Greeting Hi (Single)", userText: "hi", expectedAllow: true, objective: "Right after join, treat a hi greeting as worth a brief acknowledgement.", participantCount: 1, participantDisplayNames: ["alice"], sessionAgeMs: 4_000 }, { id: "fresh-join-greeting-hi-multi", title: "Fresh Join Greeting Hi (Multi)", userText: "hi", expectedAllow: true, objective: "Right after join in a group call, treat a hi greeting as worth a brief acknowledgement.", participantCount: 2, participantDisplayNames: ["alice", "bob"], sessionAgeMs: 4_000 }, { id: "fresh-join-greeting-sup-single", title: "Fresh Join Greeting Sup (Single)", userText: "sup", expectedAllow: true, objective: "Right after join, treat a sup check-in as worth a brief acknowledgement.", participantCount: 1, participantDisplayNames: ["alice"], sessionAgeMs: 4_000 }, { id: "fresh-join-greeting-sup-multi", title: "Fresh Join Greeting Sup (Multi)", userText: "sup", expectedAllow: true, objective: "Right after join in a group call, treat a sup check-in as worth a brief acknowledgement.", participantCount: 2, participantDisplayNames: ["alice", "bob"], sessionAgeMs: 4_000 }, { id: "fresh-join-greeting-yo-clanka-single", title: "Fresh Join Greeting Yo Clanka (Single)", userText: "yo clanka", expectedAllow: true, objective: "Treat likely wake-word variants in greeting form as direct enough to acknowledge.", participantCount: 1, participantDisplayNames: ["alice"], sessionAgeMs: 4_000 }, { id: "fresh-join-greeting-yo-clanka-multi", title: "Fresh Join Greeting Yo Clanka (Multi)", userText: "yo clanka", expectedAllow: true, objective: "In group calls, treat likely wake-word variant greetings as direct enough to acknowledge.", participantCount: 2, participantDisplayNames: ["alice", "bob"], sessionAgeMs: 4_000 }, { id: "fresh-join-non-greeting-undirected", title: "Fresh Join Non-Greeting Undirected", userText: "the build passed on main", expectedAllow: true, objective: "Non-greeting, non-directed chatter is passed to the brain which may skip.", participantCount: 2, participantDisplayNames: ["alice", "bob"], sessionAgeMs: 4_000 }, { id: "fresh-join-non-greeting-undirected-single", title: "Fresh Join Non-Greeting Undirected (Single)", userText: "the build passed on main", expectedAllow: true, objective: "Non-greeting, non-directed chatter is passed to the brain which may skip, even in 1:1.", participantCount: 1, participantDisplayNames: ["alice"], sessionAgeMs: 4_000 }, { id: "fresh-join-directed-to-other", title: "Fresh Join Directed To Other Human", userText: "bob can you share the link", expectedAllow: true, objective: "Passed to brain which detects this is addressed to another human and may skip.", participantCount: 2, participantDisplayNames: ["alice", "bob"], sessionAgeMs: 4_000 }, { id: "low-signal-lol", title: "Low Signal Fragment", userText: "lol", expectedAllow: true, objective: "Low-signal fragment passed to brain which decides whether to skip." }, { id: "low-signal-comment", title: "Low Signal Comment", userText: "ha!", expectedAllow: true, objective: "Low-signal fragment passed to brain which decides whether to skip." }, { id: "unaddressed-clear-question", title: "Unaddressed Clear Question", userText: "what's the fastest way to reduce build times in this project?", expectedAllow: true, objective: "Provide a concise practical suggestion even without explicit bot naming." }, { id: "fresh-fact-check", title: "Fresh Fact Check", userText: "clanker what's the latest rust stable version right now?", expectedAllow: true, objective: "Use a web lookup if needed for freshness, then answer in one short line." } ];

const DEFAULT_MAX_CASES = VOICE_GOLDEN_CASES.length; const DEFAULT_CASE_SESSION_AGE_MS = 40_000; const DEFAULT_CASE_PARTICIPANTS = ["alice", "bob"]; const MAX_CASE_PARTICIPANTS = 10;

function normalizeParticipantDisplayNames(value: unknown) { if (!Array.isArray(value)) return []; const dedupe = new Set(); const names: string[] = []; for (const item of value) { const raw = String(item || "") .replace(/\s+/g, " ") .trim() .slice(0, 80); if (!raw) continue; const dedupeKey = raw.toLowerCase(); if (dedupe.has(dedupeKey)) continue; dedupe.add(dedupeKey); names.push(raw); if (names.length >= MAX_CASE_PARTICIPANTS) break; } return names; }

function resolveExpectedResponse(caseRow: VoiceGoldenCase) { return caseRow.expectedResponse || (caseRow.expectedAllow ? "non_empty" : "empty"); }

function normalizeMode(value: unknown): VoiceGoldenRunMode { return String(value || "simulated").trim().toLowerCase() === "live" ? "live" : "simulated"; }

function normalizeVoiceMode(value: unknown): VoiceGoldenMode | null { return parseVoiceRuntimeMode(value); }

function resolveDefaults(options: VoiceGoldenHarnessOptions = {}): VoiceGoldenResolvedOptions { const requestedModes = normalizeVoiceModeList(options.modes); return { mode: normalizeMode(options.mode), modes: requestedModes.length ? requestedModes : [...VOICE_GOLDEN_MODES], iterations: Math.max(1, Math.floor(Number(options.iterations) || 1)), actorProvider: String(options.actorProvider || "claude-oauth").trim() || "claude-oauth", actorModel: String(options.actorModel || "claude-sonnet-4-5").trim() || "claude-sonnet-4-5", deciderProvider: String(options.deciderProvider || "claude-oauth").trim() || "claude-oauth", deciderModel: String(options.deciderModel || "claude-sonnet-4-6").trim() || "claude-sonnet-4-6", judge: { enabled: options.judge?.enabled !== undefined ? Boolean(options.judge.enabled) : true, provider: String(options.judge?.provider || "claude-oauth").trim() || "claude-oauth", model: String(options.judge?.model || "claude-sonnet-4-6").trim() || "claude-sonnet-4-6" }, allowMissingCredentials: parseBooleanFlag(options.allowMissingCredentials, false), maxCases: Math.max(1, Math.min(VOICE_GOLDEN_CASES.length, Math.floor(Number(options.maxCases) || DEFAULT_MAX_CASES))) }; }

function buildHarnessSettings({ voiceMode, actorProvider, actorModel, deciderProvider, deciderModel }: { voiceMode: VoiceGoldenMode; actorProvider: string; actorModel: string; deciderProvider: string; deciderModel: string; }) { return normalizeSettings({ ...DEFAULT_SETTINGS, botName: "clanky", memory: { enabled: false }, webSearch: { enabled: true, maxSearchesPerHour: 12 }, llm: { provider: actorProvider, model: actorModel, temperature: 0.25, maxOutputTokens: 160 }, voice: { enabled: true, mode: voiceMode, replyPath: "brain", ambientReplyEagerness: 65, generationLlm: { provider: actorProvider, model: actorModel }, replyDecisionLlm: { provider: deciderProvider, model: deciderModel }, xai: { voice: "Rex", audioFormat: "audio/pcm", sampleRateHz: 24000, region: "us-east-1" }, openaiRealtime: { model: "gpt-realtime", voice: "alloy", inputAudioFormat: "pcm16", outputAudioFormat: "pcm16", inputTranscriptionModel: "gpt-4o-mini-transcribe" }, geminiRealtime: { model: "gemini-2.5-flash-native-audio-preview-12-2025", voice: "Aoede", apiBaseUrl: "https://generativelanguage.googleapis.com", inputSampleRateHz: 24000, outputSampleRateHz: 24000 }, openaiAudioApi: { ttsModel: "gpt-4o-mini-tts", ttsVoice: "alloy", ttsSpeed: 1 } } }); }

function buildJudgeSettings(judge: VoiceGoldenJudgeConfig) { return normalizeSettings({ ...DEFAULT_SETTINGS, memory: { enabled: false }, llm: { provider: judge.provider, model: judge.model, temperature: 0, maxOutputTokens: 260 } }); }

function buildStageStats(rows: VoiceGoldenCaseResult[]): Record<string, StageStat> { return summarizeNamedMetricRows(rows.map((row) => ({ ...row.timings }))); }

function stablePassRate(passed: number, executed: number) { return formatPct(passed, executed); }

function hashString(value: string) { let hash = 0; const text = String(value || ""); for (let index = 0; index < text.length; index += 1) { hash = (hash * 31 + text.charCodeAt(index)) >>> 0; } return hash; }

function simulatedDelayMs(key: string, baseMs: number, spreadMs: number) { const hash = hashString(key); return baseMs + (hash % Math.max(1, spreadMs)); }

function isGreetingCheckIn(text: string) { const normalized = String(text || "") .toLowerCase() .replace(/[^a-z0-9'\s]/g, " ") .replace(/\s+/g, " ") .trim(); if (!normalized) return false; const tokens = normalized.split(" ").filter(Boolean); if (!tokens.length) return false; const firstToken = tokens[0]; const firstTwoTokens = tokens.slice(0, 2).join(" "); const shortGreetingTokens = new Set(["yo", "hi", "hey", "hello", "sup", "hola"]); if (shortGreetingTokens.has(firstToken)) return true; if (firstTwoTokens === "what up" || firstTwoTokens === "whats up" || firstTwoTokens === "what's up") { return true; } if (normalized === "whatsup" || normalized === "wassup") return true; return false; }

function extractPromptTranscript(prompt: string) { const transcriptMatch = prompt.match(/(?:latest\s+transcript|transcript):\s*"([^"]+)"/u); return String(transcriptMatch?.[1] || "") .trim() .toLowerCase(); }

bot.client.user = { id: "bot-user", username: "clanky", tag: "clanky#0001" };

const manager = bot.voiceSessionManager; applyCaseContextToManager({ manager, caseRow: null });

return { bot, manager }; }

async function evaluateDecision({ manager, settings, mode, caseRow }: { manager: VoiceSessionManager; settings: Record<string, unknown>; mode: VoiceGoldenMode; caseRow: VoiceGoldenCase; }) { const startedAt = performance.now(); const decision = await manager.evaluateVoiceReplyDecision({ session: createDecisionSession({ mode, caseRow }), userId: "speaker-1", settings, transcript: caseRow.userText, source: "realtime" }); const decisionMs = performance.now() - startedAt;

return { decisionMs, decision: { allow: Boolean(decision.allow), reason: String(decision.reason || ""), directAddressed: Boolean(decision.directAddressed), transcript: String(decision.transcript || caseRow.userText || "").trim(), error: String(decision.error || "") } }; }

if (mode === "openai_realtime") { session.realtimeClient = { updateInstructions() { return undefined; } }; }

return session; }

async function runLiveProductionCase({ manager, store, settings, mode, caseRow, directAddressed }: { manager: VoiceSessionManager; store: HarnessStore; settings: Record<string, unknown>; mode: VoiceGoldenMode; caseRow: VoiceGoldenCase; directAddressed: boolean; }): Promise { const stage = { connectMs: 0, inputPrepMs: 0, inputSendMs: 0, actorMs: 0, asrMs: 0, ttsMs: 0, outputAsrMs: 0, responseMs: 0 }; const session = buildExecutionSession({ mode, caseRow }); const actionStart = store.actions.length; const responseStartedAt = performance.now(); const originalSpeakVoiceLineWithTts = manager.speakVoiceLineWithTts.bind(manager);

manager.speakVoiceLineWithTts = async ({ session: activeSession }) => { if (activeSession && typeof activeSession === "object") { (activeSession as { lastAudioDeltaAt?: number }).lastAudioDeltaAt = Date.now(); } return true; };

try { applyCaseContextToManager({ manager, caseRow }); await manager.runRealtimeBrainReply({ session, settings, userId: "speaker-1", transcript: caseRow.userText, directAddressed, source: "voice_golden_production" }); } finally { manager.speakVoiceLineWithTts = originalSpeakVoiceLineWithTts; }

stage.responseMs = performance.now() - responseStartedAt; stage.actorMs = stage.responseMs; const actionDelta = store.actions.slice(actionStart); const responseText = latestVoiceReplyFromActions({ actions: actionDelta });

return { transcript: caseRow.userText, responseText, audioBytes: responseText ? Buffer.byteLength(responseText, "utf8") * 24 : 0, stage }; }

async function runSimulatedCase({ mode, caseRow, decisionAllow, iteration }: { mode: VoiceGoldenMode; caseRow: VoiceGoldenCase; decisionAllow: boolean; iteration: number; }): Promise { const idSeed = ${mode}:${caseRow.id}:${iteration};

await sleep(connectMs + inputPrepMs + inputSendMs + asrMs + actorMs + ttsMs + responseMs);

const transcript = caseRow.userText; const responseText = decisionAllow && resolveExpectedResponse(caseRow) === "non_empty" ? simulated reply (${mode}): ${caseRow.objective.slice(0, 90)} : "";

return { transcript, responseText, audioBytes: responseText ? Buffer.byteLength(responseText, "utf8") * 24 : 0, stage: { connectMs, inputPrepMs, inputSendMs, actorMs, asrMs, ttsMs, outputAsrMs: 0, responseMs } }; }

function hasProviderCredentials(provider: string) { const normalized = String(provider || "") .trim() .toLowerCase(); if (normalized === "openai") return Boolean(appConfig.openaiApiKey); if (normalized === "anthropic") return Boolean(appConfig.anthropicApiKey); if (normalized === "xai") return Boolean(appConfig.xaiApiKey); if (normalized === "claude-oauth") return true; return false; }

function validateHarnessCredentials(options: VoiceGoldenResolvedOptions) { const required = [ ...(options.mode === "live" ? [ { role: "actor", provider: options.actorProvider }, { role: "decider", provider: options.deciderProvider } ] : []), ...(options.judge.enabled ? [{ role: "judge", provider: options.judge.provider }] : []) ]; const missing = new Set();

for (const item of required) { const provider = String(item.provider || "").trim().toLowerCase(); if (!provider) continue; if (hasProviderCredentials(provider)) continue; missing.add(${item.role}:${provider}); }

return [...missing]; }

async function runJudge({ llm, judgeSettings, mode, runMode, caseRow, decision, responseText, timings, error }: { llm: LLMService; judgeSettings: Record<string, unknown>; mode: VoiceGoldenMode; runMode: VoiceGoldenRunMode; caseRow: VoiceGoldenCase; decision: DecisionResult; responseText: string; timings: StageTimings; error: string | null; }): Promise { const systemPrompt = [ "You are a strict evaluator for voice chat validation tests.", "Return strict JSON only.", "Score whether the observed behavior matches expected admission and response quality." ].join(" ");

const userPrompt = [ Run mode: ${runMode}, Voice mode: ${mode}, Case: ${caseRow.id} (${caseRow.title}), User utterance: ${caseRow.userText}, Expectation shouldAllow: ${caseRow.expectedAllow ? "true" : "false"}, Expectation response: ${resolveExpectedResponse(caseRow)}, Case objective: ${caseRow.objective}, Observed decision.allow: ${decision.allow ? "true" : "false"}, Observed decision.reason: ${decision.reason}, Observed response text: ${responseText || "(empty)"}, Observed error: ${error || "(none)"}, Timings totalMs=${timings.totalMs.toFixed(1)} decisionMs=${timings.decisionMs.toFixed(1)} responseMs=${timings.responseMs.toFixed(1)}, "Scoring rules:", "1) A failing admission expectation is a hard fail.", "2) If expected response is non_empty, response should be non-empty and reasonably aligned with objective.", "3) If expected response is empty, the correct behavior is an empty response or [SKIP]-equivalent silence.", 'Output schema: {"pass":true|false,"score":0..100,"confidence":0..1,"summary":"...","issues":["..."]}' ].join(" ");

return await runJsonJudge({ llm, settings: judgeSettings, systemPrompt, userPrompt, trace: { guildId: "voice-golden-guild", channelId: "voice-golden-text", userId: "judge", source: "voice_golden_judge", event: "judge_case", reason: null, messageId: null }, onParsed: (parsed, rawText) => { const issues = Array.isArray(parsed.issues) ? parsed.issues.map((item) => String(item || "").trim()).filter(Boolean).slice(0, 8) : [];

  return {
    pass: Boolean(parsed.pass),
    score: Math.max(0, Math.min(100, Math.floor(stableNumber(parsed.score, 0)))),
    confidence: Math.max(0, Math.min(1, stableNumber(parsed.confidence, 0))),
    summary: String(parsed.summary || "").trim(),
    issues,
    rawText
  };
},
onParseError: (rawText) => {
  const deterministicPass =
    decision.allow === caseRow.expectedAllow &&
    (resolveExpectedResponse(caseRow) === "non_empty" ? Boolean(responseText.trim()) : !responseText.trim());
  return {
    pass: deterministicPass,
    score: deterministicPass ? 75 : 25,
    confidence: 0.2,
    summary: "judge_output_parse_failed",
    issues: ["judge returned non-JSON output"],
    rawText
  };
}

}); }

function buildDeterministicJudge({ caseRow, decision, responseText, error }: { caseRow: VoiceGoldenCase; decision: DecisionResult; responseText: string; error: string | null; }): JudgeResult { const admissionMatches = decision.allow === caseRow.expectedAllow; const responseMatches = resolveExpectedResponse(caseRow) === "non_empty" ? Boolean(responseText.trim()) : !responseText.trim(); const pass = admissionMatches && responseMatches && !error; const issues: string[] = []; if (!admissionMatches) issues.push("admission_mismatch"); if (!responseMatches) issues.push("response_mismatch"); if (error) issues.push("runtime_error");

return { pass, score: pass ? 100 : 20, confidence: 1, summary: pass ? "deterministic_pass" : "deterministic_fail", issues, rawText: "" }; }

function buildEmptyTimings(decisionMs = 0): StageTimings { return { totalMs: decisionMs, decisionMs, connectMs: 0, inputPrepMs: 0, inputSendMs: 0, actorMs: 0, asrMs: 0, ttsMs: 0, outputAsrMs: 0, responseMs: 0 }; }

async function runSingleCase({ options, llm, judgeSettings, mode, settings, manager, executionStore, caseRow, iteration }: { options: VoiceGoldenResolvedOptions; llm: LLMService | null; judgeSettings: Record<string, unknown> | null; mode: VoiceGoldenMode; settings: Record<string, unknown>; manager: VoiceSessionManager; executionStore: HarnessStore; caseRow: VoiceGoldenCase; iteration: number; }): Promise { const startedAt = performance.now();

let errorText: string | null = null; let transcript = ""; let responseText = ""; let audioBytes = 0; let decisionData: DecisionResult = { allow: false, reason: "", directAddressed: false, transcript: "", error: "" }; let timings = buildEmptyTimings(0);

try { applyCaseContextToManager({ manager, caseRow }); const decisionResult = await evaluateDecision({ manager, settings, mode, caseRow }); decisionData = decisionResult.decision; timings = buildEmptyTimings(decisionResult.decisionMs);

if (!decisionData.allow) {
  transcript = decisionData.transcript || caseRow.userText;
} else if (options.mode === "simulated") {
  const simulated = await runSimulatedCase({
    mode,
    caseRow,
    decisionAllow: true,
    iteration
  });
  transcript = simulated.transcript;
  responseText = simulated.responseText;
  audioBytes = simulated.audioBytes;
  timings = {
    totalMs: 0,
    decisionMs: decisionResult.decisionMs,
    ...simulated.stage
  };
} else {
  if (!llm) {
    throw new Error("Live mode requires an initialized LLM service.");
  }
  const liveResult = await runLiveProductionCase({
    manager,
    store: executionStore,
    settings,
    mode,
    caseRow,
    directAddressed: Boolean(decisionData.directAddressed)
  });

  transcript = liveResult.transcript || decisionData.transcript || caseRow.userText;
  responseText = liveResult.responseText;
  audioBytes = liveResult.audioBytes;
  timings = {
    totalMs: 0,
    decisionMs: decisionResult.decisionMs,
    ...liveResult.stage
  };
}

} catch (error) { errorText = String((error as Error)?.message || error || "unknown_error"); }

timings.totalMs = Math.max(0, performance.now() - startedAt);

let judge: JudgeResult; if (options.judge.enabled && llm && judgeSettings) { try { judge = await runJudge({ llm, judgeSettings, mode, runMode: options.mode, caseRow, decision: decisionData, responseText, timings, error: errorText }); } catch (error) { judge = { pass: false, score: 0, confidence: 0, summary: "judge_error", issues: [String((error as Error)?.message || error || "unknown judge error")], rawText: "" }; } } else { judge = buildDeterministicJudge({ caseRow, decision: decisionData, responseText, error: errorText }); }

return { mode, caseId: caseRow.id, caseTitle: caseRow.title, iteration, expectedAllow: caseRow.expectedAllow, decision: decisionData, transcript, responseText, audioBytes, timings, pass: Boolean(judge.pass) && !errorText, judge, error: errorText }; }

function aggregateModeReport(mode: VoiceGoldenMode, skippedReason: string | null, results: VoiceGoldenCaseResult[]): VoiceGoldenModeReport { const executed = results.length; const passed = results.filter((row) => row.pass).length; const failed = executed - passed;

return { mode, skippedReason, results, aggregates: { executed, passed, failed, passRate: stablePassRate(passed, executed), stageStats: buildStageStats(results) } }; }

export async function runVoiceGoldenHarness(inputOptions: VoiceGoldenHarnessOptions = {}): Promise { const options = resolveDefaults(inputOptions); const startedAtIso = new Date().toISOString(); const onCaseProgress = typeof inputOptions.onCaseProgress === "function" ? inputOptions.onCaseProgress : null;

for (const mode of options.modes) { const settings = buildHarnessSettings({ voiceMode: mode, actorProvider: options.actorProvider, actorModel: options.actorModel, deciderProvider: options.deciderProvider, deciderModel: options.deciderModel });

let manager: VoiceSessionManager;
let executionStore: HarnessStore;
let llm: LLMService | null = null;

if (options.mode === "live") {
  executionStore = new HarnessStore();
  llm = new LLMService({
    appConfig,
    store: executionStore
  });
  const search = new WebSearchService({
    appConfig,
    store: executionStore
  });
  const runtime = createLiveExecutionRuntime({
    llm,
    search,
    store: executionStore
  });
  manager = runtime.manager;
} else {
  const runtime = createDecisionRuntime(buildSimulatedDecisionLlm());
  manager = runtime.manager;
  executionStore = runtime.store;
  if (options.judge.enabled) {
    llm = new LLMService({
      appConfig,
      store: new HarnessStore()
    });
  }
}

const results: VoiceGoldenCaseResult[] = [];
const modeCaseCount = Math.max(0, options.iterations * cases.length);
let modeCaseIndex = 0;
for (let iteration = 1; iteration <= options.iterations; iteration += 1) {
  for (const caseRow of cases) {
    modeCaseIndex += 1;
    globalCaseIndex += 1;
    onCaseProgress?.({
      phase: "start",
      mode,
      iteration,
      modeCaseIndex,
      modeCaseCount,
      globalCaseIndex,
      globalCaseCount: totalCaseCount,
      caseId: caseRow.id,
      caseTitle: caseRow.title,
      expectedAllow: Boolean(caseRow.expectedAllow)
    });
    const row = await runSingleCase({
      options,
      llm,
      judgeSettings,
      mode,
      settings,
      manager,
      executionStore,
      caseRow,
      iteration
    });
    results.push(row);
    onCaseProgress?.({
      phase: "done",
      mode,
      iteration,
      modeCaseIndex,
      modeCaseCount,
      globalCaseIndex,
      globalCaseCount: totalCaseCount,
      caseId: caseRow.id,
      caseTitle: caseRow.title,
      expectedAllow: Boolean(caseRow.expectedAllow),
      pass: Boolean(row.pass),
      decisionAllow: Boolean(row.decision.allow),
      decisionReason: String(row.decision.reason || ""),
      error: row.error,
      durationMs: Number(row.timings.totalMs || 0)
    });
  }
}

modeReports.push(aggregateModeReport(mode, null, results));
try {
  await manager.dispose("voice_golden_harness_done");
} catch (error) {
  console.warn(
    `[voice-golden-harness] manager.dispose failed for mode=${mode}: ${String((error as Error)?.message || error)}`
  );
}

}

const allResults = modeReports.flatMap((report) => report.results); const passed = allResults.filter((row) => row.pass).length; const executed = allResults.length; const failed = executed - passed;

return { startedAt: startedAtIso, finishedAt: new Date().toISOString(), options, modeReports, summary: { executed, passed, failed, passRate: stablePassRate(passed, executed), stageStats: buildStageStats(allResults) } }; }

export function printVoiceGoldenHarnessReport(report: VoiceGoldenHarnessReport) { console.log("Voice Golden Validation Harness"); console.log(startedAt=${report.startedAt}); console.log(finishedAt=${report.finishedAt}); console.log(mode=${report.options.mode}); console.log(modes=[${report.options.modes.join(", ")}]); console.log(iterations=${report.options.iterations}); console.log(judge=${report.options.judge.enabled ? "on" : "off"}); console.log("");

for (const modeReport of report.modeReports) { if (modeReport.skippedReason) { console.log(mode=${modeReport.mode} skipped (${modeReport.skippedReason})); continue; }

console.log(
  `mode=${modeReport.mode} executed=${modeReport.aggregates.executed} pass=${modeReport.aggregates.passed} fail=${modeReport.aggregates.failed} passRate=${modeReport.aggregates.passRate.toFixed(1)}%`
);
const totalMs = modeReport.aggregates.stageStats.totalMs;
const decisionMs = modeReport.aggregates.stageStats.decisionMs;
const responseMs = modeReport.aggregates.stageStats.responseMs;
if (totalMs) {
  console.log(
    `  totalMs p50=${totalMs.p50Ms.toFixed(1)} p95=${totalMs.p95Ms.toFixed(1)} avg=${totalMs.avgMs.toFixed(1)}`
  );
}
if (decisionMs) {
  console.log(
    `  decisionMs p50=${decisionMs.p50Ms.toFixed(1)} p95=${decisionMs.p95Ms.toFixed(1)} avg=${decisionMs.avgMs.toFixed(1)}`
  );
}
if (responseMs) {
  console.log(
    `  responseMs p50=${responseMs.p50Ms.toFixed(1)} p95=${responseMs.p95Ms.toFixed(1)} avg=${responseMs.avgMs.toFixed(1)}`
  );
}

const failedRows = modeReport.results.filter((row) => !row.pass).slice(0, 6);
for (const row of failedRows) {
  console.log(
    `  fail case=${row.caseId} iter=${row.iteration} reason=${row.decision.reason || row.error || row.judge.summary}`
  );
  if (row.judge.issues.length) {
    console.log(`    issues=${row.judge.issues.join(" | ")}`);
  }
}

}

console.log(""); console.log( summary executed=${report.summary.executed} pass=${report.summary.passed} fail=${report.summary.failed} passRate=${report.summary.passRate.toFixed(1)}% ); }