src/voice/voiceInterruptClassifier.ts

import { buildSingleTurnPromptLog } from "../promptLogging.ts"; import { getPromptBotName } from "../prompts/promptCore.ts"; import { applyOrchestratorOverrideSettings, getResolvedVoiceInterruptClassifierBinding } from "../settings/agentStack.ts"; import { isCancelIntent } from "../tools/cancelDetection.ts"; import { defaultVoiceReplyDecisionModel, normalizeVoiceReplyDecisionProvider, resolveVoiceReplyDecisionMaxOutputTokens } from "./voiceDecisionRuntime.ts"; import { VOICE_INTERRUPT_CLASSIFIER_TIMEOUT_MS, STT_REPLY_MAX_CHARS, STT_TRANSCRIPT_MAX_CHARS } from "./voiceSessionManager.constants.ts"; import { normalizeInlineText, normalizeVoiceText } from "./voiceSessionHelpers.ts"; import type { LoggedVoicePromptBundle, VoiceInterruptOverlapBurstEntry, VoiceSession } from "./voiceSessionTypes.ts";

type InterruptClassifierSettings = Record<string, unknown> | null;

type InterruptClassifierStoreLike = { logAction?: (entry: { kind?: string; guildId?: string | null; channelId?: string | null; userId?: string | null; content: string; metadata?: Record<string, unknown>; }) => void; };

type InterruptClassifierGenerateResult = { text?: string | null; };

type InterruptClassifierHost = { store?: InterruptClassifierStoreLike | null; llm?: { generate?: (args: { settings: InterruptClassifierSettings; systemPrompt: string; userPrompt: string; contextMessages: unknown[]; trace?: { guildId?: string | null; channelId?: string | null; userId?: string | null; source?: string | null; }; signal?: AbortSignal; }) => Promise; } | null; };

type VoiceInterruptClassifierResult = { decision: "interrupt" | "ignore"; source: string; latencyMs: number; promptLog: LoggedVoicePromptBundle | null; rawOutput?: string | null; error?: string | null; };

function normalizeBurstText(text: string, maxChars = STT_TRANSCRIPT_MAX_CHARS) { return normalizeVoiceText(text, maxChars); }

function countTokens(text: string) { const normalized = normalizeBurstText(text, STT_REPLY_MAX_CHARS); if (!normalized) return 0; return normalized.split(/\s+/u).filter(Boolean).length; }

function isLikelyLaughterToken(text: string) { const normalized = normalizeInlineText(text, 120) .toLowerCase() .replace(/[.!?,'"`~^*_()[]{}:;|/\-]+/gu, " ") .replace(/\s+/gu, " ") .trim(); if (!normalized) return true; if (/^(?:ha|heh|haha|hehe|lol|lmao|lmfao|rofl|哈|哈哈|하|하하|ㅋ|ㅋㅋ|w|ww)+$/u.test(normalized.replace(/\s+/gu, ""))) { return true; } return false; }

function isLikelyLowSignalOverlapText(text: string) { const normalized = normalizeInlineText(text, 160) .toLowerCase() .replace(/[.!?,'"`~^_()[]{}:;|/\-]+/gu, " ") .replace(/\s+/gu, " ") .trim(); if (!normalized) return true; if (isLikelyLaughterToken(normalized)) return true; if (/^(?:mm+|mhm+|mhmm+|uh+|uh huh|uhhuh|uh-huh|yeah+|yep+|ya+|ok(?:ay)?|right+|true+|damn+|bro+|woah+|wow+)(?:\s+(?:mm+|mhm+|yeah+|ok(?:ay)?|right+|true+|damn+|bro+|woah+|wow+))$/u.test(normalized)) { return true; } return countTokens(normalized) <= 2 && normalized.length <= 12; }

function isObviousInterruptTakeoverText(text: string) { const normalized = normalizeInlineText(text, 200) .toLowerCase() .replace(/\s+/gu, " ") .trim(); if (!normalized) return false; // Cancel intent covers "stop", "cancel", "nevermind", "abort", etc. if (isCancelIntent(normalized)) return true; // Only match phrases that are unambiguously about taking the conversational // floor. Bare words like "wait", "stop", "hold on" fire constantly in // gaming sessions (e.g. "oh wait, I don't have teleportation potions") and // cause false-positive interruptions. return /\b(?:let me talk|lemme talk|can i talk|can i say something|let me finish|shut up|be quiet|shush|hush)\b/u.test(normalized); }

export function hasObviousInterruptTakeoverBurst(entries: VoiceInterruptOverlapBurstEntry[]) { return (Array.isArray(entries) ? entries : []).some((entry) => isObviousInterruptTakeoverText(entry.transcript)); }

function hasClearlySemanticBurst(entries: VoiceInterruptOverlapBurstEntry[]) { return entries.some((entry) => { const transcript = normalizeBurstText(entry.transcript, STT_REPLY_MAX_CHARS); if (!transcript) return false; if (isObviousInterruptTakeoverText(transcript)) return true; if (/[?]/u.test(transcript)) return true; return countTokens(transcript) >= 6 || transcript.length >= 28; }); }

function formatBurstEntries(entries: VoiceInterruptOverlapBurstEntry[]) { return entries .map((entry) => { const speaker = normalizeInlineText(entry.speakerName, 80) || "someone"; const transcript = normalizeBurstText(entry.transcript, STT_REPLY_MAX_CHARS); const phase = entry.isFinal ? "final" : "partial"; return ${speaker} (${phase}): "${transcript}"; }) .filter(Boolean) .join(" "); }

function buildInterruptClassifierPrompt({ settings, interruptedUtteranceText, entries }: { settings: InterruptClassifierSettings; interruptedUtteranceText: string; entries: VoiceInterruptOverlapBurstEntry[]; }) { const botName = getPromptBotName(settings); const assistantLine = normalizeBurstText(interruptedUtteranceText, STT_REPLY_MAX_CHARS) || "[unknown]"; const burstLines = formatBurstEntries(entries); const systemPrompt = [ You are deciding whether ${botName} should stop speaking right now in a Discord voice chat., "Return exactly one token: INTERRUPT or IGNORE.", "Use INTERRUPT only when someone is clearly taking the floor, redirecting the conversation, asking a real question, giving a command, or meaningfully stopping the assistant.", "Use IGNORE for laughter, backchannel, short acknowledgements, filler, ambient reaction noise, or overlap that is too weak or ambiguous to justify stopping the assistant." ].join(" "); const userPrompt = [ Assistant speech in progress: "${assistantLine}", "", "Recent overlapping ASR burst:", burstLines || "[none]", "", "Should the assistant stop speaking right now?" ].join(" "); return { systemPrompt, userPrompt, promptLog: buildSingleTurnPromptLog({ systemPrompt, userPrompt }) }; }

function parseInterruptDecision(text: string): "interrupt" | "ignore" | null { const normalized = normalizeInlineText(text, 80).toUpperCase(); if (!normalized) return null; if (normalized.includes("INTERRUPT")) return "interrupt"; if (normalized.includes("IGNORE")) return "ignore"; return null; }

export async function classifyVoiceInterruptBurst( host: InterruptClassifierHost, { session, settings, interruptedUtteranceText, entries, traceUserId = null, skipLlm = false }: { session: Pick<VoiceSession, "id" | "guildId" | "textChannelId">; settings: InterruptClassifierSettings; interruptedUtteranceText: string; entries: VoiceInterruptOverlapBurstEntry[]; traceUserId?: string | null; skipLlm?: boolean; } ): Promise { const normalizedEntries = (Array.isArray(entries) ? entries : []) .map((entry) => ({ ...entry, transcript: normalizeBurstText(entry.transcript, STT_REPLY_MAX_CHARS) })) .filter((entry) => Boolean(entry.transcript)); if (normalizedEntries.length === 0) { return { decision: "ignore", source: "empty_burst", latencyMs: 0, promptLog: null }; } if (normalizedEntries.every((entry) => isLikelyLowSignalOverlapText(entry.transcript))) { return { decision: "ignore", source: "low_signal_heuristic", latencyMs: 0, promptLog: null }; } if (normalizedEntries.some((entry) => isObviousInterruptTakeoverText(entry.transcript))) { return { decision: "interrupt", source: "takeover_heuristic", latencyMs: 0, promptLog: null }; }

if (skipLlm) { return { decision: "ignore", source: "classifier_disabled", latencyMs: 0, promptLog: null }; }

const binding = getResolvedVoiceInterruptClassifierBinding(settings); const llmProvider = normalizeVoiceReplyDecisionProvider(binding?.provider || "openai"); const llmModel = String(binding?.model || defaultVoiceReplyDecisionModel(llmProvider)).trim() || defaultVoiceReplyDecisionModel(llmProvider); const maxOutputTokens = resolveVoiceReplyDecisionMaxOutputTokens(llmProvider, llmModel); const { systemPrompt, userPrompt, promptLog } = buildInterruptClassifierPrompt({ settings, interruptedUtteranceText, entries: normalizedEntries });

if (!host.llm?.generate) { return { decision: hasClearlySemanticBurst(normalizedEntries) ? "interrupt" : "ignore", source: "llm_unavailable_fallback", latencyMs: 0, promptLog, error: "llm_generate_unavailable" }; }

const startedAt = Date.now(); const abortController = typeof AbortController === "function" ? new AbortController() : null; const timeout = abortController ? setTimeout(() => { abortController.abort(new Error("voice_interrupt_classifier_timeout")); }, VOICE_INTERRUPT_CLASSIFIER_TIMEOUT_MS) : null; try { const result = await host.llm.generate({ settings: applyOrchestratorOverrideSettings(settings, { provider: llmProvider, model: llmModel, temperature: 0, maxOutputTokens, reasoningEffort: "minimal" }), systemPrompt, userPrompt, contextMessages: [], trace: { guildId: session.guildId, channelId: session.textChannelId, userId: traceUserId, source: "voice_interrupt_classifier" }, signal: abortController?.signal }); const rawOutput = String(result?.text || ""); const decision = parseInterruptDecision(rawOutput); if (decision) { return { decision, source: decision === "interrupt" ? "model_interrupt" : "model_ignore", latencyMs: Date.now() - startedAt, promptLog, rawOutput }; } return { decision: hasClearlySemanticBurst(normalizedEntries) ? "interrupt" : "ignore", source: "unparseable_fallback", latencyMs: Date.now() - startedAt, promptLog, rawOutput, error: unparseable_interrupt_classifier_output:${rawOutput.slice(0, 60)} }; } catch (error) { const message = String(error?.message || error || "unknown_error"); const timedOut = Boolean(abortController?.signal.aborted) && message.includes("voice_interrupt_classifier_timeout"); host.store?.logAction?.({ kind: "voice_error", guildId: session.guildId, channelId: session.textChannelId, userId: traceUserId, content: timedOut ? "voice_interrupt_classifier_timed_out" : voice_interrupt_classifier_failed: ${message}, metadata: { sessionId: session.id, burstEntryCount: normalizedEntries.length, timeoutMs: timedOut ? VOICE_INTERRUPT_CLASSIFIER_TIMEOUT_MS : undefined } }); return { decision: hasClearlySemanticBurst(normalizedEntries) ? "interrupt" : "ignore", source: timedOut ? "timeout_fallback" : "runtime_error_fallback", latencyMs: Date.now() - startedAt, promptLog, error: message }; } finally { if (timeout) { clearTimeout(timeout); } } }