JCV's Portfolio

src/voice/voiceReplyPipeline.ts

import { getActivitySettings, getVoiceConversationPolicy } from "../settings/agentStack.ts"; import { clamp } from "../utils.ts"; import { buildVoiceReplyScopeKey } from "../tools/activeReplyRegistry.ts"; import { isAbortError } from "../tools/abortError.ts"; import { getMusicWakeFollowupState } from "./musicWakeLatch.ts"; import { VOICE_GENERATION_ONLY_WATCHDOG_MS, VOICE_GENERATION_SOUNDBOARD_CANDIDATE_TIMEOUT_MS, REALTIME_CONTEXT_MEMBER_LIMIT, STT_CONTEXT_MAX_MESSAGES, STT_REPLY_MAX_CHARS, STT_TRANSCRIPT_MAX_CHARS, VOICE_CHANNEL_EFFECT_EVENT_PROMPT_LIMIT, VOICE_MEMBERSHIP_EVENT_PROMPT_LIMIT } from "./voiceSessionManager.constants.ts"; import { SOUNDBOARD_MAX_CANDIDATES, extractNoteDirectives, formatSoundboardCandidateLine, isRealtimeMode, normalizeVoiceAddressingTargetToken, normalizeVoiceText } from "./voiceSessionHelpers.ts"; import { providerSupports } from "./voiceModes.ts"; import { normalizeSoundboardRefs as normalizeSoundboardRefsModule, resolveSoundboardCandidates as resolveSoundboardCandidatesModule } from "./voiceSoundboard.ts"; import { setVoiceLivePromptSnapshot } from "./voicePromptState.ts"; import { buildSharedVoiceTurnContext } from "./voiceTurnContext.ts";

import { normalizeVoiceOutputLeaseMode } from "./voiceOutputLease.ts"; import { appendStreamWatchNoteEntry } from "./voiceStreamWatch.ts"; import { getCompactionCursor, maybeStartVoiceContextCompaction } from "./voiceContextCompaction.ts"; import type { ReplyInterruptionPolicy } from "./bargeInController.ts"; import type { InFlightAcceptedBrainTurn, VoiceConversationContext, VoiceGenerationContextSnapshot, LoggedVoicePromptBundle, VoiceOutputLeaseMode, VoicePendingResponseLatencyContext, VoiceRuntimeEventContext, VoiceRealtimeToolSettings, VoiceSession } from "./voiceSessionTypes.ts"; import { musicPhaseIsActive } from "./voiceSessionTypes.ts"; import type { VoiceSessionManager } from "./voiceSessionManager.ts";

type GeneratedPayload = { text?: string; /** Raw generation text before normalizeVoiceReplyText strips [[NOTE:...]] directives.

Used by the pipeline to extract and store screen-watch notes. */ rawText?: string; playedSoundboardRefs?: unknown[]; streamedRequestedRealtimeUtterance?: boolean; usedWebSearchFollowup?: boolean; usedScreenShareOffer?: boolean; leaveVoiceChannelRequested?: boolean; voiceAddressing?: unknown; voiceOutputLeaseMode?: unknown; streamedSentenceCount?: number; generationContextSnapshot?: VoiceGenerationContextSnapshot | null; replyPrompts?: LoggedVoicePromptBundle | null; };

type ContextMessage = { role: "assistant" | "user"; content: string; };

interface VoiceReplyPipelineParams { session: VoiceSession; settings: VoiceRealtimeToolSettings | null; userId: string | null; transcript: string; directAddressed?: boolean; directAddressConfidence?: number; conversationContext?: VoiceConversationContext | null; musicWakeFollowupEligibleAtCapture?: boolean; mode: "realtime_transport"; source?: string; inputKind?: string; latencyContext?: VoicePendingResponseLatencyContext | null; frozenFrameSnapshot?: { mimeType: string; dataBase64: string } | null; runtimeEventContext?: VoiceRuntimeEventContext | null; }

& { client: VoiceSessionManager["client"]; instructionManager: VoiceSessionManager["instructionManager"]; llm: VoiceSessionManager["llm"]; memory: VoiceSessionManager["memory"]; sessionLifecycle: VoiceSessionManager["sessionLifecycle"]; store: VoiceSessionManager["store"]; activeReplies: VoiceSessionManager["activeReplies"]; };

function toGeneratedPayload(value: unknown): GeneratedPayload { if (value && typeof value === "object") { return value as GeneratedPayload; } return { text: typeof value === "string" ? value : "", playedSoundboardRefs: [], usedWebSearchFollowup: false, usedScreenShareOffer: false, leaveVoiceChannelRequested: false, voiceAddressing: null }; }

type VoiceGenerationTimeoutError = Error & { code?: string; stage?: string; timeoutMs?: number; };

const VOICE_GENERATION_TIMEOUT_CODE = "voice_generation_timeout";

function isVoiceGenerationTimeoutError(error: unknown): error is VoiceGenerationTimeoutError { return String((error as VoiceGenerationTimeoutError | null)?.code || "").trim() === VOICE_GENERATION_TIMEOUT_CODE; }

async function waitForVoiceGenerationStage({ stage, timeoutMs, task }: { stage: string; timeoutMs: number; task: Promise; }) { return await new Promise((resolve, reject) => { let settled = false; const timer = setTimeout(() => { if (settled) return; settled = true; reject(createVoiceGenerationTimeoutError(stage, timeoutMs)); }, Math.max(1, Math.round(Number(timeoutMs) || 1)));

void task.then((value) => {
  if (settled) return;
  settled = true;
  clearTimeout(timer);
  resolve(value);
}).catch((error: unknown) => {
  if (settled) return;
  settled = true;
  clearTimeout(timer);
  reject(error);
});

}); }

return { generatedVoiceAddressing, replyInterruptionPolicy }; }

function normalizeAssistantReplyOutputLeaseMode(rawOutputLeaseMode: unknown) { const normalizedRaw = rawOutputLeaseMode && typeof rawOutputLeaseMode === "object" && !Array.isArray(rawOutputLeaseMode) ? (rawOutputLeaseMode as Record<string, unknown>).mode : rawOutputLeaseMode; const normalizedMode = normalizeVoiceOutputLeaseMode(normalizedRaw); return normalizedMode === "ambient" ? null : normalizedMode; }

function isScreenWatchQuestion(transcript: string, directAddressed: boolean) { const normalized = String(transcript || "").trim().toLowerCase(); if (!normalized) return false; const explicitScreenQuestion = /\b(what(?:'s| is)? (?:on|in) (?:the )?(?:screen|stream|share)|what do you see|what can you see|can you see (?:my|the) (?:screen|stream|share)|look at (?:my|the) (?:screen|stream|share)|what(?:'s| is) happening on (?:my|the) (?:screen|stream|share))\b/i .test(normalized); if (explicitScreenQuestion) return true; if (!directAddressed) return false; return /\b(do you see|can you tell what|what am i looking at)\b/i.test(normalized); }

export async function runVoiceReplyPipeline( host: VoiceReplyPipelineHost, params: VoiceReplyPipelineParams ): Promise { const { session } = params; const source = String(params.source || params.mode).trim() || params.mode; if (!session || session.ending) return false;

if (!isRealtimeMode(session.mode)) return false; if (typeof host.generateVoiceTurn !== "function") { host.store.logAction({ kind: "voice_error", guildId: session.guildId, channelId: session.textChannelId, userId: params.userId, content: "realtime_generation_unavailable", metadata: { sessionId: session.id, source } }); return false; }

const normalizedTranscript = normalizeVoiceText(params.transcript, STT_TRANSCRIPT_MAX_CHARS); if (!normalizedTranscript) return false;

// Determine early whether this is a stream_watch commentary turn so we // can tag the ActiveReply and conditionally skip the image attachment. const isStreamWatchCommentaryTurn = params.inputKind === "event" && params.runtimeEventContext?.category === "screen_share";

const currentMusicPhase = host.getMusicPhase(session); const musicWakeFollowupState = getMusicWakeFollowupState(session, params.userId || null); const shouldRefreshMusicWakeAfterSpeech = musicPhaseIsActive(currentMusicPhase) && currentMusicPhase !== "paused_wake_word" && ( Boolean(params.directAddressed) || Boolean(params.musicWakeFollowupEligibleAtCapture) || musicWakeFollowupState.passiveWakeFollowupAllowed );

if (host.maybeSupersedeRealtimeReplyBeforePlayback({ session, source: ${source}:generation_preflight, generationStartedAtMs: latencyFinalizedAtMs || generationStartedAt, replyUserId: params.userId || null })) { clearInFlightAcceptedBrainTurn(); return false; } const activeReply = host.activeReplies && voiceReplyScopeKey ? host.activeReplies.begin(voiceReplyScopeKey, "voice-generation", ["voice_generation"]) : null; if (activeReply && session.inFlightAcceptedBrainTurn === inFlightAcceptedBrainTurn) { session.inFlightAcceptedBrainTurn.acceptedAt = activeReply.startedAt; } const generationSignal = activeReply?.abortController.signal; const generationInterrupted = () => Boolean( session.ending || generationSignal?.aborted || ( activeReply && host.activeReplies?.isStale(voiceReplyScopeKey, activeReply.startedAt) ) );

void maybeStartVoiceContextCompaction(host, { session, settings: params.settings, source });

const { contextMessages, contextMessageChars, contextTurns } = buildContextMessages(session, normalizedTranscript); host.updateModelContextSummary(session, "generation", { source, capturedAt: new Date().toISOString(), availableTurns: contextTurns.length, sentTurns: contextMessages.length, maxTurns: STT_CONTEXT_MAX_MESSAGES, contextChars: contextMessageChars, transcriptChars: normalizedTranscript.length, directAddressed: Boolean(params.directAddressed) });

host.store.logAction({ kind: "voice_runtime", guildId: session.guildId, channelId: session.textChannelId, userId: params.userId, content: "voice_generation_prep_stage", metadata: { sessionId: session.id, source, stage: "soundboard_candidates", state: "start", timeoutMs: VOICE_GENERATION_SOUNDBOARD_CANDIDATE_TIMEOUT_MS } }); const soundboardCandidatesStartedAt = Date.now(); let soundboardCandidateInfo: Awaited<ReturnType> | null = null; try { soundboardCandidateInfo = await waitForVoiceGenerationStage({ stage: "soundboard_candidates", timeoutMs: VOICE_GENERATION_SOUNDBOARD_CANDIDATE_TIMEOUT_MS, task: resolveSoundboardCandidatesModule(host, { session, settings: params.settings }) }); host.store.logAction({ kind: "voice_runtime", guildId: session.guildId, channelId: session.textChannelId, userId: params.userId, content: "voice_generation_prep_stage", metadata: { sessionId: session.id, source, stage: "soundboard_candidates", state: "ok", elapsedMs: Math.max(0, Date.now() - soundboardCandidatesStartedAt), candidateCount: Array.isArray(soundboardCandidateInfo?.candidates) ? soundboardCandidateInfo.candidates.length : 0 } }); } catch (error) { if (isAbortError(error) || generationInterrupted()) { throw error; } const timedOut = isVoiceGenerationTimeoutError(error); host.store.logAction({ kind: timedOut ? "voice_runtime" : "voice_error", guildId: session.guildId, channelId: session.textChannelId, userId: params.userId, content: timedOut ? "voice_generation_prep_stage" : voice_generation_soundboard_candidates_failed: ${String((error as Error)?.message || error)}, metadata: { sessionId: session.id, source, stage: "soundboard_candidates", state: timedOut ? "timeout" : "error", elapsedMs: Math.max(0, Date.now() - soundboardCandidatesStartedAt), timeoutMs: VOICE_GENERATION_SOUNDBOARD_CANDIDATE_TIMEOUT_MS, fallbackUsed: true, error: String((error as Error)?.message || error) } }); soundboardCandidateInfo = null; } const soundboardCandidateLines = (Array.isArray(soundboardCandidateInfo?.candidates) ? soundboardCandidateInfo.candidates : [] ) .map((entry) => formatSoundboardCandidateLine(entry)) .filter(Boolean) .slice(0, SOUNDBOARD_MAX_CANDIDATES);

const resolvedConversationContext = params.conversationContext && typeof params.conversationContext === "object" ? params.conversationContext : host.buildVoiceConversationContext({ session, userId: params.userId, directAddressed: Boolean(params.directAddressed) }); const sharedTurnContext = buildSharedVoiceTurnContext(host, { session, settings: params.settings, speakerUserId: params.userId, maxParticipants: REALTIME_CONTEXT_MEMBER_LIMIT, maxMembershipEvents: VOICE_MEMBERSHIP_EVENT_PROMPT_LIMIT, maxVoiceEffects: VOICE_CHANNEL_EFFECT_EVENT_PROMPT_LIMIT }); const participantRoster = sharedTurnContext.participantRoster; const recentMembershipEvents = sharedTurnContext.recentMembershipEvents; const recentVoiceEffectEvents = sharedTurnContext.recentVoiceEffectEvents; const sessionTiming = host.sessionLifecycle.buildVoiceSessionTimingContext(session); const streamWatchNotes = sharedTurnContext.streamWatchNotes;

// Only attach the raw image frame for stream_watch commentary turns. // User-speech voice replies still get the rolling [[NOTE:...]] context // (via streamWatchNotes above) but skip the image to cut ~1500-2000 // tokens and halve generation latency. const shouldAttachStreamWatchFrame = isStreamWatchCommentaryTurn || (session.streamWatch?.active && isScreenWatchQuestion(normalizedTranscript, Boolean(params.directAddressed))); const streamWatchLatestFrame = shouldAttachStreamWatchFrame ? (params.frozenFrameSnapshot?.dataBase64 ? params.frozenFrameSnapshot : session.streamWatch?.active && session.streamWatch?.latestFrameDataBase64 ? { mimeType: String(session.streamWatch.latestFrameMimeType || "image/jpeg"), dataBase64: String(session.streamWatch.latestFrameDataBase64) } : null) : null; const generationConversationContext = { ...(resolvedConversationContext || {}), sessionTimeoutWarningActive: Boolean(sessionTiming?.timeoutWarningActive), sessionTimeoutWarningReason: String(sessionTiming?.timeoutWarningReason || "none"), streamWatchNotes, compactedSessionSummary: sharedTurnContext.compactedSessionSummary };

const markInFlightAcceptedBrainTurnPhase = (phase: "generation_only" | "tool_call_started" | "playback_requested") => { if (session.inFlightAcceptedBrainTurn === inFlightAcceptedBrainTurn) { session.inFlightAcceptedBrainTurn.phase = phase; } }; let generatedPayload: GeneratedPayload | null = null; let generationFinished = false; const voiceConversation = getVoiceConversationPolicy(params.settings); const useRealtimeTts = String(voiceConversation.ttsMode || "").trim().toLowerCase() !== "api"; const streamingVoiceReplyEnabled = useRealtimeTts && Boolean(voiceConversation.streaming?.enabled); let streamedReplyRequestedAt = 0;

if ( streamingVoiceReplyEnabled && providerSupports(session.mode || "", "updateInstructions") ) { void host.instructionManager.prepareRealtimeTurnContext({ session, settings: params.settings, userId: params.userId, transcript: normalizedTranscript, captureReason: source }).catch((error: unknown) => { host.store.logAction({ kind: "voice_error", guildId: session.guildId, channelId: session.textChannelId, userId: host.client.user?.id || null, content: openai_realtime_turn_context_refresh_failed: ${String((error as Error)?.message || error)}, metadata: { sessionId: session.id, source } }); }); } try { const activity = getActivitySettings(params.settings); const generateVoiceTurnPromise = host.generateVoiceTurn({ settings: params.settings, guildId: session.guildId, channelId: session.textChannelId, userId: params.userId, transcript: normalizedTranscript, inputKind: params.inputKind || "transcript", directAddressed: Boolean(params.directAddressed), source, contextMessages, sessionId: session.id, isEagerTurn: !params.directAddressed && !generationConversationContext?.currentSpeakerActive, voiceAmbientReplyEagerness: Number(voiceConversation.ambientReplyEagerness) || 0, responseWindowEagerness: Number(activity.responseWindowEagerness) || 0, conversationContext: generationConversationContext, runtimeEventContext: params.runtimeEventContext || null, sessionTiming, participantRoster, recentMembershipEvents, recentVoiceEffectEvents, recentToolOutcomes: sharedTurnContext.recentToolOutcomeLines, soundboardCandidates: soundboardCandidateLines, streamWatchLatestFrame, nativeDiscordSharers: sharedTurnContext.nativeDiscordSharers, voiceToolCallbacks: host.buildVoiceToolCallbacks({ session, settings: params.settings }), onSpokenSentence: async ({ text, index, voiceAddressing, voiceOutputLeaseMode }: { text: string; index: number; voiceAddressing?: { talkingTo: string | null } | null; voiceOutputLeaseMode?: VoiceOutputLeaseMode | null; }) => { if (generationInterrupted()) return false; const playbackPlan = host.buildVoiceReplyPlaybackPlan({ replyText: String(text || ""), trailingSoundboardRefs: [] }); if (!playbackPlan.steps.length) return false; const { replyInterruptionPolicy: streamedReplyInterruptionPolicy } = resolveAssistantReplyTargeting(host, { session, userId: params.userId, rawAddressing: voiceAddressing || null }); const streamedReplyOutputLeaseMode = normalizeAssistantReplyOutputLeaseMode(voiceOutputLeaseMode); const requestedAt = Date.now(); const latencyContext = index === 0 ? { finalizedAtMs: latencyFinalizedAtMs, asrStartedAtMs: latencyAsrStartedAtMs, asrCompletedAtMs: latencyAsrCompletedAtMs, generationStartedAtMs: generationStartedAt, replyRequestedAtMs: requestedAt, audioStartedAtMs: 0, source, captureReason: latencyCaptureReason, queueWaitMs: latencyQueueWaitMs, pendingQueueDepth: latencyPendingQueueDepth } : null; const playbackSource = ${source}:stream_chunk_${Math.max(0, Number(index || 0))}; // Derive supersede user from the resolved addressing: // TO:ALL → null (un-supersedable by queue). // TO:specific-user → that user. No addressing → triggering speaker. const streamSupersedeUserId = voiceAddressing?.talkingTo === "ALL" ? null : (streamedReplyInterruptionPolicy?.allowedUserId || params.userId || null); // Fast path: realtime utterance request (no soundboard, realtime TTS available) if (useRealtimeTts && playbackPlan.soundboardRefs.length === 0) { const normalizedText = normalizeVoiceText(playbackPlan.spokenText, STT_REPLY_MAX_CHARS); if (!normalizedText) return false; if (generationInterrupted()) return false; if (host.maybeSupersedeRealtimeReplyBeforePlayback({ session, source: playbackSource, speechStep: index, generationStartedAtMs: generationStartedAt, outputLeaseMode: streamedReplyOutputLeaseMode, replyUserId: streamSupersedeUserId })) { return false; } const requested = host.requestRealtimeTextUtterance({ session, text: normalizedText, userId: host.client.user?.id || null, source: playbackSource, interruptionPolicy: streamedReplyInterruptionPolicy, outputLeaseMode: streamedReplyOutputLeaseMode, latencyContext, musicWakeRefreshAfterSpeech: shouldRefreshMusicWakeAfterSpeech }); if (requested && streamedReplyRequestedAt === 0) { streamedReplyRequestedAt = requestedAt; session.lastAssistantReplyAt = requestedAt; markInFlightAcceptedBrainTurnPhase("playback_requested"); } return { accepted: requested, playedSoundboardRefs: [], requestedRealtimeUtterance: requested }; } // Full playback path: API TTS or mixed speech+soundboard const playbackResult = await host.playVoiceReplyInOrder({ session, settings: params.settings, spokenText: playbackPlan.spokenText, playbackSteps: playbackPlan.steps, source: playbackSource, preferRealtimeUtterance: useRealtimeTts, interruptionPolicy: streamedReplyInterruptionPolicy, outputLeaseMode: streamedReplyOutputLeaseMode, latencyContext, musicWakeRefreshAfterSpeech: shouldRefreshMusicWakeAfterSpeech, replyUserId: streamSupersedeUserId }); if (generationInterrupted()) return false; const accepted = Boolean(playbackResult.completed) && (Boolean(playbackResult.spokeLine) || Number(playbackResult.playedSoundboardCount || 0) > 0); if (accepted && streamedReplyRequestedAt === 0) { streamedReplyRequestedAt = requestedAt; session.lastAssistantReplyAt = requestedAt; markInFlightAcceptedBrainTurnPhase("playback_requested"); } if ( accepted && shouldRefreshMusicWakeAfterSpeech && playbackResult.spokeLine && !playbackResult.requestedRealtimeUtterance ) { host.schedulePassiveMusicWakeLatchRefresh({ session, settings: params.settings, userId: params.userId || null }); } return { accepted, playedSoundboardRefs: playbackPlan.soundboardRefs.slice( 0, Math.max(0, Number(playbackResult.playedSoundboardCount || 0)) ), requestedRealtimeUtterance: Boolean(playbackResult.requestedRealtimeUtterance) }; }, streamingSentencesEnabled: streamingVoiceReplyEnabled, signal: generationSignal }); const generationOnlyWatchdogPromise = new Promise((_, reject) => { const watchdogTimer = setTimeout(() => { const currentPhase = session.inFlightAcceptedBrainTurn?.phase || null; if ( session.inFlightAcceptedBrainTurn !== inFlightAcceptedBrainTurn || currentPhase !== "generation_only" || generationInterrupted() ) { return; } try { activeReply?.abortController.abort(voice_generation_only_watchdog_timeout:${VOICE_GENERATION_ONLY_WATCHDOG_MS}); } catch { // best-effort } host.store.logAction({ kind: "voice_runtime", guildId: session.guildId, channelId: session.textChannelId, userId: params.userId, content: "voice_generation_watchdog_timeout", metadata: { sessionId: session.id, source, phase: currentPhase, timeoutMs: VOICE_GENERATION_ONLY_WATCHDOG_MS, transcriptChars: normalizedTranscript.length } }); reject(createVoiceGenerationTimeoutError("generation_only_watchdog", VOICE_GENERATION_ONLY_WATCHDOG_MS)); }, VOICE_GENERATION_ONLY_WATCHDOG_MS); void generateVoiceTurnPromise.then( () => { clearTimeout(watchdogTimer); }, () => { clearTimeout(watchdogTimer); } ); }); generatedPayload = toGeneratedPayload(await Promise.race([ generateVoiceTurnPromise, generationOnlyWatchdogPromise ])); if (generatedPayload?.generationContextSnapshot) { session.lastGenerationContext = { ...generatedPayload.generationContextSnapshot, source, mode: session.mode || source }; } generationFinished = true; } catch (error) { if (isAbortError(error) || generationInterrupted()) { return false; } if (isVoiceGenerationTimeoutError(error)) { return false; } host.store.logAction({ kind: "voice_error", guildId: session.guildId, channelId: session.textChannelId, userId: params.userId, content: realtime_generation_failed: ${String(error?.message || error)}, metadata: { sessionId: session.id, source } }); return false; } finally { host.activeReplies?.clear(activeReply); if (!generationFinished || generationInterrupted()) { clearInFlightAcceptedBrainTurn(); } }

if (generationInterrupted()) { clearInFlightAcceptedBrainTurn(); return false; }

if (streamedReplyRequestedAt > 0) { host.collapsePendingRealtimeAssistantStreamTail({ session, source }); }

// Extract [[NOTE:...]] directives from the raw (pre-normalization) generation // text. normalizeVoiceReplyText strips notes to prevent TTS from reading them // aloud, so generatedPayload.text is already note-free. The rawText field // preserves the original output for note extraction and storage. const noteExtraction = extractNoteDirectives(generatedPayload?.rawText || generatedPayload?.text); const replyText = normalizeVoiceText(noteExtraction.text || "", STT_REPLY_MAX_CHARS); const playedSoundboardRefs = normalizeSoundboardRefsModule(generatedPayload?.playedSoundboardRefs); const streamedSentenceCount = Math.max(0, Number(generatedPayload?.streamedSentenceCount || 0)); const streamedRequestedRealtimeUtterance = Boolean(generatedPayload?.streamedRequestedRealtimeUtterance); const usedWebSearchFollowup = Boolean(generatedPayload?.usedWebSearchFollowup); const usedScreenShareOffer = Boolean(generatedPayload?.usedScreenShareOffer); const leaveVoiceChannelRequested = Boolean(generatedPayload?.leaveVoiceChannelRequested); const replyPrompts = generatedPayload?.replyPrompts && typeof generatedPayload.replyPrompts === "object" ? generatedPayload.replyPrompts : null; setVoiceLivePromptSnapshot(session, "generation", { replyPrompts, source }); const { generatedVoiceAddressing, replyInterruptionPolicy } = resolveAssistantReplyTargeting(host, { session, userId: params.userId, rawAddressing: generatedPayload?.voiceAddressing }); const replyOutputLeaseMode = normalizeAssistantReplyOutputLeaseMode( generatedPayload?.voiceOutputLeaseMode ); if (session.inFlightAcceptedBrainTurn === inFlightAcceptedBrainTurn) { session.inFlightAcceptedBrainTurn.outputLeaseMode = replyOutputLeaseMode; }

// Store any [[NOTE:...]] directives the brain wrote as private self-notes. // These persist as noteEntries and are injected into future turns // so the brain can maintain its own visual memory without a separate triage model. const storeExtractedNotes = () => { if (noteExtraction.notes.length === 0) return; if (!session.streamWatch?.active) return; const settingsObj = params.settings as Record<string, Record<string, Record<string, unknown>>> | null; const maxEntries = Number(settingsObj?.voice?.streamWatch?.maxNoteEntries) || 12; for (const note of noteExtraction.notes) { appendStreamWatchNoteEntry({ session, text: note, at: Date.now(), provider: null, model: null, speakerName: null, maxEntries }); } };

if ( !streamingVoiceReplyEnabled && playbackPlan.spokenText && providerSupports(session.mode || "", "updateInstructions") ) { void host.instructionManager.prepareRealtimeTurnContext({ session, settings: params.settings, userId: params.userId, transcript: normalizedTranscript, captureReason: source }).catch((error: unknown) => { host.store.logAction({ kind: "voice_error", guildId: session.guildId, channelId: session.textChannelId, userId: host.client.user?.id || null, content: openai_realtime_turn_context_refresh_failed: ${String((error as Error)?.message || error)}, metadata: { sessionId: session.id, source } }); }); }

const streamedSpeechPlayed = streamedSentenceCount > 0; const replyRequestedAt = streamedReplyRequestedAt || Date.now(); const replyLatencyContext = { finalizedAtMs: latencyFinalizedAtMs, asrStartedAtMs: latencyAsrStartedAtMs, asrCompletedAtMs: latencyAsrCompletedAtMs, generationStartedAtMs: generationStartedAt, replyRequestedAtMs: replyRequestedAt, audioStartedAtMs: 0, source, captureReason: latencyCaptureReason, queueWaitMs: latencyQueueWaitMs, pendingQueueDepth: latencyPendingQueueDepth }; session.lastAssistantReplyAt = replyRequestedAt;

if (!leaveVoiceChannelRequested || session.ending) { return true; }

if (playbackPlan.spokenText && playbackResult.spokeLine) { await host.waitForLeaveDirectivePlayback({ session, expectRealtimeAudio: requestedRealtimeUtterance, source: ${source}:leave_directive }); }

await host.endSession({ guildId: session.guildId, reason: "assistant_leave_directive", requestedByUserId: host.client.user?.id || null, settings: params.settings, announcement: "wrapping up vc." }).catch((error: unknown) => { host.store.logAction({ kind: "voice_error", guildId: session.guildId, channelId: session.textChannelId, userId: host.client.user?.id || null, content: assistant_leave_directive_failed: ${String((error as Error)?.message || error)}, metadata: { sessionId: session.id, mode: session.mode, source } }); });

return true; }