src/voice/captureManager.ts

import { appendAudioToAsr, beginAsrUtterance, commitAsrUtterance, discardAsrUtterance, getOrCreatePerUserAsrState, getOrCreateSharedAsrState, releaseSharedAsrActiveUser, scheduleAsrIdleClose, tryHandoffSharedAsr, type AsrBridgeDeps, type AsrCommitResult, type AsrUtteranceState } from "./voiceAsrBridge.ts"; import { ACTIVITY_TOUCH_THROTTLE_MS, CAPTURE_IDLE_FLUSH_MS, CAPTURE_MAX_DURATION_MS, CAPTURE_NEAR_SILENCE_ABORT_ACTIVE_RATIO_MAX, CAPTURE_NEAR_SILENCE_ABORT_MIN_AGE_MS, CAPTURE_NEAR_SILENCE_ABORT_PEAK_MAX, INPUT_SPEECH_END_SILENCE_MS, OPENAI_ASR_BRIDGE_MAX_WAIT_MS, STT_TRANSCRIPT_MAX_CHARS, VOICE_SILENCE_GATE_ACTIVE_SAMPLE_MIN_ABS } from "./voiceSessionManager.constants.ts"; import { getMusicWakeFollowupState } from "./musicWakeLatch.ts"; import { inspectAsrTranscript, isRealtimeMode, normalizeVoiceText } from "./voiceSessionHelpers.ts"; import type { BargeInController, BargeInDecisionEvaluation } from "./bargeInController.ts"; import type { DeferredActionQueue } from "./deferredActionQueue.ts"; import type { TurnProcessor } from "./turnProcessor.ts"; import type { CaptureState, VoiceSession } from "./voiceSessionTypes.ts"; import { buildRuntimeDecisionCorrelation } from "../services/runtimeCorrelation.ts";

type CaptureManagerSettings = Record<string, unknown> | null;

interface CaptureSignalMetrics { sampleCount: number; activeSampleRatio: number; peak: number; rms: number; }

interface PcmSilenceGateResult extends CaptureSignalMetrics { clipDurationMs: number; drop: boolean; }

type CaptureManagerStoreLike = { logAction: (entry: { kind: string; guildId?: string | null; channelId?: string | null; userId?: string | null; content: string; metadata?: Record<string, unknown>; }) => void; };

interface CaptureManagerHost { client: { user?: { id?: string | null; } | null; }; store: CaptureManagerStoreLike; bargeInController: Pick<BargeInController, "getCaptureSignalMetrics" | "evaluateBargeInDecision">; turnProcessor: Pick<TurnProcessor, "queueRealtimeTurn" | "queueFileAsrTurn">; shouldUsePerUserTranscription: (args: { session: VoiceSession; settings?: CaptureManagerSettings; }) => boolean; shouldUseFileTurnTranscription: (args: { session: VoiceSession; settings?: CaptureManagerSettings; }) => boolean; shouldUseSharedTranscription: (args: { session: VoiceSession; settings?: CaptureManagerSettings; }) => boolean; shouldUseTranscriptOverlapInterrupts: (args: { session: VoiceSession; settings?: CaptureManagerSettings; }) => boolean; buildAsrBridgeDeps: (session: VoiceSession) => AsrBridgeDeps; hasDeferredTurnBlockingActiveCapture: (session: VoiceSession) => boolean; flushHeldRoomCoalesceTurns: (session: VoiceSession, reason?: string) => void; deferredActionQueue: Pick<DeferredActionQueue, "recheckDeferredVoiceActions">; drainPendingRealtimeAssistantUtterances: (session: VoiceSession, reason?: string) => boolean; hasCaptureBeenPromoted: (capture: CaptureState) => boolean; resolveCaptureTurnPromotionReason: (args: { session: VoiceSession; capture: CaptureState; }) => string | null; hasCaptureServerVadSpeech: (args: { session: VoiceSession; capture: CaptureState; }) => boolean; touchActivity: (guildId: string, settings?: CaptureManagerSettings) => void; isCaptureConfirmedLiveSpeech: (args: { session: VoiceSession; capture: CaptureState; }) => boolean; interruptBotSpeechForBargeIn: (args: { session: VoiceSession; userId?: string | null; source?: string; minCaptureBytes?: number; captureState?: CaptureState | null; }) => boolean; ensurePendingSpeechStartedInterruptFromLocalCapture: (args: { session: VoiceSession; userId?: string | null; captureState?: CaptureState | null; source?: string; }) => boolean; evaluatePcmSilenceGate: (args: { pcmBuffer: Buffer; sampleRateHz?: number; }) => PcmSilenceGateResult; estimatePcm16MonoDurationMs: (pcmByteLength: number, sampleRateHz?: number) => number; queueRealtimeTurnFromAsrBridge: (args: { session: VoiceSession; userId: string; pcmBuffer?: Buffer | null; captureReason?: string; finalizedAt?: number; musicWakeFollowupEligibleAtCapture?: boolean; bridgeUtteranceId?: number | null; asrResult?: AsrCommitResult | null; source?: string; serverVadConfirmed?: boolean; }) => boolean; handoffInterruptedTurnToVoiceBrain: (args: { session: VoiceSession; reason?: string; userId?: string | null; source?: string; bridgeUtteranceId?: number | null; }) => boolean; }

export class CaptureManager { constructor(private readonly host: CaptureManagerHost) {}

private shouldLogBargeInGate(evaluation: BargeInDecisionEvaluation) { return evaluation.outputState.locked || evaluation.outputState.pendingResponse || evaluation.outputState.openAiActiveResponse || evaluation.outputState.botTurnOpen || evaluation.outputState.bufferedBotSpeech || evaluation.liveAudioStreaming; }

private logBargeInGateDecision({ session, userId, source = "speaking_data", captureState, evaluation, allow, reason, transcriptOverlapInterruptsEnabled }: { session: VoiceSession; userId: string; source?: string; captureState: CaptureState; evaluation: BargeInDecisionEvaluation; allow: boolean; reason: string; transcriptOverlapInterruptsEnabled: boolean; }) { const now = Date.now(); captureState.bargeInGateLoggedAt = now; this.host.store.logAction({ kind: "voice_runtime", guildId: session.guildId, channelId: session.textChannelId, userId, content: "voice_barge_in_gate", metadata: { ...buildRuntimeDecisionCorrelation({ botId: this.host.client.user?.id || null, sessionId: session.id, source, stage: "barge_in", allow, reason }), baseAllow: evaluation.allowed, baseReason: evaluation.reason, pendingRequestId: evaluation.pendingRequestId, minCaptureBytes: evaluation.minCaptureBytes, captureUtteranceId: Math.max(0, Number(captureState.asrUtteranceId || 0)) || null, captureStartedAt: Math.max(0, Number(captureState.startedAt || 0)) || null, capturePromotedAt: Math.max(0, Number(captureState.promotedAt || 0)) || null, captureAgeMs: evaluation.captureAgeMs, captureBytesSent: evaluation.captureBytesSent, promotionReason: String(captureState.promotionReason || "").trim() || null, transcriptOverlapInterruptsEnabled, signalPeak: evaluation.signal.peak, signalRms: evaluation.signal.rms, signalActiveSampleRatio: evaluation.signal.activeSampleRatio, outputLocked: evaluation.outputState.locked, outputLockReason: evaluation.outputState.lockReason, outputMusicActive: evaluation.outputState.musicActive, outputBargeInSuppressed: evaluation.outputState.bargeInSuppressed, outputBotTurnOpen: evaluation.outputState.botTurnOpen, outputBufferedBotSpeech: evaluation.outputState.bufferedBotSpeech, outputPendingResponse: evaluation.outputState.pendingResponse, outputOpenAiActiveResponse: evaluation.outputState.openAiActiveResponse, interruptionPolicyScope: evaluation.interruptionPolicy?.scope || null, interruptionPolicyAllowedUserId: evaluation.interruptionPolicy?.allowedUserId || null, interruptionPolicySource: evaluation.interruptionPolicy?.source || null, interruptionPolicyReason: evaluation.interruptionPolicy?.reason || null } }); }

startInboundCapture({ session, userId, settings = session?.settingsSnapshot }: { session: VoiceSession; userId: string; settings?: CaptureManagerSettings; }) { if (!session || !userId) return; if (session.userCaptures.has(userId)) return; const useOpenAiPerUserAsr = this.host.shouldUsePerUserTranscription({ session, settings }); const useOpenAiSharedAsr = this.host.shouldUseSharedTranscription({ session, settings });

const sampleRate = isRealtimeMode(session.mode) ? Number(session.realtimeInputSampleRateHz) || 24000 : 24000;
session.voxClient?.subscribeUser(userId, INPUT_SPEECH_END_SILENCE_MS, sampleRate);

const captureState: CaptureState = {
  userId,
  startedAt: Date.now(),
  promotedAt: 0,
  promotionReason: null,
  bargeInGateLoggedAt: 0,
  musicWakeFollowupEligibleAtPromotion: false,
  asrUtteranceId: 0,
  bytesSent: 0,
  signalSampleCount: 0,
  signalActiveSampleCount: 0,
  signalPeakAbs: 0,
  signalSumSquares: 0,
  pcmChunks: [],
  sharedAsrBytesSent: 0,
  lastActivityTouchAt: 0,
  idleFlushTimer: null,
  maxFlushTimer: null,
  speakingEndFinalizeTimer: null,
  finalize: null,
  abort: null,
  removeSubprocessListeners: null
};

session.userCaptures.set(userId, captureState);
const asrDeps = this.host.buildAsrBridgeDeps(session);
const beginSharedAsrUtterance = (targetUserId: string) =>
  beginAsrUtterance("shared", session, asrDeps, settings || null, targetUserId);
const appendToSharedAsr = (targetUserId: string, pcmChunk: Buffer) =>
  appendAudioToAsr("shared", session, asrDeps, settings || null, targetUserId, pcmChunk);
const scheduleSharedAsrIdleClose = () => {
  scheduleAsrIdleClose("shared", session, asrDeps, "");
};
const releaseSharedAsrUser = (targetUserId: string | null = userId) => {
  releaseSharedAsrActiveUser(session, targetUserId);
};
const tryHandoffSharedAsrToWaitingCapture = () => {
  const asrState = getOrCreateSharedAsrState(session);
  return tryHandoffSharedAsr({
    session,
    asrState,
    deps: asrDeps,
    settings: settings || null,
    beginUtterance: beginSharedAsrUtterance,
    appendAudio: appendToSharedAsr,
    releaseUser: (targetUserId) => releaseSharedAsrUser(targetUserId)
  });
};

if (useOpenAiPerUserAsr) {
  beginAsrUtterance("per_user", session, asrDeps, settings || null, userId);
  const asrState = getOrCreatePerUserAsrState(session, userId);
  captureState.asrUtteranceId = Math.max(0, Number(asrState?.utterance?.id || 0));
}

const cleanupCapture = (reason?: string) => {
  const current = session.userCaptures.get(userId);
  if (!current) return;
  session.userCaptures.delete(userId);

  if (current.idleFlushTimer) {
    clearTimeout(current.idleFlushTimer);
  }
  if (current.maxFlushTimer) {
    clearTimeout(current.maxFlushTimer);
  }
  if (current.speakingEndFinalizeTimer) {
    clearTimeout(current.speakingEndFinalizeTimer);
  }
  try {
    current.removeSubprocessListeners?.();
  } catch {
    // ignore
  }
  // Room-coalesce: if this was the last active capture, flush any held turns.
  // Skip for max_duration — the user is still speaking; a new capture will
  // start and eventually finalize with a real speech-end reason.
  const isRealSpeechEnd = reason !== "max_duration";
  if (
    isRealSpeechEnd &&
    session.userCaptures.size <= 0 &&
    typeof this.host.flushHeldRoomCoalesceTurns === "function"
  ) {
    this.host.flushHeldRoomCoalesceTurns(session, "last_capture_finalized");
  }
};

const maybeTriggerDeferredActions = (reason = "capture_resolved") => {
  if (!this.host.hasDeferredTurnBlockingActiveCapture(session)) {
    this.host.deferredActionQueue.recheckDeferredVoiceActions({ session, reason });
  }
};

const maybeDrainQueuedAssistantSpeechAfterNoTurn = (reason = "capture_resolved") => {
  if (this.host.hasDeferredTurnBlockingActiveCapture(session)) return;
  this.host.drainPendingRealtimeAssistantUtterances(session, reason);
};

const appendBufferedCaptureToAsr = () => {
  if (useOpenAiPerUserAsr || !useOpenAiSharedAsr) return;
  let appendedBytes = 0;
  for (const chunk of captureState.pcmChunks) {
    if (!Buffer.isBuffer(chunk) || !chunk.length) continue;
    const appended = appendToSharedAsr(userId, chunk);
    if (appended) appendedBytes += chunk.length;
  }
  if (appendedBytes > 0) {
    captureState.sharedAsrBytesSent =
      Math.max(0, Number(captureState.sharedAsrBytesSent || 0)) + appendedBytes;
  }
};

const promoteCapture = (now = Date.now()) => {
  if (this.host.hasCaptureBeenPromoted(captureState)) return true;
  const promotionReason = this.host.resolveCaptureTurnPromotionReason({
    session,
    capture: captureState
  });
  if (!promotionReason) return false;
  const signal = this.host.bargeInController.getCaptureSignalMetrics(captureState);
  captureState.promotedAt = now;
  captureState.promotionReason = String(promotionReason);
  captureState.musicWakeFollowupEligibleAtPromotion =
    getMusicWakeFollowupState(session, userId, now).passiveWakeFollowupAllowed;
  if (useOpenAiSharedAsr) {
    beginSharedAsrUtterance(userId);
  }
  appendBufferedCaptureToAsr();
  session.lastInboundAudioAt = now;
  captureState.lastActivityTouchAt = now;
  this.host.touchActivity(session.guildId, settings);
  this.host.store.logAction({
    kind: "voice_turn_in",
    guildId: session.guildId,
    channelId: session.textChannelId,
    userId,
    content: "voice_activity_started",
    metadata: {
      sessionId: session.id,
      promotionReason: captureState.promotionReason,
      promotionDelayMs: Math.max(0, now - Number(captureState.startedAt || now)),
      promotionBytes: Math.max(0, Number(captureState.bytesSent || 0)),
      promotionServerVadConfirmed: this.host.hasCaptureServerVadSpeech({
        session,
        capture: captureState
      }),
      promotionPeak: signal.peak,
      promotionRms: signal.rms,
      promotionActiveSampleRatio: signal.activeSampleRatio
    }
  });
  return true;
};

let captureFinalized = false;
const finalizeUserTurn = (reason = "stream_end") => {
  if (captureFinalized) return;
  captureFinalized = true;
  const finalizedAt = Date.now();
  const captureDurationMs = Math.max(0, finalizedAt - captureState.startedAt);
  const signal = this.host.bargeInController.getCaptureSignalMetrics(captureState);

  if (!this.host.hasCaptureBeenPromoted(captureState) && Number(captureState.bytesSent || 0) > 0 && !session.ending) {
    this.host.store.logAction({
      kind: "voice_runtime",
      guildId: session.guildId,
      channelId: session.textChannelId,
      userId,
      content: "voice_turn_dropped_provisional_capture",
      metadata: {
        sessionId: session.id,
        reason: String(reason || "stream_end"),
        bytesSent: Number(captureState.bytesSent || 0),
        durationMs: captureDurationMs,
        peak: signal.peak,
        rms: signal.rms,
        activeSampleRatio: signal.activeSampleRatio
      }
    });
    cleanupCapture(reason);
    maybeTriggerDeferredActions(String(reason || "stream_end"));
    maybeDrainQueuedAssistantSpeechAfterNoTurn(String(reason || "stream_end"));
    if (useOpenAiPerUserAsr) {
      discardAsrUtterance("per_user", session, userId);
      scheduleAsrIdleClose("per_user", session, asrDeps, userId);
    } else if (useOpenAiSharedAsr) {
      releaseSharedAsrUser();
      if (!tryHandoffSharedAsrToWaitingCapture()) {
        scheduleSharedAsrIdleClose();
      }
    }
    return;
  }

  this.host.store.logAction({
    kind: "voice_runtime",
    guildId: session.guildId,
    channelId: session.textChannelId,
    userId,
    content: "voice_turn_finalized",
    metadata: {
      sessionId: session.id,
      reason: String(reason || "stream_end"),
      bytesSent: captureState.bytesSent,
      durationMs: captureDurationMs
    }
  });

  if (captureState.bytesSent <= 0 || session.ending) {
    this.host.store.logAction({
      kind: "voice_runtime",
      guildId: session.guildId,
      channelId: session.textChannelId,
      userId,
      content: "voice_turn_skipped_empty_capture",
      metadata: {
        sessionId: session.id,
        reason: String(reason || "stream_end"),
        bytesSent: Number(captureState.bytesSent || 0),
        ending: Boolean(session.ending)
      }
    });
    cleanupCapture(reason);
    maybeTriggerDeferredActions(String(reason || "stream_end"));
    maybeDrainQueuedAssistantSpeechAfterNoTurn(String(reason || "stream_end"));
    if (useOpenAiPerUserAsr) {
      discardAsrUtterance("per_user", session, userId);
      scheduleAsrIdleClose("per_user", session, asrDeps, userId);
    } else if (useOpenAiSharedAsr) {
      releaseSharedAsrUser();
      if (!tryHandoffSharedAsrToWaitingCapture()) {
        scheduleSharedAsrIdleClose();
      }
    }
    return;
  }

  const pcmBuffer = Buffer.concat(captureState.pcmChunks);
  const sampleRateHz = isRealtimeMode(session.mode)
    ? Number(session.realtimeInputSampleRateHz) || 24000
    : 24000;
  const silenceGate = this.host.evaluatePcmSilenceGate({ pcmBuffer, sampleRateHz });
  const audioDurationMs = silenceGate.clipDurationMs;
  const isBurstArtifact = audioDurationMs > 200 && captureDurationMs < audioDurationMs * 0.25;

  if (silenceGate.drop) {
    this.host.store.logAction({
      kind: "voice_runtime",
      guildId: session.guildId,
      channelId: session.textChannelId,
      userId,
      content: "voice_turn_dropped_silence_gate",
      metadata: {
        sessionId: session.id,
        reason: String(reason || "stream_end"),
        bytesSent: captureState.bytesSent,
        captureDurationMs,
        audioDurationMs,
        rms: silenceGate.rms,
        peak: silenceGate.peak,
        activeSampleRatio: silenceGate.activeSampleRatio,
        isBurstArtifact,
        silenceGateDrop: silenceGate.drop
      }
    });
    cleanupCapture(reason);
    maybeTriggerDeferredActions(String(reason || "stream_end"));
    maybeDrainQueuedAssistantSpeechAfterNoTurn(String(reason || "stream_end"));
    if (useOpenAiPerUserAsr) {
      discardAsrUtterance("per_user", session, userId);
      scheduleAsrIdleClose("per_user", session, asrDeps, userId);
    } else if (useOpenAiSharedAsr) {
      releaseSharedAsrUser();
      scheduleSharedAsrIdleClose();
    }
    return;
  }

  if (isBurstArtifact) {
    this.host.store.logAction({
      kind: "voice_runtime",
      guildId: session.guildId,
      channelId: session.textChannelId,
      userId,
      content: "voice_turn_burst_artifact_processed",
      metadata: {
        sessionId: session.id,
        reason: String(reason || "stream_end"),
        bytesSent: captureState.bytesSent,
        captureDurationMs,
        audioDurationMs
      }
    });
  }

  cleanupCapture(reason);
  if (this.host.shouldUseFileTurnTranscription({ session, settings })) {
    this.host.turnProcessor.queueFileAsrTurn({
      session,
      userId,
      pcmBuffer,
      captureReason: reason
    });
    return;
  }

  if (useOpenAiPerUserAsr || useOpenAiSharedAsr) {
    void this.runAsrBridgeCommit({
      session,
      userId,
      settings,
      captureState,
      pcmBuffer,
      captureReason: reason,
      finalizedAt,
      useOpenAiPerUserAsr,
      useOpenAiSharedAsr
    });
    return;
  }
  this.host.turnProcessor.queueRealtimeTurn({
    session,
    userId,
    pcmBuffer,
    captureReason: reason,
    finalizedAt,
    musicWakeFollowupEligibleAtCapture: captureState.musicWakeFollowupEligibleAtPromotion,
    serverVadConfirmed: this.host.hasCaptureServerVadSpeech({ session, capture: captureState })
  });
};

const scheduleIdleFlush = () => {
  if (captureState.idleFlushTimer) {
    clearTimeout(captureState.idleFlushTimer);
  }
  captureState.idleFlushTimer = setTimeout(() => {
    finalizeUserTurn("idle_timeout");
  }, CAPTURE_IDLE_FLUSH_MS);
};

const onUserAudio = (audioUserId: string, pcmBase64: Buffer | string) => {
  if (String(audioUserId || "") !== userId) return;
  const now = Date.now();
  let normalizedPcm: Buffer;
  try {
    if (Buffer.isBuffer(pcmBase64)) {
      normalizedPcm = pcmBase64;
    } else {
      normalizedPcm = Buffer.from(String(pcmBase64 || ""), "base64");
    }
  } catch (error) {
    this.host.store.logAction({
      kind: "voice_warn",
      guildId: session.guildId,
      channelId: session.textChannelId,
      content: "invalid_pcm_base64_from_subprocess",
      metadata: {
        userId,
        error: error instanceof Error ? error.message : String(error)
      }
    });
    return;
  }
  if (!normalizedPcm.length) return;
  captureState.bytesSent += normalizedPcm.length;
  const sampleCount = Math.floor(normalizedPcm.length / 2);
  if (sampleCount > 0) {
    let peakAbs = Math.max(0, Number(captureState.signalPeakAbs || 0));
    let activeSamples = 0;
    let sumSquares = Math.max(0, Number(captureState.signalSumSquares || 0));
    for (let offset = 0; offset + 1 < normalizedPcm.length; offset += 2) {
      const sample = normalizedPcm.readInt16LE(offset);
      const absSample = Math.abs(sample);
      if (absSample > peakAbs) peakAbs = absSample;
      if (absSample >= VOICE_SILENCE_GATE_ACTIVE_SAMPLE_MIN_ABS) {
        activeSamples += 1;
      }
      sumSquares += sample * sample;
    }
    captureState.signalSampleCount = Math.max(0, Number(captureState.signalSampleCount || 0)) + sampleCount;
    captureState.signalActiveSampleCount =
      Math.max(0, Number(captureState.signalActiveSampleCount || 0)) + activeSamples;
    captureState.signalPeakAbs = peakAbs;
    captureState.signalSumSquares = sumSquares;
  }
  captureState.pcmChunks.push(normalizedPcm);
  if (useOpenAiPerUserAsr) {
    appendAudioToAsr("per_user", session, asrDeps, settings || null, userId, normalizedPcm);
  }
  const wasPromoted = this.host.hasCaptureBeenPromoted(captureState);
  if (!wasPromoted) {
    promoteCapture(now);
  }
  const isPromoted = this.host.hasCaptureBeenPromoted(captureState);
  if (isPromoted && wasPromoted && useOpenAiSharedAsr) {
    const appendedToSharedAsr = appendToSharedAsr(userId, normalizedPcm);
    if (appendedToSharedAsr) {
      captureState.sharedAsrBytesSent =
        Math.max(0, Number(captureState.sharedAsrBytesSent || 0)) + normalizedPcm.length;
    }
  }
  if (captureState.speakingEndFinalizeTimer) {
    clearTimeout(captureState.speakingEndFinalizeTimer);
    captureState.speakingEndFinalizeTimer = null;
  }
  scheduleIdleFlush();

  if (isPromoted) {
    session.lastInboundAudioAt = now;
    if (
      this.host.isCaptureConfirmedLiveSpeech({ session, capture: captureState }) &&
      now - captureState.lastActivityTouchAt >= ACTIVITY_TOUCH_THROTTLE_MS
    ) {
      this.host.touchActivity(session.guildId, settings);
      captureState.lastActivityTouchAt = now;
    }
  }

  const bargeEvaluation = this.host.bargeInController.evaluateBargeInDecision({
    session,
    userId,
    captureState
  });
  const transcriptOverlapInterruptsEnabled = this.host.shouldUseTranscriptOverlapInterrupts({
    session,
    settings: settings || null
  });
  let bargeInAllowed = bargeEvaluation.allowed;
  let bargeInReason: string = bargeEvaluation.reason;
  if (bargeInAllowed && transcriptOverlapInterruptsEnabled) {
    bargeInAllowed = false;
    bargeInReason = "transcript_overlap_interrupts_enabled";
  }
  if (
    !wasPromoted &&
    isPromoted &&
    Number(captureState.bargeInGateLoggedAt || 0) <= 0 &&
    this.shouldLogBargeInGate(bargeEvaluation)
  ) {
    this.logBargeInGateDecision({
      session,
      userId,
      source: "speaking_data",
      captureState,
      evaluation: bargeEvaluation,
      allow: bargeInAllowed,
      reason: bargeInReason,
      transcriptOverlapInterruptsEnabled
    });
  }
  if (
    !transcriptOverlapInterruptsEnabled &&
    bargeEvaluation.allowed
  ) {
    this.host.interruptBotSpeechForBargeIn({
      session,
      userId,
      source: "speaking_data",
      minCaptureBytes: bargeEvaluation.minCaptureBytes || 0,
      captureState
    });
  } else if (transcriptOverlapInterruptsEnabled && isPromoted) {
    this.host.ensurePendingSpeechStartedInterruptFromLocalCapture({
      session,
      userId,
      captureState,
      source: "local_capture_overlap"
    });
  }

  const captureAgeMs = Math.max(0, now - Number(captureState.startedAt || now));
  const signalSampleCount = Math.max(0, Number(captureState.signalSampleCount || 0));
  if (captureAgeMs >= CAPTURE_NEAR_SILENCE_ABORT_MIN_AGE_MS && signalSampleCount > 0) {
    const activeSampleCount = Math.max(0, Number(captureState.signalActiveSampleCount || 0));
    const activeSampleRatio = activeSampleCount / signalSampleCount;
    const peak = Math.max(0, Number(captureState.signalPeakAbs || 0)) / 32768;
    if (
      activeSampleRatio <= CAPTURE_NEAR_SILENCE_ABORT_ACTIVE_RATIO_MAX &&
      peak <= CAPTURE_NEAR_SILENCE_ABORT_PEAK_MAX
    ) {
      finalizeUserTurn("near_silence_early_abort");
    }
  }
};

captureState.finalize = finalizeUserTurn;
captureState.abort = (reason = "capture_suppressed") => {
  if (captureFinalized) return;
  captureFinalized = true;
  this.host.store.logAction({
    kind: "voice_runtime",
    guildId: session.guildId,
    channelId: session.textChannelId,
    userId,
    content: "voice_turn_dropped",
    metadata: {
      sessionId: session.id,
      reason: String(reason || "capture_suppressed"),
      bytesSent: captureState.bytesSent,
      durationMs: Math.max(0, Date.now() - captureState.startedAt)
    }
  });
  cleanupCapture(reason);
  maybeTriggerDeferredActions(String(reason || "capture_suppressed"));
  maybeDrainQueuedAssistantSpeechAfterNoTurn(String(reason || "capture_suppressed"));
  if (useOpenAiPerUserAsr) {
    scheduleAsrIdleClose("per_user", session, asrDeps, userId);
  } else if (useOpenAiSharedAsr) {
    releaseSharedAsrUser();
    if (!tryHandoffSharedAsrToWaitingCapture()) {
      scheduleSharedAsrIdleClose();
    }
  }
};
captureState.maxFlushTimer = setTimeout(() => {
  finalizeUserTurn("max_duration");
}, CAPTURE_MAX_DURATION_MS);

const onUserAudioEnd = (audioUserId: string) => {
  if (String(audioUserId || "") !== userId) return;
  if (!captureFinalized) {
    finalizeUserTurn("stream_end");
  }
};

let listenersRemoved = false;
const removeListeners = () => {
  if (listenersRemoved) return;
  listenersRemoved = true;
  session.voxClient?.off("userAudio", onUserAudio);
  session.voxClient?.off("userAudioEnd", onUserAudioEnd);
};
captureState.removeSubprocessListeners = removeListeners;

if (session.voxClient) {
  session.voxClient.on("userAudio", onUserAudio);
  session.voxClient.on("userAudioEnd", onUserAudioEnd);
  session.cleanupHandlers.push(removeListeners);
}

}

private async runAsrBridgeCommit({ session, userId, settings, captureState, pcmBuffer, captureReason, finalizedAt, useOpenAiPerUserAsr, useOpenAiSharedAsr }: { session: VoiceSession; userId: string; settings?: CaptureManagerSettings; captureState: CaptureState; pcmBuffer: Buffer; captureReason: string; finalizedAt: number; useOpenAiPerUserAsr: boolean; useOpenAiSharedAsr: boolean; }) { const asrMode = useOpenAiPerUserAsr ? "per_user" : "shared"; const asrSource = useOpenAiPerUserAsr ? "per_user" : "shared"; const asrDeps = this.host.buildAsrBridgeDeps(session); const committedPerUserUtterance: AsrUtteranceState | null = useOpenAiPerUserAsr ? getOrCreatePerUserAsrState(session, userId)?.utterance || null : null;

if (useOpenAiSharedAsr) {
  const hasSharedAsrAudio = Math.max(0, Number(captureState.sharedAsrBytesSent || 0)) > 0;
  if (!hasSharedAsrAudio) {
    this.host.turnProcessor.queueRealtimeTurn({
      session,
      userId,
      pcmBuffer,
      captureReason,
      finalizedAt,
      musicWakeFollowupEligibleAtCapture: captureState.musicWakeFollowupEligibleAtPromotion,
      serverVadConfirmed: this.host.hasCaptureServerVadSpeech({ session, capture: captureState })
    });
    return;
  }
  const sharedAsrState = getOrCreateSharedAsrState(session);
  if (sharedAsrState) {
    sharedAsrState.phase = "committing";
  }
}

const asrBridgeMaxWaitMs = Math.max(120, Number(OPENAI_ASR_BRIDGE_MAX_WAIT_MS) || 700);
let bridgeForwarded = false;
let latestForwardedTranscript = "";
const forwardAsrBridgeTurn = (
  asrResult: AsrCommitResult | null,
  source: string,
  allowRevision = false
) => {
  if (session.ending) return false;
  const nextTranscript = inspectAsrTranscript(
    asrResult?.transcript || "",
    STT_TRANSCRIPT_MAX_CHARS
  ).transcript;
  if (!nextTranscript) return false;
  if (bridgeForwarded && !allowRevision) return false;
  if (bridgeForwarded && nextTranscript === latestForwardedTranscript) return false;
  const queued = this.host.queueRealtimeTurnFromAsrBridge({
    session,
    userId,
    pcmBuffer,
    captureReason,
    finalizedAt,
    musicWakeFollowupEligibleAtCapture: captureState.musicWakeFollowupEligibleAtPromotion,
    bridgeUtteranceId: Math.max(0, Number(captureState.asrUtteranceId || 0)) || null,
    asrResult,
    source,
    serverVadConfirmed: this.host.hasCaptureServerVadSpeech({ session, capture: captureState })
  });
  if (queued) {
    bridgeForwarded = true;
    latestForwardedTranscript = nextTranscript;
  }
  return queued;
};

const fallbackTimer = setTimeout(() => {
  if (bridgeForwarded || session.ending) return;
  this.host.store.logAction({
    kind: "voice_runtime",
    guildId: session.guildId,
    channelId: session.textChannelId,
    userId,
    content: "openai_realtime_asr_bridge_timeout_fallback",
    metadata: {
      sessionId: session.id,
      captureReason: String(captureReason || "stream_end"),
      source: asrSource,
      waitMs: asrBridgeMaxWaitMs
    }
  });
}, asrBridgeMaxWaitMs);

try {
  const asrResult = asrMode === "per_user"
    ? await commitAsrUtterance("per_user", asrDeps, settings || null, userId, captureReason)
    : await commitAsrUtterance("shared", asrDeps, settings || null, userId, captureReason);
  clearTimeout(fallbackTimer);
  const commitTranscript = inspectAsrTranscript(
    asrResult?.transcript || "",
    STT_TRANSCRIPT_MAX_CHARS
  ).transcript;

  if (commitTranscript) {
    forwardAsrBridgeTurn(asrResult, asrSource);
  }

  if (!session.ending) {
    const lateAsrState = asrMode === "per_user"
      ? getOrCreatePerUserAsrState(session, userId)
      : getOrCreateSharedAsrState(session);
    // Per-user ASR can roll into a new live utterance before the previous
    // committed utterance finishes delivering late final segments. Keep
    // watching the committed utterance object so we don't lose the real
    // transcript just because a fresh provisional capture started.
    const trackedUtterance = asrMode === "per_user"
      ? committedPerUserUtterance
      : lateAsrState?.utterance || null;
    if (trackedUtterance) {
      const lateDeadlineMs = Date.now() + 1500;
      while (Date.now() < lateDeadlineMs && !session.ending) {
        await new Promise((resolve) => setTimeout(resolve, 80));
        if (asrMode !== "per_user" && lateAsrState?.utterance !== trackedUtterance) break;
        const lateFinal = normalizeVoiceText(
          Array.isArray(trackedUtterance.finalSegments)
            ? trackedUtterance.finalSegments.join(" ")
            : "",
          STT_TRANSCRIPT_MAX_CHARS
        );
        if (lateFinal && lateFinal !== latestForwardedTranscript) {
          const hadForwardedTranscript = Boolean(latestForwardedTranscript);
          const priorTranscriptChars = latestForwardedTranscript.length;
          const lateForwarded = forwardAsrBridgeTurn(
            {
              ...(asrResult || {
                transcript: "",
                asrStartedAtMs: 0,
                asrCompletedAtMs: 0,
                transcriptionModelPrimary: "",
                transcriptionModelFallback: null,
                transcriptionPlanReason: "",
                usedFallbackModel: false,
                captureReason,
                transcriptLogprobs: null
              }),
              transcript: lateFinal
            },
            hadForwardedTranscript
              ? `${asrSource}_late_streaming_revision`
              : `${asrSource}_late_streaming`,
            true
          );
          if (lateForwarded) {
            this.host.store.logAction({
              kind: "voice_runtime",
              guildId: session.guildId,
              channelId: session.textChannelId,
              userId,
              content: hadForwardedTranscript
                ? "openai_realtime_asr_bridge_late_streaming_revised"
                : "openai_realtime_asr_bridge_late_streaming_recovered",
              metadata: {
                sessionId: session.id,
                captureReason: String(captureReason || "stream_end"),
                source: asrSource,
                transcriptChars: lateFinal.length,
                priorTranscriptChars,
                lateWaitMs: Date.now() - (lateDeadlineMs - 1500)
              }
            });
          }
        }
      }
    }
  }

  if (!bridgeForwarded) {
    const forwarded = forwardAsrBridgeTurn(asrResult, asrSource);
    if (!forwarded) {
      const clipDurationMs = this.host.estimatePcm16MonoDurationMs(
        pcmBuffer.length,
        Number(session.realtimeInputSampleRateHz) || 24000
      );
      this.host.store.logAction({
        kind: "voice_runtime",
        guildId: session.guildId,
        channelId: session.textChannelId,
        userId,
        content: "openai_realtime_asr_bridge_empty_dropped",
        metadata: {
          sessionId: session.id,
          captureReason: String(captureReason || "stream_end"),
          source: asrSource,
          waitMs: asrBridgeMaxWaitMs,
          pcmBytes: pcmBuffer.length,
          clipDurationMs,
          asrResultAvailable: Boolean(asrResult)
        }
      });
      this.host.handoffInterruptedTurnToVoiceBrain({
        session,
        reason: "empty_asr_bridge_drop",
        userId,
        source: "unclear_empty_asr_bridge_turn",
        bridgeUtteranceId: Math.max(0, Number(captureState.asrUtteranceId || 0)) || null
      });
      this.host.deferredActionQueue.recheckDeferredVoiceActions({
        session,
        reason: "empty_asr_bridge_drop"
      });
      return;
    }
  }
  const lateTranscript = inspectAsrTranscript(
    asrResult?.transcript || "",
    STT_TRANSCRIPT_MAX_CHARS
  ).transcript;
  if (!lateTranscript || lateTranscript === latestForwardedTranscript) return;
  this.host.store.logAction({
    kind: "voice_runtime",
    guildId: session.guildId,
    channelId: session.textChannelId,
    userId,
    content: "openai_realtime_asr_bridge_late_result_ignored",
    metadata: {
      sessionId: session.id,
      captureReason: String(captureReason || "stream_end"),
      source: asrSource,
      transcriptChars: lateTranscript.length
    }
  });
} catch (error) {
  clearTimeout(fallbackTimer);
  this.host.store.logAction({
    kind: "voice_error",
    guildId: session.guildId,
    channelId: session.textChannelId,
    userId,
    content: `openai_realtime_asr_turn_failed: ${String(error instanceof Error ? error.message : error)}`,
    metadata: {
      sessionId: session.id,
      captureReason: String(captureReason || "stream_end")
    }
  });
  forwardAsrBridgeTurn(null, `${asrSource}_error`);
}

} }