src/voice/voiceJoinFlow.ts

import { randomUUID } from "node:crypto"; import { clamp } from "../utils.ts"; import { createWarmMemoryState } from "./voiceSessionWarmMemory.ts"; import { ClankvoxClient } from "./clankvoxClient.ts"; import { OpenAiRealtimeClient } from "./openaiRealtimeClient.ts"; import { GeminiRealtimeClient } from "./geminiRealtimeClient.ts"; import { XaiRealtimeClient } from "./xaiRealtimeClient.ts"; import { getRealtimeConnectErrorDiagnostics } from "./realtimeClientCore.ts"; import { SOUNDBOARD_MAX_CANDIDATES, isRealtimeMode, resolveVoiceAsrLanguageGuidance, resolveRealtimeProvider, resolveTranscriberProvider, resolveVoiceApiTtsProvider, resolveVoiceRuntimeMode, shortError } from "./voiceSessionHelpers.ts"; import { getPromptBotName } from "../prompts/promptCore.ts"; import { buildVoiceInstructions, resolveRealtimeToolOwnership } from "./voiceConfigResolver.ts"; import { resolveSoundboardCandidates } from "./voiceSoundboard.ts"; import { buildRealtimeFunctionTools, getVoiceMcpServerStatuses } from "./voiceToolCallToolRegistry.ts"; import { providerSupports } from "./voiceModes.ts"; import { createEmptyVoiceLivePromptState } from "./voicePromptState.ts"; import { createNativeDiscordScreenShareState } from "./nativeDiscordScreenShare.ts"; import { createGoLiveStreamState, buildGoLiveStreamStateFromStream, syncPrimaryGoLiveStream, upsertSessionGoLiveStream } from "../selfbot/streamDiscovery.ts"; import { createStreamPublishState } from "./voiceStreamPublish.ts"; import type { VoiceSession } from "./voiceSessionTypes.ts"; import { createAssistantOutputState } from "./assistantOutputState.ts"; import { OPENAI_REALTIME_DEFAULT_SESSION_MODEL, OPENAI_REALTIME_DEFAULT_TRANSCRIPTION_MODEL, normalizeOpenAiRealtimeSessionModel, normalizeOpenAiRealtimeTranscriptionModel } from "./realtimeProviderNormalization.ts"; import { getVoiceChannelPolicy, getVoiceConversationPolicy, getVoiceRuntimeConfig, getVoiceSessionLimits, getVoiceSettings } from "../settings/agentStack.ts"; import { sendOperationalMessage } from "./voiceOperationalMessaging.ts";

const MIN_MAX_SESSION_MINUTES = 1; const MAX_MAX_SESSION_MINUTES = 120; const OPENAI_REALTIME_MAX_SESSION_MINUTES = 60; const MIN_INACTIVITY_SECONDS = 20; const MAX_INACTIVITY_SECONDS = 3600;

function createRealtimeRuntimeLogger(manager, { guildId, channelId, botUserId }) { return ({ level, event, metadata }) => { manager.store.logAction({ kind: level === "warn" ? "voice_error" : "voice_runtime", guildId, channelId, userId: botUserId, content: event, metadata }); }; }

export async function requestJoin(manager, { message, settings, intentConfidence = null }) { if (!message?.guild || !message?.member || !message?.channel) return false;

const guildId = String(message.guild.id); const userId = String(message.author?.id || ""); if (!userId) return false; const voiceSettings = getVoiceSettings(settings); const voiceChannelPolicy = getVoiceChannelPolicy(settings); const voiceSessionLimits = getVoiceSessionLimits(settings); const voiceRuntime = getVoiceRuntimeConfig(settings);

return await manager.withJoinLock(guildId, async () => { if (!voiceSettings.enabled) { await sendOperationalMessage(manager, { channel: message.channel, settings, guildId, channelId: message.channelId, userId, messageId: message.id, event: "voice_join_request", reason: "voice_disabled", details: { voiceEnabled: Boolean(voiceSettings.enabled) }, mustNotify: true }); return true; }

const blockedUsers = [...(voiceChannelPolicy.blockedUserIds || [])].map((value) => String(value));
if (blockedUsers.includes(userId)) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "requester_blocked",
    details: {
      blockedVoiceUserIdsCount: blockedUsers.length
    },
    mustNotify: true
  });
  return true;
}

const memberVoiceChannel = message.member.voice?.channel;
if (!memberVoiceChannel) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "requester_not_in_voice",
    details: {},
    mustNotify: true
  });
  return true;
}

const targetVoiceChannelId = String(memberVoiceChannel.id);
const blockedChannels = [...(voiceChannelPolicy.blockedChannelIds || [])].map((value) => String(value));
const allowedChannels = [...(voiceChannelPolicy.allowedChannelIds || [])].map((value) => String(value));

if (blockedChannels.includes(targetVoiceChannelId)) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "channel_blocked",
    details: {
      targetVoiceChannelId
    },
    mustNotify: true
  });
  return true;
}

if (allowedChannels.length > 0 && !allowedChannels.includes(targetVoiceChannelId)) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "channel_not_allowlisted",
    details: {
      targetVoiceChannelId,
      allowlistedChannelCount: allowedChannels.length
    },
    mustNotify: true
  });
  return true;
}

const maxSessionsPerDay = clamp(Number(voiceSessionLimits.maxSessionsPerDay) || 0, 0, 120);
if (maxSessionsPerDay > 0) {
  const since24h = new Date(Date.now() - 24 * 60 * 60 * 1000).toISOString();
  const startedLastDay = manager.store.countActionsSince("voice_session_start", since24h);
  if (startedLastDay >= maxSessionsPerDay) {
    await sendOperationalMessage(manager, {
      channel: message.channel,
      settings,
      guildId,
      channelId: message.channelId,
      userId,
      messageId: message.id,
      event: "voice_join_request",
      reason: "max_sessions_per_day_reached",
      details: {
        startedLastDay,
        maxSessionsPerDay
      },
      mustNotify: true
    });
    return true;
  }
}

const existing = manager.sessions.get(guildId);
if (existing) {
  if (existing.voiceChannelId === targetVoiceChannelId) {
    manager.touchActivity(guildId, settings);
    await sendOperationalMessage(manager, {
      channel: message.channel,
      settings,
      guildId,
      channelId: message.channelId,
      userId,
      messageId: message.id,
      event: "voice_join_request",
      reason: "already_in_channel",
      details: {
        voiceChannelId: targetVoiceChannelId
      },
      mustNotify: false
    });
    return true;
  }

  await manager.endSession({
    guildId,
    reason: "switch_channel",
    requestedByUserId: userId,
    announceChannel: message.channel,
    announcement: "switching voice channels.",
    settings,
    messageId: message.id
  });
}

const runtimeMode = resolveVoiceRuntimeMode(settings);
const voiceConversation = getVoiceConversationPolicy(settings);
const replyPath = String(voiceConversation.replyPath || "brain").trim().toLowerCase();
const ttsMode = String(voiceConversation.ttsMode || "realtime").trim().toLowerCase();
const transcriptionMethod = String(
  voiceRuntime.openaiRealtime?.transcriptionMethod || "realtime_bridge"
)
  .trim()
  .toLowerCase();
const transcriberProvider = resolveTranscriberProvider(settings);
const voiceApiTtsProvider = resolveVoiceApiTtsProvider(settings);
const usesFileTurnTranscription = replyPath !== "native" && transcriptionMethod === "file_wav";
const usesRealtimeBridgeAsr =
  replyPath !== "native" &&
  transcriptionMethod === "realtime_bridge" &&
  !voiceConversation.textOnlyMode;
const usesApiTts = replyPath === "brain" && ttsMode === "api";
const needsOpenAiAudioApi =
  usesRealtimeBridgeAsr ||
  (usesFileTurnTranscription && transcriberProvider === "openai") ||
  (usesApiTts && voiceApiTtsProvider === "openai");
const needsElevenLabsAudioApi =
  (usesFileTurnTranscription && transcriberProvider === "elevenlabs") ||
  (usesApiTts && voiceApiTtsProvider === "elevenlabs");
if (runtimeMode === "voice_agent" && !manager.appConfig?.xaiApiKey) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "xai_api_key_missing",
    details: {
      mode: runtimeMode
    },
    mustNotify: true
  });
  return true;
}
if (runtimeMode === "openai_realtime" && !manager.appConfig?.openaiApiKey) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "openai_api_key_missing",
    details: {
      mode: runtimeMode
    },
    mustNotify: true
  });
  return true;
}
if (runtimeMode === "gemini_realtime" && !manager.appConfig?.geminiApiKey) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "gemini_api_key_missing",
    details: {
      mode: runtimeMode
    },
    mustNotify: true
  });
  return true;
}
if (runtimeMode === "elevenlabs_realtime" && !manager.appConfig?.elevenLabsApiKey) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "elevenlabs_api_key_missing",
    details: {
      mode: runtimeMode
    },
    mustNotify: true
  });
  return true;
}
if (needsElevenLabsAudioApi && !manager.appConfig?.elevenLabsApiKey) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "elevenlabs_api_key_missing",
    details: {
      mode: runtimeMode,
      transcriptionMethod,
      replyPath,
      ttsMode,
      providerRole:
        usesApiTts && voiceApiTtsProvider === "elevenlabs"
          ? "tts"
          : usesFileTurnTranscription && transcriberProvider === "elevenlabs"
            ? "transcription"
            : "voice_runtime"
    },
    mustNotify: true
  });
  return true;
}
if (runtimeMode === "elevenlabs_realtime") {
  const elevenLabsSettings = voiceRuntime.elevenLabsRealtime;
  const elevenLabsVoiceId = String(elevenLabsSettings?.voiceId || "").trim();
  if (replyPath !== "brain") {
    await sendOperationalMessage(manager, {
      channel: message.channel,
      settings,
      guildId,
      channelId: message.channelId,
      userId,
      messageId: message.id,
      event: "voice_join_request",
      reason: "elevenlabs_full_brain_required",
      details: {
        mode: runtimeMode,
        replyPath
      },
      mustNotify: true
    });
    return true;
  }
  if (!elevenLabsVoiceId) {
    await sendOperationalMessage(manager, {
      channel: message.channel,
      settings,
      guildId,
      channelId: message.channelId,
      userId,
      messageId: message.id,
      event: "voice_join_request",
      reason: "elevenlabs_voice_id_missing",
      details: {
        mode: runtimeMode,
        ttsMode
      },
      mustNotify: true
    });
    return true;
  }
}
if (transcriberProvider === "elevenlabs" && replyPath !== "brain") {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "elevenlabs_full_brain_required",
    details: {
      mode: runtimeMode,
      replyPath,
      providerRole: "transcription"
    },
    mustNotify: true
  });
  return true;
}
if (transcriberProvider === "elevenlabs" && transcriptionMethod !== "file_wav") {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "elevenlabs_file_asr_required",
    details: {
      mode: runtimeMode,
      transcriptionMethod,
      transcriberProvider
    },
    mustNotify: true
  });
  return true;
}
if (needsOpenAiAudioApi && runtimeMode !== "openai_realtime" && !manager.appConfig?.openaiApiKey) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "openai_audio_api_key_missing",
    details: {
      mode: runtimeMode,
      transcriptionMethod,
      ttsMode
    },
    mustNotify: true
  });
  return true;
}
if (usesFileTurnTranscription && !manager.llm?.isAsrReady?.(transcriberProvider)) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "voice_file_asr_unavailable",
    details: {
      mode: runtimeMode,
      transcriptionMethod,
      transcriberProvider
    },
    mustNotify: true
  });
  return true;
}
if (usesApiTts && !manager.llm?.isSpeechSynthesisReady?.(voiceApiTtsProvider)) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "voice_api_tts_unavailable",
    details: {
      mode: runtimeMode,
      ttsMode,
      ttsProvider: voiceApiTtsProvider
    },
    mustNotify: true
  });
  return true;
}
if (replyPath === "brain" && typeof manager.generateVoiceTurn !== "function") {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "voice_brain_unavailable",
    details: {
      mode: runtimeMode,
      replyPath
    },
    mustNotify: true
  });
  return true;
}

const missingPermissionInfo = manager.getMissingJoinPermissionInfo({
  guild: message.guild,
  voiceChannel: memberVoiceChannel
});
if (missingPermissionInfo) {
  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: missingPermissionInfo.reason,
    details: {
      missingPermissions: missingPermissionInfo.missingPermissions || []
    },
    mustNotify: true
  });
  return true;
}

const realtimeToolOwnership = resolveRealtimeToolOwnership({
  settings,
  mode: runtimeMode
});
const maxSessionMinutesCap = runtimeMode === "openai_realtime"
  ? OPENAI_REALTIME_MAX_SESSION_MINUTES
  : MAX_MAX_SESSION_MINUTES;
const maxSessionMinutes = clamp(
  Number(voiceSessionLimits.maxSessionMinutes) || 30,
  MIN_MAX_SESSION_MINUTES,
  maxSessionMinutesCap
);

let voxClient: ClankvoxClient | null = null;
let subprocessSpawnPromise: Promise<ClankvoxClient> | null = null;
let realtimeClient = null;
let reservedConcurrencySlot = false;
let realtimeInputSampleRateHz = 24000;
let realtimeOutputSampleRateHz = 24000;
let perUserAsrEnabled = false;
let sharedAsrEnabled = false;
let openAiPerUserAsrModel = OPENAI_REALTIME_DEFAULT_TRANSCRIPTION_MODEL;
let openAiPerUserAsrLanguage = "";
let openAiPerUserAsrPrompt = "";

try {
  const maxConcurrentSessions = clamp(Number(voiceSessionLimits.maxConcurrentSessions) || 1, 1, 3);
  if (!existing) {
    const activeOrPendingSessions = manager.sessions.size + manager.pendingSessionGuildIds.size;
    if (activeOrPendingSessions >= maxConcurrentSessions) {
      await sendOperationalMessage(manager, {
        channel: message.channel,
        settings,
        guildId,
        channelId: message.channelId,
        userId,
        messageId: message.id,
        event: "voice_join_request",
        reason: "max_concurrent_sessions_reached",
        details: {
          activeOrPendingSessions,
          maxConcurrentSessions
        },
        mustNotify: true
      });
      return true;
    }

    manager.pendingSessionGuildIds.add(guildId);
    reservedConcurrencySlot = true;
  }

  // --- Spawn subprocess early so it boots in parallel with API connect ---
  subprocessSpawnPromise = ClankvoxClient.spawn(
    String(message.guild.id),
    String(memberVoiceChannel.id),
    message.guild,
    { selfDeaf: false, selfMute: false }
  );

  // --- Pre-warm: connect realtime API while subprocess boots ---
  const initialSoundboardCandidateInfo = await resolveSoundboardCandidates(manager, {
    settings,
    guild: message.guild
  });
  const initialSoundboardCandidates = Array.isArray(initialSoundboardCandidateInfo?.candidates)
    ? initialSoundboardCandidateInfo.candidates
    : [];
  const baseVoiceInstructions = buildVoiceInstructions(settings, {
    soundboardCandidates: initialSoundboardCandidates
  });
  const realtimeRuntimeLogger = createRealtimeRuntimeLogger(manager, {
    guildId,
    channelId: message.channelId,
    botUserId: manager.client.user?.id || null
  });
  const openAiRealtimeSettings = voiceRuntime.openaiRealtime;
  const voiceAsrGuidance = resolveVoiceAsrLanguageGuidance(settings);
  if (runtimeMode === "voice_agent") {
    realtimeClient = new XaiRealtimeClient({
      apiKey: manager.appConfig.xaiApiKey,
      logger: realtimeRuntimeLogger
    });

    const xaiSettings = voiceRuntime.xai;
    realtimeInputSampleRateHz = Number(xaiSettings?.sampleRateHz) || 24000;
    realtimeOutputSampleRateHz = Number(xaiSettings?.sampleRateHz) || 24000;
    await realtimeClient.connect({
      voice: xaiSettings?.voice || "Rex",
      instructions: baseVoiceInstructions,
      region: xaiSettings?.region || "us-east-1",
      inputAudioFormat: xaiSettings?.audioFormat || "audio/pcm",
      outputAudioFormat: xaiSettings?.audioFormat || "audio/pcm",
      inputSampleRateHz: realtimeInputSampleRateHz,
      outputSampleRateHz: realtimeOutputSampleRateHz,
      tools:
        realtimeToolOwnership === "provider_native"
          ? buildRealtimeFunctionTools(manager, {
            session: null,
            settings,
            target: runtimeMode
          })
          : [],
      toolChoice: "auto"
    });
  } else if (runtimeMode === "openai_realtime") {
    realtimeClient = new OpenAiRealtimeClient({
      apiKey: manager.appConfig.openaiApiKey,
      logger: realtimeRuntimeLogger
    });

    realtimeInputSampleRateHz = 24000;
    realtimeOutputSampleRateHz = 24000;
    await realtimeClient.connect({
      model: normalizeOpenAiRealtimeSessionModel(
        openAiRealtimeSettings?.model,
        OPENAI_REALTIME_DEFAULT_SESSION_MODEL
      ),
      voice: String(openAiRealtimeSettings?.voice || "alloy").trim() || "alloy",
      instructions: baseVoiceInstructions,
      inputAudioFormat: String(openAiRealtimeSettings?.inputAudioFormat || "pcm16").trim() || "pcm16",
      outputAudioFormat: String(openAiRealtimeSettings?.outputAudioFormat || "pcm16").trim() || "pcm16",
      inputTranscriptionModel:
        normalizeOpenAiRealtimeTranscriptionModel(
          openAiRealtimeSettings?.inputTranscriptionModel,
          OPENAI_REALTIME_DEFAULT_TRANSCRIPTION_MODEL
        ),
      inputTranscriptionLanguage: voiceAsrGuidance.language,
      inputTranscriptionPrompt: voiceAsrGuidance.prompt,
      tools:
        realtimeToolOwnership === "provider_native"
          ? buildRealtimeFunctionTools(manager, {
            session: null,
            settings,
            target: runtimeMode
          })
          : [],
      toolChoice: "auto"
    });
  } else if (runtimeMode === "gemini_realtime") {
    const geminiRealtimeSettings = voiceRuntime.geminiRealtime;
    realtimeClient = new GeminiRealtimeClient({
      apiKey: manager.appConfig.geminiApiKey,
      baseUrl:
        String(geminiRealtimeSettings?.apiBaseUrl || "https://generativelanguage.googleapis.com").trim() ||
        "https://generativelanguage.googleapis.com",
      logger: realtimeRuntimeLogger
    });

    realtimeInputSampleRateHz = Number(geminiRealtimeSettings?.inputSampleRateHz) || 16000;
    realtimeOutputSampleRateHz = Number(geminiRealtimeSettings?.outputSampleRateHz) || 24000;
    await realtimeClient.connect({
      model:
        String(geminiRealtimeSettings?.model || "gemini-2.5-flash-native-audio-preview-12-2025").trim() ||
        "gemini-2.5-flash-native-audio-preview-12-2025",
      voice: String(geminiRealtimeSettings?.voice || "Aoede").trim() || "Aoede",
      instructions: baseVoiceInstructions,
      inputSampleRateHz: realtimeInputSampleRateHz,
      outputSampleRateHz: realtimeOutputSampleRateHz
    });
  } else if (runtimeMode === "elevenlabs_realtime") {
    const elevenLabsRealtimeSettings = voiceRuntime.elevenLabsRealtime;
    realtimeInputSampleRateHz = Number(elevenLabsRealtimeSettings?.inputSampleRateHz) || 16000;
    realtimeOutputSampleRateHz = Number(elevenLabsRealtimeSettings?.outputSampleRateHz) || 24000;
    const { ElevenLabsRealtimeClient } = await import("./elevenLabsRealtimeClient.ts");
    const elevenLabsClient = new ElevenLabsRealtimeClient({
      apiKey: manager.appConfig?.elevenLabsApiKey || "",
      baseUrl: String(elevenLabsRealtimeSettings?.apiBaseUrl || "").trim() || null,
      logger: realtimeRuntimeLogger
    });
    const elevenLabsVoiceId = String(elevenLabsRealtimeSettings?.voiceId || "").trim();
    const elevenLabsModel = String(elevenLabsRealtimeSettings?.ttsModel || "eleven_multilingual_v2").trim() || "eleven_multilingual_v2";
    const outputFormat = `pcm_${realtimeOutputSampleRateHz}`;
    await elevenLabsClient.connect({
      voiceId: elevenLabsVoiceId,
      model: elevenLabsModel,
      outputFormat,
      outputSampleRateHz: realtimeOutputSampleRateHz,
      chunkLengthSchedule: [50, 120, 200, 260]
    });
    realtimeClient = elevenLabsClient;
  }

  // --- ASR bridge setup (provider-agnostic) ---
  // Realtime bridge ASR uses OpenAI when the configured transcriber
  // provider is openai and the runtime supports those bridge modes.
  if (transcriberProvider === "openai" && manager.appConfig?.openaiApiKey && isRealtimeMode(runtimeMode)) {
    const transcriptionMethod = String(
      openAiRealtimeSettings?.transcriptionMethod || "realtime_bridge"
    )
      .trim()
      .toLowerCase();
    const usesRealtimeTranscriptionBridge = transcriptionMethod !== "file_wav";
    const perUserAsrBridgeEnabled = Boolean(openAiRealtimeSettings?.usePerUserAsrBridge);
    const usePerUser = usesRealtimeTranscriptionBridge &&
      providerSupports(runtimeMode, "perUserAsr") &&
      perUserAsrBridgeEnabled;
    const useShared = providerSupports(runtimeMode, "sharedAsr") && !usePerUser;
    perUserAsrEnabled = usePerUser;
    sharedAsrEnabled = usesRealtimeTranscriptionBridge && useShared;
    openAiPerUserAsrModel = normalizeOpenAiRealtimeTranscriptionModel(
      openAiRealtimeSettings?.inputTranscriptionModel,
      OPENAI_REALTIME_DEFAULT_TRANSCRIPTION_MODEL
    );
    openAiPerUserAsrLanguage = voiceAsrGuidance.language;
    openAiPerUserAsrPrompt = voiceAsrGuidance.prompt;
  }

  // --- Await subprocess that was spawning in parallel with API connect ---
  voxClient = await subprocessSpawnPromise;
  if (voxClient) {
    voxClient.logAction = (action) => manager.store.logAction(action);
  }

  const now = Date.now();
  const session: VoiceSession = {
    id: randomUUID(),
    guildId,
    voiceChannelId: targetVoiceChannelId,
    textChannelId: String(message.channelId),
    requestedByUserId: userId,
    mode: runtimeMode,
    realtimeProvider: resolveRealtimeProvider(runtimeMode),
    realtimeToolOwnership,
    realtimeInputSampleRateHz,
    realtimeOutputSampleRateHz,
    recentVoiceTurns: [],
    transcriptTurns: [],
    compactedContextSummary: null,
    compactedContextLastAt: 0,
    compactedContextCoveredThroughTurn: null,
    compactedContextCursor: 0,
    compactedContextInFlight: false,
    pendingCompactionNotes: [],
    durableContext: [],
    modelContextSummary: {
      generation: null,
      decider: null
    },
    voxClient,
    realtimeClient,
    startedAt: now,
    lastActivityAt: now,
    maxEndsAt: null,
    inactivityEndsAt: null,
    maxTimer: null,
    inactivityTimer: null,
    botTurnResetTimer: null,
    botTurnOpen: false,
    bargeInSuppressionUntil: 0,
    bargeInSuppressedAudioChunks: 0,
    bargeInSuppressedAudioBytes: 0,
    lastBotActivityTouchAt: 0,
    responseFlushTimer: null,
    responseWatchdogTimer: null,
    responseDoneGraceTimer: null,
    botDisconnectTimer: null,
    lastResponseRequestAt: 0,
    lastAudioDeltaAt: 0,
    lastAssistantReplyAt: 0,
    lastDirectAddressAt: 0,
    lastDirectAddressUserId: null,
    musicWakeLatchedUntil: 0,
    musicWakeLatchedByUserId: null,
    lastInboundAudioAt: 0,
    realtimeReplySupersededCount: 0,
    pendingRealtimeInputBytes: 0,
    nextResponseRequestId: 0,
    pendingResponse: null,
    activeReplyInterruptionPolicy: null,
    deferredVoiceActions: {},
    deferredVoiceActionTimers: {},
    lastRequestedRealtimeUtterance: null,
    interruptedAssistantReply: null,
    pendingRealtimeAssistantUtterances: [],
    realtimeAssistantUtteranceBackpressureActive: false,
    pendingFileAsrTurns: 0,
    fileAsrTurnDrainActive: false,
    pendingFileAsrTurnsQueue: [],
    realtimeTurnDrainActive: false,
    pendingRealtimeTurns: [],
    activeRealtimeTurn: null,
    interruptOverlapBurst: null,
    interruptDecisionsByUtteranceId: new Map(),
    pendingSpeechStartedInterrupts: new Map(),
    pendingInterruptBridgeTurns: new Map(),
    nextInterruptBurstId: 0,
    openAiAsrSessions: new Map(),
    perUserAsrEnabled,
    sharedAsrEnabled,
    openAiSharedAsrState: null,
    openAiPerUserAsrModel,
    openAiPerUserAsrLanguage,
    openAiPerUserAsrPrompt,
    lastRealtimeAssistantAudioItemId: null,
    lastRealtimeAssistantAudioItemContentIndex: 0,
    lastRealtimeAssistantAudioItemReceivedMs: 0,
    lastRealtimeToolCallerUserId: null,
    toolCallEvents: [],
    mcpStatus: getVoiceMcpServerStatuses(manager),
    toolMusicTrackCatalog: new Map(),
    memoryWriteWindow: [],
    warmMemory: createWarmMemoryState(),
    behavioralFactCache: null,
    conversationHistoryCaches: null,
    ...(realtimeToolOwnership === "provider_native"
      ? {
          realtimePendingToolCalls: new Map(),
          realtimeToolCallExecutions: new Map(),
          realtimeToolResponseDebounceTimer: null,
          realtimeCompletedToolCallIds: new Map(),
          realtimeToolDefinitions: [],
          lastRealtimeToolHash: "",
          lastRealtimeToolRefreshAt: 0,
          awaitingToolOutputs: false
        }
      : {}),
    factProfiles: new Map(),
    guildFactProfile: null,
    voiceCommandState: null,
    musicQueueState: {
      guildId,
      voiceChannelId: targetVoiceChannelId,
      tracks: [],
      nowPlayingIndex: null,
      isPaused: false,
      volume: 1
    },
    assistantOutput: createAssistantOutputState({ now, trigger: "session_start" }),
    thoughtLoopTimer: null,
    thoughtLoopBusy: false,
    nextThoughtAt: 0,
    lastThoughtAttemptAt: 0,
    lastThoughtSpokenAt: 0,
    pendingAmbientThought: null,
    userCaptures: new Map(),
    streamWatch: {
      active: false,
      targetUserId: null,
      requestedByUserId: null,
      lastFrameAt: 0,
      lastCommentaryAt: 0,
      lastCommentaryNote: null,
      lastMemoryRecapAt: 0,
      lastMemoryRecapText: null,
      lastMemoryRecapDurableSaved: false,
      lastMemoryRecapReason: null,
      lastNoteAt: 0,
      lastNoteProvider: null,
      lastNoteModel: null,
      noteEntries: [],
      ingestedFrameCount: 0,
      acceptedFrameCountInWindow: 0,
      frameWindowStartedAt: 0,
      latestFrameMimeType: null,
      latestFrameDataBase64: "",
      latestFrameAt: 0,
      latestChangeScore: 0,
      latestEmaChangeScore: 0,
      latestIsSceneCut: false
    },
    nativeScreenShare: createNativeDiscordScreenShareState(),
    goLiveStream: createGoLiveStreamState(),
    goLiveStreams: new Map(),
    streamPublish: createStreamPublishState(),
    music: {
      phase: "idle",
      ducked: false,
      pauseReason: null,
      startedAt: 0,
      stoppedAt: 0,
      provider: null,
      source: null,
      lastTrackId: null,
      lastTrackTitle: null,
      lastTrackArtists: [],
      lastTrackUrl: null,
      lastPlaybackUrl: null,
      lastPlaybackResolvedDirectUrl: false,
      lastQuery: null,
      lastRequestedByUserId: null,
      lastRequestText: null,
      lastCommandAt: 0,
      lastCommandReason: null,
      pendingQuery: null,
      pendingPlatform: "auto",
      pendingAction: "play_now",
      pendingResults: [],
      pendingRequestedByUserId: null,
      pendingRequestedAt: 0
    },
    soundboard: {
      playCount: 0,
      lastPlayedAt: 0,
      catalogCandidates:
        String(initialSoundboardCandidateInfo?.source || "") === "guild_catalog"
          ? initialSoundboardCandidates.slice(0, SOUNDBOARD_MAX_CANDIDATES)
          : [],
      catalogFetchedAt:
        String(initialSoundboardCandidateInfo?.source || "") === "guild_catalog" ||
          String(initialSoundboardCandidateInfo?.source || "") === "none"
          ? now
          : 0,
      lastDirectiveKey: "",
      lastDirectiveAt: 0
    },
    latencyStages: [],
    membershipEvents: [],
    voiceChannelEffects: [],
    baseVoiceInstructions,
    lastRealtimeInstructions: "",
    lastRealtimeInstructionsAt: 0,
    realtimeInstructionRefreshTimer: null,
    realtimeTurnContextRefreshState: null,
    settingsSnapshot: settings,
    cleanupHandlers: [],
    ending: false,
    livePromptState: createEmptyVoiceLivePromptState(),
    playerState: null,
    botTurnOpenAt: 0
  };

  manager.sessions.set(guildId, session);
  manager.primeSessionFactProfiles(session);

  // ── Seed pre-existing Go Live streams ─────────────────────
  // When the bot joins a channel where someone is already streaming,
  // the GUILD_CREATE / VOICE_STATE_UPDATE events that detected the
  // stream fired before this session existed and were discarded.
  // Scan two sources to catch pre-existing streams:
  //
  // 1. streamDiscovery.streams — retains entries from STREAM_CREATE
  //    events that arrived while no session existed.
  // 2. Voice channel members — discord.js exposes `streaming` on
  //    each member's voice state (mirrors self_stream).
  //
  // Source 1 gives full stream metadata (streamKey, rtcServerId);
  // source 2 gives only userId+channelId but covers cases where
  // STREAM_CREATE wasn't received (e.g., stream started before the
  // gateway connected).
  const botUserId = String(manager.client.user?.id || "").trim();
  if (manager.streamDiscovery) {
    for (const stream of manager.streamDiscovery.streams.values()) {
      if (
        stream.guildId === guildId &&
        stream.channelId === targetVoiceChannelId &&
        String(stream.userId || "").trim() !== botUserId
      ) {
        upsertSessionGoLiveStream(session, buildGoLiveStreamStateFromStream(stream));
        manager.store.logAction({
          kind: "stream_discovery",
          guildId,
          channelId: targetVoiceChannelId,
          userId: stream.userId,
          content: `stream_discovery_pre_existing_stream_seeded: streamKey=${stream.streamKey}`,
          metadata: {
            sessionId: session.id,
            streamKey: stream.streamKey,
            source: "stream_discovery_streams_map"
          }
        });
      }
    }
  }
  // Fallback: check discord.js voice states for streaming members
  // when streamDiscovery didn't have the stream entry (e.g., bot
  // connected after the stream was already live and STREAM_CREATE
  // was missed).
  if (memberVoiceChannel?.members) {
    for (const [memberId, member] of memberVoiceChannel.members) {
      if (
        String(memberId).trim() !== botUserId &&
        member.voice?.streaming === true
      ) {
        const streamKey = `guild:${guildId}:${targetVoiceChannelId}:${memberId}`;
        upsertSessionGoLiveStream(session, {
          ...createGoLiveStreamState(),
          streamKey,
          targetUserId: String(memberId),
          guildId,
          channelId: targetVoiceChannelId,
          discoveredAt: Date.now(),
        });
        manager.store.logAction({
          kind: "stream_discovery",
          guildId,
          channelId: targetVoiceChannelId,
          userId: String(memberId),
          content: `stream_discovery_pre_existing_stream_seeded: streamKey=${streamKey}`,
          metadata: {
            sessionId: session.id,
            streamKey,
            source: "voice_channel_member_streaming"
          }
        });
      }
    }
  }
  syncPrimaryGoLiveStream(session);

  // Record the bot's own join as a membership event so the classifier
  // history shows "[botName] joined" as the first event.
  const botName = getPromptBotName(settings);
  manager.recordVoiceMembershipEvent({
    session,
    userId: manager.client.user?.id || "",
    eventType: "join",
    displayName: botName
  });

  await manager.sessionLifecycle.attachSessionRuntime({
    session,
    settings,
    initialSpeakerUserId: userId
  });

  manager.store.logAction({
    kind: "voice_session_start",
    guildId,
    channelId: message.channelId,
    userId,
    content: `voice_joined:${targetVoiceChannelId}`,
    metadata: {
      sessionId: session.id,
      mode: runtimeMode,
      requestedByUserId: userId,
      voiceChannelId: targetVoiceChannelId,
      maxSessionMinutes,
      inactivityLeaveSeconds: clamp(
        Number(voiceSessionLimits.inactivityLeaveSeconds) || 300,
        MIN_INACTIVITY_SECONDS,
        MAX_INACTIVITY_SECONDS
      ),
      intentConfidence
    }
  });

  // Fire the bot's own join through the classifier → generation pipeline
  // so the bot can greet if eagerness allows it.
  void manager.fireVoiceRuntimeEvent({
    session,
    settings,
    userId: manager.client.user?.id || "",
    transcript: "[YOU joined the voice channel]",
    source: "bot_join_greeting",
    runtimeEventContext: {
      category: "membership",
      eventType: "join",
      actorUserId: manager.client.user?.id || "",
      actorDisplayName: botName,
      actorRole: "self"
    }
  });

  return true;
} catch (error) {
  const errorText = String(error?.message || error);
  const connectDiagnostics = getRealtimeConnectErrorDiagnostics(error);
  manager.store.logAction({
    kind: "voice_error",
    guildId,
    channelId: message.channelId,
    userId,
    content: `voice_join_failed: ${errorText}`,
    metadata: connectDiagnostics
      ? {
        connectDiagnostics
      }
      : undefined
  });

  if (realtimeClient) {
    try {
      await realtimeClient.close();
    } catch (closeError) {
      manager.store.logAction({
        kind: "voice_error",
        guildId,
        channelId: message.channelId,
        userId,
        content: `voice_join_realtime_client_close_failed: ${String(closeError?.message || closeError)}`,
        metadata: {
          failedAfter: "voice_join_failed"
        }
      });
    }
  }

  // If the realtime API connect failed, the subprocess may still be
  // booting in the background. Await and clean it up to avoid leaks.
  if (!voxClient && subprocessSpawnPromise) {
    try {
      const spawnedClient = await subprocessSpawnPromise;
      await spawnedClient.destroy();
    } catch {
      // subprocess also failed — nothing to clean up
    }
  }

  if (voxClient) {
    try {
      await voxClient.destroy();
    } catch {
      // ignore
    }
  }

  await sendOperationalMessage(manager, {
    channel: message.channel,
    settings,
    guildId,
    channelId: message.channelId,
    userId,
    messageId: message.id,
    event: "voice_join_request",
    reason: "join_failed",
    details: {
      error: shortError(errorText)
    },
    mustNotify: true
  });
  return true;
} finally {
  if (reservedConcurrencySlot) {
    manager.pendingSessionGuildIds.delete(guildId);
  }
}

}); }