import { clamp } from "../utils.ts"; import { executeVoiceBrowserBrowseTool, executeVoiceMinecraftTaskTool, executeVoiceShareBrowserSessionTool, executeVoiceStopVideoShareTool } from "./voiceToolCallAgents.ts"; import { executeVoiceConversationSearchTool, executeVoiceMemoryWriteTool } from "./voiceToolCallMemory.ts"; import { executeVoiceMusicPlayTool, executeVoiceMusicQueueAddTool, executeVoiceMusicQueueNextTool, executeVoiceMusicNowPlayingTool, executeVoiceMusicPauseTool, executeVoiceMusicSearchTool, executeVoiceMusicReplyHandoffTool, executeVoiceMusicResumeTool, executeVoiceMusicSkipTool, executeVoiceMusicStopTool, executeVoiceStreamVisualizerTool, executeVoiceVideoPlayTool, executeVoiceVideoSearchTool } from "./voiceToolCallMusic.ts"; import { executeVoiceWebScrapeTool, executeVoiceWebSearchTool } from "./voiceToolCallWeb.ts"; import { normalizeInlineText } from "./voiceSessionHelpers.ts"; import { maybeTriggerAssistantDirectedSoundboard } from "./voiceSoundboard.ts"; import type { VoiceRealtimeToolDescriptor, VoiceRealtimeToolSettings, VoiceSession, VoiceSessionSoundboardState, VoiceToolRuntimeSessionLike } from "./voiceSessionTypes.ts"; import type { VoiceToolCallArgs, VoiceToolCallManager } from "./voiceToolCallTypes.ts"; import { throwIfAborted } from "../tools/abortError.ts";
type ToolRuntimeSession = VoiceSession | VoiceToolRuntimeSessionLike;
function hasSoundboardSessionContext( session: ToolRuntimeSession | null | undefined ): session is ToolRuntimeSession & { ending: boolean; mode: string; guildId: string; textChannelId: string; id: string; } { return Boolean( session && typeof session.ending === "boolean" && typeof session.mode === "string" && typeof session.guildId === "string" && typeof session.textChannelId === "string" && typeof session.id === "string" ); }
function ensureSoundboardState(session: ToolRuntimeSession): VoiceSessionSoundboardState { const existing = session.soundboard; if (existing) return existing; const nextState: VoiceSessionSoundboardState = { playCount: 0, lastPlayedAt: 0, catalogCandidates: [], catalogFetchedAt: 0, lastDirectiveKey: "", lastDirectiveAt: 0 }; session.soundboard = nextState; return nextState; }
function resolveSoundboardDirectiveSession( session: ToolRuntimeSession | null | undefined, settings: VoiceRealtimeToolSettings | null | undefined ): (ToolRuntimeSession & { ending: boolean; mode: string; guildId: string; textChannelId: string; id: string; settingsSnapshot: VoiceRealtimeToolSettings | null; soundboard: VoiceSessionSoundboardState; }) | null { if (!hasSoundboardSessionContext(session)) { return null; }
const settingsSnapshot = session.settingsSnapshot ?? settings ?? null; const soundboard = ensureSoundboardState(session); session.settingsSnapshot = settingsSnapshot;
return Object.assign(session, { settingsSnapshot, soundboard }); }
type LocalVoiceToolCallOptions = { session?: ToolRuntimeSession | null; settings?: VoiceRealtimeToolSettings | null; toolName: string; args?: VoiceToolCallArgs; signal?: AbortSignal; };
type McpVoiceToolCallOptions = { session?: ToolRuntimeSession | null; settings?: VoiceRealtimeToolSettings | null; toolDescriptor: VoiceRealtimeToolDescriptor | null | undefined; args?: VoiceToolCallArgs; signal?: AbortSignal; };
const LOCAL_VOICE_TOOL_HANDLERS: Record< string, (manager: VoiceToolCallManager, opts: LocalVoiceToolCallOptions) => Promise<Record<string, unknown>>
= { memory_write: async (manager, opts) => await executeVoiceMemoryWriteTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), conversation_search: async (manager, opts) => await executeVoiceConversationSearchTool(manager, { session: opts.session, args: opts.args, signal: opts.signal }), music_search: async (manager, opts) => await executeVoiceMusicSearchTool(manager, { session: opts.session, args: opts.args, signal: opts.signal }), music_play: async (manager, opts) => await executeVoiceMusicPlayTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), video_search: async (manager, opts) => await executeVoiceVideoSearchTool(manager, { session: opts.session, args: opts.args, signal: opts.signal }), video_play: async (manager, opts) => await executeVoiceVideoPlayTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), music_queue_add: async (manager, opts) => await executeVoiceMusicQueueAddTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), music_queue_next: async (manager, opts) => await executeVoiceMusicQueueNextTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), media_stop: async (manager, opts) => await executeVoiceMusicStopTool(manager, { session: opts.session, settings: opts.settings, signal: opts.signal }), media_pause: async (manager, opts) => await executeVoiceMusicPauseTool(manager, { session: opts.session, settings: opts.settings, signal: opts.signal }), media_reply_handoff: async (manager, opts) => await executeVoiceMusicReplyHandoffTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), media_resume: async (manager, opts) => await executeVoiceMusicResumeTool(manager, { session: opts.session, signal: opts.signal }), media_skip: async (manager, opts) => await executeVoiceMusicSkipTool(manager, { session: opts.session, settings: opts.settings, signal: opts.signal }), media_now_playing: async (manager, opts) => await executeVoiceMusicNowPlayingTool(manager, { session: opts.session, signal: opts.signal }), play_soundboard: async (manager, opts) => await executeVoicePlaySoundboardTool(manager, { session: opts.session, settings: opts.settings, args: opts.args }), start_screen_watch: async (manager, opts) => { throwIfAborted(opts.signal, "Voice tool cancelled"); return await executeStartScreenWatchTool(manager, { session: opts.session, settings: opts.settings, args: opts.args }); }, see_screenshare_snapshot: async (_manager, opts) => { throwIfAborted(opts.signal, "Voice tool cancelled"); const sw = (opts.session as { streamWatch?: { active?: boolean; latestFrameDataBase64?: string; latestFrameMimeType?: string; latestFrameAt?: number; targetUserId?: string; } } | null)?.streamWatch; if (!sw?.active) { return { ok: false, error: "No active screen watch." }; } const dataBase64 = String(sw.latestFrameDataBase64 || "").trim(); if (!dataBase64) { return { ok: false, error: "No recent frame available." }; } return { ok: true, streamerName: sw.targetUserId || null, frameAgeMs: Math.max(0, Date.now() - Number(sw.latestFrameAt || 0)), mimeType: String(sw.latestFrameMimeType || "image/jpeg"), dataBase64 }; }, web_search: async (manager, opts) => await executeVoiceWebSearchTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), web_scrape: async (manager, opts) => await executeVoiceWebScrapeTool(manager, { session: opts.session, args: opts.args, signal: opts.signal }), browser_browse: async (manager, opts) => await executeVoiceBrowserBrowseTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), stream_visualizer: async (manager, opts) => await executeVoiceStreamVisualizerTool(manager, { session: opts.session, args: opts.args, signal: opts.signal }), share_browser_session: async (manager, opts) => await executeVoiceShareBrowserSessionTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), stop_video_share: async (manager, opts) => await executeVoiceStopVideoShareTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), minecraft_task: async (manager, opts) => await executeVoiceMinecraftTaskTool(manager, { session: opts.session, settings: opts.settings, args: opts.args, signal: opts.signal }), leave_voice_channel: async (manager, opts) => { throwIfAborted(opts.signal, "Voice tool cancelled"); scheduleLeaveVoiceChannel(manager, { session: opts.session, settings: opts.settings }); return { ok: true, status: "leaving" }; } };
async function executeStartScreenWatchTool( manager: VoiceToolCallManager, { session, settings, args }: { session?: ToolRuntimeSession | null; settings?: VoiceRealtimeToolSettings | null; args?: VoiceToolCallArgs | null; } ) { const requesterUserId = normalizeInlineText(session?.lastRealtimeToolCallerUserId, 80) || null; if (!requesterUserId || !session?.guildId) { return { ok: false, started: false, error: "screen_watch_context_unavailable" }; } const target = normalizeInlineText(args?.target, 120) || null;
let transcript = ""; const recentVoiceTurns = Array.isArray(session.recentVoiceTurns) ? session.recentVoiceTurns : []; for (let i = recentVoiceTurns.length - 1; i >= 0; i -= 1) { const turn = recentVoiceTurns[i]; if (String(turn?.role || "") !== "user") continue; if (String(turn?.userId || "") !== requesterUserId) continue; transcript = normalizeInlineText(turn?.text, 220) || ""; break; }
const result = await manager.startVoiceScreenWatch({ settings, guildId: session.guildId, channelId: session.textChannelId || null, requesterUserId, target, transcript, source: "voice_realtime_tool_call" }); return { ok: Boolean(result?.started || result?.reused), started: Boolean(result?.started || result?.reused), reused: Boolean(result?.reused), transport: result?.transport === "native" || result?.transport === "link" ? result.transport : null, reason: normalizeInlineText(result?.reason, 120) || null, targetUserId: normalizeInlineText(result?.targetUserId, 80) || null, frameReady: Boolean(result?.frameReady), linkUrl: normalizeInlineText(result?.linkUrl, 320) || null, expiresInMinutes: Number.isFinite(Number(result?.expiresInMinutes)) ? Math.max(0, Math.round(Number(result.expiresInMinutes))) : null }; }
function scheduleLeaveVoiceChannel(
manager: VoiceToolCallManager,
{ session, settings }: { session?: ToolRuntimeSession | null; settings?: VoiceRealtimeToolSettings | null }
) {
setTimeout(async () => {
if (!session || session.ending) return;
await manager.waitForLeaveDirectivePlayback({
session,
expectRealtimeAudio: true,
source: "realtime_tool_leave_directive"
});
await manager.endSession({
guildId: session.guildId,
reason: "assistant_leave_directive",
requestedByUserId: manager.client.user?.id || null,
settings,
announcement: "wrapping up vc."
}).catch((error) => {
manager.store.logAction({
kind: "voice_error",
guildId: session.guildId,
channelId: session.textChannelId,
userId: manager.client.user?.id || null,
content: assistant_leave_directive_end_session_failed: ${String(error instanceof Error ? error.message : error)},
metadata: {
sessionId: session.id,
reason: "assistant_leave_directive"
}
});
});
}, 0);
}
async function executeVoicePlaySoundboardTool( manager: VoiceToolCallManager, { session, settings, args }: { session?: ToolRuntimeSession | null; settings?: VoiceRealtimeToolSettings | null; args?: VoiceToolCallArgs; } ) { const soundboardSession = resolveSoundboardDirectiveSession(session, settings); if (!soundboardSession || soundboardSession.ending) { return { ok: false, played: [], error: "soundboard_session_unavailable" }; }
const normalizedRefs = (Array.isArray(args?.refs) ? args.refs : []) .map((entry) => normalizeInlineText(entry, 180)) .filter(Boolean) .slice(0, 10); if (normalizedRefs.length === 0) { return { ok: false, played: [], error: "soundboard_refs_required" }; }
const played: string[] = []; for (const requestedRef of normalizedRefs) { const previousPlayCount = Math.max(0, Number(soundboardSession.soundboard.playCount || 0)); await maybeTriggerAssistantDirectedSoundboard(manager, { session: soundboardSession, settings, userId: manager.client.user?.id || null, transcript: "", requestedRef, source: "voice_realtime_tool_play_soundboard" }); const nextPlayCount = Math.max(0, Number(soundboardSession.soundboard.playCount || 0)); if (nextPlayCount > previousPlayCount) { played.push(requestedRef); } }
return { ok: played.length > 0, played, error: played.length > 0 ? null : "soundboard_refs_unresolved" }; }
export async function executeLocalVoiceToolCall(
manager: VoiceToolCallManager,
opts: LocalVoiceToolCallOptions
): Promise<Record<string, unknown>> {
const normalizedToolName = normalizeInlineText(opts.toolName, 120);
if (!normalizedToolName) {
throw new Error("missing_tool_name");
}
const handler = LOCAL_VOICE_TOOL_HANDLERS[normalizedToolName];
if (!handler) {
throw new Error(unsupported_tool:${normalizedToolName});
}
return await handler(manager, opts);
}
export async function executeMcpVoiceToolCall( manager: VoiceToolCallManager, { session, settings: _settings, toolDescriptor, args, signal }: McpVoiceToolCallOptions ) { void _settings; throwIfAborted(signal, "Voice MCP tool cancelled"); const serverName = normalizeInlineText(toolDescriptor?.serverName, 80); const toolName = normalizeInlineText(toolDescriptor?.name, 120); if (!serverName || !toolName) { throw new Error("invalid_mcp_tool_descriptor"); }
const serverStatus = (Array.isArray(session?.mcpStatus) ? session.mcpStatus : [])
.find((entry) => String(entry?.serverName || "") === serverName) || null;
if (!serverStatus) {
throw new Error(mcp_server_not_found:${serverName});
}
const baseUrl = String(serverStatus.baseUrl || "").trim().replace(//+$/, "");
const toolPath = String(serverStatus.toolPath || "/tools/call").trim() || "/tools/call";
const targetUrl = ${baseUrl}${toolPath.startsWith("/") ? "" : "/"}${toolPath};
const timeoutMs = clamp(Math.floor(Number(serverStatus.timeoutMs || 10_000)), 500, 60_000);
const headers = {
"content-type": "application/json",
...(serverStatus.headers && typeof serverStatus.headers === "object" ? serverStatus.headers : {})
};
try { const response = await fetch(targetUrl, { method: "POST", headers, body: JSON.stringify({ toolName, arguments: args && typeof args === "object" ? args : {} }), signal: signal ? AbortSignal.any([signal, AbortSignal.timeout(timeoutMs)]) : AbortSignal.timeout(timeoutMs) }); const bodyText = await response.text().catch(() => ""); let payload: Record<string, unknown> | null = null; if (bodyText) { try { payload = JSON.parse(bodyText); } catch { payload = { output: bodyText }; } }
if (!response.ok) {
const errorMessage = normalizeInlineText(payload?.error || payload?.message || bodyText, 400) || `HTTP_${response.status}`;
manager.updateVoiceMcpStatus(session, serverName, {
connected: false,
lastError: errorMessage,
lastCallAt: new Date().toISOString()
});
throw new Error(errorMessage);
}
manager.updateVoiceMcpStatus(session, serverName, {
connected: true,
lastError: null,
lastCallAt: new Date().toISOString(),
lastConnectedAt: new Date().toISOString()
});
return {
ok: payload?.ok === false ? false : true,
output: Object.hasOwn(payload || {}, "output") ? payload?.output : payload,
error: payload?.error || null
};
} catch (error) { const message = String(error?.message || error); manager.updateVoiceMcpStatus(session, serverName, { connected: false, lastError: message, lastCallAt: new Date().toISOString() }); throw error; } }
