import { formatBehaviorMemoryFacts, formatConversationParticipantMemory, formatConversationWindows } from "../prompts/promptFormatters.ts"; import { buildWebToolRoutingPolicyLine, BROWSER_BROWSE_POLICY_LINE, BROWSER_SCREENSHOT_POLICY_LINE, CONVERSATION_SEARCH_POLICY_LINE, IMMEDIATE_WEB_SEARCH_POLICY_LINE } from "../prompts/toolPolicy.ts"; import { buildActiveMusicReplyGuidanceLines, MUSIC_ACTIVE_AUTONOMY_POLICY_LINE, MUSIC_REPLY_HANDOFF_POLICY_LINE } from "../prompts/voiceLivePolicy.ts"; import { buildSingleTurnPromptLog } from "../promptLogging.ts"; import { loadSharedVoiceMemoryContext } from "./voiceMemoryContext.ts"; import { REALTIME_CONTEXT_MEMBER_LIMIT, REALTIME_CONTEXT_TRANSCRIPT_MAX_CHARS, REALTIME_INSTRUCTION_REFRESH_DEBOUNCE_MS, STT_TRANSCRIPT_MAX_CHARS, VOICE_CHANNEL_EFFECT_EVENT_PROMPT_LIMIT, VOICE_MEMBERSHIP_EVENT_PROMPT_LIMIT } from "./voiceSessionManager.constants.ts"; import { formatVoiceChannelEffectSummary, inspectAsrTranscript } from "./voiceSessionHelpers.ts"; import { buildVoiceInstructions, isTransportOnlySession, shouldHandleRealtimeFunctionCalls, shouldRegisterRealtimeTools } from "./voiceConfigResolver.ts"; import { getVoiceStreamWatchSettings } from "../settings/agentStack.ts"; import { getScreenWatchCommentaryTier } from "../prompts/voiceAdmissionPolicy.ts"; import { buildSharedVoiceTurnContext, type SharedVoiceTurnContext } from "./voiceTurnContext.ts"; import type { RealtimeInstructionMemorySlice, RealtimeTurnContextRefreshState, VoiceRealtimeToolSettings, VoiceSession } from "./voiceSessionTypes.ts"; import { refreshRealtimeTools } from "./voiceToolCallInfra.ts"; import type { VoiceToolCallManager } from "./voiceToolCallTypes.ts"; import { providerSupports } from "./voiceModes.ts";
type InstructionSettings = VoiceRealtimeToolSettings | null;
interface InstructionStoreLike { getSettings: () => InstructionSettings; logAction: (entry: { kind: string; guildId?: string | null; channelId?: string | null; userId?: string | null; content: string; metadata?: Record<string, unknown>; }) => void; searchLookupContext?: (payload: { guildId: string; channelId: string | null; queryText: string; limit?: number; maxAgeHours?: number; }) => Promise<unknown[]> | unknown[]; searchConversationWindows?: (payload: { guildId: string; channelId: string | null; queryText: string; limit?: number; maxAgeHours?: number; before?: number; after?: number; }) => Promise<unknown[]> | unknown[]; }
interface StreamWatchPromptContext { prompt?: string; notes?: string[]; active?: boolean; }
function toPromptRecordRows(rows: unknown): Array<Record<string, unknown>> { if (!Array.isArray(rows)) return []; return rows.filter((entry): entry is Record<string, unknown> => ( Boolean(entry) && typeof entry === "object" && !Array.isArray(entry) )); }
interface ScreenShareCapabilityLike { available?: boolean; supported?: boolean; reason?: string | null; }
interface VoiceChannelParticipant { userId: string; displayName: string; }
interface VoiceMembershipPromptEntry { userId: string; displayName: string; eventType: string; at: number; ageMs: number; }
interface VoiceChannelEffectPromptEntry { userId: string; displayName: string; channelId: string; guildId: string; effectType: string; soundId: string | null; soundName: string | null; soundVolume: number | null; emoji: string | null; animationType: number | null; animationId: number | null; at: number; ageMs: number; summary: string; }
interface VoiceCommandStateLike { userId: string | null; domain: string | null; intent: string | null; startedAt: number; expiresAt: number; }
interface MusicDisambiguationPromptContext { active?: boolean; action?: string | null; query?: string | null; options?: Array<{ title?: string; artist?: string; id?: string; }>; }
interface MusicPromptContext { playbackState: "playing" | "paused" | "stopped" | "idle"; replyHandoffMode: "duck" | "pause" | null; currentTrack: { id: string | null; title: string; artists: string[] } | null; lastTrack: { id: string | null; title: string; artists: string[] } | null; queueLength: number; upcomingTracks: Array<{ id: string | null; title: string; artist: string | null }>; lastAction: "play_now" | "stop" | "pause" | "resume" | "skip" | null; lastQuery: string | null; }
interface QueueRealtimeTurnContextRefreshArgs { session: VoiceSession; settings?: InstructionSettings; userId?: string | null; transcript?: string; captureReason?: string; }
interface PrepareRealtimeTurnContextArgs { session: VoiceSession; settings?: InstructionSettings; userId?: string | null; transcript?: string; captureReason?: string; }
interface RefreshRealtimeInstructionsArgs { session: VoiceSession; settings?: InstructionSettings; reason?: string; speakerUserId?: string | null; transcript?: string; memorySlice?: RealtimeInstructionMemorySlice | null; }
interface BuildRealtimeInstructionsArgs { session: VoiceSession; settings?: InstructionSettings; speakerUserId?: string | null; transcript?: string; memorySlice?: RealtimeInstructionMemorySlice | null; sharedTurnContext?: SharedVoiceTurnContext | null; }
interface BuildRealtimeMemorySliceArgs { session: VoiceSession; settings?: InstructionSettings; userId?: string | null; transcript?: string; }
type InstructionManagerHost = VoiceToolCallManager & { store: InstructionStoreLike; resolveVoiceSpeakerName: (session: VoiceSession, userId?: string | null) => string; getStreamWatchNotesForPrompt: ( session: VoiceSession, settings?: InstructionSettings ) => StreamWatchPromptContext | null; getVoiceScreenWatchCapability: (args?: { settings?: InstructionSettings; guildId?: string | null; channelId?: string | null; requesterUserId?: string | null; }) => ScreenShareCapabilityLike | null; getVoiceChannelParticipants: (session: VoiceSession) => VoiceChannelParticipant[]; getRecentVoiceMembershipEvents: ( session: VoiceSession, args?: { now?: number; maxItems?: number } ) => VoiceMembershipPromptEntry[]; getRecentVoiceChannelEffectEvents: ( session: VoiceSession, args?: { now?: number; maxItems?: number } ) => VoiceChannelEffectPromptEntry[]; ensureVoiceCommandState: (session: VoiceSession) => VoiceCommandStateLike | null; getMusicDisambiguationPromptContext: ( session: VoiceSession ) => MusicDisambiguationPromptContext | null; getMusicPromptContext: (session: VoiceSession) => MusicPromptContext | null; getSessionFactProfileSlice?: (payload: { session: VoiceSession; userId?: string | null; }) => { participantProfiles?: unknown[]; selfFacts?: unknown[]; loreFacts?: unknown[]; userFacts: unknown[]; relevantFacts: unknown[]; guidanceFacts?: unknown[]; }; };
export class InstructionManager { constructor(private readonly host: InstructionManagerHost) {}
private sanitizeRealtimeContextTranscript({ session, userId, transcript = "", maxChars, stage, captureReason = null }: { session: VoiceSession; userId?: string | null; transcript?: string; maxChars: number; stage: string; captureReason?: string | null; }) { const transcriptGuard = inspectAsrTranscript(transcript, maxChars); if (!transcriptGuard.malformed) return transcriptGuard.transcript;
this.store.logAction({
kind: "voice_runtime",
guildId: session.guildId,
channelId: session.textChannelId,
userId: String(userId || "").trim() || null,
content: "openai_realtime_turn_context_control_token_transcript_dropped",
metadata: {
sessionId: session.id,
stage,
captureReason: captureReason ? String(captureReason || "stream_end") : null,
transcript: transcriptGuard.transcript,
controlTokenCount: transcriptGuard.controlTokenCount,
reservedAudioMarkerCount: transcriptGuard.reservedAudioMarkerCount
}
});
return "";
}
queueRealtimeTurnContextRefresh({ session, settings, userId, transcript = "", captureReason = "stream_end" }: QueueRealtimeTurnContextRefreshArgs) { if (!session || session.ending) return; if (!providerSupports(session.mode || "", "updateInstructions")) return;
const pendingRefreshState = this.ensureTurnContextRefreshState(session);
pendingRefreshState.pending = {
settings: settings || session.settingsSnapshot || this.store.getSettings(),
userId: String(userId || "").trim() || null,
transcript: this.sanitizeRealtimeContextTranscript({
session,
userId,
transcript,
maxChars: REALTIME_CONTEXT_TRANSCRIPT_MAX_CHARS,
stage: "queue_realtime_turn_context_refresh",
captureReason
}),
captureReason: String(captureReason || "stream_end")
};
if (pendingRefreshState.inFlight) return;
pendingRefreshState.inFlight = true;
const runQueuedRefresh = async () => {
let nextRefresh = null;
try {
while (!session.ending) {
const queued = pendingRefreshState.pending;
pendingRefreshState.pending = null;
if (!queued) break;
await this.prepareRealtimeTurnContext({
session,
settings: queued.settings,
userId: queued.userId,
transcript: queued.transcript,
captureReason: queued.captureReason
});
}
} catch (error) {
this.store.logAction({
kind: "voice_error",
guildId: session.guildId,
channelId: session.textChannelId,
userId: this.host.client.user?.id || null,
content: `openai_realtime_turn_context_refresh_failed: ${String((error as Error)?.message || error)}`,
metadata: {
sessionId: session.id,
source: "queued_turn_context_refresh"
}
});
} finally {
pendingRefreshState.inFlight = false;
if (session.ending) {
if (session.realtimeTurnContextRefreshState === pendingRefreshState) {
session.realtimeTurnContextRefreshState = null;
}
} else if (pendingRefreshState.pending) {
nextRefresh = pendingRefreshState.pending;
} else if (session.realtimeTurnContextRefreshState === pendingRefreshState) {
session.realtimeTurnContextRefreshState = null;
}
}
if (nextRefresh) {
this.queueRealtimeTurnContextRefresh({
session,
settings: nextRefresh.settings,
userId: nextRefresh.userId,
transcript: nextRefresh.transcript,
captureReason: nextRefresh.captureReason
});
}
};
void runQueuedRefresh();
}
async prepareRealtimeTurnContext({ session, settings, userId, transcript = "", captureReason: _captureReason = "stream_end" }: PrepareRealtimeTurnContextArgs) { if (!session || session.ending) return; if (!providerSupports(session.mode || "", "updateInstructions")) return;
const normalizedTranscript = this.sanitizeRealtimeContextTranscript({
session,
userId,
transcript,
maxChars: REALTIME_CONTEXT_TRANSCRIPT_MAX_CHARS,
stage: "prepare_realtime_turn_context",
captureReason: _captureReason
});
const transportOnly = isTransportOnlySession({ session, settings });
const memorySlice = transportOnly
? null
: await this.buildRealtimeMemorySlice({
session,
settings,
userId,
transcript: normalizedTranscript
});
session.lastRealtimeMemorySlice = memorySlice;
await this.refreshRealtimeInstructions({
session,
settings,
reason: "turn_context",
speakerUserId: userId,
transcript: normalizedTranscript,
memorySlice
});
}
async buildRealtimeMemorySlice({ session, settings, userId, transcript = "" }: BuildRealtimeMemorySliceArgs): Promise { const normalizedTranscript = this.sanitizeRealtimeContextTranscript({ session, userId, transcript, maxChars: STT_TRANSCRIPT_MAX_CHARS, stage: "build_realtime_memory_slice" }); const normalizedUserId = String(userId || "").trim() || null; const loaded = await loadSharedVoiceMemoryContext({ searchConversationWindows: this.store.searchConversationWindows, getSessionFactProfileSlice: typeof this.host.getSessionFactProfileSlice === "function" ? ({ userId: memoryUserId }) => this.host.getSessionFactProfileSlice?.({ session, userId: memoryUserId }) || null : undefined, searchDurableFacts: typeof this.host.memory?.searchDurableFacts === "function" ? (payload) => this.host.memory.searchDurableFacts(payload) : null, loadBehavioralFactsForPrompt: typeof this.host.memory?.loadBehavioralFactsForPrompt === "function" ? async (payload) => await this.host.memory.loadBehavioralFactsForPrompt(payload) : null }, { session, settings: settings || session.settingsSnapshot || this.store.getSettings(), userId: normalizedUserId, transcript: normalizedTranscript, continuitySource: "voice_realtime_instruction_context", behavioralSource: "voice_realtime_behavioral_memory:instruction_refresh" }); this.store.logAction({ kind: "voice_runtime", guildId: session.guildId, channelId: session.textChannelId, userId: normalizedUserId || this.host.client.user?.id || null, content: "voice_realtime_instruction_memory_loaded", metadata: { sessionId: session.id, memorySource: "voice_realtime_instruction_context", transcriptChars: normalizedTranscript.length, continuityLoadMs: loaded.continuityLoadMs, behavioralMemoryLoadMs: loaded.behavioralMemoryLoadMs, totalLoadMs: loaded.totalLoadMs, usedCachedBehavioralFacts: loaded.usedCachedBehavioralFacts, participantProfileCount: Array.isArray(loaded.memorySlice.participantProfiles) ? loaded.memorySlice.participantProfiles.length : 0, userFactCount: Array.isArray(loaded.memorySlice.userFacts) ? loaded.memorySlice.userFacts.length : 0, relevantFactCount: Array.isArray(loaded.memorySlice.relevantFacts) ? loaded.memorySlice.relevantFacts.length : 0, guidanceFactCount: Array.isArray(loaded.memorySlice.guidanceFacts) ? loaded.memorySlice.guidanceFacts.length : 0, behavioralFactCount: Array.isArray(loaded.memorySlice.behavioralFacts) ? loaded.memorySlice.behavioralFacts.length : 0, recentConversationHistoryCount: Array.isArray(loaded.memorySlice.recentConversationHistory) ? loaded.memorySlice.recentConversationHistory.length : 0 } }); return loaded.memorySlice; }
scheduleRealtimeInstructionRefresh({ session, settings, reason = "voice_context_refresh", speakerUserId = null, transcript = "", memorySlice = null }: RefreshRealtimeInstructionsArgs) { if (!session || session.ending) return; if (!providerSupports(session.mode || "", "updateInstructions")) return;
if (session.realtimeInstructionRefreshTimer) {
clearTimeout(session.realtimeInstructionRefreshTimer);
session.realtimeInstructionRefreshTimer = null;
}
session.realtimeInstructionRefreshTimer = setTimeout(() => {
session.realtimeInstructionRefreshTimer = null;
this.spawnRealtimeInstructionRefresh({
session,
settings: settings || session.settingsSnapshot || this.store.getSettings(),
reason,
speakerUserId,
transcript,
memorySlice
});
}, REALTIME_INSTRUCTION_REFRESH_DEBOUNCE_MS);
}
private spawnRealtimeInstructionRefresh({
session,
settings,
reason = "voice_context_refresh",
speakerUserId = null,
transcript = "",
memorySlice = null
}: RefreshRealtimeInstructionsArgs) {
void this.refreshRealtimeInstructions({
session,
settings,
reason,
speakerUserId,
transcript,
memorySlice
}).catch((error: unknown) => {
this.store.logAction({
kind: "voice_error",
guildId: session.guildId,
channelId: session.textChannelId,
userId: speakerUserId || this.host.client.user?.id || null,
content: openai_realtime_instruction_refresh_failed: ${String((error as Error)?.message || error)},
metadata: {
sessionId: session.id,
reason: String(reason || "voice_context_refresh")
}
});
});
}
async refreshRealtimeInstructions({ session, settings, reason = "voice_context_refresh", speakerUserId = null, transcript = "", memorySlice = null }: RefreshRealtimeInstructionsArgs) { if (!session || session.ending) return; if (!providerSupports(session.mode || "", "updateInstructions")) return; if (!session.realtimeClient) return; const updateInstructions = "updateInstructions" in session.realtimeClient && typeof session.realtimeClient.updateInstructions === "function" ? session.realtimeClient.updateInstructions.bind(session.realtimeClient) : null; if (!updateInstructions) return;
const resolvedSettings = settings || session.settingsSnapshot || this.store.getSettings();
if (shouldRegisterRealtimeTools({ session, settings: resolvedSettings })) {
await refreshRealtimeTools(this.host, {
session,
settings: resolvedSettings,
reason
});
}
const effectiveMemorySlice = memorySlice ?? session.lastRealtimeMemorySlice ?? null;
session.lastRealtimeMemorySlice = effectiveMemorySlice;
const sharedTurnContext = buildSharedVoiceTurnContext(this.host, {
session,
settings: resolvedSettings,
speakerUserId,
maxParticipants: REALTIME_CONTEXT_MEMBER_LIMIT,
maxMembershipEvents: VOICE_MEMBERSHIP_EVENT_PROMPT_LIMIT,
maxVoiceEffects: VOICE_CHANNEL_EFFECT_EVENT_PROMPT_LIMIT
});
const instructions = this.buildRealtimeInstructions({
session,
settings: resolvedSettings,
speakerUserId,
transcript,
memorySlice: effectiveMemorySlice,
sharedTurnContext
});
if (!instructions) return;
if (instructions === session.lastRealtimeInstructions) return;
try {
updateInstructions(instructions);
session.lastRealtimeInstructions = instructions;
session.lastRealtimeInstructionsAt = Date.now();
this.store.logAction({
kind: "voice_runtime",
guildId: session.guildId,
channelId: session.textChannelId,
userId: this.host.client.user?.id || null,
content: "openai_realtime_instructions_updated",
metadata: {
sessionId: session.id,
reason: String(reason || "voice_context_refresh"),
speakerUserId: speakerUserId ? String(speakerUserId) : null,
participantCount: sharedTurnContext.participantRoster.length,
transcriptChars: transcript ? String(transcript).length : 0,
userFactCount: Array.isArray(effectiveMemorySlice?.userFacts) ? effectiveMemorySlice.userFacts.length : 0,
relevantFactCount: Array.isArray(effectiveMemorySlice?.relevantFacts) ? effectiveMemorySlice.relevantFacts.length : 0,
conversationWindowCount: Array.isArray(effectiveMemorySlice?.recentConversationHistory)
? effectiveMemorySlice.recentConversationHistory.length
: 0,
toolNames: Array.isArray(session.realtimeToolDefinitions)
? session.realtimeToolDefinitions.map((tool) => String(tool?.name || "")).filter(Boolean)
: [],
recentToolOutcomeCount: sharedTurnContext.recentToolOutcomes.length,
nativeDiscordSharerCount: sharedTurnContext.nativeDiscordSharers.length,
screenWatchAvailable: sharedTurnContext.screenWatchCapability.available,
screenWatchActive: Boolean(sharedTurnContext.streamWatchNotes?.active),
instructionsChars: instructions.length,
replyPrompts: buildSingleTurnPromptLog({
systemPrompt: instructions,
userPrompt: ""
})
}
});
} catch (error) {
this.store.logAction({
kind: "voice_error",
guildId: session.guildId,
channelId: session.textChannelId,
userId: this.host.client.user?.id || null,
content: `openai_realtime_instruction_update_failed: ${String((error as Error)?.message || error)}`,
metadata: {
sessionId: session.id,
reason: String(reason || "voice_context_refresh")
}
});
}
}
buildRealtimeInstructions({
session,
settings,
speakerUserId = null,
transcript = "",
memorySlice = null,
sharedTurnContext = null
}: BuildRealtimeInstructionsArgs) {
const baseInstructions = String(session?.baseVoiceInstructions || buildVoiceInstructions(settings)).trim();
const speakerName = this.host.resolveVoiceSpeakerName(session, speakerUserId);
const normalizedTranscript = this.sanitizeRealtimeContextTranscript({
session,
userId: speakerUserId,
transcript,
maxChars: REALTIME_CONTEXT_TRANSCRIPT_MAX_CHARS,
stage: "build_realtime_instructions"
});
const effectiveMemorySlice = memorySlice ?? session.lastRealtimeMemorySlice ?? null;
const resolvedTurnContext = sharedTurnContext || buildSharedVoiceTurnContext(this.host, {
session,
settings,
speakerUserId,
maxParticipants: REALTIME_CONTEXT_MEMBER_LIMIT,
maxMembershipEvents: VOICE_MEMBERSHIP_EVENT_PROMPT_LIMIT,
maxVoiceEffects: VOICE_CHANNEL_EFFECT_EVENT_PROMPT_LIMIT
});
const streamWatchNotes = resolvedTurnContext.streamWatchNotes;
const hasScreenFrameContext =
Array.isArray(streamWatchNotes?.notes) && streamWatchNotes.notes.length > 0;
const hasActiveScreenFrameContext = hasScreenFrameContext && Boolean(streamWatchNotes?.active);
const hasRecentScreenFrameMemory = hasScreenFrameContext && !streamWatchNotes?.active;
const screenShareCapability = resolvedTurnContext.screenWatchCapability;
const nativeDiscordSharers = resolvedTurnContext.nativeDiscordSharers;
const participants = resolvedTurnContext.participantRoster;
const recentMembershipEvents = resolvedTurnContext.recentMembershipEvents;
const recentVoiceChannelEffects = resolvedTurnContext.recentVoiceEffectEvents;
const guild = this.host.client.guilds.cache.get(String(session?.guildId || "")) || null;
const voiceChannel = guild?.channels?.cache?.get(String(session?.voiceChannelId || "")) || null;
const roster =
participants.length > 0
? participants
.slice(0, REALTIME_CONTEXT_MEMBER_LIMIT)
.map((participant) => participant.displayName)
.join(", ")
: "unknown";
const membershipSummary = recentMembershipEvents.length
? recentMembershipEvents
.map((entry) => {
const action = entry.eventType === "join" ? "joined" : "left";
return ${entry.displayName} ${action} (${Math.max(0, Math.round(entry.ageMs))}ms ago);
})
.join(" | ")
: "none";
const effectSummary = recentVoiceChannelEffects.length
? recentVoiceChannelEffects
.map((entry) => formatVoiceChannelEffectSummary(entry, { includeTiming: true }))
.join(" | ")
: "none";
const participantMemory = formatConversationParticipantMemory({
participantProfiles: toPromptRecordRows(effectiveMemorySlice?.participantProfiles),
selfFacts: toPromptRecordRows(effectiveMemorySlice?.selfFacts),
loreFacts: toPromptRecordRows(effectiveMemorySlice?.loreFacts)
});
const recentConversationHistory = formatConversationWindows(effectiveMemorySlice?.recentConversationHistory);
const guidanceFacts = formatBehaviorMemoryFacts(effectiveMemorySlice?.guidanceFacts, 8);
const behavioralFacts = formatBehaviorMemoryFacts(effectiveMemorySlice?.behavioralFacts, 8);
const compactedSessionSummary = resolvedTurnContext.compactedSessionSummary;
const recentToolOutcomeLines = resolvedTurnContext.recentToolOutcomeLines;
const activeVoiceCommandState = this.host.ensureVoiceCommandState(session);
const musicDisambiguation = this.host.getMusicDisambiguationPromptContext(session);
const sections = [baseInstructions];
sections.push(
[
"Live server context:",
`- Server: ${String(guild?.name || "unknown").trim() || "unknown"}`,
`- Voice channel: ${String(voiceChannel?.name || "unknown").trim() || "unknown"}`,
`- Humans currently in channel: ${roster}`,
`- Recent membership changes: ${membershipSummary}`,
`- Recent voice effects: ${effectSummary}`,
"- If someone recently joined, a quick natural greeting is usually good.",
"- If someone recently left, a brief natural goodbye/acknowledgement is usually good."
].join("
") );
if (speakerName || normalizedTranscript) {
sections.push(
[
"Current turn context:",
speakerName ? `- Active speaker: ${speakerName}` : null,
normalizedTranscript ? `- Latest speaker transcript: ${normalizedTranscript}` : null
]
.filter(Boolean)
.join("
") ); }
if (
Array.isArray(effectiveMemorySlice?.participantProfiles) && effectiveMemorySlice.participantProfiles.length > 0 ||
Array.isArray(effectiveMemorySlice?.selfFacts) && effectiveMemorySlice.selfFacts.length > 0 ||
Array.isArray(effectiveMemorySlice?.loreFacts) && effectiveMemorySlice.loreFacts.length > 0
) {
sections.push(
[
"People in this conversation:",
participantMemory
]
.filter(Boolean)
.join("
") ); }
if (Array.isArray(effectiveMemorySlice?.guidanceFacts) && effectiveMemorySlice.guidanceFacts.length > 0) {
sections.push(
[
"Behavior guidance:",
"- These are standing guidance facts that should shape how you act in this conversation.",
guidanceFacts
].join("
") ); }
if (Array.isArray(effectiveMemorySlice?.recentConversationHistory) && effectiveMemorySlice.recentConversationHistory.length > 0) {
sections.push(
[
"Recent conversation continuity:",
"- These windows come from persisted shared text/voice history.",
recentConversationHistory
].join("
") ); }
if (compactedSessionSummary?.text) {
sections.push(
[
"Session conversation summary:",
`- ${compactedSessionSummary.text}`
].join("
") ); }
if (Array.isArray(effectiveMemorySlice?.behavioralFacts) && effectiveMemorySlice.behavioralFacts.length > 0) {
sections.push(
[
"Relevant behavioral memory:",
"- These behavior memories were retrieved because they match the current turn. Follow them when relevant.",
behavioralFacts
].join("
") ); }
if (recentToolOutcomeLines.length > 0) {
sections.push(
[
"Recent tool outcomes:",
"- Treat these as recent room state and prior action context.",
...recentToolOutcomeLines.map((line) => `- ${line}`)
].join("
") ); }
if (activeVoiceCommandState || musicDisambiguation) {
sections.push(
[
"Active command session:",
activeVoiceCommandState?.userId
? `- Locked speaker user ID: ${activeVoiceCommandState.userId}`
: null,
activeVoiceCommandState?.domain
? `- Domain: ${activeVoiceCommandState.domain}`
: null,
activeVoiceCommandState?.intent
? `- Intent: ${activeVoiceCommandState.intent}`
: null,
activeVoiceCommandState
? `- Command session expires in about ${Math.max(0, Math.round((activeVoiceCommandState.expiresAt - Date.now()) / 1000))} seconds.`
: null,
"- In command-only mode, a follow-up from the locked speaker does not need the wake word again.",
musicDisambiguation?.active
? `- Pending music action: ${musicDisambiguation.action}`
: null,
musicDisambiguation?.query
? `- Pending music query: ${musicDisambiguation.query}`
: null,
...(musicDisambiguation?.options || []).slice(0, 5).map((option, index) =>
`- Music option ${index + 1}: ${String(option?.title || "").trim()} - ${String(option?.artist || "").trim()} [${String(option?.id || "").trim()}]`
)
]
.filter(Boolean)
.join("
") ); }
const musicContext = resolvedTurnContext.musicContext;
if (
musicContext && (
musicContext.currentTrack?.title ||
musicContext.lastTrack?.title ||
musicContext.queueLength > 0 ||
musicContext.lastAction ||
musicContext.lastQuery
)
) {
const musicDisplayState =
musicContext.playbackState === "idle" &&
(musicContext.currentTrack?.title || musicContext.lastTrack?.title)
? "stopped"
: musicContext.playbackState;
const musicLines = ["Music playback:"];
musicLines.push(`- Status: ${musicDisplayState}`);
if (musicContext.currentTrack) {
const artists = musicContext.currentTrack.artists.length
? musicContext.currentTrack.artists.join(", ")
: "unknown artist";
musicLines.push(
`- Current song: ${musicContext.currentTrack.title} by ${artists} (${musicDisplayState})${musicContext.currentTrack.id ? ` [selection_id: ${musicContext.currentTrack.id}]` : ""}`
);
}
if (
musicContext.lastTrack && (
!musicContext.currentTrack ||
musicContext.currentTrack.title !== musicContext.lastTrack.title ||
musicContext.currentTrack.artists.join(" | ") !== musicContext.lastTrack.artists.join(" | ")
)
) {
const artists = musicContext.lastTrack.artists.length
? musicContext.lastTrack.artists.join(", ")
: "unknown artist";
musicLines.push(
`- Last played: ${musicContext.lastTrack.title} by ${artists}${musicContext.lastTrack.id ? ` [selection_id: ${musicContext.lastTrack.id}]` : ""}`
);
}
if (musicContext.queueLength > 0) {
musicLines.push(`- Queue: ${musicContext.queueLength} track(s)`);
for (const [index, track] of musicContext.upcomingTracks.entries()) {
musicLines.push(
`- Queue item ${index + 1}: ${track.title}${track.artist ? ` - ${track.artist}` : ""}${track.id ? ` [selection_id: ${track.id}]` : ""}`
);
}
}
if (musicContext.lastAction) {
musicLines.push(`- Last action: ${musicContext.lastAction}`);
}
if (musicContext.lastQuery) {
musicLines.push(`- Last music query: ${musicContext.lastQuery}`);
}
if (musicContext.replyHandoffMode === "pause") {
musicLines.push("- Your next spoken reply can take the floor: music is paused now and auto-resumes when you finish or stay silent.");
} else if (musicContext.replyHandoffMode === "duck") {
musicLines.push("- Your next spoken reply can take the floor: music stays live and ducks under your voice, then unducks when you finish.");
}
musicLines.push(...buildActiveMusicReplyGuidanceLines(musicContext));
sections.push(musicLines.join("
")); }
const configuredTools = Array.isArray(session.realtimeToolDefinitions) ? session.realtimeToolDefinitions : [];
if (shouldHandleRealtimeFunctionCalls({ session, settings }) && configuredTools.length > 0) {
const localToolNames = configuredTools
.filter((tool) => tool?.toolType !== "mcp")
.map((tool) => String(tool?.name || "").trim())
.filter(Boolean)
.slice(0, 16);
const localToolNameSet = new Set(localToolNames);
const hasWebSearchTool = localToolNameSet.has("web_search");
const hasWebScrapeTool = localToolNameSet.has("web_scrape");
const hasBrowserBrowseTool = localToolNameSet.has("browser_browse");
const hasConversationSearchTool = localToolNameSet.has("conversation_search");
const hasMemoryWriteTool = localToolNameSet.has("memory_write");
const mcpToolNames = configuredTools
.filter((tool) => tool?.toolType === "mcp")
.map((tool) => String(tool?.name || "").trim())
.filter(Boolean)
.slice(0, 16);
sections.push(
[
"Tooling policy:",
localToolNames.length > 0 ? `- Local tools: ${localToolNames.join(", ")}` : null,
mcpToolNames.length > 0 ? `- MCP tools: ${mcpToolNames.join(", ")}` : null,
"- Use tools when they improve factuality or action execution. Always call the tool — never just say you will.",
hasWebSearchTool || hasWebScrapeTool
? `- ${buildWebToolRoutingPolicyLine({ includeBrowserBrowse: hasBrowserBrowseTool })}`
: hasBrowserBrowseTool
? `- ${BROWSER_BROWSE_POLICY_LINE}`
: null,
hasBrowserBrowseTool ? `- ${BROWSER_SCREENSHOT_POLICY_LINE}` : null,
hasWebSearchTool ? `- ${IMMEDIATE_WEB_SEARCH_POLICY_LINE} Do not respond with only audio saying you will search.` : null,
hasConversationSearchTool ? `- ${CONVERSATION_SEARCH_POLICY_LINE}` : null,
hasMemoryWriteTool ? "- For memory writes, only store concise durable facts and avoid secrets. Write facts from your own perspective — use 'me'/'my' instead of your name." : null,
"- For music controls, use music_play to start or replace playback now. It searches internally and may return disambiguation options.",
"- If music_play returns choices, ask which one they want and then call music_play again with selection_id.",
"- For YouTube video playback, use video_play. It resolves YouTube results and uses outbound stream publish when that runtime path is available.",
"- If video_play returns choices, ask which one they want and then call video_play again with selection_id.",
"- Omit selection_id unless you are reusing an exact one already shown in prompt context or a prior tool result. Never invent placeholder or markup tokens.",
"- Use music_search only for explicit browsing requests or when the user wants options. Ordinary play and queue requests can resolve directly from query text.",
"- Use video_search only for explicit video options. If thumbnails, page layout, or browsing the YouTube site would help, browser_browse may fit better.",
"- For a fresh play request, pass query to music_play. For a followup choice after disambiguation, call music_play with selection_id.",
"- For a fresh video request, pass query to video_play. For a followup choice after disambiguation, call video_play with selection_id.",
"- If Music playback context already shows a selection_id for the exact track you want, reuse that selection_id with music_play and include the matching query text instead of re-searching.",
"- Use music_queue_next to place a track after the current one and music_queue_add to append. Both can take direct query text or exact prior IDs.",
"- For requests like \"play X, then queue Y\", call music_play for X first and music_queue_next for Y second in the same tool response.",
"- Do not claim a track is queued or added until music_queue_next or music_queue_add succeeds.",
"- Use media_stop to stop playback.",
"- Do not emulate play-now by chaining music_queue_add and media_skip.",
"- Do not use media_skip as a substitute for media_stop.",
`- ${MUSIC_ACTIVE_AUTONOMY_POLICY_LINE}`,
`- ${MUSIC_REPLY_HANDOFF_POLICY_LINE}`,
"- If a tool fails, explain the failure briefly and continue naturally."
]
.filter(Boolean)
.join("
") ); }
const rawScreenShareReason = String(screenShareCapability?.reason || "").trim().toLowerCase();
const screenShareReason = rawScreenShareReason || "unavailable";
const screenShareAvailable = Boolean(screenShareCapability?.available);
const screenShareSupported = Boolean(screenShareCapability?.supported);
const commentaryEagerness = Math.max(0, Math.min(100,
Number(getVoiceStreamWatchSettings(settings).commentaryEagerness) || 60
));
if (hasActiveScreenFrameContext) {
sections.push(
[
"Visual context:",
"- You currently have screen-watch frame snapshots for this conversation.",
"- You may comment only on what those snapshots show.",
"- Do not imply you have a continuous live view beyond the provided frame context.",
`Screen watch commentary eagerness: ${commentaryEagerness}/100.`,
getScreenWatchCommentaryTier(commentaryEagerness)
].join("
")
);
} else if (hasRecentScreenFrameMemory) {
sections.push(
[
"Visual context:",
"- You do not currently see the user's screen.",
"- You do retain notes from an earlier screen-watch in this conversation.",
"- If asked, answer only from those earlier notes and make clear they are not a live view."
].join("
")
);
} else {
if (screenShareAvailable) {
sections.push(
[
"Visual context:",
"- You do not currently see the user's screen.",
"- Do not claim to see, watch, or react to on-screen content until actual frame context is provided.",
"- If the speaker asks you to see/watch/share their screen or stream, call start_screen_watch.",
"- The runtime chooses the best available watch method automatically."
].join("
")
);
} else if (screenShareSupported) {
sections.push(
[
"Visual context:",
"- You do not currently see the user's screen.",
"- Screen watch exists but is unavailable right now.",
- Current unavailability reason: ${screenShareReason}.,
"- If asked, say screen watch is unavailable right now.",
"- Do not claim to see or watch the screen."
].join("
")
);
} else {
sections.push(
[
"Visual context:",
"- You do not currently see the user's screen.",
"- Do not claim to see, watch, or react to on-screen content.",
"- If asked about screen watching, explain that you need active frame context before you can comment on what is on screen."
].join("
")
);
}
}
if (nativeDiscordSharers.length > 0) {
const nativeStreamActionLine = screenShareAvailable
? "- If watching one of them would help, call start_screen_watch to request actual frame context."
: screenShareSupported
? "- Screen watch start is unavailable right now, so do not call start_screen_watch yet."
: "- Screen watch is unavailable in this session, so do not call start_screen_watch.";
const nativeStreamTargetLine = screenShareAvailable
? "- If more than one share is live and you want a specific one, pass { target: \"display name\" }."
: "- If more than one share is live, keep track of who is sharing but do not request a watch until it becomes available.";
sections.push(
[
"Native Discord streams live right now:",
...nativeDiscordSharers.slice(0, 6).map((entry) => {
const details = [
entry.streamType,
entry.codec,
entry.width && entry.height ? `${entry.width}x${entry.height}` : null
]
.filter(Boolean)
.join(", ");
return `- ${entry.displayName}${details ? ` (${details})` : ""}`;
}),
"- You do not automatically see those shares just because they are active.",
nativeStreamActionLine,
nativeStreamTargetLine
].join("
") ); }
if (hasScreenFrameContext) {
sections.push(
[
hasActiveScreenFrameContext ? "Screen-watch frame context:" : "Recent screen-watch memory:",
`- Guidance: ${String(streamWatchNotes?.prompt || "").trim()}`,
...(streamWatchNotes?.notes || []).slice(-8).map((note) => `- ${note}`),
hasActiveScreenFrameContext
? "- Treat these notes as snapshots, not a continuous feed."
: "- Treat these notes as earlier snapshots, not a current live view."
]
.filter(Boolean)
.join("
") ); }
return sections.join("
").slice(0, 5200); }
private ensureTurnContextRefreshState( session: VoiceSession ): RealtimeTurnContextRefreshState { const current = session.realtimeTurnContextRefreshState; if (current && typeof current === "object") { return current; } const nextState: RealtimeTurnContextRefreshState = { inFlight: false, pending: null }; session.realtimeTurnContextRefreshState = nextState; return nextState; }
private get store() { return this.host.store; } }
