JCV's Portfolio

src/voice/elevenLabsRealtimeClient.ts

import { EventEmitter } from "node:events"; import WebSocket from "ws"; import { buildCommonRealtimeState, closeRealtimeSocket, compactObject, handleRealtimeSocketClose, handleRealtimeSocketError, markRealtimeConnected, openRealtimeSocket, safeJsonPreview, sendRealtimePayload } from "./realtimeClientCore.ts"; import type { RealtimeInterruptAcceptanceMode } from "./realtimeInterruptAcceptance.ts"; import { normalizeElevenLabsBaseUrl } from "./realtimeProviderNormalization.ts";

/**

ElevenLabs WebSocket TTS streaming client.
This is a TTS-only realtime client: the brain LLM (Claude etc.) generates
text via the normal generation pipeline, and this client converts it to
streaming audio via the ElevenLabs WebSocket text-to-speech API.
Protocol:
Connect -> wss://{host}/v1/text-to-speech/{voice_id}/stream-input?...
Init -> { text: " ", voice_settings, generation_config }
Text -> { text: "...", flush: true }
Recv <- { audio: "" } | { isFinal: true }
Close -> { text: "" }
Audio output is emitted as "audio_delta" events (base64-encoded PCM)
which sessionLifecycle.ts pipes to clankvox -> Discord. */

const DEFAULT_CHUNK_SCHEDULE = [50, 120, 200, 260];

export class ElevenLabsRealtimeClient extends EventEmitter { apiKey; baseUrl; logger; ws: WebSocket | null; connectedAt; lastEventAt; lastError; sessionId; lastCloseCode; lastCloseReason; lastOutboundEventType; lastOutboundEventAt; lastOutboundEvent; recentOutboundEvents; sessionConfig; activeResponseId; activeResponseStatus; audioBase64Buffer: Buffer | null;

private _responseInProgress = false; private _utteranceCounter = 0;

constructor({ apiKey, baseUrl = null, logger = null }) { super(); this.apiKey = String(apiKey || "").trim(); this.baseUrl = normalizeElevenLabsBaseUrl(baseUrl); this.logger = typeof logger === "function" ? logger : null; this.ws = null; this.connectedAt = 0; this.lastEventAt = 0; this.lastError = null; this.sessionId = null; this.lastCloseCode = null; this.lastCloseReason = null; this.lastOutboundEventType = null; this.lastOutboundEventAt = 0; this.lastOutboundEvent = null; this.recentOutboundEvents = []; this.sessionConfig = null; this.activeResponseId = null; this.activeResponseStatus = null; this.audioBase64Buffer = null; }

async connect({ voiceId = "", model = "eleven_multilingual_v2", outputFormat = "pcm_24000", outputSampleRateHz = 24000, voiceSettings = null as { stability?: number; similarity_boost?: number; style?: number; speed?: number } | null, chunkLengthSchedule = DEFAULT_CHUNK_SCHEDULE as number[] } = {}) { if (!this.apiKey) { throw new Error("Missing ELEVENLABS_API_KEY for ElevenLabs realtime TTS."); } const resolvedVoiceId = String(voiceId || "").trim(); if (!resolvedVoiceId) { throw new Error("ElevenLabs realtime TTS requires a voice_id."); }

if (this.ws && this.ws.readyState === WebSocket.OPEN) {
  return this.getState();
}

// Build WebSocket URL with query params
const httpBaseUrl = normalizeElevenLabsBaseUrl(this.baseUrl);
const wsBaseUrl = httpBaseUrl.replace(/^https:/, "wss:").replace(/^http:/, "ws:");
const wsUrl = new URL(`/v1/text-to-speech/${encodeURIComponent(resolvedVoiceId)}/stream-input`, wsBaseUrl);
wsUrl.searchParams.set("model_id", String(model || "eleven_multilingual_v2").trim());
wsUrl.searchParams.set("output_format", String(outputFormat || "pcm_24000").trim());
// Keep the connection alive for the entire voice session.  ElevenLabs
// bills per character, not per connection time, so an idle WS is free.
// 3600s (1 hour) prevents the idle-timeout disconnect that previously
// caused jarring audio cutouts during screen-watch and listen-only periods.
wsUrl.searchParams.set("inactivity_timeout", "3600");

const ws = await openRealtimeSocket({
  url: wsUrl.toString(),
  headers: {
    "xi-api-key": this.apiKey
  },
  timeoutMessage: "Timed out connecting to ElevenLabs TTS WebSocket after 10000ms.",
  connectErrorPrefix: "ElevenLabs TTS WebSocket connection failed"
});
markRealtimeConnected(this, ws);

ws.on("message", (payload) => {
  this.lastEventAt = Date.now();
  this.handleIncoming(payload);
});

ws.on("error", (error) => {
  handleRealtimeSocketError(this, error, {
    logEvent: "elevenlabs_realtime_ws_error"
  });
});

ws.on("close", (code, reasonBuffer) => {
  handleRealtimeSocketClose(this, code, reasonBuffer, {
    logEvent: "elevenlabs_realtime_ws_closed"
  });
});

this.sessionConfig = {
  voiceId: resolvedVoiceId,
  model: String(model || "eleven_multilingual_v2").trim(),
  outputFormat: String(outputFormat || "pcm_24000").trim(),
  outputSampleRateHz: Number(outputSampleRateHz) || 24000,
  voiceSettings: voiceSettings && typeof voiceSettings === "object"
    ? { ...voiceSettings }
    : null,
  chunkLengthSchedule: Array.isArray(chunkLengthSchedule)
    ? chunkLengthSchedule.map(Number).filter(Number.isFinite)
    : DEFAULT_CHUNK_SCHEDULE.slice()
};

// Initialize the connection: first message must have text=" " (space)
const initPayload: Record<string, unknown> = {
  text: " "
};
if (this.sessionConfig.voiceSettings) {
  initPayload.voice_settings = compactObject(this.sessionConfig.voiceSettings);
}
if (this.sessionConfig.chunkLengthSchedule.length > 0) {
  initPayload.generation_config = {
    chunk_length_schedule: this.sessionConfig.chunkLengthSchedule
  };
}
this.send(initPayload, "init");

return this.getState();

}

handleIncoming(payload: unknown) { let event: Record<string, unknown> | null = null;

try {
  event = JSON.parse(String(payload || ""));
} catch {
  return;
}

if (!event || typeof event !== "object") return;

// Audio output chunk: { audio: "<base64>", alignment?: ..., normalizedAlignment?: ... }
if (typeof event.audio === "string" && event.audio) {
  this.emit("audio_delta", event.audio);
  return;
}

// Final marker: { isFinal: true }
if (event.isFinal === true) {
  this._responseInProgress = false;
  this.activeResponseStatus = "completed";
  this.emit("response_done", event);
  return;
}

// Error from the server
if (typeof event.error === "string" || typeof event.message === "string") {
  const errorMessage = String(event.error || event.message || "Unknown ElevenLabs TTS error");
  this.lastError = errorMessage;
  this.log("warn", "elevenlabs_realtime_error_event", {
    error: errorMessage,
    event: safeJsonPreview(event)
  });
  this.emit("error_event", {
    message: errorMessage,
    event
  });
}

}

/**

Send text to be spoken via the WebSocket.
This is the primary method for TTS-only mode.
The text is sent with flush=true so ElevenLabs generates audio immediately
for whatever text has been buffered. */ requestPlaybackUtterance(promptText: string) { const text = String(promptText || "").trim(); if (!text) return;

this._utteranceCounter += 1;

const utteranceId = `el_utt_${this._utteranceCounter}`;
this._responseInProgress = true;
this.activeResponseId = utteranceId;
this.activeResponseStatus = "in_progress";

// Send text with flush to trigger immediate generation.
// ElevenLabs expects text to end with a trailing space for best continuity.
this.send({
  text: text + " ",
  flush: true
}, "text");

}

/**

For TTS-only providers, requestTextUtterance is identical to
requestPlaybackUtterance — we just render the text as speech. */ requestTextUtterance(promptText: string) { this.requestPlaybackUtterance(promptText); }

// --- Methods required by the interface but mostly no-ops for TTS-only ---

/**

ElevenLabs TTS doesn't accept audio input. No-op. */ appendInputAudioPcm(_audioBuffer: Buffer) { // ElevenLabs TTS WebSocket is output-only — audio input goes through // a separate ASR bridge (OpenAI transcription, etc.) }

appendInputAudioBase64(_audioBase64: string) { // No-op for TTS-only provider }

commitInputAudioBuffer() { // No-op for TTS-only provider }

createAudioResponse() { // No-op — TTS is driven by requestPlaybackUtterance/requestTextUtterance }

cancelActiveResponse(): boolean { // ElevenLabs WebSocket doesn't support cancelling mid-stream. // The best we can do is close and reconnect, but that's handled at // the session level via barge-in suppression. this._responseInProgress = false; this.activeResponseStatus = "cancelled"; return false; }

isResponseInProgress(): boolean { return this._responseInProgress; }

getInterruptAcceptanceMode(): RealtimeInterruptAcceptanceMode { // ElevenLabs can't cancel in-flight audio via the API, so we use // local cut with async confirmation (same as Gemini). return "local_cut_async_confirmation"; }

clearActiveResponse(status: string | null = null) { this._responseInProgress = false; this.activeResponseId = null; this.activeResponseStatus = status ? String(status).trim() || null : null; }

send(payload: Record<string, unknown>, eventType: string | null = null) { const resolvedType = eventType || "text"; sendRealtimePayload(this, { payload, eventType: resolvedType, summarizeOutboundPayload: summarizeElevenLabsOutbound, skipHistoryEventType: null, skipLogEventType: null, logEvent: "elevenlabs_realtime_client_event_sent", socketNotOpenMessage: "ElevenLabs TTS WebSocket is not open." }); }

async close() { if (!this.ws) return; if (this.ws.readyState === WebSocket.CLOSED) { this.ws = null; return; }

// Send the close-connection message (empty text)
try {
  if (this.ws.readyState === WebSocket.OPEN) {
    this.ws.send(JSON.stringify({ text: "" }));
  }
} catch {
  // ignore
}

await closeRealtimeSocket(this.ws);
this.ws = null;
this.audioBase64Buffer = null;
this.clearActiveResponse();

}

getState() { return { ...buildCommonRealtimeState(this), activeResponseId: this.activeResponseId || null, activeResponseStatus: this.activeResponseStatus || null, utteranceCount: this._utteranceCounter, model: this.sessionConfig?.model || null }; }

log(level: string, event: string, metadata: Record<string, unknown> | null = null) { if (!this.logger) return; this.logger({ level, event, metadata }); } }

function summarizeElevenLabsOutbound(payload: Record<string, unknown> | null) { if (!payload || typeof payload !== "object") return null; const text = typeof payload.text === "string" ? payload.text : null; const flush = Boolean(payload.flush); const hasVoiceSettings = Boolean(payload.voice_settings); const hasGenerationConfig = Boolean(payload.generation_config);

if (text === " " && (hasVoiceSettings || hasGenerationConfig)) { return compactObject({ type: "init", hasVoiceSettings, hasGenerationConfig }); }

if (text === "") { return { type: "close" }; }

return compactObject({ type: "text", textChars: text ? text.trim().length : 0, flush }); }