/**
- Shared vision utility for captioning images via a vision-capable LLM.
- Used by:
-
- Image caption cache (message history images)
-
- Browser agent (page screenshots)
-
- Embed thumbnail captioning */
import { applyOrchestratorOverrideSettings, getResolvedVisionBinding } from "../settings/agentStack.ts";
const DEFAULT_CAPTION_PROMPT = "Describe this image in one concise sentence for search and conversation context. Focus on the main subject, action, and any text visible.";
const DEFAULT_CAPTION_MAX_OUTPUT_TOKENS = 150;
const IMAGE_FETCH_TIMEOUT_MS = 8_000; const IMAGE_FETCH_MAX_BYTES = 10 * 1024 * 1024; // 10 MB
const VISION_PROVIDER_CANDIDATES = [ { provider: "anthropic", model: "claude-haiku-4-5" }, { provider: "xai", model: "grok-2-vision-latest" } ];
/**
-
Resolve the best available vision provider from the LLM service.
-
Mirrors the pattern in voiceStreamWatch.ts resolveStreamWatchVisionProviderSettings. */ export function resolveVisionProviderSettings(llm, settings = null) { if (!llm || typeof llm.isProviderConfigured !== "function") return null;
const visionSettings = getResolvedVisionBinding(settings); const preferredProvider = String(visionSettings.provider || "").trim().toLowerCase(); const preferredModel = String(visionSettings.model || "").trim();
if (preferredProvider && preferredModel && llm.isProviderConfigured(preferredProvider)) { return { provider: preferredProvider, model: preferredModel, temperature: 0.2, maxOutputTokens: DEFAULT_CAPTION_MAX_OUTPUT_TOKENS }; }
for (const candidate of VISION_PROVIDER_CANDIDATES) { if (!llm.isProviderConfigured(candidate.provider)) continue; return { provider: candidate.provider, model: candidate.model, temperature: 0.2, maxOutputTokens: DEFAULT_CAPTION_MAX_OUTPUT_TOKENS }; }
return null; }
/**
-
Fetch image bytes from a URL with timeout and size limit.
-
Returns { dataBase64, mimeType } or null on failure. */ export async function fetchImageAsBase64(url, { timeoutMs = IMAGE_FETCH_TIMEOUT_MS, maxBytes = IMAGE_FETCH_MAX_BYTES } = {}) { const normalizedUrl = String(url || "").trim(); if (!normalizedUrl) return null;
const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), timeoutMs);
try { const response = await fetch(normalizedUrl, { signal: controller.signal, headers: { Accept: "image/*" } }); if (!response.ok) return null;
const contentType = String(response.headers.get("content-type") || "").toLowerCase(); const contentLength = Number(response.headers.get("content-length") || 0); if (contentLength > maxBytes) return null; const arrayBuffer = await response.arrayBuffer(); if (arrayBuffer.byteLength > maxBytes) return null; const dataBase64 = Buffer.from(arrayBuffer).toString("base64"); const mimeType = contentType.startsWith("image/") ? contentType.split(";")[0].trim() : "image/jpeg"; return { dataBase64, mimeType };} catch { return null; } finally { clearTimeout(timeout); } }
/**
-
Caption an image using a vision-capable LLM.
-
Accepts either:
-
- url: image URL (will be fetched and converted to base64, or passed directly)
-
- dataBase64 + mimeType: raw image data
-
Returns { caption, provider, model } or null if captioning failed. */ export async function captionImage({ llm, settings = null, mimeType = "", dataBase64 = "", url = "", prompt = "", maxOutputTokens = 0, trace = null }) { if (!llm || typeof llm.generate !== "function") return null;
const providerSettings = resolveVisionProviderSettings(llm, settings); if (!providerSettings) return null;
// Build image input — prefer caller-supplied bytes, otherwise fetch URL bytes locally. // This avoids provider-side fetch failures for Discord CDN URLs that the bot can still reach. const normalizedBase64 = String(dataBase64 || "").trim(); const normalizedUrl = String(url || "").trim(); const normalizedMimeType = String(mimeType || "").trim().toLowerCase() || "image/jpeg";
let imageInput; if (normalizedBase64) { imageInput = { mediaType: normalizedMimeType, dataBase64: normalizedBase64 }; } else if (normalizedUrl) { const fetched = await fetchImageAsBase64(normalizedUrl); if (!fetched?.dataBase64) return null; imageInput = { mediaType: fetched.mimeType || normalizedMimeType, dataBase64: fetched.dataBase64 }; } else { return null; }
const resolvedPrompt = String(prompt || DEFAULT_CAPTION_PROMPT).trim(); const resolvedMaxTokens = Math.max( 50, Number(maxOutputTokens) || DEFAULT_CAPTION_MAX_OUTPUT_TOKENS );
try { const tunedSettings = applyOrchestratorOverrideSettings(settings, { provider: providerSettings.provider, model: providerSettings.model, temperature: providerSettings.temperature, maxOutputTokens: resolvedMaxTokens });
const generated = await llm.generate({ settings: tunedSettings, systemPrompt: "You are an image captioning assistant. Respond with only the description, no preamble.", userPrompt: resolvedPrompt, imageInputs: [imageInput], contextMessages: [], trace: trace || { guildId: null, channelId: null, userId: null, source: "image_caption" } }); const caption = String(generated?.text || "").trim(); if (!caption) return null; return { caption, provider: generated?.provider || providerSettings.provider || null, model: generated?.model || providerSettings.model || null };} catch { return null; } }
