src/vision/captionImage.ts

/**

  • Shared vision utility for captioning images via a vision-capable LLM.
  • Used by:
    • Image caption cache (message history images)
    • Browser agent (page screenshots)
    • Embed thumbnail captioning */

import { applyOrchestratorOverrideSettings, getResolvedVisionBinding } from "../settings/agentStack.ts";

const DEFAULT_CAPTION_PROMPT = "Describe this image in one concise sentence for search and conversation context. Focus on the main subject, action, and any text visible.";

const DEFAULT_CAPTION_MAX_OUTPUT_TOKENS = 150;

const IMAGE_FETCH_TIMEOUT_MS = 8_000; const IMAGE_FETCH_MAX_BYTES = 10 * 1024 * 1024; // 10 MB

const VISION_PROVIDER_CANDIDATES = [ { provider: "anthropic", model: "claude-haiku-4-5" }, { provider: "xai", model: "grok-2-vision-latest" } ];

/**

  • Resolve the best available vision provider from the LLM service.

  • Mirrors the pattern in voiceStreamWatch.ts resolveStreamWatchVisionProviderSettings. */ export function resolveVisionProviderSettings(llm, settings = null) { if (!llm || typeof llm.isProviderConfigured !== "function") return null;

    const visionSettings = getResolvedVisionBinding(settings); const preferredProvider = String(visionSettings.provider || "").trim().toLowerCase(); const preferredModel = String(visionSettings.model || "").trim();

    if (preferredProvider && preferredModel && llm.isProviderConfigured(preferredProvider)) { return { provider: preferredProvider, model: preferredModel, temperature: 0.2, maxOutputTokens: DEFAULT_CAPTION_MAX_OUTPUT_TOKENS }; }

    for (const candidate of VISION_PROVIDER_CANDIDATES) { if (!llm.isProviderConfigured(candidate.provider)) continue; return { provider: candidate.provider, model: candidate.model, temperature: 0.2, maxOutputTokens: DEFAULT_CAPTION_MAX_OUTPUT_TOKENS }; }

    return null; }

/**

  • Fetch image bytes from a URL with timeout and size limit.

  • Returns { dataBase64, mimeType } or null on failure. */ export async function fetchImageAsBase64(url, { timeoutMs = IMAGE_FETCH_TIMEOUT_MS, maxBytes = IMAGE_FETCH_MAX_BYTES } = {}) { const normalizedUrl = String(url || "").trim(); if (!normalizedUrl) return null;

    const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), timeoutMs);

    try { const response = await fetch(normalizedUrl, { signal: controller.signal, headers: { Accept: "image/*" } }); if (!response.ok) return null;

     const contentType = String(response.headers.get("content-type") || "").toLowerCase();
     const contentLength = Number(response.headers.get("content-length") || 0);
     if (contentLength > maxBytes) return null;
    
     const arrayBuffer = await response.arrayBuffer();
     if (arrayBuffer.byteLength > maxBytes) return null;
    
     const dataBase64 = Buffer.from(arrayBuffer).toString("base64");
     const mimeType = contentType.startsWith("image/") ? contentType.split(";")[0].trim() : "image/jpeg";
    
     return { dataBase64, mimeType };
    

    } catch { return null; } finally { clearTimeout(timeout); } }

/**

  • Caption an image using a vision-capable LLM.

  • Accepts either:

    • url: image URL (will be fetched and converted to base64, or passed directly)
    • dataBase64 + mimeType: raw image data
  • Returns { caption, provider, model } or null if captioning failed. */ export async function captionImage({ llm, settings = null, mimeType = "", dataBase64 = "", url = "", prompt = "", maxOutputTokens = 0, trace = null }) { if (!llm || typeof llm.generate !== "function") return null;

    const providerSettings = resolveVisionProviderSettings(llm, settings); if (!providerSettings) return null;

    // Build image input — prefer caller-supplied bytes, otherwise fetch URL bytes locally. // This avoids provider-side fetch failures for Discord CDN URLs that the bot can still reach. const normalizedBase64 = String(dataBase64 || "").trim(); const normalizedUrl = String(url || "").trim(); const normalizedMimeType = String(mimeType || "").trim().toLowerCase() || "image/jpeg";

    let imageInput; if (normalizedBase64) { imageInput = { mediaType: normalizedMimeType, dataBase64: normalizedBase64 }; } else if (normalizedUrl) { const fetched = await fetchImageAsBase64(normalizedUrl); if (!fetched?.dataBase64) return null; imageInput = { mediaType: fetched.mimeType || normalizedMimeType, dataBase64: fetched.dataBase64 }; } else { return null; }

    const resolvedPrompt = String(prompt || DEFAULT_CAPTION_PROMPT).trim(); const resolvedMaxTokens = Math.max( 50, Number(maxOutputTokens) || DEFAULT_CAPTION_MAX_OUTPUT_TOKENS );

    try { const tunedSettings = applyOrchestratorOverrideSettings(settings, { provider: providerSettings.provider, model: providerSettings.model, temperature: providerSettings.temperature, maxOutputTokens: resolvedMaxTokens });

     const generated = await llm.generate({
         settings: tunedSettings,
         systemPrompt:
             "You are an image captioning assistant. Respond with only the description, no preamble.",
         userPrompt: resolvedPrompt,
         imageInputs: [imageInput],
         contextMessages: [],
         trace: trace || {
             guildId: null,
             channelId: null,
             userId: null,
             source: "image_caption"
         }
     });
    
     const caption = String(generated?.text || "").trim();
     if (!caption) return null;
    
     return {
         caption,
         provider: generated?.provider || providerSettings.provider || null,
         model: generated?.model || providerSettings.model || null
     };
    

    } catch { return null; } }