import OpenAI from "openai"; import { normalizeDiscoveryUrl } from "./discovery.ts"; import { getResearchRuntimeConfig, getResolvedOrchestratorBinding, resolveAgentStack } from "../settings/agentStack.ts"; import { assertPublicUrl } from "./urlSafety.ts"; import { clamp } from "../utils.ts"; import { normalizeProviderOrder } from "../store/normalize/primitives.ts"; import { normalizeWhitespaceText } from "../normalization/text.ts"; import { sleep } from "../normalization/time.ts"; import { throwIfAborted } from "../tools/abortError.ts"; import { getRetryDelayMs, isRetryableFetchError, shouldRetryHttpStatus, withAttemptCount } from "../retry.ts";
const BRAVE_SEARCH_API_URL = "https://api.search.brave.com/res/v1/web/search"; const SERPAPI_SEARCH_API_URL = "https://serpapi.com/search.json"; const SEARCH_TIMEOUT_MS = 5_000; const FAST_FETCH_TIMEOUT_MS = 8_000; const MAX_RESPONSE_BYTES = 16 * 1024 * 1024; const SEARCH_RETRY_ATTEMPTS = 2; const FETCH_RETRY_ATTEMPTS = 2; const SEARCH_USER_AGENT = "clanky/0.2 (+web-search-v2; https://github.com/Volpestyle/clanky)"; type ProviderSearchInput = { query: string; maxResults: number; recencyDays: number; safeSearch: boolean; signal?: AbortSignal; };
type ProviderSearchRow = { url: string; provider?: string | null; [key: string]: string | number | boolean | null | undefined; };
type ProviderSearchResult = { results: ProviderSearchRow[]; };
class AttemptError extends Error { attempts;
constructor(message, attempts) { super(message); this.attempts = Number(attempts || 1); } }
export class WebSearchService { store; providers; openai;
constructor({ appConfig, store }) { this.store = store; this.providers = buildProviders(appConfig); this.openai = String(appConfig?.openaiApiKey || "").trim() ? new OpenAI({ apiKey: String(appConfig?.openaiApiKey || "").trim() }) : null; }
isConfigured() { return Boolean(this.openai) || this.providers.some((provider) => provider.isConfigured()); }
async searchAndRead({ settings, query, trace = { guildId: null, channelId: null, userId: null, source: null }, signal = undefined as AbortSignal | undefined }) { const config = normalizeWebSearchConfig(getResearchRuntimeConfig(settings).localExternalSearch); const resolvedStack = resolveAgentStack(settings); const normalizedQuery = sanitizeExternalText(query, 220); if (!normalizedQuery) { return { query: "", results: [], fetchedPages: 0, providerUsed: null, providerFallbackUsed: false, summaryText: "" }; }
throwIfAborted(signal, "Web search cancelled");
if (resolvedStack.researchRuntime === "openai_native_web_search") {
return await this.searchWithOpenAiHostedWebSearch({
settings,
query: normalizedQuery,
trace,
signal
});
}
const providers = resolveProviderOrder(this.providers, config.providerOrder);
const primaryProvider = providers[0] || null;
const secondaryProvider = providers[1] || null;
if (!primaryProvider) {
throw new Error("Live search is not configured. Set BRAVE_SEARCH_API_KEY and/or SERPAPI_API_KEY.");
}
const started = Date.now();
let providerUsed = primaryProvider.name;
let providerFallbackUsed = false;
try {
let searchData;
try {
searchData = await primaryProvider.search({
query: normalizedQuery,
maxResults: config.maxResults,
recencyDays: config.recencyDaysDefault,
safeSearch: config.safeSearch,
signal
});
} catch (error) {
if (!secondaryProvider) throw error;
providerFallbackUsed = true;
providerUsed = secondaryProvider.name;
searchData = await secondaryProvider.search({
query: normalizedQuery,
maxResults: config.maxResults,
recencyDays: config.recencyDaysDefault,
safeSearch: config.safeSearch,
signal
});
}
const readCandidates = searchData.results.slice(0, config.maxPagesToRead);
const pageSummaries = await mapConcurrent(readCandidates, config.maxConcurrentFetches, async (item) => {
throwIfAborted(signal, "Web search cancelled");
try {
return await this.readPageSummary(item.url, config.maxCharsPerPage, signal);
} catch (error) {
this.logSearchError({
trace,
query: normalizedQuery,
provider: providerUsed,
stage: "fetch",
attempts: Number(error?.attempts || 1),
error
});
return { error: String(error?.message || error), attempts: Number(error?.attempts || 1) };
}
});
const summaryByUrl = new Map();
for (let index = 0; index < readCandidates.length; index += 1) {
summaryByUrl.set(readCandidates[index].url, pageSummaries[index]);
}
const results = searchData.results.map((item) => {
const page = summaryByUrl.get(item.url);
return {
...item,
provider: item.provider || providerUsed,
pageTitle: page?.title || null,
pageSummary: page?.summary || null,
pageError: page?.error || null,
extractionMethod: page?.extractionMethod || null
};
});
const fetchedPages = results.filter((row) => row.pageSummary).length;
this.store.logAction({
kind: "search_call",
guildId: trace.guildId,
channelId: trace.channelId,
userId: trace.userId,
content: normalizedQuery,
metadata: {
query: normalizedQuery,
source: trace.source || "unknown",
maxResults: config.maxResults,
returnedResults: results.length,
pageReadsRequested: readCandidates.length,
pageReadsSucceeded: fetchedPages,
providerUsed,
fallbackUsed: providerFallbackUsed,
latencyMs: Date.now() - started
}
});
return {
query: normalizedQuery,
results,
fetchedPages,
providerUsed,
providerFallbackUsed,
summaryText: ""
};
} catch (error) {
this.logSearchError({
trace,
query: normalizedQuery,
provider: providerUsed,
stage: "provider",
attempts: Number(error?.attempts || 1),
error
});
throw error;
}
}
async searchWithOpenAiHostedWebSearch({ settings, query, trace, signal = undefined as AbortSignal | undefined }) { if (!this.openai) { throw new Error("OpenAI native web search requires OPENAI_API_KEY."); }
const researchConfig = getResearchRuntimeConfig(settings);
const nativeConfig = researchConfig.openaiNativeWebSearch as {
userLocation?: string;
allowedDomains?: readonly string[];
};
const tool = {
type: "web_search_preview",
...(buildOpenAiWebSearchUserLocation(nativeConfig.userLocation)
? { user_location: buildOpenAiWebSearchUserLocation(nativeConfig.userLocation) }
: {}),
...(normalizeAllowedDomains(nativeConfig.allowedDomains).length
? {
filters: {
allowed_domains: normalizeAllowedDomains(nativeConfig.allowedDomains)
}
}
: {})
};
const orchestrator = getResolvedOrchestratorBinding(settings);
const model =
String(orchestrator?.provider || "").trim() === "openai" && String(orchestrator?.model || "").trim()
? String(orchestrator.model).trim()
: "gpt-5.2";
const started = Date.now();
const response = await this.openai.responses.create({
model,
input: [{
role: "user",
content: [{
type: "input_text",
text: query
}]
}],
tools: [tool],
include: ["web_search_call.action.sources"]
}, signal ? { signal } : undefined);
const summaryText = normalizeWhitespaceText(String(response.output_text || "").trim(), {
maxLen: 6_000
});
const results = extractOpenAiWebSearchResults(response).slice(
0,
Math.max(1, Number(researchConfig.localExternalSearch?.maxResults) || 5)
);
const fetchedPages = results.filter((row) => row.pageSummary).length;
this.store.logAction({
kind: "search_call",
guildId: trace.guildId,
channelId: trace.channelId,
userId: trace.userId,
content: query,
metadata: {
query,
source: trace.source || "unknown",
runtime: "openai_native_web_search",
returnedResults: results.length,
pageReadsRequested: 0,
pageReadsSucceeded: fetchedPages,
providerUsed: "openai_native_web_search",
fallbackUsed: false,
latencyMs: Date.now() - started
}
});
return {
query,
results,
fetchedPages,
providerUsed: "openai_native_web_search",
providerFallbackUsed: false,
summaryText
};
}
async readPageSummary(url, maxChars, signal = undefined as AbortSignal | undefined) {
const safeUrl = normalizeDiscoveryUrl(url);
if (!safeUrl) {
throw new Error(blocked or invalid page URL: ${url});
}
throwIfAborted(signal, "Web page read cancelled");
await assertPublicUrl(safeUrl);
const { response, attempts } = await fetchWithRetry({
request: () =>
fetch(safeUrl, {
method: "GET",
redirect: "follow",
headers: {
"user-agent": SEARCH_USER_AGENT,
accept: "text/html,text/plain;q=0.9,*/*;q=0.2"
},
signal: combineAbortSignal(signal, FAST_FETCH_TIMEOUT_MS)
}),
shouldRetryResponse: (res) => !res.ok && shouldRetryHttpStatus(res.status),
maxAttempts: FETCH_RETRY_ATTEMPTS
});
if (!response.ok) {
throw new AttemptError(`page fetch HTTP ${response.status}`, attempts);
}
const finalUrl = normalizeDiscoveryUrl(response.url);
if (!finalUrl) {
throw new AttemptError(`redirected to blocked URL: ${response.url}`, attempts);
}
await assertPublicUrl(finalUrl);
const contentType = String(response.headers.get("content-type") || "").toLowerCase();
if (
contentType &&
!contentType.includes("text/html") &&
!contentType.includes("text/plain")
) {
throw new AttemptError(`unsupported content type: ${contentType || "unknown"}`, attempts);
}
const { text: raw, truncated } = await readResponseBodyLimited(response, MAX_RESPONSE_BYTES);
if (!raw) {
throw new AttemptError("empty page response", attempts);
}
if (contentType.includes("text/plain")) {
const summary = sanitizeExternalText(raw, maxChars);
if (!summary) {
throw new AttemptError("page text had no usable content", attempts);
}
return {
title: null,
summary,
attempts,
extractionMethod: truncated ? "fast_truncated" : "fast"
};
}
const extraction = extractReadableContent(raw, maxChars);
if (!extraction.summary) {
throw new AttemptError("HTML page had no usable text", attempts);
}
return {
title: extraction.title,
summary: extraction.summary,
attempts,
extractionMethod: truncated ? "fast_truncated" : "fast"
};
}
logSearchError({ trace, query, provider, stage, attempts, error }) { this.store.logAction({ kind: "search_error", guildId: trace.guildId, channelId: trace.channelId, userId: trace.userId, content: String(error?.message || error), metadata: { query, source: trace.source || "unknown", provider, stage, attempts, maxAttemptsPerRequest: Math.max(SEARCH_RETRY_ATTEMPTS, FETCH_RETRY_ATTEMPTS) } }); } }
function buildProviders(appConfig) { return [ new BraveSearchProvider(appConfig), new SerpApiSearchProvider(appConfig) ]; }
function normalizeAllowedDomains(value) { return Array.isArray(value) ? value.map((entry) => String(entry || "").trim().toLowerCase()).filter(Boolean) : []; }
function buildOpenAiWebSearchUserLocation(value) { const raw = String(value || "").trim(); if (!raw) return null; const parts = raw.split(",").map((entry) => entry.trim()).filter(Boolean); if (!parts.length) return null; const [city = "", region = "", country = ""] = parts; return { type: "approximate", ...(city ? { city } : {}), ...(region ? { region } : {}), ...(country ? { country } : {}) }; }
function extractOpenAiWebSearchResults(response) { const output = Array.isArray(response?.output) ? response.output : []; const results = []; for (const item of output) { if (!item || typeof item !== "object") continue; if (item.type !== "web_search_call") continue; const sources = Array.isArray(item?.action?.sources) ? item.action.sources : []; for (const source of sources) { if (!source || typeof source !== "object") continue; const url = normalizeDiscoveryUrl(String(source.url || "").trim()); if (!url) continue; let domain = ""; try { domain = new URL(url).hostname.replace(/^www./, ""); } catch { domain = ""; } results.push({ title: String(source.title || domain || "untitled").trim() || "untitled", url, domain, snippet: normalizeWhitespaceText(String(source.description || source.snippet || "").trim(), { maxLen: 500 }), provider: "openai_native_web_search" }); } } return dedupeOpenAiWebSearchResults(results); }
function dedupeOpenAiWebSearchResults(results) { const deduped = []; const seen = new Set(); for (const result of Array.isArray(results) ? results : []) { const url = String(result?.url || "").trim(); if (!url || seen.has(url)) continue; seen.add(url); deduped.push(result); } return deduped; }
function resolveProviderOrder(providers = [], configuredOrder) { const desired = Array.isArray(configuredOrder) && configuredOrder.length ? configuredOrder : ["brave", "serpapi"]; const byName = new Map(providers.map((provider) => [provider.name, provider])); const ordered = []; for (const key of desired) { const provider = byName.get(key); if (provider?.isConfigured()) ordered.push(provider); } for (const provider of providers) { if (provider.isConfigured() && !ordered.includes(provider)) { ordered.push(provider); } } return ordered; }
class BraveSearchProvider { name; apiKey;
constructor(appConfig) { this.name = "brave"; this.apiKey = String(appConfig?.braveSearchApiKey || "").trim(); }
isConfigured() { return Boolean(this.apiKey); }
async search(input: ProviderSearchInput): Promise {
const endpoint = new URL(BRAVE_SEARCH_API_URL);
endpoint.searchParams.set("q", input.query);
endpoint.searchParams.set("count", String(clamp(Number(input.maxResults) || 5, 1, 10)));
if (input.recencyDays) {
endpoint.searchParams.set("freshness", ${clamp(Number(input.recencyDays) || 30, 1, 365)}d);
}
endpoint.searchParams.set("safesearch", input.safeSearch ? "strict" : "off");
const payload = await fetchSearchPayload({
endpoint,
headers: {
"x-subscription-token": this.apiKey,
accept: "application/json",
"user-agent": SEARCH_USER_AGENT
},
requestLabel: "Brave Search",
invalidJsonMessage: "Brave Search returned invalid JSON.",
signal: input.signal
});
const rawItems = Array.isArray(payload?.web?.results) ? payload.web.results : [];
return { results: normalizeProviderResults(rawItems, "brave", input.maxResults) };
} }
class SerpApiSearchProvider { name; apiKey;
constructor(appConfig) { this.name = "serpapi"; this.apiKey = String(appConfig?.serpApiKey || "").trim(); }
isConfigured() { return Boolean(this.apiKey); }
async search(input: ProviderSearchInput): Promise {
const endpoint = new URL(SERPAPI_SEARCH_API_URL);
endpoint.searchParams.set("engine", "google");
endpoint.searchParams.set("q", input.query);
endpoint.searchParams.set("api_key", this.apiKey);
endpoint.searchParams.set("num", String(clamp(Number(input.maxResults) || 5, 1, 10)));
endpoint.searchParams.set("safe", input.safeSearch ? "active" : "off");
if (input.recencyDays) {
endpoint.searchParams.set("tbs", qdr:d${clamp(Number(input.recencyDays) || 30, 1, 365)});
}
const payload = await fetchSearchPayload({
endpoint,
headers: {
accept: "application/json",
"user-agent": SEARCH_USER_AGENT
},
requestLabel: "SerpApi",
invalidJsonMessage: "SerpApi returned invalid JSON.",
signal: input.signal
});
const rawItems = Array.isArray(payload?.organic_results) ? payload.organic_results : [];
return { results: normalizeProviderResults(rawItems, "serpapi", input.maxResults) };
} }
function combineAbortSignal(signal: AbortSignal | undefined, timeoutMs: number) { return signal ? AbortSignal.any([signal, AbortSignal.timeout(timeoutMs)]) : AbortSignal.timeout(timeoutMs); }
async function fetchSearchPayload({ endpoint, headers, requestLabel, invalidJsonMessage, signal = undefined as AbortSignal | undefined }) { const { response, attempts } = await fetchWithRetry({ request: () => fetch(endpoint, { method: "GET", headers, signal: combineAbortSignal(signal, SEARCH_TIMEOUT_MS) }), shouldRetryResponse: (res) => !res.ok && shouldRetryHttpStatus(res.status), maxAttempts: SEARCH_RETRY_ATTEMPTS });
if (!response.ok) {
throw new AttemptError(${String(requestLabel || "Search")} HTTP ${response.status}, attempts);
}
return await safeJson(response, attempts, invalidJsonMessage); }
function normalizeProviderResults(rawItems, provider, maxResults) { const seen = new Set(); const normalized = []; for (const entry of rawItems) { const normalizedUrl = normalizeDiscoveryUrl(entry?.url || entry?.link || ""); if (!normalizedUrl || seen.has(normalizedUrl)) continue; seen.add(normalizedUrl);
normalized.push({
rank: normalized.length + 1,
title: sanitizeExternalText(entry?.title || "untitled", 180),
url: normalizedUrl,
domain: extractDomain(normalizedUrl),
snippet: sanitizeExternalText(entry?.description || entry?.snippet || "", 320),
published: entry?.age || entry?.date || null,
provider
});
} return normalized.slice(0, clamp(Number(maxResults) || 5, 1, 10)); }
function normalizeWebSearchConfig(rawConfig) { const cfg = rawConfig && typeof rawConfig === "object" ? rawConfig : {}; const maxResultsRaw = Number(cfg.maxResults); const maxPagesRaw = Number(cfg.maxPagesToRead); const maxCharsRaw = Number(cfg.maxCharsPerPage); const maxConcurrentFetches = Number(cfg.maxConcurrentFetches);
return { maxResults: clamp(Number.isFinite(maxResultsRaw) ? maxResultsRaw : 5, 1, 10), maxPagesToRead: clamp(Number.isFinite(maxPagesRaw) ? maxPagesRaw : 3, 0, 5), maxCharsPerPage: clamp(Number.isFinite(maxCharsRaw) ? maxCharsRaw : 6000, 350, 24000), safeSearch: cfg.safeSearch !== undefined ? Boolean(cfg.safeSearch) : true, recencyDaysDefault: clamp(Number(cfg.recencyDaysDefault) || 30, 1, 365), providerOrder: normalizeProviderOrder(cfg.providerOrder), maxConcurrentFetches: clamp(Number.isFinite(maxConcurrentFetches) ? maxConcurrentFetches : 5, 1, 10) }; }
function extractDomain(rawUrl) { try { return new URL(rawUrl).hostname.toLowerCase(); } catch { return "unknown"; } }
function sanitizeExternalText(value, maxLen = 240) { return normalizeWhitespaceText(value, { maxLen, ellipsis: true }); }
async function fetchWithRetry({ request, shouldRetryResponse, maxAttempts }) { let attempt = 0; while (attempt < maxAttempts) { attempt += 1; try { const response = await request(); if (!shouldRetryResponse(response) || attempt >= maxAttempts) { return { response, attempts: attempt }; } } catch (error) { if (!isRetryableFetchError(error) || attempt >= maxAttempts) { throw withAttemptCount(error, attempt); } }
await sleep(getRetryDelayMs(attempt));
}
throw withAttemptCount(new Error("Web fetch failed after retries."), maxAttempts); }
async function safeJson(response, attempts, errorMessage) { try { return await response.json(); } catch { throw new AttemptError(errorMessage, attempts); } }
async function readResponseBodyLimited(response, maxBytes) { if (!response.body) { return { text: "", truncated: false }; } const reader = response.body.getReader(); let size = 0; const chunks = []; let truncated = false;
try { while (true) { const { done, value } = await reader.read(); if (done) break; if (!value) continue; if (size >= maxBytes) { truncated = true; break; } const remaining = Math.max(0, maxBytes - size); if (value.byteLength > remaining) { chunks.push(value.subarray(0, remaining)); size += remaining; truncated = true; break; } size += value.byteLength; chunks.push(value); } } finally { try { await reader.cancel(); } catch (error) { this.store.logAction({kind: "search_service", content: "search_reader_cancel_failed", metadata: { error: String(error?.message || error) }}); } }
const buffer = Buffer.concat(chunks.map((chunk) => Buffer.from(chunk))); return { text: buffer.toString("utf8"), truncated }; }
function extractReadableContent(html, maxChars) { const title = sanitizeExternalText(extractTitle(html), 120) || null; const body = String(html || "") .replace(/<script[\s\S]?</script>/gi, " ") .replace(/<style[\s\S]?</style>/gi, " ") .replace(/<noscript[\s\S]?</noscript>/gi, " ") .replace(/<svg[\s\S]?</svg>/gi, " ") .replace(/<template[\s\S]?</template>/gi, " ") .replace(/</\s(p|div|article|section|h1|h2|h3|h4|h5|h6|li|tr|blockquote|pre|br)\s*>/gi, " ") .replace(/<[^>]+>/g, " ") .replace(/ /gi, " ") .replace(/&/gi, "&") .replace(/"/gi, '"') .replace(/'/g, "'") .replace(/</gi, "<") .replace(/>/gi, ">") .replace(/&#(\d+);/g, (_m, code) => { const num = Number(code); return Number.isFinite(num) ? String.fromCharCode(num) : ""; }) .replace(/&#x([0-9a-f]+);/gi, (_m, hex) => { const num = Number.parseInt(hex, 16); return Number.isFinite(num) ? String.fromCharCode(num) : ""; }) .replace(/\s+/g, " ") .trim(); const summary = sanitizeExternalText(body, maxChars); return { title, summary }; }
function extractTitle(html) { const match = String(html || "").match(/<title[^>]>([\s\S]?)</title>/i); return String(match?.[1] || "").replace(/\s+/g, " ").trim(); }
async function mapConcurrent(items, limit, mapper) { const max = Math.max(1, Number(limit) || 1); const results = new Array(items.length); let cursor = 0;
// Safe because mapper is always async (does I/O), so cursor is only // read/incremented synchronously between awaits on the single JS thread. async function worker() { while (cursor < items.length) { const current = cursor; cursor += 1; results[current] = await mapper(items[current], current); } }
await Promise.all(Array.from({ length: Math.min(max, items.length) }, () => worker())); return results; }
