src/services/discovery.ts

import { assertPublicUrl, isBlockedHost } from "./urlSafety.ts"; import { clamp } from "../utils.ts"; import { getDiscoverySettings } from "../settings/agentStack.ts"; import { normalizeWhitespaceText } from "../normalization/text.ts"; import { isRedirectStatus } from "../retry.ts";

const DISCOVERY_TIMEOUT_MS = 9_000; const DISCOVERY_MAX_REDIRECTS = 5; const DISCOVERY_USER_AGENT = "clanky/0.1 (+discovery-posts; https://github.com/Volpestyle/clanky)";

const TRACKING_QUERY_PREFIXES = ["utm_"]; const TRACKING_QUERY_KEYS = new Set([ "fbclid", "gclid", "igshid", "mc_cid", "mc_eid", "si", "spm" ]);

const STOP_WORDS = new Set([ "about", "after", "again", "also", "been", "before", "being", "cant", "could", "didnt", "dont", "from", "gonna", "have", "just", "like", "make", "more", "only", "really", "some", "that", "their", "there", "they", "this", "thing", "very", "want", "with", "would", "your" ]);

const SOURCE_WEIGHTS = { reddit: 0.94, hackernews: 1, youtube: 0.96, rss: 0.92, x: 0.9 };

export class DiscoveryService { store;

constructor({ store }) { this.store = store; }

async collect({ settings, guildId, channelId, channelName, recentMessages }) { const config = normalizeDiscoveryConfig(getDiscoverySettings(settings)); const sourceTasksEnabled = config.sources.reddit && config.redditSubreddits.length || config.sources.hackerNews || config.sources.youtube && config.youtubeChannelIds.length || config.sources.rss && config.rssFeeds.length || config.sources.x && config.xHandles.length; if (!sourceTasksEnabled) { return { enabled: false, topics: [], candidates: [], selected: [], reports: [], errors: [], dedupeSinceIso: null }; }

const topics = buildTopicSeeds({
  recentMessages,
  channelName
});
const sinceIso = new Date(Date.now() - config.dedupeHours * 60 * 60_000).toISOString();
const tasks = [];

if (config.sources.reddit && config.redditSubreddits.length) {
  tasks.push(this.fetchReddit(config));
}
if (config.sources.hackerNews) {
  tasks.push(this.fetchHackerNews(config));
}
if (config.sources.youtube && config.youtubeChannelIds.length) {
  tasks.push(this.fetchYoutube(config));
}
if (config.sources.rss && config.rssFeeds.length) {
  tasks.push(this.fetchRss(config));
}
if (config.sources.x && config.xHandles.length) {
  tasks.push(this.fetchX(config));
}

const settled = await Promise.allSettled(tasks);
const reports = [];
const errors = [];
const rawCandidates = [];

for (const result of settled) {
  if (result.status === "fulfilled") {
    reports.push(result.value.report);
    rawCandidates.push(...result.value.items);
    continue;
  }

  const message = String(result.reason?.message || result.reason || "unknown discovery error");
  errors.push(message);
  reports.push({
    source: "unknown",
    fetched: 0,
    accepted: 0,
    error: message
  });
}

const seen = new Set();
const filtered = [];
const now = Date.now();
const freshnessMs = config.freshnessHours * 60 * 60_000;

for (const item of rawCandidates) {
  const normalizedUrl = normalizeDiscoveryUrl(item.url);
  if (!normalizedUrl) continue;
  if (seen.has(normalizedUrl)) continue;
  seen.add(normalizedUrl);

  if (!config.allowNsfw && item.nsfw) continue;
  if (item.publishedAt) {
    const publishedTs = Date.parse(item.publishedAt);
    if (Number.isFinite(publishedTs) && now - publishedTs > freshnessMs) continue;
  }
  if (this.store.wasLinkSharedSince(normalizedUrl, sinceIso)) continue;

  const score = scoreCandidate({
    item,
    topics,
    freshnessHours: config.freshnessHours,
    randomness: config.randomness
  });
  filtered.push({
    ...item,
    url: normalizedUrl,
    score
  });
}

filtered.sort((a, b) => b.score - a.score);
const candidates = filtered.slice(0, config.maxCandidatesForPrompt).map(toPromptCandidate);
const selected = pickSelectedCandidates(
  candidates,
  config.maxLinksPerPost,
  config.randomness
);

const reportBySource = reports.reduce((acc, report) => {
  const key = String(report.source || "unknown");
  acc[key] = report;
  return acc;
}, {});

return {
  enabled: true,
  topics,
  candidates,
  selected,
  reports,
  reportBySource,
  errors,
  dedupeSinceIso: sinceIso,
  summary: {
    guildId,
    channelId,
    sourceCount: tasks.length,
    fetchedCount: rawCandidates.length,
    candidateCount: candidates.length,
    selectedCount: selected.length
  }
};

}

async fetchReddit(config) { const items = []; let fetched = 0; const selectedSubs = config.redditSubreddits.slice(0, 6); const errors = [];

for (const subreddit of selectedSubs) {
  const url = `https://www.reddit.com/r/${encodeURIComponent(subreddit)}/hot.json?limit=${config.sourceFetchLimit}&raw_json=1`;

  let payload;
  try {
    payload = await readJson(url);
  } catch (error) {
    errors.push(`r/${subreddit}: ${String(error?.message || error)}`);
    continue;
  }

  const children = payload?.data?.children;
  if (!Array.isArray(children)) continue;

  for (const child of children) {
    const post = child?.data;
    if (!post || post.stickied) continue;
    fetched += 1;

    const outbound = post.url_overridden_by_dest || post.url || "";
    const permalink = post.permalink
      ? `https://www.reddit.com${post.permalink}`
      : "";
    const candidateUrl = normalizeDiscoveryUrl(outbound) || normalizeDiscoveryUrl(permalink);
    if (!candidateUrl) continue;

    items.push({
      source: "reddit",
      sourceLabel: `r/${subreddit}`,
      title: sanitizeExternalText(post.title, 180),
      url: candidateUrl,
      excerpt: sanitizeExternalText(post.selftext || post.link_flair_text || "", 200),
      popularity: Number(post.ups || 0) + Number(post.num_comments || 0),
      publishedAt: Number.isFinite(post.created_utc)
        ? new Date(post.created_utc * 1000).toISOString()
        : null,
      nsfw: Boolean(post.over_18)
    });
  }
}

return {
  report: {
    source: "reddit",
    fetched,
    accepted: items.length,
    error: errors.length ? errors.join(" | ") : null
  },
  items
};

}

async fetchHackerNews(config) { let topIds = []; try { topIds = await readJson("https://hacker-news.firebaseio.com/v0/topstories.json"); } catch (error) { return { report: { source: "hackernews", fetched: 0, accepted: 0, error: String(error?.message || error) }, items: [] }; }

if (!Array.isArray(topIds) || !topIds.length) {
  return {
    report: {
      source: "hackernews",
      fetched: 0,
      accepted: 0,
      error: "no story ids returned"
    },
    items: []
  };
}

const ids = topIds.slice(0, Math.min(30, config.sourceFetchLimit * 2));
const rows = await Promise.all(
  ids.map((id) =>
    readJson(`https://hacker-news.firebaseio.com/v0/item/${encodeURIComponent(String(id))}.json`).catch(
      () => null
    )
  )
);

const items = [];
let fetched = 0;
for (const row of rows) {
  if (!row || row.type !== "story" || !row.title) continue;
  fetched += 1;

  const externalUrl = normalizeDiscoveryUrl(row.url || "");
  const fallback = normalizeDiscoveryUrl(
    `https://news.ycombinator.com/item?id=${encodeURIComponent(String(row.id || ""))}`
  );
  const url = externalUrl || fallback;
  if (!url) continue;

  items.push({
    source: "hackernews",
    sourceLabel: "Hacker News",
    title: sanitizeExternalText(row.title, 180),
    url,
    excerpt: "",
    popularity: Number(row.score || 0) + Number(row.descendants || 0),
    publishedAt: Number.isFinite(row.time) ? new Date(row.time * 1000).toISOString() : null,
    nsfw: false
  });
}

return {
  report: {
    source: "hackernews",
    fetched,
    accepted: items.length,
    error: null
  },
  items
};

}

async fetchYoutube(config) { return await this.fetchFeedSources({ source: "youtube", inputs: config.youtubeChannelIds.slice(0, 8), config, excerptMaxLen: 180, buildUrl: (channelId) => https://www.youtube.com/feeds/videos.xml?channel_id=${encodeURIComponent(String(channelId || ""))}, buildSourceLabel: ({ input, parsed }) => parsed.feedTitle || YouTube ${input} }); }

async fetchRss(config) { return await this.fetchFeedSources({ source: "rss", inputs: config.rssFeeds.slice(0, 8), config, excerptMaxLen: 180, buildUrl: (feedUrl) => String(feedUrl || ""), buildSourceLabel: ({ input, parsed }) => parsed.feedTitle || String(input || "") }); }

async fetchX(config) { const baseUrl = config.xNitterBaseUrl.replace(//+$/, ""); return await this.fetchFeedSources({ source: "x", inputs: config.xHandles.slice(0, 6), config, excerptMaxLen: 200, buildUrl: (handle) => ${baseUrl}/${encodeURIComponent(String(handle || ""))}/rss, buildSourceLabel: ({ input }) => @${input} }); }

async fetchFeedSources({ source, inputs, config, buildUrl, buildSourceLabel, excerptMaxLen = 180 }) { const normalizedSource = String(source || "rss"); const items = []; let fetched = 0; const errors = [];

for (const input of Array.isArray(inputs) ? inputs : []) {
  const inputValue = String(input || "").trim();
  if (!inputValue) continue;

  const url = String(buildUrl?.(inputValue) || "").trim();
  if (!url) continue;

  let xml = "";
  try {
    xml = await readText(url);
  } catch (error) {
    errors.push(`${inputValue}: ${String(error?.message || error)}`);
    continue;
  }

  const parsed = parseFeed(xml, { maxItems: config.sourceFetchLimit });
  fetched += parsed.items.length;

  for (const entry of parsed.items) {
    if (!entry.link) continue;
    items.push({
      source: normalizedSource,
      sourceLabel: String(
        buildSourceLabel?.({ input: inputValue, parsed, entry, url }) || parsed.feedTitle || inputValue
      ),
      title: sanitizeExternalText(entry.title, 180),
      url: entry.link,
      excerpt: sanitizeExternalText(entry.summary || "", excerptMaxLen),
      popularity: 0,
      publishedAt: entry.publishedAt,
      nsfw: false
    });
  }
}

return {
  report: {
    source: normalizedSource,
    fetched,
    accepted: items.length,
    error: errors.length ? errors.join(" | ") : null
  },
  items
};

} }

function toPromptCandidate(item) { return { title: sanitizeExternalText(item.title || "", 180), url: String(item.url || "").trim(), source: String(item.source || "web"), sourceLabel: sanitizeExternalText(item.sourceLabel || item.source || "web", 60), excerpt: sanitizeExternalText(item.excerpt || "", 220), score: Number(item.score || 0), publishedAt: item.publishedAt || null }; }

function normalizeDiscoveryConfig(rawConfig) { const cfg = rawConfig && typeof rawConfig === "object" ? rawConfig : {}; const sources = cfg.sources && typeof cfg.sources === "object" ? cfg.sources : {};

return { maxLinksPerPost: clamp(Number(cfg.maxLinksPerPost) || 2, 1, 4), maxCandidatesForPrompt: clamp(Number(cfg.maxCandidatesForPrompt) || 6, 1, 12), freshnessHours: clamp(Number(cfg.freshnessHours) || 96, 1, 24 * 14), dedupeHours: clamp(Number(cfg.dedupeHours) || 168, 1, 24 * 45), randomness: clamp(Number(cfg.randomness) || 55, 0, 100), sourceFetchLimit: clamp(Number(cfg.sourceFetchLimit) || 10, 2, 30), allowNsfw: Boolean(cfg.allowNsfw), allowSelfCuration: cfg.allowSelfCuration !== undefined ? Boolean(cfg.allowSelfCuration) : true, maxSourcesPerType: clamp(Number(cfg.maxSourcesPerType) || 10, 1, 50), redditSubreddits: stringList(cfg.redditSubreddits, 20, 40) .map((entry) => entry.replace(/^r//i, "")) .filter(Boolean), youtubeChannelIds: stringList(cfg.youtubeChannelIds, 20, 80), rssFeeds: stringList(cfg.rssFeeds, 30, 240).filter((url) => Boolean(normalizeDiscoveryUrl(url))), xHandles: stringList(cfg.xHandles, 20, 40) .map((entry) => entry.replace(/^@/, "")) .filter(Boolean), xNitterBaseUrl: normalizeNitterBase(cfg.xNitterBaseUrl), sources: { reddit: sources.reddit !== undefined ? Boolean(sources.reddit) : true, hackerNews: sources.hackerNews !== undefined ? Boolean(sources.hackerNews) : true, youtube: sources.youtube !== undefined ? Boolean(sources.youtube) : true, rss: sources.rss !== undefined ? Boolean(sources.rss) : true, x: sources.x !== undefined ? Boolean(sources.x) : false } }; }

function normalizeNitterBase(value) { const raw = String(value || "").trim() || "https://nitter.net"; try { const parsed = new URL(raw); if (parsed.protocol !== "http:" && parsed.protocol !== "https:") { return "https://nitter.net"; } return ${parsed.protocol}//${parsed.host}; } catch { return "https://nitter.net"; } }

function stringList(input, maxItems, maxLen) { const values = Array.isArray(input) ? input : typeof input === "string" ? input.split(/[ ,]/g) : [];

return [...new Set(values.map((item) => String(item || "").trim()).filter(Boolean))] .slice(0, maxItems) .map((item) => item.slice(0, maxLen)); }

function buildTopicSeeds({ recentMessages, channelName }) { const topics = []; const counts = new Map();

const words = [ String(channelName || ""), ...((recentMessages || []).map((msg) => String(msg.content || ""))) ] .join(" ") .toLowerCase() .match(/[a-z][a-z0-9_-]{3,24}/g);

for (const token of words || []) { if (STOP_WORDS.has(token)) continue; counts.set(token, Number(counts.get(token) || 0) + 1); }

const ranked = [...counts.entries()] .sort((a, b) => b[1] - a[1]) .slice(0, 8) .map(([token]) => token);

return [...new Set([...topics, ...ranked])].slice(0, 16); }

function scoreCandidate({ item, topics, freshnessHours, randomness }) { const source = String(item.source || "web"); const sourceWeight = Number(SOURCE_WEIGHTS[source]) || 0.9; const titleText = ${item.title || ""} ${item.excerpt || ""}.toLowerCase(); const topicMatches = topics.reduce( (count, topic) => (topic && titleText.includes(topic.toLowerCase()) ? count + 1 : count), 0 ); const topicScore = clamp(topicMatches / Math.max(1, Math.min(topics.length, 3)), 0, 1);

let freshnessScore = 0.35; if (item.publishedAt) { const publishedTs = Date.parse(item.publishedAt); if (Number.isFinite(publishedTs)) { const ageHours = Math.max(0, (Date.now() - publishedTs) / 3_600_000); freshnessScore = clamp(1 - ageHours / Math.max(1, freshnessHours * 1.1), 0, 1); } }

const popularity = Math.max(0, Number(item.popularity) || 0); const popularityScore = clamp(Math.log10(popularity + 1) / 4, 0, 1); const randomSkew = (Math.random() - 0.5) * (clamp(randomness, 0, 100) / 100) * 0.5;

return Number((sourceWeight * 0.3 + topicScore * 0.4 + freshnessScore * 0.2 + popularityScore * 0.1 + randomSkew).toFixed(4)); }

function pickSelectedCandidates(candidates, maxLinks, randomness) { if (!Array.isArray(candidates) || !candidates.length) return [];

const pool = candidates.slice(0, Math.min(candidates.length, Math.max(maxLinks * 3, 6))); const randomness01 = clamp(randomness, 0, 100) / 100;

const weighted = pool .map((item, index) => ({ ...item, weightedScore: Number(item.score || 0) + (Math.random() - 0.5) * 0.35 * randomness01 - index * 0.03 * (1 - randomness01) })) .sort((a, b) => b.weightedScore - a.weightedScore);

const selected = []; const usedSources = new Set();

for (const item of weighted) { if (selected.length >= maxLinks) break; if (usedSources.has(item.source)) continue; selected.push(item); usedSources.add(item.source); }

for (const item of weighted) { if (selected.length >= maxLinks) break; if (selected.some((picked) => picked.url === item.url)) continue; selected.push(item); }

return selected.map(({ weightedScore: _weightedScore, ...rest }) => rest); }

export function normalizeDiscoveryUrl(rawUrl) { const value = String(rawUrl || "").trim(); if (!value) return null;

let parsed; try { parsed = new URL(value); } catch { return null; }

if (parsed.protocol !== "http:" && parsed.protocol !== "https:") return null;

if (isBlockedHost(parsed.hostname)) return null; parsed.hash = "";

for (const key of [...parsed.searchParams.keys()]) { const lowered = key.toLowerCase(); if (TRACKING_QUERY_PREFIXES.some((prefix) => lowered.startsWith(prefix))) { parsed.searchParams.delete(key); continue; } if (TRACKING_QUERY_KEYS.has(lowered)) { parsed.searchParams.delete(key); } }

if ((parsed.protocol === "http:" && parsed.port === "80") || (parsed.protocol === "https:" && parsed.port === "443")) { parsed.port = ""; }

const normalized = parsed.toString(); return normalized.endsWith("/") ? normalized.slice(0, -1) : normalized; }

async function readJson(url) { const raw = await readText(url, "application/json");

try { return JSON.parse(raw); } catch { throw new Error(invalid JSON from ${url}); } }

async function readText(url, accept = "application/xml,text/xml,application/rss+xml,text/plain,application/json") { const safeUrl = normalizeDiscoveryUrl(url); if (!safeUrl) { throw new Error(blocked or invalid discovery URL: ${url}); } const { response, finalUrl } = await fetchDiscoveryResponse({ url: safeUrl, accept });

if (!response.ok) { throw new Error(HTTP ${response.status} for ${finalUrl}); }

return response.text(); }

async function fetchDiscoveryResponse({ url, accept, maxRedirects = DISCOVERY_MAX_REDIRECTS }) { let currentUrl = String(url || ""); for (let redirects = 0; redirects <= maxRedirects; redirects += 1) { await assertPublicUrl(currentUrl); const response = await fetch(currentUrl, { method: "GET", redirect: "manual", headers: { "user-agent": DISCOVERY_USER_AGENT, accept }, signal: AbortSignal.timeout(DISCOVERY_TIMEOUT_MS) });

if (isRedirectStatus(response.status)) {
  const location = String(response.headers.get("location") || "").trim();
  if (!location) {
    throw new Error(`redirect missing location for ${currentUrl}`);
  }
  const nextUrl = normalizeDiscoveryUrl(new URL(location, currentUrl).toString());
  if (!nextUrl) {
    throw new Error(`blocked or invalid discovery redirect URL: ${location}`);
  }
  currentUrl = nextUrl;
  continue;
}

const finalUrl = normalizeDiscoveryUrl(response.url || currentUrl);
if (!finalUrl) {
  throw new Error(`blocked or invalid discovery URL: ${response.url || currentUrl}`);
}
await assertPublicUrl(finalUrl);
return {
  response,
  finalUrl
};

}

throw new Error(too many redirects for discovery URL: ${url}); }

function parseFeed(xml, { maxItems = 10 } = {}) { const text = String(xml || ""); const items = []; const feedTitle = sanitizeExternalText( decodeXmlEntities(extractFirstTag(text, "channel") ? extractTag(extractFirstTag(text, "channel"), "title") : ""), 80 ) || sanitizeExternalText(decodeXmlEntities(extractTag(text, "title")), 80);

const rssItems = matchAllBlocks(text, "item"); const atomEntries = matchAllBlocks(text, "entry"); const blocks = rssItems.length ? rssItems : atomEntries;

for (const block of blocks.slice(0, maxItems)) { const title = decodeXmlEntities(extractTag(block, "title") || ""); const summary = decodeXmlEntities(extractTag(block, "description") || "") || decodeXmlEntities(extractTag(block, "summary") || "") || decodeXmlEntities(extractTag(block, "content") || ""); const link = decodeXmlEntities(extractTag(block, "link") || "") || decodeXmlEntities(extractAtomHref(block) || ""); const publishedAtRaw = extractTag(block, "pubDate") || extractTag(block, "published") || extractTag(block, "updated");

const publishedTs = publishedAtRaw ? Date.parse(publishedAtRaw) : NaN;
items.push({
  title: sanitizeExternalText(title, 180),
  summary: sanitizeExternalText(summary, 260),
  link: normalizeDiscoveryUrl(link),
  publishedAt: Number.isFinite(publishedTs) ? new Date(publishedTs).toISOString() : null
});

}

return { feedTitle, items: items.filter((item) => Boolean(item.link)) }; }

function extractFirstTag(input, tagName) { const matches = matchAllBlocks(input, tagName); return matches[0] || ""; }

function matchAllBlocks(input, tagName) { const pattern = new RegExp( <${escapeRegex(tagName)}\\b[^>]*>([\\s\\S]*?)<\\/${escapeRegex(tagName)}>, "gi" ); return [...String(input || "").matchAll(pattern)].map((match) => String(match[1] || "")); }

function extractTag(input, tagName) { const pattern = new RegExp( <${escapeRegex(tagName)}\\b[^>]*>([\\s\\S]*?)<\\/${escapeRegex(tagName)}>, "i" ); const match = String(input || "").match(pattern); return match?.[1] ? stripTagMarkup(match[1]) : ""; }

function extractAtomHref(input) { const linkTagPattern = /<link\b([^>]?)/?>/gi; for (const match of String(input || "").matchAll(linkTagPattern)) { const attrs = String(match[1] || ""); const href = attrs.match(/\bhref\s=\s*'"['"]/i)?.[1]; if (!href) continue; const rel = attrs.match(/\brel\s*=\s*'"['"]/i)?.[1]; if (!rel || rel.toLowerCase() === "alternate") { return href; } } return ""; }

function stripTagMarkup(value) { return String(value || "") .replace(/<![CDATA[([\s\S]*?)]]>/gi, "$1") .replace(/<[^>]+>/g, " ") .replace(/\s+/g, " ") .trim(); }

function decodeXmlEntities(value) { return String(value || "") .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, """) .replace(/'/g, "'") .replace(/'/gi, "'"); }

function sanitizeExternalText(value, maxLen = 180) { return normalizeWhitespaceText(decodeXmlEntities(String(value || "")), { maxLen, ellipsis: true, replacements: [{ pattern: /[([^]]{2,80})]([^)]+)/g, replacement: "$1" }] }); }

function escapeRegex(value) { return String(value || "").replace(/[.*+?^${}()|[]\]/g, "\$&"); }