/* discover.jsx — real brand discovery from a domain.
 *
 * Steps, all real:
 *  1. Fetch https://{domain}/ (CORS direct → proxy fallback)
 *  2. Parse <title>, <meta>, <link rel=icon>, OpenGraph, Twitter card
 *  3. Parse all JSON-LD blocks — extract Organization, LocalBusiness, WebSite, etc.
 *  4. If we found a contact/about link, fetch it for an address fallback
 *  5. Look up Wikidata (real API, CORS-friendly)
 *  6. Optionally fetch /robots.txt and /sitemap.xml for the live audit
 *  7. Use Claude (via window.claude.complete OR user's anthropic key) to
 *     classify industry + propose ~5 plausible competitors + suggest
 *     vertical-specific scan prompts.
 *  8. Return a rich brand object the rest of the app uses.
 */

const DISCOVER_TIMEOUT_MS = 12000;  // per-proxy attempt (was 18s — too long if 3 proxies queue up)
const WIKIDATA_TIMEOUT_MS = 8000;
const CLAUDE_TIMEOUT_MS = 30000;

function _timed(p, ms, msg = 'timeout') {
  return Promise.race([p, new Promise((_, rj) => setTimeout(() => rj(new Error(msg)), ms))]);
}

// Public CORS proxies, tried in order. Different proxies fail on different sites.
const CORS_PROXIES = [
  (u) => 'https://corsproxy.io/?' + encodeURIComponent(u),
  (u) => 'https://api.allorigins.win/raw?url=' + encodeURIComponent(u),
  (u) => 'https://api.codetabs.com/v1/proxy/?quest=' + encodeURIComponent(u),
];

async function _fetchOne(url, ms) {
  const ctrl = new AbortController();
  const t = setTimeout(() => ctrl.abort(), ms);
  try {
    const r = await fetch(url, { redirect: 'follow', signal: ctrl.signal });
    if (!r.ok) throw new Error(`HTTP ${r.status}`);
    const text = await r.text();
    if (!text || text.length < 50) throw new Error('empty response');
    return text;
  } finally {
    clearTimeout(t);
  }
}

/* Server-proxied fetch via /api/fetch?url=… (managed mode). Returns body string
 * or throws. Tries the server first; callers fall back to direct/CORS proxies. */
async function _fetchViaServer(url, ms = 15_000) {
  if (typeof window === 'undefined' || !window.aisoEnabled || !window.aisoEnabled()) {
    throw new Error('managed mode disabled');
  }
  const base = window.AISO_API_BASE.replace(/\/$/, '');
  const ctrl = new AbortController();
  const t = setTimeout(() => ctrl.abort(), ms);
  try {
    const r = await fetch(`${base}/fetch?url=${encodeURIComponent(url)}`, { signal: ctrl.signal });
    if (!r.ok) throw new Error(`server HTTP ${r.status}`);
    const data = await r.json();
    if (!data?.ok) throw new Error(data?.error || 'server fetch failed');
    if (!data.body || data.body.length < 50) throw new Error('empty response');
    return data.body;
  } finally {
    clearTimeout(t);
  }
}

/* Try server first (managed mode), then direct, then each proxy, then www. variant. */
async function _fetchTry(url, { textOk = true } = {}) {
  const errors = [];

  // 1) server proxy (preferred in managed mode)
  if (typeof window !== 'undefined' && window.aisoEnabled && window.aisoEnabled()) {
    try {
      const text = await _fetchViaServer(url, 15_000);
      return { ok: true, text, via: 'server', url };
    } catch (e) { errors.push(['server', e.message || e.name]); }
  }

  // 2) direct (fast, when CORS allows)
  try {
    const text = await _fetchOne(url, 6000);
    return { ok: true, text, via: 'direct', url };
  } catch (e) { errors.push(['direct', e.message || e.name]); }

  // 3) each proxy
  for (let i = 0; i < CORS_PROXIES.length; i++) {
    const proxyUrl = CORS_PROXIES[i](url);
    try {
      const text = await _fetchOne(proxyUrl, DISCOVER_TIMEOUT_MS);
      return { ok: true, text, via: `proxy-${i + 1}`, url };
    } catch (e) {
      errors.push([`proxy-${i + 1}`, e.message || e.name]);
    }
  }

  // 4) www. variant fallback
  const parsed = url.replace(/^https?:\/\//, '');
  if (!parsed.startsWith('www.')) {
    const altUrl = url.replace(/^(https?:\/\/)/, '$1www.');
    if (typeof window !== 'undefined' && window.aisoEnabled && window.aisoEnabled()) {
      try {
        const text = await _fetchViaServer(altUrl, 15_000);
        return { ok: true, text, via: 'www-server', url: altUrl };
      } catch (e) { errors.push(['www-server', e.message || e.name]); }
    }
    try {
      const text = await _fetchOne(altUrl, 6000);
      return { ok: true, text, via: 'www-direct', url: altUrl };
    } catch (e) { errors.push(['www-direct', e.message || e.name]); }
    for (let i = 0; i < CORS_PROXIES.length; i++) {
      try {
        const text = await _fetchOne(CORS_PROXIES[i](altUrl), DISCOVER_TIMEOUT_MS);
        return { ok: true, text, via: `www-proxy-${i + 1}`, url: altUrl };
      } catch (e) {
        errors.push([`www-proxy-${i + 1}`, e.message || e.name]);
      }
    }
  }

  return {
    ok: false,
    error: errors.map(([k, v]) => `${k}: ${v}`).join(' · '),
    errorShort: errors[errors.length - 1]?.[1] || 'unreachable',
    attempts: errors.length,
    url,
  };
}

function _clean(s) {
  if (!s) return '';
  return String(s).replace(/\s+/g, ' ').trim();
}

function _absoluteUrl(href, baseDomain) {
  if (!href) return null;
  if (/^https?:\/\//i.test(href)) return href;
  if (href.startsWith('//')) return 'https:' + href;
  if (href.startsWith('/')) return `https://${baseDomain}${href}`;
  return `https://${baseDomain}/${href.replace(/^\.\//, '')}`;
}

function _domainBare(domain) {
  return String(domain || '').replace(/^https?:\/\//, '').replace(/^www\./, '').split('/')[0].split('.')[0];
}

function _titleCase(s) {
  if (!s) return s;
  return s.split(/[\s-]+/).map((w) => w[0]?.toUpperCase() + w.slice(1)).join(' ');
}

// Detect bot-challenge / interstitial pages (Cloudflare "Just a moment…", etc.) so we
// don't mistake the challenge page's <title> for the brand name (e.g. visa.com).
function _isChallengePage(html) {
  if (!html) return false;
  const title = ((html.match(/<title[^>]*>([\s\S]*?)<\/title>/i) || [])[1] || '').trim();
  if (/^(just a moment|attention required|checking your browser|verifying you are human|access denied|please wait|one more step)/i.test(title)) return true;
  const head = html.slice(0, 6000);
  return /cf-browser-verification|\/cdn-cgi\/challenge-platform|challenge-platform\/h\/|enable javascript and cookies to continue|checking if the site connection is secure|ddos protection by cloudflare/i.test(head);
}

/* ---------- meta + jsonld parsing ---------- */

function parseMeta(html, baseDomain) {
  if (!html) return {};
  const $ = (re, group = 1) => (html.match(re) || [])[group] || '';
  const title = _clean($(/<title[^>]*>([\s\S]*?)<\/title>/i));
  const description = _clean($(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i));
  const ogTitle = _clean($(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i));
  const ogDescription = _clean($(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i));
  const ogImage = _absoluteUrl($(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i), baseDomain);
  const ogSiteName = _clean($(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']/i));
  const ogType = _clean($(/<meta[^>]+property=["']og:type["'][^>]+content=["']([^"']+)["']/i));
  const ogLocale = _clean($(/<meta[^>]+property=["']og:locale["'][^>]+content=["']([^"']+)["']/i));
  const twitterSite = _clean($(/<meta[^>]+name=["']twitter:site["'][^>]+content=["']([^"']+)["']/i));
  const lang = _clean($(/<html[^>]+lang=["']([^"']+)["']/i));
  const themeColor = _clean($(/<meta[^>]+name=["']theme-color["'][^>]+content=["']([^"']+)["']/i));

  // canonical & favicon (best-effort)
  const canonical = _absoluteUrl($(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i), baseDomain);
  const iconHref =
    $(/<link[^>]+rel=["'](?:icon|shortcut icon|apple-touch-icon)["'][^>]+href=["']([^"']+)["']/i) ||
    $(/<link[^>]+href=["']([^"']+)["'][^>]+rel=["'](?:icon|shortcut icon|apple-touch-icon)["']/i);
  const favicon = _absoluteUrl(iconHref || '/favicon.ico', baseDomain);

  // socials — first matching link to common networks
  const social = {};
  const socialMap = {
    twitter: /https?:\/\/(?:www\.)?(?:twitter\.com|x\.com)\/([^"'\/?#\s]+)/i,
    linkedin: /https?:\/\/(?:www\.)?linkedin\.com\/(?:company|in)\/([^"'\/?#\s]+)/i,
    instagram: /https?:\/\/(?:www\.)?instagram\.com\/([^"'\/?#\s]+)/i,
    facebook: /https?:\/\/(?:www\.)?facebook\.com\/([^"'\/?#\s]+)/i,
    youtube: /https?:\/\/(?:www\.)?youtube\.com\/(?:@|channel\/|user\/|c\/)([^"'\/?#\s]+)/i,
    github: /https?:\/\/(?:www\.)?github\.com\/([^"'\/?#\s]+)/i,
  };
  for (const [k, re] of Object.entries(socialMap)) {
    const m = html.match(re);
    if (m && m[1] && m[1].length < 60) social[k] = m[0];
  }

  // h1 (one of)
  const h1 = _clean($(/<h1[^>]*>([\s\S]*?)<\/h1>/i).replace(/<[^>]+>/g, ' '));

  return {
    title, description, ogTitle, ogDescription, ogImage, ogSiteName, ogType, ogLocale,
    twitterSite, canonical, favicon, themeColor, social, h1, lang,
    pageSize: html.length,
  };
}

function _safeParseJson(s) {
  try { return JSON.parse(s); } catch {}
  // sometimes JSON-LD has comments or trailing commas; try a light clean
  try {
    const cleaned = s.replace(/\/\*[\s\S]*?\*\//g, '').replace(/,(\s*[}\]])/g, '$1');
    return JSON.parse(cleaned);
  } catch {}
  return null;
}

function _flattenLd(node, out = []) {
  if (!node) return out;
  if (Array.isArray(node)) { node.forEach((n) => _flattenLd(n, out)); return out; }
  if (typeof node !== 'object') return out;
  if (node['@graph']) _flattenLd(node['@graph'], out);
  out.push(node);
  // walk values too (nested @types within properties)
  Object.values(node).forEach((v) => {
    if (v && (typeof v === 'object' || Array.isArray(v))) _flattenLd(v, out);
  });
  return out;
}

function parseJsonLd(html) {
  if (!html) return { byType: {}, raw: [], summary: {} };
  const blockRe = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
  const raw = [];
  let m;
  while ((m = blockRe.exec(html)) !== null) {
    const parsed = _safeParseJson(m[1].trim());
    if (parsed) raw.push(parsed);
  }
  const flat = [];
  raw.forEach((r) => _flattenLd(r, flat));

  const byType = {};
  for (const node of flat) {
    let t = node['@type'];
    if (Array.isArray(t)) t = t[0];
    if (!t) continue;
    byType[t] = byType[t] || [];
    byType[t].push(node);
  }

  // Promote useful entities into a summary
  const summary = {};
  function take(types) {
    for (const t of types) if (byType[t]?.[0]) return byType[t][0];
    return null;
  }
  const org = take(['Organization', 'Corporation', 'NGO', 'EducationalOrganization', 'Brand']);
  const local = take(['LocalBusiness', 'Restaurant', 'CafeOrCoffeeShop', 'FoodEstablishment', 'Store', 'ProfessionalService', 'MedicalOrganization', 'Dentist', 'AutoDealer', 'Hotel']);
  const site = take(['WebSite']);
  const product = take(['Product']);

  if (org) summary.organization = {
    name: _clean(org.name),
    description: _clean(org.description),
    logo: typeof org.logo === 'string' ? org.logo : org.logo?.url,
    sameAs: [].concat(org.sameAs || []).filter(Boolean),
    foundingDate: org.foundingDate,
  };
  if (local) {
    const addr = local.address || {};
    summary.localBusiness = {
      name: _clean(local.name),
      type: local['@type'],
      addressLocality: _clean(addr.addressLocality || ''),
      addressRegion: _clean(addr.addressRegion || ''),
      addressCountry: typeof addr.addressCountry === 'string' ? addr.addressCountry : (addr.addressCountry?.name || ''),
      postalCode: _clean(addr.postalCode || ''),
      streetAddress: _clean(addr.streetAddress || ''),
      telephone: _clean(local.telephone || ''),
      priceRange: _clean(local.priceRange || ''),
      aggregateRating: local.aggregateRating ? {
        ratingValue: local.aggregateRating.ratingValue,
        reviewCount: local.aggregateRating.reviewCount || local.aggregateRating.ratingCount,
      } : null,
    };
  }
  if (site) summary.website = { name: _clean(site.name), description: _clean(site.description) };
  if (product) summary.product = { name: _clean(product.name) };

  return { byType, raw, summary, types: Object.keys(byType) };
}

/* ---------- Wikidata lookup (real, with same-vertical filter) ----------
 * Wikidata's wbsearchentities is fuzzy and returns ANY matching label —
 * "bekon" matches a record producer (Q108649783). We filter so we only
 * accept entities that plausibly describe a company/website/brand.
 *
 * Heuristic:
 *  1. Drop entities whose description matches person/biographical patterns.
 *  2. Require the label to be a close-match to the search term (no random
 *     2-word brand → 1-word famous person collisions).
 *  3. Prefer entities whose description suggests business/tech/product.
 */

const _WD_DROP_RX = /\b(person|musician|songwriter|producer|singer|rapper|composer|actor|actress|athlete|footballer|politician|writer|author|poet|journalist|painter|sculptor|director|filmmaker|screenwriter|saint|monarch|king|queen|emperor|botanist|biologist|physicist|mathematician|surgeon|physician|judge|lawyer|priest|monk|nun|abbot|bishop|cardinal|pope|noble|baron|earl|duke|princess|prince|historian|philosopher|theologian|economist|sociologist|psychologist|astronomer|chemist|engineer (?:born|who)|born \d|died \d|\b(?:b\.|d\.) ?\d| family| dynasty|fictional character|species of|genus of|village in|town in|commune in|river in|mountain in)\b/i;

const _WD_KEEP_RX = /\b(company|corporation|business|startup|startup company|brand|website|web site|web application|web platform|application|software|product|service|SaaS|platform|app|tool|technology company|tech company|firm|agency|organization|nonprofit|magazine|publisher|publication|store|retailer|restaurant|cafe|coffee|hotel|clinic|hospital|university|school|institute|studio|label|consultancy|consulting|enterprise|industry|manufacturer|maker of|provider of|provider|developer of|online (?:store|platform|service)|app for|software for)\b/i;

function _wdLooksLikeBrand(entity, searchTerm) {
  if (!entity?.label) return false;
  const label = String(entity.label).toLowerCase();
  const term = String(searchTerm || '').toLowerCase();
  const desc = String(entity.description || '').toLowerCase();

  // Exact / near-exact label match: accept immediately, even if description looks persony.
  // (Catches Apple → Q312 "American multinational technology company".)
  if (label === term) return true;
  const longer = label.length >= term.length ? label : term;
  const shorter = label.length >= term.length ? term : label;
  if (longer.includes(shorter) && shorter.length / longer.length >= 0.85) return true;

  // Looser path: require label-overlap + the description doesn't scream "person".
  if (_WD_DROP_RX.test(desc) && !_WD_KEEP_RX.test(desc)) return false;
  if (longer !== shorter && !longer.includes(shorter)) return false;
  if (shorter.length / longer.length < 0.6) return false;

  return true;
}

async function lookupWikidata(searchTerm) {
  if (!searchTerm) return null;
  try {
    const url = `https://www.wikidata.org/w/api.php?action=wbsearchentities&search=${encodeURIComponent(searchTerm)}&language=en&format=json&origin=*&limit=10`;
    const r = await _timed(fetch(url), WIKIDATA_TIMEOUT_MS, 'wikidata timeout');
    if (!r.ok) return null;
    const data = await r.json();
    if (!data.search || data.search.length === 0) return null;

    // Score: exact label > brand-shaped description > inclusive label > nothing
    const scored = data.search.map((entity) => {
      const looksLike = _wdLooksLikeBrand(entity, searchTerm);
      const keepMatch = _WD_KEEP_RX.test(String(entity.description || ''));
      const labelExact = String(entity.label).toLowerCase() === String(searchTerm).toLowerCase();
      let score = 0;
      if (labelExact) score += 15;
      if (looksLike) score += 8;
      if (keepMatch) score += 4;
      return { entity, score };
    });
    const best = scored.sort((a, b) => b.score - a.score)[0];
    if (!best || best.score < 8) return null;

    const top = best.entity;
    return {
      id: top.id,
      label: top.label,
      description: top.description,
      alternatives: scored.slice(1, 5).map((s) => ({ id: s.entity.id, label: s.entity.label, description: s.entity.description })),
    };
  } catch {
    return null;
  }
}

/* ---------- Claude enrichment (uses BYO key if present, else window.claude) ---------- */

async function _claudeCall(prompt, anthropicKey) {
  const tryWith = async (fn) => _timed(fn(), CLAUDE_TIMEOUT_MS, 'claude timeout');
  // Managed mode has no BYO key — keys live on the server. Route through callModel
  // whenever managed mode is on (not only when a BYO key is present), and give the
  // classification JSON enough room so it isn't truncated into unparseable output.
  const managed = !!(window.aisoEnabled && window.aisoEnabled());
  if (typeof window.callModel === 'function' && (anthropicKey || managed)) {
    try {
      const r = await tryWith(() => window.callModel('anthropic', anthropicKey || '', prompt, { maxTokens: 1500 }));
      return r.text;
    } catch (e) {
      // fall through to built-in
    }
  }
  if (typeof window.claude?.complete === 'function') {
    try {
      return await tryWith(() => window.claude.complete(prompt));
    } catch {
      return null;
    }
  }
  return null;
}

function _extractJsonFromText(text) {
  if (!text) return null;
  // grab the first { ... } or [ ... ] block
  const m = text.match(/(\{[\s\S]*\}|\[[\s\S]*\])/);
  if (!m) return null;
  return _safeParseJson(m[1]);
}

async function enrichWithClaude({ domain, meta, jsonld, wikidata, robots, sitemap, degraded, anthropicKey }) {
  const summary = {
    domain,
    homepage_reachable: !degraded,
    title: meta.title || null,
    h1: meta.h1 || null,
    description: meta.description || meta.ogDescription || null,
    ogSiteName: meta.ogSiteName || null,
    jsonld_types: jsonld.types || [],
    jsonld_organization: jsonld.summary?.organization || null,
    jsonld_localBusiness: jsonld.summary?.localBusiness || null,
    wikidata: wikidata ? { id: wikidata.id, label: wikidata.label, description: wikidata.description } : null,
    socials: Object.keys(meta.social || {}),
    robots_signals: robots ? { user_agents: robots.userAgents, ai_bots_named: robots.mentionedAI } : null,
    sitemap_urls_sample: sitemap?.sampleUrls?.slice(0, 8) || null,
  };

  // Hard count of evidence we actually have. If zero / one, we're guessing.
  const evidenceCount =
    (summary.title ? 1 : 0) +
    (summary.description ? 1 : 0) +
    (summary.jsonld_types?.length ? 1 : 0) +
    (summary.wikidata ? 1 : 0) +
    (summary.sitemap_urls_sample?.length ? 1 : 0);

  const prompt = `You are classifying a website for an AI-visibility audit. Be calibrated and honest about what you can know.

Scraped signals from ${domain}:
\`\`\`json
${JSON.stringify(summary, null, 2)}
\`\`\`

EVIDENCE COUNT: ${evidenceCount} of 5 (title, description, jsonld, wikidata, sitemap)

CRITICAL CALIBRATION RULES — read these:
1. If you do NOT clearly recognize the brand "${domain}" from your training data AND evidence_count < 2: set confidence ≤ 0.2, brandName to the literal domain (e.g. "${domain.split('.')[0]}"), industry: "unknown", location: "unknown", and competitors: []. DO NOT invent a vertical. DO NOT pattern-match on partial words (e.g. don't claim "cratox" is "Cratoxylum / a plant association"). DO NOT speculate about etymology, latin roots, or what the name "might suggest". If you don't know, say you don't know.
2. Confidence > 0.7 ONLY if BOTH: (a) you clearly know this exact brand from training, AND (b) the scraped signals are consistent with what you know.
3. If confidence < 0.3, return scanPrompts: [] and competitors: [] — there's no point guessing.
4. Brand names are proper nouns. Treat the domain as a literal string. Don't decompose it into roots.

Return ONLY a JSON object (no markdown, no prose):

{
  "brandName": "what people search for. If unknown, the literal domain root.",
  "tagline": "<=12 words OR empty string if unknown",
  "industry": "lowercase 1-4 words OR 'unknown'",
  "vertical": "broader category OR 'unknown'",
  "location": "City, Region/Country OR 'Online' OR 'unknown'",
  "audience": "who they serve OR ''",
  "isLocalBusiness": true|false,
  "confidence": 0.0-1.0,
  "knowsFromTraining": true|false,
  "uncertaintyNote": "if confidence < 0.5, ONE short sentence explaining what's missing",
  "competitors": [
    {"name": "Competitor brand", "reason": "concrete reason — same vertical, same geo, same offer"}
  ],
  "scanPrompts": [
    "..." (only if confidence >= 0.3)
  ]
}`;

  const text = await _claudeCall(prompt, anthropicKey);
  if (!text) return null;
  const parsed = _extractJsonFromText(text);
  if (!parsed) return { raw: text };
  return parsed;
}

/* ---------- main discovery ---------- */
async function discoverBrand(domain, { onProgress = () => {}, anthropicKey, signal } = {}) {
  const dom = String(domain || '').replace(/^https?:\/\//, '').replace(/\/.*$/, '').trim();
  if (!dom) throw new Error('domain required');

  const out = {
    domain: dom,
    discoveredAt: Date.now(),
    homepage: null,
    meta: null,
    jsonld: null,
    robots: null,
    sitemap: null,
    wikidata: null,
    enrichment: null,
    name: '',
    tagline: '',
    description: '',
    industry: '',
    vertical: '',
    location: '',
    audience: '',
    competitors: [],
    scanPrompts: [],
    logo: null,
    favicon: null,
    address: null,
    social: {},
    socialLinks: {},
    sources: [],
    confidence: 0,
    degraded: false,
  };

  // 1. Homepage + robots + sitemap — fired in parallel, each emits its own progress
  //    as soon as it resolves (so robots/sitemap don't block on a slow homepage).
  onProgress({ step: 'homepage', status: 'running', label: `Fetching https://${dom}/` });
  onProgress({ step: 'robots', status: 'running', label: 'Probing robots.txt' });
  onProgress({ step: 'sitemap', status: 'running', label: 'Probing sitemap.xml' });

  const hpFuture = _fetchTry(`https://${dom}/`);
  const robotsFuture = _fetchTry(`https://${dom}/robots.txt`);
  const sitemapFuture = _fetchTry(`https://${dom}/sitemap.xml`);

  // robots — settles independently
  robotsFuture.then((res) => {
    if (signal?.aborted) return;
    if (res.ok && !/^\s*<!?(doctype|html)/i.test(res.text)) {
      const lines = res.text.split('\n').map(l => l.trim()).filter(Boolean);
      const userAgents = lines.filter(l => /^user-agent:/i.test(l));
      const sitemaps = lines.filter(l => /^sitemap:/i.test(l)).map(l => l.replace(/^sitemap:\s*/i, ''));
      const aiBots = ['GPTBot', 'ClaudeBot', 'PerplexityBot', 'CCBot', 'Google-Extended', 'OAI-SearchBot', 'anthropic-ai'];
      const mentionedAI = aiBots.filter(b => res.text.includes(b));
      out.robots = { lines: lines.length, userAgents: userAgents.length, sitemaps, mentionedAI, preview: res.text.slice(0, 320) };
      onProgress({ step: 'robots', status: 'pass', label: `robots.txt found`, detail: `${lines.length} lines · ${userAgents.length} user-agent block(s)${mentionedAI.length ? ` · ${mentionedAI.join(', ')}` : ''}` });
    } else {
      onProgress({ step: 'robots', status: 'fail', label: 'No robots.txt' });
    }
  });

  // sitemap — settles independently
  sitemapFuture.then((res) => {
    if (signal?.aborted) return;
    if (res.ok && !/^\s*<!?(doctype|html)/i.test(res.text)) {
      const urls = (res.text.match(/<loc>/gi) || []).length;
      const isIndex = /<sitemapindex/i.test(res.text);
      const sampleUrls = [...res.text.matchAll(/<loc>([^<]+)<\/loc>/gi)].slice(0, 12).map(m => m[1]);
      out.sitemap = { urls, isIndex, sampleUrls };
      onProgress({ step: 'sitemap', status: 'pass', label: `sitemap.xml found`, detail: isIndex ? `index with ${urls} sub-sitemaps` : `${urls} URL${urls === 1 ? '' : 's'}` });
    } else {
      onProgress({ step: 'sitemap', status: 'fail', label: 'No sitemap.xml' });
    }
  });

  // homepage — we MUST wait for this before parsing meta/JSON-LD.
  // Hard-cap total time at 35s — if it hasn't resolved by then, give up and continue degraded.
  const hpCap = new Promise((resolve) => setTimeout(() => resolve({ ok: false, errorShort: 'global timeout 35s', timedOut: true }), 35000));
  const hp = await Promise.race([hpFuture, hpCap]);

  if (signal?.aborted) throw new Error('aborted');

  // homepage — but reject bot-challenge interstitials (Cloudflare "Just a moment…"),
  // otherwise we'd treat the challenge page's <title> as the brand name.
  const challenged = hp.ok && _isChallengePage(hp.text);
  if (hp.ok && !challenged) {
    out.homepage = { ok: true, via: hp.via, length: hp.text.length, finalUrl: hp.url };
    onProgress({ step: 'homepage', status: 'pass', label: `Read homepage`, detail: `${(hp.text.length / 1024).toFixed(0)}kb · ${hp.via}` });
    out.sources.push({ source: 'homepage', via: hp.via });

    // 2. Meta
    onProgress({ step: 'meta', status: 'running', label: 'Reading meta + Open Graph' });
    const meta = parseMeta(hp.text, dom);
    out.meta = meta;
    out.social = meta.social || {};
    out.socialLinks = meta.social || {};
    out.favicon = meta.favicon;
    out.logo = meta.ogImage || meta.favicon;
    out.name = meta.ogSiteName || (meta.title && meta.title.split(/[—–|·:]/)[0].trim()) || '';
    out.description = meta.description || meta.ogDescription || '';
    const metaDetail = [
      meta.title ? `“${meta.title.length > 50 ? meta.title.slice(0, 50) + '…' : meta.title}”` : 'no title',
      meta.description ? 'description ✓' : 'no description',
      meta.ogImage ? 'og:image ✓' : null,
      Object.keys(meta.social || {}).length ? `${Object.keys(meta.social).length} social link(s)` : null,
    ].filter(Boolean).join(' · ');
    onProgress({ step: 'meta', status: 'pass', label: 'Parsed page meta', detail: metaDetail });

    // 3. JSON-LD
    onProgress({ step: 'jsonld', status: 'running', label: 'Scanning structured data' });
    const jsonld = parseJsonLd(hp.text);
    out.jsonld = jsonld;
    if (jsonld.summary.organization?.name && !out.name) out.name = jsonld.summary.organization.name;
    if (jsonld.summary.organization?.description && !out.description) out.description = jsonld.summary.organization.description;
    if (jsonld.summary.organization?.logo) out.logo = _absoluteUrl(jsonld.summary.organization.logo, dom);
    const lb = jsonld.summary.localBusiness;
    if (lb) {
      const parts = [lb.streetAddress, lb.addressLocality, lb.addressRegion, lb.postalCode, lb.addressCountry].filter(Boolean);
      out.address = {
        line: parts.join(', '),
        city: lb.addressLocality,
        region: lb.addressRegion,
        country: lb.addressCountry,
        phone: lb.telephone,
      };
      if (lb.addressLocality && !out.location) {
        out.location = [lb.addressLocality, lb.addressRegion || lb.addressCountry].filter(Boolean).join(', ');
      }
    }
    onProgress({
      step: 'jsonld',
      status: jsonld.types.length > 0 ? 'pass' : 'fail',
      label: jsonld.types.length > 0 ? `Found structured data` : 'No JSON-LD on homepage',
      detail: jsonld.types.length > 0 ? jsonld.types.join(', ') : 'AI models will rely on prose only',
    });
  } else {
    // homepage failed — degraded mode
    out.degraded = true;
    // robots/sitemap were fired in parallel above; await their real results
    // (the bare robotsRes/sitemapRes were never defined → ReferenceError that
    // crashed degraded discovery instead of falling back to limited data).
    const [robotsRes, sitemapRes] = await Promise.all([robotsFuture, sitemapFuture]);
    const siteAlive = robotsRes.ok || sitemapRes.ok;
    onProgress({
      step: 'homepage',
      status: 'fail',
      label: challenged ? 'Bot-challenge page (Cloudflare) — not the real site' : (siteAlive ? 'Homepage blocked — likely bot challenge' : 'Couldn’t reach homepage'),
      detail: challenged
        ? 'Got Cloudflare’s “Just a moment…” interstitial, not the homepage — using the domain + robots/sitemap + Claude, not the challenge page’s title.'
        : (siteAlive
          ? `${hp.errorShort} · but robots/sitemap responded, so the site is alive. Falling back to limited data.`
          : (hp.errorShort || 'no signal — site may be down or domain misspelled')),
    });
    onProgress({ step: 'meta', status: 'fail', label: 'Skipped — no homepage HTML' });
    onProgress({ step: 'jsonld', status: 'fail', label: 'Skipped — no homepage HTML' });
  }

  if (signal?.aborted) throw new Error('aborted');

  // 4 + 5 run in PARALLEL — Wikidata and Claude don't depend on each other.
  const searchTerm = out.name || _domainBare(dom);

  // kick Claude off immediately with what we have, including a Wikidata "promise" we'll
  // wait on inside enrichment. Actually simpler: race both, surface progress per-step.
  onProgress({ step: 'wikidata', status: 'running', label: `Looking up Wikidata · "${searchTerm}"` });
  onProgress({ step: 'claude', status: 'running', label: 'Classifying with Claude' });

  const wikidataPromise = lookupWikidata(searchTerm).then((wd) => {
    if (signal?.aborted) return null;
    if (wd) {
      onProgress({ step: 'wikidata', status: 'pass', label: `${wd.id} · ${wd.label}`, detail: wd.description || 'matched entity' });
      if (!out.description && wd.description) out.description = wd.description;
    } else {
      onProgress({ step: 'wikidata', status: 'fail', label: 'No matching Wikidata entity', detail: 'common for smaller / younger brands' });
    }
    out.wikidata = wd;
    return wd;
  });

  const enrichmentPromise = (async () => {
    // wait briefly so Wikidata can inform Claude if it returns first; otherwise proceed
    const wdRace = await Promise.race([
      wikidataPromise.catch(() => null),
      new Promise((r) => setTimeout(() => r('timeout'), 4000)),
    ]);
    const wdForClaude = wdRace === 'timeout' ? null : wdRace;
    try {
      return await enrichWithClaude({
        domain: dom,
        meta: out.meta || {},
        jsonld: out.jsonld || { types: [], summary: {} },
        wikidata: wdForClaude,
        robots: out.robots,
        sitemap: out.sitemap,
        degraded: out.degraded,
        anthropicKey,
      });
    } catch (e) {
      return { error: e.message || 'enrichment failed' };
    }
  })().then((enrichment) => {
    if (signal?.aborted) return null;
    out.enrichment = enrichment;
    if (enrichment && !enrichment.error) {
      const conf = typeof enrichment.confidence === 'number' ? enrichment.confidence : 0.5;
      const knowsIt = enrichment.knowsFromTraining !== false;
      out.confidence = conf;
      out.knowsFromTraining = knowsIt;
      out.uncertaintyNote = enrichment.uncertaintyNote || '';

      // Only adopt enrichment fields if confidence is meaningful.
      // For low-confidence: keep literal domain-derived name + 'unknown' fields.
      if (conf >= 0.3) {
        if (enrichment.brandName) out.name = out.name || enrichment.brandName;
        if (enrichment.tagline) out.tagline = enrichment.tagline;
        if (enrichment.industry && enrichment.industry !== 'unknown') out.industry = enrichment.industry;
        if (enrichment.vertical && enrichment.vertical !== 'unknown') out.vertical = enrichment.vertical;
        if (enrichment.location && enrichment.location !== 'unknown' && !out.location) out.location = enrichment.location;
        if (enrichment.audience) out.audience = enrichment.audience;
        if (Array.isArray(enrichment.competitors)) out.competitors = enrichment.competitors;
        if (Array.isArray(enrichment.scanPrompts)) out.scanPrompts = enrichment.scanPrompts;
      } else {
        // Low-confidence: drop competitors + prompts (don't propagate guesses)
        out.competitors = [];
        out.scanPrompts = [];
      }

      const detail = conf < 0.3
        ? `low confidence (${Math.round(conf * 100)}%) — ${enrichment.uncertaintyNote || 'brand not in training data'}`
        : [out.industry, out.location, enrichment.competitors?.length ? `${enrichment.competitors.length} competitors` : null].filter(Boolean).join(' · ');

      onProgress({
        step: 'claude',
        status: conf >= 0.3 ? 'pass' : 'fail',
        label: conf >= 0.3 ? 'Inferred industry + competitors' : 'Brand not recognized',
        detail,
      });
    } else {
      const reason = enrichment?.error || 'wire an Anthropic key in Settings to enable';
      onProgress({ step: 'claude', status: 'fail', label: 'Claude enrichment unavailable', detail: reason });
    }
    return enrichment;
  });

  await Promise.all([wikidataPromise, enrichmentPromise]);

  // Final fallbacks
  if (!out.name) out.name = _titleCase(_domainBare(dom));
  if (!out.industry) out.industry = out.confidence < 0.3 ? 'unknown' : 'business';
  if (!out.location) out.location = out.confidence < 0.3 ? 'unknown' : 'Online';

  onProgress({ step: 'done', status: 'pass', label: 'Discovery complete', detail: `${out.name} · ${out.industry}${out.confidence < 0.3 ? ' · low confidence' : ''}` });

  return out;
}

Object.assign(window, {
  discoverBrand,
  parseMeta,
  parseJsonLd,
  lookupWikidata,
  enrichWithClaude,
});
