, <meta>, <link rel=icon>, OpenGraph, Twitter card * 3. Parse all JSON-LD blocks — extract Organization, LocalBusiness, WebSite, etc. * 4. If we found a contact/about link, fetch it for an address fallback * 5. Look up Wikidata (real API, CORS-friendly) * 6. Optionally fetch /robots.txt and /sitemap.xml for the live audit * 7. Use Claude (via window.claude.complete OR user's anthropic key) to * classify industry + propose ~5 plausible competitors + suggest * vertical-specific scan prompts. * 8. Return a rich brand object the rest of the app uses. */ const DISCOVER_TIMEOUT_MS = 12000; // per-proxy attempt (was 18s — too long if 3 proxies queue up) const WIKIDATA_TIMEOUT_MS = 8000; const CLAUDE_TIMEOUT_MS = 30000; function _timed(p, ms, msg = 'timeout') { return Promise.race([p, new Promise((_, rj) => setTimeout(() => rj(new Error(msg)), ms))]); } // Public CORS proxies, tried in order. Different proxies fail on different sites. const CORS_PROXIES = [ (u) => 'https://corsproxy.io/?' + encodeURIComponent(u), (u) => 'https://api.allorigins.win/raw?url=' + encodeURIComponent(u), (u) => 'https://api.codetabs.com/v1/proxy/?quest=' + encodeURIComponent(u), ]; async function _fetchOne(url, ms) { const ctrl = new AbortController(); const t = setTimeout(() => ctrl.abort(), ms); try { const r = await fetch(url, { redirect: 'follow', signal: ctrl.signal }); if (!r.ok) throw new Error(`HTTP ${r.status}`); const text = await r.text(); if (!text || text.length < 50) throw new Error('empty response'); return text; } finally { clearTimeout(t); } } /* Server-proxied fetch via /api/fetch?url=… (managed mode). Returns body string * or throws. Tries the server first; callers fall back to direct/CORS proxies. */ async function _fetchViaServer(url, ms = 15_000) { if (typeof window === 'undefined' || !window.aisoEnabled || !window.aisoEnabled()) { throw new Error('managed mode disabled'); } const base = window.AISO_API_BASE.replace(/\/$/, ''); const ctrl = new AbortController(); const t = setTimeout(() => ctrl.abort(), ms); try { const r = await fetch(`${base}/fetch?url=${encodeURIComponent(url)}`, { signal: ctrl.signal }); if (!r.ok) throw new Error(`server HTTP ${r.status}`); const data = await r.json(); if (!data?.ok) throw new Error(data?.error || 'server fetch failed'); if (!data.body || data.body.length < 50) throw new Error('empty response'); return data.body; } finally { clearTimeout(t); } } /* Try server first (managed mode), then direct, then each proxy, then www. variant. */ async function _fetchTry(url, { textOk = true } = {}) { const errors = []; // 1) server proxy (preferred in managed mode) if (typeof window !== 'undefined' && window.aisoEnabled && window.aisoEnabled()) { try { const text = await _fetchViaServer(url, 15_000); return { ok: true, text, via: 'server', url }; } catch (e) { errors.push(['server', e.message || e.name]); } } // 2) direct (fast, when CORS allows) try { const text = await _fetchOne(url, 6000); return { ok: true, text, via: 'direct', url }; } catch (e) { errors.push(['direct', e.message || e.name]); } // 3) each proxy for (let i = 0; i < CORS_PROXIES.length; i++) { const proxyUrl = CORS_PROXIES[i](url); try { const text = await _fetchOne(proxyUrl, DISCOVER_TIMEOUT_MS); return { ok: true, text, via: `proxy-${i + 1}`, url }; } catch (e) { errors.push([`proxy-${i + 1}`, e.message || e.name]); } } // 4) www. variant fallback const parsed = url.replace(/^https?:\/\//, ''); if (!parsed.startsWith('www.')) { const altUrl = url.replace(/^(https?:\/\/)/, '$1www.'); if (typeof window !== 'undefined' && window.aisoEnabled && window.aisoEnabled()) { try { const text = await _fetchViaServer(altUrl, 15_000); return { ok: true, text, via: 'www-server', url: altUrl }; } catch (e) { errors.push(['www-server', e.message || e.name]); } } try { const text = await _fetchOne(altUrl, 6000); return { ok: true, text, via: 'www-direct', url: altUrl }; } catch (e) { errors.push(['www-direct', e.message || e.name]); } for (let i = 0; i < CORS_PROXIES.length; i++) { try { const text = await _fetchOne(CORS_PROXIES[i](altUrl), DISCOVER_TIMEOUT_MS); return { ok: true, text, via: `www-proxy-${i + 1}`, url: altUrl }; } catch (e) { errors.push([`www-proxy-${i + 1}`, e.message || e.name]); } } } return { ok: false, error: errors.map(([k, v]) => `${k}: ${v}`).join(' · '), errorShort: errors[errors.length - 1]?.[1] || 'unreachable', attempts: errors.length, url, }; } function _clean(s) { if (!s) return ''; return String(s).replace(/\s+/g, ' ').trim(); } function _absoluteUrl(href, baseDomain) { if (!href) return null; if (/^https?:\/\//i.test(href)) return href; if (href.startsWith('//')) return 'https:' + href; if (href.startsWith('/')) return `https://${baseDomain}${href}`; return `https://${baseDomain}/${href.replace(/^\.\//, '')}`; } function _domainBare(domain) { return String(domain || '').replace(/^https?:\/\//, '').replace(/^www\./, '').split('/')[0].split('.')[0]; } function _titleCase(s) { if (!s) return s; return s.split(/[\s-]+/).map((w) => w[0]?.toUpperCase() + w.slice(1)).join(' '); } // Detect bot-challenge / interstitial pages (Cloudflare "Just a moment…", etc.) so we // don't mistake the challenge page's <title> for the brand name (e.g. visa.com). function _isChallengePage(html) { if (!html) return false; const title = ((html.match(/<title[^>]*>([\s\S]*?)<\/title>/i) || [])[1] || '').trim(); if (/^(just a moment|attention required|checking your browser|verifying you are human|access denied|please wait|one more step)/i.test(title)) return true; const head = html.slice(0, 6000); return /cf-browser-verification|\/cdn-cgi\/challenge-platform|challenge-platform\/h\/|enable javascript and cookies to continue|checking if the site connection is secure|ddos protection by cloudflare/i.test(head); } /* ---------- meta + jsonld parsing ---------- */ function parseMeta(html, baseDomain) { if (!html) return {}; const $ = (re, group = 1) => (html.match(re) || [])[group] || ''; const title = _clean($(/<title[^>]*>([\s\S]*?)<\/title>/i)); const description = _clean($(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i)); const ogTitle = _clean($(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i)); const ogDescription = _clean($(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i)); const ogImage = _absoluteUrl($(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i), baseDomain); const ogSiteName = _clean($(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']/i)); const ogType = _clean($(/<meta[^>]+property=["']og:type["'][^>]+content=["']([^"']+)["']/i)); const ogLocale = _clean($(/<meta[^>]+property=["']og:locale["'][^>]+content=["']([^"']+)["']/i)); const twitterSite = _clean($(/<meta[^>]+name=["']twitter:site["'][^>]+content=["']([^"']+)["']/i)); const lang = _clean($(/<html[^>]+lang=["']([^"']+)["']/i)); const themeColor = _clean($(/<meta[^>]+name=["']theme-color["'][^>]+content=["']([^"']+)["']/i)); // canonical & favicon (best-effort) const canonical = _absoluteUrl($(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i), baseDomain); const iconHref = $(/<link[^>]+rel=["'](?:icon|shortcut icon|apple-touch-icon)["'][^>]+href=["']([^"']+)["']/i) || $(/<link[^>]+href=["']([^"']+)["'][^>]+rel=["'](?:icon|shortcut icon|apple-touch-icon)["']/i); const favicon = _absoluteUrl(iconHref || '/favicon.ico', baseDomain); // socials — first matching link to common networks const social = {}; const socialMap = { twitter: /https?:\/\/(?:www\.)?(?:twitter\.com|x\.com)\/([^"'\/?#\s]+)/i, linkedin: /https?:\/\/(?:www\.)?linkedin\.com\/(?:company|in)\/([^"'\/?#\s]+)/i, instagram: /https?:\/\/(?:www\.)?instagram\.com\/([^"'\/?#\s]+)/i, facebook: /https?:\/\/(?:www\.)?facebook\.com\/([^"'\/?#\s]+)/i, youtube: /https?:\/\/(?:www\.)?youtube\.com\/(?:@|channel\/|user\/|c\/)([^"'\/?#\s]+)/i, github: /https?:\/\/(?:www\.)?github\.com\/([^"'\/?#\s]+)/i, }; for (const [k, re] of Object.entries(socialMap)) { const m = html.match(re); if (m && m[1] && m[1].length < 60) social[k] = m[0]; } // h1 (one of) const h1 = _clean($(/<h1[^>]*>([\s\S]*?)<\/h1>/i).replace(/<[^>]+>/g, ' ')); return { title, description, ogTitle, ogDescription, ogImage, ogSiteName, ogType, ogLocale, twitterSite, canonical, favicon, themeColor, social, h1, lang, pageSize: html.length, }; } function _safeParseJson(s) { try { return JSON.parse(s); } catch {} // sometimes JSON-LD has comments or trailing commas; try a light clean try { const cleaned = s.replace(/\/\*[\s\S]*?\*\//g, '').replace(/,(\s*[}\]])/g, '$1'); return JSON.parse(cleaned); } catch {} return null; } function _flattenLd(node, out = []) { if (!node) return out; if (Array.isArray(node)) { node.forEach((n) => _flattenLd(n, out)); return out; } if (typeof node !== 'object') return out; if (node['@graph']) _flattenLd(node['@graph'], out); out.push(node); // walk values too (nested @types within properties) Object.values(node).forEach((v) => { if (v && (typeof v === 'object' || Array.isArray(v))) _flattenLd(v, out); }); return out; } function parseJsonLd(html) { if (!html) return { byType: {}, raw: [], summary: {} }; const blockRe = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi; const raw = []; let m; while ((m = blockRe.exec(html)) !== null) { const parsed = _safeParseJson(m[1].trim()); if (parsed) raw.push(parsed); } const flat = []; raw.forEach((r) => _flattenLd(r, flat)); const byType = {}; for (const node of flat) { let t = node['@type']; if (Array.isArray(t)) t = t[0]; if (!t) continue; byType[t] = byType[t] || []; byType[t].push(node); } // Promote useful entities into a summary const summary = {}; function take(types) { for (const t of types) if (byType[t]?.[0]) return byType[t][0]; return null; } const org = take(['Organization', 'Corporation', 'NGO', 'EducationalOrganization', 'Brand']); const local = take(['LocalBusiness', 'Restaurant', 'CafeOrCoffeeShop', 'FoodEstablishment', 'Store', 'ProfessionalService', 'MedicalOrganization', 'Dentist', 'AutoDealer', 'Hotel']); const site = take(['WebSite']); const product = take(['Product']); if (org) summary.organization = { name: _clean(org.name), description: _clean(org.description), logo: typeof org.logo === 'string' ? org.logo : org.logo?.url, sameAs: [].concat(org.sameAs || []).filter(Boolean), foundingDate: org.foundingDate, }; if (local) { const addr = local.address || {}; summary.localBusiness = { name: _clean(local.name), type: local['@type'], addressLocality: _clean(addr.addressLocality || ''), addressRegion: _clean(addr.addressRegion || ''), addressCountry: typeof addr.addressCountry === 'string' ? addr.addressCountry : (addr.addressCountry?.name || ''), postalCode: _clean(addr.postalCode || ''), streetAddress: _clean(addr.streetAddress || ''), telephone: _clean(local.telephone || ''), priceRange: _clean(local.priceRange || ''), aggregateRating: local.aggregateRating ? { ratingValue: local.aggregateRating.ratingValue, reviewCount: local.aggregateRating.reviewCount || local.aggregateRating.ratingCount, } : null, }; } if (site) summary.website = { name: _clean(site.name), description: _clean(site.description) }; if (product) summary.product = { name: _clean(product.name) }; return { byType, raw, summary, types: Object.keys(byType) }; } /* ---------- Wikidata lookup (real, with same-vertical filter) ---------- * Wikidata's wbsearchentities is fuzzy and returns ANY matching label — * "bekon" matches a record producer (Q108649783). We filter so we only * accept entities that plausibly describe a company/website/brand. * * Heuristic: * 1. Drop entities whose description matches person/biographical patterns. * 2. Require the label to be a close-match to the search term (no random * 2-word brand → 1-word famous person collisions). * 3. Prefer entities whose description suggests business/tech/product. */ const _WD_DROP_RX = /\b(person|musician|songwriter|producer|singer|rapper|composer|actor|actress|athlete|footballer|politician|writer|author|poet|journalist|painter|sculptor|director|filmmaker|screenwriter|saint|monarch|king|queen|emperor|botanist|biologist|physicist|mathematician|surgeon|physician|judge|lawyer|priest|monk|nun|abbot|bishop|cardinal|pope|noble|baron|earl|duke|princess|prince|historian|philosopher|theologian|economist|sociologist|psychologist|astronomer|chemist|engineer (?:born|who)|born \d|died \d|\b(?:b\.|d\.) ?\d| family| dynasty|fictional character|species of|genus of|village in|town in|commune in|river in|mountain in)\b/i; const _WD_KEEP_RX = /\b(company|corporation|business|startup|startup company|brand|website|web site|web application|web platform|application|software|product|service|SaaS|platform|app|tool|technology company|tech company|firm|agency|organization|nonprofit|magazine|publisher|publication|store|retailer|restaurant|cafe|coffee|hotel|clinic|hospital|university|school|institute|studio|label|consultancy|consulting|enterprise|industry|manufacturer|maker of|provider of|provider|developer of|online (?:store|platform|service)|app for|software for)\b/i; function _wdLooksLikeBrand(entity, searchTerm) { if (!entity?.label) return false; const label = String(entity.label).toLowerCase(); const term = String(searchTerm || '').toLowerCase(); const desc = String(entity.description || '').toLowerCase(); // Exact / near-exact label match: accept immediately, even if description looks persony. // (Catches Apple → Q312 "American multinational technology company".) if (label === term) return true; const longer = label.length >= term.length ? label : term; const shorter = label.length >= term.length ? term : label; if (longer.includes(shorter) && shorter.length / longer.length >= 0.85) return true; // Looser path: require label-overlap + the description doesn't scream "person". if (_WD_DROP_RX.test(desc) && !_WD_KEEP_RX.test(desc)) return false; if (longer !== shorter && !longer.includes(shorter)) return false; if (shorter.length / longer.length < 0.6) return false; return true; } async function lookupWikidata(searchTerm) { if (!searchTerm) return null; try { const url = `https://www.wikidata.org/w/api.php?action=wbsearchentities&search=${encodeURIComponent(searchTerm)}&language=en&format=json&origin=*&limit=10`; const r = await _timed(fetch(url), WIKIDATA_TIMEOUT_MS, 'wikidata timeout'); if (!r.ok) return null; const data = await r.json(); if (!data.search || data.search.length === 0) return null; // Score: exact label > brand-shaped description > inclusive label > nothing const scored = data.search.map((entity) => { const looksLike = _wdLooksLikeBrand(entity, searchTerm); const keepMatch = _WD_KEEP_RX.test(String(entity.description || '')); const labelExact = String(entity.label).toLowerCase() === String(searchTerm).toLowerCase(); let score = 0; if (labelExact) score += 15; if (looksLike) score += 8; if (keepMatch) score += 4; return { entity, score }; }); const best = scored.sort((a, b) => b.score - a.score)[0]; if (!best || best.score < 8) return null; const top = best.entity; return { id: top.id, label: top.label, description: top.description, alternatives: scored.slice(1, 5).map((s) => ({ id: s.entity.id, label: s.entity.label, description: s.entity.description })), }; } catch { return null; } } /* ---------- Claude enrichment (uses BYO key if present, else window.claude) ---------- */ async function _claudeCall(prompt, anthropicKey) { const tryWith = async (fn) => _timed(fn(), CLAUDE_TIMEOUT_MS, 'claude timeout'); // Managed mode has no BYO key — keys live on the server. Route through callModel // whenever managed mode is on (not only when a BYO key is present), and give the // classification JSON enough room so it isn't truncated into unparseable output. const managed = !!(window.aisoEnabled && window.aisoEnabled()); if (typeof window.callModel === 'function' && (anthropicKey || managed)) { try { const r = await tryWith(() => window.callModel('anthropic', anthropicKey || '', prompt, { maxTokens: 1500 })); return r.text; } catch (e) { // fall through to built-in } } if (typeof window.claude?.complete === 'function') { try { return await tryWith(() => window.claude.complete(prompt)); } catch { return null; } } return null; } function _extractJsonFromText(text) { if (!text) return null; // grab the first { ... } or [ ... ] block const m = text.match(/(\{[\s\S]*\}|\[[\s\S]*\])/); if (!m) return null; return _safeParseJson(m[1]); } async function enrichWithClaude({ domain, meta, jsonld, wikidata, robots, sitemap, degraded, anthropicKey }) { const summary = { domain, homepage_reachable: !degraded, title: meta.title || null, h1: meta.h1 || null, description: meta.description || meta.ogDescription || null, ogSiteName: meta.ogSiteName || null, jsonld_types: jsonld.types || [], jsonld_organization: jsonld.summary?.organization || null, jsonld_localBusiness: jsonld.summary?.localBusiness || null, wikidata: wikidata ? { id: wikidata.id, label: wikidata.label, description: wikidata.description } : null, socials: Object.keys(meta.social || {}), robots_signals: robots ? { user_agents: robots.userAgents, ai_bots_named: robots.mentionedAI } : null, sitemap_urls_sample: sitemap?.sampleUrls?.slice(0, 8) || null, }; // Hard count of evidence we actually have. If zero / one, we're guessing. const evidenceCount = (summary.title ? 1 : 0) + (summary.description ? 1 : 0) + (summary.jsonld_types?.length ? 1 : 0) + (summary.wikidata ? 1 : 0) + (summary.sitemap_urls_sample?.length ? 1 : 0); const prompt = `You are classifying a website for an AI-visibility audit. Be calibrated and honest about what you can know. Scraped signals from ${domain}: \`\`\`json ${JSON.stringify(summary, null, 2)} \`\`\` EVIDENCE COUNT: ${evidenceCount} of 5 (title, description, jsonld, wikidata, sitemap) CRITICAL CALIBRATION RULES — read these: 1. If you do NOT clearly recognize the brand "${domain}" from your training data AND evidence_count < 2: set confidence ≤ 0.2, brandName to the literal domain (e.g. "${domain.split('.')[0]}"), industry: "unknown", location: "unknown", and competitors: []. DO NOT invent a vertical. DO NOT pattern-match on partial words (e.g. don't claim "cratox" is "Cratoxylum / a plant association"). DO NOT speculate about etymology, latin roots, or what the name "might suggest". If you don't know, say you don't know. 2. Confidence > 0.7 ONLY if BOTH: (a) you clearly know this exact brand from training, AND (b) the scraped signals are consistent with what you know. 3. If confidence < 0.3, return scanPrompts: [] and competitors: [] — there's no point guessing. 4. Brand names are proper nouns. Treat the domain as a literal string. Don't decompose it into roots. Return ONLY a JSON object (no markdown, no prose): { "brandName": "what people search for. If unknown, the literal domain root.", "tagline": "<=12 words OR empty string if unknown", "industry": "lowercase 1-4 words OR 'unknown'", "vertical": "broader category OR 'unknown'", "location": "City, Region/Country OR 'Online' OR 'unknown'", "audience": "who they serve OR ''", "isLocalBusiness": true|false, "confidence": 0.0-1.0, "knowsFromTraining": true|false, "uncertaintyNote": "if confidence < 0.5, ONE short sentence explaining what's missing", "competitors": [ {"name": "Competitor brand", "reason": "concrete reason — same vertical, same geo, same offer"} ], "scanPrompts": [ "..." (only if confidence >= 0.3) ] }`; const text = await _claudeCall(prompt, anthropicKey); if (!text) return null; const parsed = _extractJsonFromText(text); if (!parsed) return { raw: text }; return parsed; } /* ---------- main discovery ---------- */ async function discoverBrand(domain, { onProgress = () => {}, anthropicKey, signal } = {}) { const dom = String(domain || '').replace(/^https?:\/\//, '').replace(/\/.*$/, '').trim(); if (!dom) throw new Error('domain required'); const out = { domain: dom, discoveredAt: Date.now(), homepage: null, meta: null, jsonld: null, robots: null, sitemap: null, wikidata: null, enrichment: null, name: '', tagline: '', description: '', industry: '', vertical: '', location: '', audience: '', competitors: [], scanPrompts: [], logo: null, favicon: null, address: null, social: {}, socialLinks: {}, sources: [], confidence: 0, degraded: false, }; // 1. Homepage + robots + sitemap — fired in parallel, each emits its own progress // as soon as it resolves (so robots/sitemap don't block on a slow homepage). onProgress({ step: 'homepage', status: 'running', label: `Fetching https://${dom}/` }); onProgress({ step: 'robots', status: 'running', label: 'Probing robots.txt' }); onProgress({ step: 'sitemap', status: 'running', label: 'Probing sitemap.xml' }); const hpFuture = _fetchTry(`https://${dom}/`); const robotsFuture = _fetchTry(`https://${dom}/robots.txt`); const sitemapFuture = _fetchTry(`https://${dom}/sitemap.xml`); // robots — settles independently robotsFuture.then((res) => { if (signal?.aborted) return; if (res.ok && !/^\s*<!?(doctype|html)/i.test(res.text)) { const lines = res.text.split('\n').map(l => l.trim()).filter(Boolean); const userAgents = lines.filter(l => /^user-agent:/i.test(l)); const sitemaps = lines.filter(l => /^sitemap:/i.test(l)).map(l => l.replace(/^sitemap:\s*/i, '')); const aiBots = ['GPTBot', 'ClaudeBot', 'PerplexityBot', 'CCBot', 'Google-Extended', 'OAI-SearchBot', 'anthropic-ai']; const mentionedAI = aiBots.filter(b => res.text.includes(b)); out.robots = { lines: lines.length, userAgents: userAgents.length, sitemaps, mentionedAI, preview: res.text.slice(0, 320) }; onProgress({ step: 'robots', status: 'pass', label: `robots.txt found`, detail: `${lines.length} lines · ${userAgents.length} user-agent block(s)${mentionedAI.length ? ` · ${mentionedAI.join(', ')}` : ''}` }); } else { onProgress({ step: 'robots', status: 'fail', label: 'No robots.txt' }); } }); // sitemap — settles independently sitemapFuture.then((res) => { if (signal?.aborted) return; if (res.ok && !/^\s*<!?(doctype|html)/i.test(res.text)) { const urls = (res.text.match(/<loc>/gi) || []).length; const isIndex = /<sitemapindex/i.test(res.text); const sampleUrls = [...res.text.matchAll(/<loc>([^<]+)<\/loc>/gi)].slice(0, 12).map(m => m[1]); out.sitemap = { urls, isIndex, sampleUrls }; onProgress({ step: 'sitemap', status: 'pass', label: `sitemap.xml found`, detail: isIndex ? `index with ${urls} sub-sitemaps` : `${urls} URL${urls === 1 ? '' : 's'}` }); } else { onProgress({ step: 'sitemap', status: 'fail', label: 'No sitemap.xml' }); } }); // homepage — we MUST wait for this before parsing meta/JSON-LD. // Hard-cap total time at 35s — if it hasn't resolved by then, give up and continue degraded. const hpCap = new Promise((resolve) => setTimeout(() => resolve({ ok: false, errorShort: 'global timeout 35s', timedOut: true }), 35000)); const hp = await Promise.race([hpFuture, hpCap]); if (signal?.aborted) throw new Error('aborted'); // homepage — but reject bot-challenge interstitials (Cloudflare "Just a moment…"), // otherwise we'd treat the challenge page's <title> as the brand name. const challenged = hp.ok && _isChallengePage(hp.text); if (hp.ok && !challenged) { out.homepage = { ok: true, via: hp.via, length: hp.text.length, finalUrl: hp.url }; onProgress({ step: 'homepage', status: 'pass', label: `Read homepage`, detail: `${(hp.text.length / 1024).toFixed(0)}kb · ${hp.via}` }); out.sources.push({ source: 'homepage', via: hp.via }); // 2. Meta onProgress({ step: 'meta', status: 'running', label: 'Reading meta + Open Graph' }); const meta = parseMeta(hp.text, dom); out.meta = meta; out.social = meta.social || {}; out.socialLinks = meta.social || {}; out.favicon = meta.favicon; out.logo = meta.ogImage || meta.favicon; out.name = meta.ogSiteName || (meta.title && meta.title.split(/[—–|·:]/)[0].trim()) || ''; out.description = meta.description || meta.ogDescription || ''; const metaDetail = [ meta.title ? `“${meta.title.length > 50 ? meta.title.slice(0, 50) + '…' : meta.title}”` : 'no title', meta.description ? 'description ✓' : 'no description', meta.ogImage ? 'og:image ✓' : null, Object.keys(meta.social || {}).length ? `${Object.keys(meta.social).length} social link(s)` : null, ].filter(Boolean).join(' · '); onProgress({ step: 'meta', status: 'pass', label: 'Parsed page meta', detail: metaDetail }); // 3. JSON-LD onProgress({ step: 'jsonld', status: 'running', label: 'Scanning structured data' }); const jsonld = parseJsonLd(hp.text); out.jsonld = jsonld; if (jsonld.summary.organization?.name && !out.name) out.name = jsonld.summary.organization.name; if (jsonld.summary.organization?.description && !out.description) out.description = jsonld.summary.organization.description; if (jsonld.summary.organization?.logo) out.logo = _absoluteUrl(jsonld.summary.organization.logo, dom); const lb = jsonld.summary.localBusiness; if (lb) { const parts = [lb.streetAddress, lb.addressLocality, lb.addressRegion, lb.postalCode, lb.addressCountry].filter(Boolean); out.address = { line: parts.join(', '), city: lb.addressLocality, region: lb.addressRegion, country: lb.addressCountry, phone: lb.telephone, }; if (lb.addressLocality && !out.location) { out.location = [lb.addressLocality, lb.addressRegion || lb.addressCountry].filter(Boolean).join(', '); } } onProgress({ step: 'jsonld', status: jsonld.types.length > 0 ? 'pass' : 'fail', label: jsonld.types.length > 0 ? `Found structured data` : 'No JSON-LD on homepage', detail: jsonld.types.length > 0 ? jsonld.types.join(', ') : 'AI models will rely on prose only', }); } else { // homepage failed — degraded mode out.degraded = true; // robots/sitemap were fired in parallel above; await their real results // (the bare robotsRes/sitemapRes were never defined → ReferenceError that // crashed degraded discovery instead of falling back to limited data). const [robotsRes, sitemapRes] = await Promise.all([robotsFuture, sitemapFuture]); const siteAlive = robotsRes.ok || sitemapRes.ok; onProgress({ step: 'homepage', status: 'fail', label: challenged ? 'Bot-challenge page (Cloudflare) — not the real site' : (siteAlive ? 'Homepage blocked — likely bot challenge' : 'Couldn’t reach homepage'), detail: challenged ? 'Got Cloudflare’s “Just a moment…” interstitial, not the homepage — using the domain + robots/sitemap + Claude, not the challenge page’s title.' : (siteAlive ? `${hp.errorShort} · but robots/sitemap responded, so the site is alive. Falling back to limited data.` : (hp.errorShort || 'no signal — site may be down or domain misspelled')), }); onProgress({ step: 'meta', status: 'fail', label: 'Skipped — no homepage HTML' }); onProgress({ step: 'jsonld', status: 'fail', label: 'Skipped — no homepage HTML' }); } if (signal?.aborted) throw new Error('aborted'); // 4 + 5 run in PARALLEL — Wikidata and Claude don't depend on each other. const searchTerm = out.name || _domainBare(dom); // kick Claude off immediately with what we have, including a Wikidata "promise" we'll // wait on inside enrichment. Actually simpler: race both, surface progress per-step. onProgress({ step: 'wikidata', status: 'running', label: `Looking up Wikidata · "${searchTerm}"` }); onProgress({ step: 'claude', status: 'running', label: 'Classifying with Claude' }); const wikidataPromise = lookupWikidata(searchTerm).then((wd) => { if (signal?.aborted) return null; if (wd) { onProgress({ step: 'wikidata', status: 'pass', label: `${wd.id} · ${wd.label}`, detail: wd.description || 'matched entity' }); if (!out.description && wd.description) out.description = wd.description; } else { onProgress({ step: 'wikidata', status: 'fail', label: 'No matching Wikidata entity', detail: 'common for smaller / younger brands' }); } out.wikidata = wd; return wd; }); const enrichmentPromise = (async () => { // wait briefly so Wikidata can inform Claude if it returns first; otherwise proceed const wdRace = await Promise.race([ wikidataPromise.catch(() => null), new Promise((r) => setTimeout(() => r('timeout'), 4000)), ]); const wdForClaude = wdRace === 'timeout' ? null : wdRace; try { return await enrichWithClaude({ domain: dom, meta: out.meta || {}, jsonld: out.jsonld || { types: [], summary: {} }, wikidata: wdForClaude, robots: out.robots, sitemap: out.sitemap, degraded: out.degraded, anthropicKey, }); } catch (e) { return { error: e.message || 'enrichment failed' }; } })().then((enrichment) => { if (signal?.aborted) return null; out.enrichment = enrichment; if (enrichment && !enrichment.error) { const conf = typeof enrichment.confidence === 'number' ? enrichment.confidence : 0.5; const knowsIt = enrichment.knowsFromTraining !== false; out.confidence = conf; out.knowsFromTraining = knowsIt; out.uncertaintyNote = enrichment.uncertaintyNote || ''; // Only adopt enrichment fields if confidence is meaningful. // For low-confidence: keep literal domain-derived name + 'unknown' fields. if (conf >= 0.3) { if (enrichment.brandName) out.name = out.name || enrichment.brandName; if (enrichment.tagline) out.tagline = enrichment.tagline; if (enrichment.industry && enrichment.industry !== 'unknown') out.industry = enrichment.industry; if (enrichment.vertical && enrichment.vertical !== 'unknown') out.vertical = enrichment.vertical; if (enrichment.location && enrichment.location !== 'unknown' && !out.location) out.location = enrichment.location; if (enrichment.audience) out.audience = enrichment.audience; if (Array.isArray(enrichment.competitors)) out.competitors = enrichment.competitors; if (Array.isArray(enrichment.scanPrompts)) out.scanPrompts = enrichment.scanPrompts; } else { // Low-confidence: drop competitors + prompts (don't propagate guesses) out.competitors = []; out.scanPrompts = []; } const detail = conf < 0.3 ? `low confidence (${Math.round(conf * 100)}%) — ${enrichment.uncertaintyNote || 'brand not in training data'}` : [out.industry, out.location, enrichment.competitors?.length ? `${enrichment.competitors.length} competitors` : null].filter(Boolean).join(' · '); onProgress({ step: 'claude', status: conf >= 0.3 ? 'pass' : 'fail', label: conf >= 0.3 ? 'Inferred industry + competitors' : 'Brand not recognized', detail, }); } else { const reason = enrichment?.error || 'wire an Anthropic key in Settings to enable'; onProgress({ step: 'claude', status: 'fail', label: 'Claude enrichment unavailable', detail: reason }); } return enrichment; }); await Promise.all([wikidataPromise, enrichmentPromise]); // Final fallbacks if (!out.name) out.name = _titleCase(_domainBare(dom)); if (!out.industry) out.industry = out.confidence < 0.3 ? 'unknown' : 'business'; if (!out.location) out.location = out.confidence < 0.3 ? 'unknown' : 'Online'; onProgress({ step: 'done', status: 'pass', label: 'Discovery complete', detail: `${out.name} · ${out.industry}${out.confidence < 0.3 ? ' · low confidence' : ''}` }); return out; } Object.assign(window, { discoverBrand, parseMeta, parseJsonLd, lookupWikidata, enrichWithClaude, });

/* discover.jsx — real brand discovery from a domain. * * Steps, all real: * 1. Fetch https://{domain}/ (CORS direct → proxy fallback) * 2. Parse , <meta>, <link rel=icon>, OpenGraph, Twitter card * 3. Parse all JSON-LD blocks — extract Organization, LocalBusiness, WebSite, etc. * 4. If we found a contact/about link, fetch it for an address fallback * 5. Look up Wikidata (real API, CORS-friendly) * 6. Optionally fetch /robots.txt and /sitemap.xml for the live audit * 7. Use Claude (via window.claude.complete OR user's anthropic key) to * classify industry + propose ~5 plausible competitors + suggest * vertical-specific scan prompts. * 8. Return a rich brand object the rest of the app uses. */ const DISCOVER_TIMEOUT_MS = 12000; // per-proxy attempt (was 18s — too long if 3 proxies queue up) const WIKIDATA_TIMEOUT_MS = 8000; const CLAUDE_TIMEOUT_MS = 30000; function _timed(p, ms, msg = 'timeout') { return Promise.race([p, new Promise((_, rj) => setTimeout(() => rj(new Error(msg)), ms))]); } // Public CORS proxies, tried in order. Different proxies fail on different sites. const CORS_PROXIES = [ (u) => 'https://corsproxy.io/?' + encodeURIComponent(u), (u) => 'https://api.allorigins.win/raw?url=' + encodeURIComponent(u), (u) => 'https://api.codetabs.com/v1/proxy/?quest=' + encodeURIComponent(u), ]; async function _fetchOne(url, ms) { const ctrl = new AbortController(); const t = setTimeout(() => ctrl.abort(), ms); try { const r = await fetch(url, { redirect: 'follow', signal: ctrl.signal }); if (!r.ok) throw new Error(`HTTP ${r.status}`); const text = await r.text(); if (!text || text.length < 50) throw new Error('empty response'); return text; } finally { clearTimeout(t); } } /* Server-proxied fetch via /api/fetch?url=… (managed mode). Returns body string * or throws. Tries the server first; callers fall back to direct/CORS proxies. */ async function _fetchViaServer(url, ms = 15_000) { if (typeof window === 'undefined' || !window.aisoEnabled || !window.aisoEnabled()) { throw new Error('managed mode disabled'); } const base = window.AISO_API_BASE.replace(/\/$/, ''); const ctrl = new AbortController(); const t = setTimeout(() => ctrl.abort(), ms); try { const r = await fetch(`${base}/fetch?url=${encodeURIComponent(url)}`, { signal: ctrl.signal }); if (!r.ok) throw new Error(`server HTTP ${r.status}`); const data = await r.json(); if (!data?.ok) throw new Error(data?.error || 'server fetch failed'); if (!data.body || data.body.length < 50) throw new Error('empty response'); return data.body; } finally { clearTimeout(t); } } /* Try server first (managed mode), then direct, then each proxy, then www. variant. */ async function _fetchTry(url, { textOk = true } = {}) { const errors = []; // 1) server proxy (preferred in managed mode) if (typeof window !== 'undefined' && window.aisoEnabled && window.aisoEnabled()) { try { const text = await _fetchViaServer(url, 15_000); return { ok: true, text, via: 'server', url }; } catch (e) { errors.push(['server', e.message || e.name]); } } // 2) direct (fast, when CORS allows) try { const text = await _fetchOne(url, 6000); return { ok: true, text, via: 'direct', url }; } catch (e) { errors.push(['direct', e.message || e.name]); } // 3) each proxy for (let i = 0; i < CORS_PROXIES.length; i++) { const proxyUrl = CORS_PROXIES[i](url); try { const text = await _fetchOne(proxyUrl, DISCOVER_TIMEOUT_MS); return { ok: true, text, via: `proxy-${i + 1}`, url }; } catch (e) { errors.push([`proxy-${i + 1}`, e.message || e.name]); } } // 4) www. variant fallback const parsed = url.replace(/^https?:\/\//, ''); if (!parsed.startsWith('www.')) { const altUrl = url.replace(/^(https?:\/\/)/, '$1www.'); if (typeof window !== 'undefined' && window.aisoEnabled && window.aisoEnabled()) { try { const text = await _fetchViaServer(altUrl, 15_000); return { ok: true, text, via: 'www-server', url: altUrl }; } catch (e) { errors.push(['www-server', e.message || e.name]); } } try { const text = await _fetchOne(altUrl, 6000); return { ok: true, text, via: 'www-direct', url: altUrl }; } catch (e) { errors.push(['www-direct', e.message || e.name]); } for (let i = 0; i < CORS_PROXIES.length; i++) { try { const text = await _fetchOne(CORS_PROXIES[i](altUrl), DISCOVER_TIMEOUT_MS); return { ok: true, text, via: `www-proxy-${i + 1}`, url: altUrl }; } catch (e) { errors.push([`www-proxy-${i + 1}`, e.message || e.name]); } } } return { ok: false, error: errors.map(([k, v]) => `${k}: ${v}`).join(' · '), errorShort: errors[errors.length - 1]?.[1] || 'unreachable', attempts: errors.length, url, }; } function _clean(s) { if (!s) return ''; return String(s).replace(/\s+/g, ' ').trim(); } function _absoluteUrl(href, baseDomain) { if (!href) return null; if (/^https?:\/\//i.test(href)) return href; if (href.startsWith('//')) return 'https:' + href; if (href.startsWith('/')) return `https://${baseDomain}${href}`; return `https://${baseDomain}/${href.replace(/^\.\//, '')}`; } function _domainBare(domain) { return String(domain || '').replace(/^https?:\/\//, '').replace(/^www\./, '').split('/')[0].split('.')[0]; } function _titleCase(s) { if (!s) return s; return s.split(/[\s-]+/).map((w) => w[0]?.toUpperCase() + w.slice(1)).join(' '); } // Detect bot-challenge / interstitial pages (Cloudflare "Just a moment…", etc.) so we // don't mistake the challenge page's <title> for the brand name (e.g. visa.com). function _isChallengePage(html) { if (!html) return false; const title = ((html.match(/<title[^>]*>([\s\S]*?)<\/title>/i) || [])[1] || '').trim(); if (/^(just a moment|attention required|checking your browser|verifying you are human|access denied|please wait|one more step)/i.test(title)) return true; const head = html.slice(0, 6000); return /cf-browser-verification|\/cdn-cgi\/challenge-platform|challenge-platform\/h\/|enable javascript and cookies to continue|checking if the site connection is secure|ddos protection by cloudflare/i.test(head); } /* ---------- meta + jsonld parsing ---------- */ function parseMeta(html, baseDomain) { if (!html) return {}; const $ = (re, group = 1) => (html.match(re) || [])[group] || ''; const title = _clean($(/<title[^>]*>([\s\S]*?)<\/title>/i)); const description = _clean($(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i)); const ogTitle = _clean($(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i)); const ogDescription = _clean($(/<meta[^>]+property=["']og:description["'][^>]+content=["']([^"']+)["']/i)); const ogImage = _absoluteUrl($(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i), baseDomain); const ogSiteName = _clean($(/<meta[^>]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']/i)); const ogType = _clean($(/<meta[^>]+property=["']og:type["'][^>]+content=["']([^"']+)["']/i)); const ogLocale = _clean($(/<meta[^>]+property=["']og:locale["'][^>]+content=["']([^"']+)["']/i)); const twitterSite = _clean($(/<meta[^>]+name=["']twitter:site["'][^>]+content=["']([^"']+)["']/i)); const lang = _clean($(/<html[^>]+lang=["']([^"']+)["']/i)); const themeColor = _clean($(/<meta[^>]+name=["']theme-color["'][^>]+content=["']([^"']+)["']/i)); // canonical & favicon (best-effort) const canonical = _absoluteUrl($(/<link[^>]+rel=["']canonical["'][^>]+href=["']([^"']+)["']/i), baseDomain); const iconHref = $(/<link[^>]+rel=["'](?:icon|shortcut icon|apple-touch-icon)["'][^>]+href=["']([^"']+)["']/i) || $(/<link[^>]+href=["']([^"']+)["'][^>]+rel=["'](?:icon|shortcut icon|apple-touch-icon)["']/i); const favicon = _absoluteUrl(iconHref || '/favicon.ico', baseDomain); // socials — first matching link to common networks const social = {}; const socialMap = { twitter: /https?:\/\/(?:www\.)?(?:twitter\.com|x\.com)\/([^"'\/?#\s]+)/i, linkedin: /https?:\/\/(?:www\.)?linkedin\.com\/(?:company|in)\/([^"'\/?#\s]+)/i, instagram: /https?:\/\/(?:www\.)?instagram\.com\/([^"'\/?#\s]+)/i, facebook: /https?:\/\/(?:www\.)?facebook\.com\/([^"'\/?#\s]+)/i, youtube: /https?:\/\/(?:www\.)?youtube\.com\/(?:@|channel\/|user\/|c\/)([^"'\/?#\s]+)/i, github: /https?:\/\/(?:www\.)?github\.com\/([^"'\/?#\s]+)/i, }; for (const [k, re] of Object.entries(socialMap)) { const m = html.match(re); if (m && m[1] && m[1].length < 60) social[k] = m[0]; } // h1 (one of) const h1 = _clean($(/<h1[^>]*>([\s\S]*?)<\/h1>/i).replace(/<[^>]+>/g, ' ')); return { title, description, ogTitle, ogDescription, ogImage, ogSiteName, ogType, ogLocale, twitterSite, canonical, favicon, themeColor, social, h1, lang, pageSize: html.length, }; } function _safeParseJson(s) { try { return JSON.parse(s); } catch {} // sometimes JSON-LD has comments or trailing commas; try a light clean try { const cleaned = s.replace(/\/\*[\s\S]*?\*\//g, '').replace(/,(\s*[}\]])/g, '$1'); return JSON.parse(cleaned); } catch {} return null; } function _flattenLd(node, out = []) { if (!node) return out; if (Array.isArray(node)) { node.forEach((n) => _flattenLd(n, out)); return out; } if (typeof node !== 'object') return out; if (node['@graph']) _flattenLd(node['@graph'], out); out.push(node); // walk values too (nested @types within properties) Object.values(node).forEach((v) => { if (v && (typeof v === 'object' || Array.isArray(v))) _flattenLd(v, out); }); return out; } function parseJsonLd(html) { if (!html) return { byType: {}, raw: [], summary: {} }; const blockRe = /<script[^>]+type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi; const raw = []; let m; while ((m = blockRe.exec(html)) !== null) { const parsed = _safeParseJson(m[1].trim()); if (parsed) raw.push(parsed); } const flat = []; raw.forEach((r) => _flattenLd(r, flat)); const byType = {}; for (const node of flat) { let t = node['@type']; if (Array.isArray(t)) t = t[0]; if (!t) continue; byType[t] = byType[t] || []; byType[t].push(node); } // Promote useful entities into a summary const summary = {}; function take(types) { for (const t of types) if (byType[t]?.[0]) return byType[t][0]; return null; } const org = take(['Organization', 'Corporation', 'NGO', 'EducationalOrganization', 'Brand']); const local = take(['LocalBusiness', 'Restaurant', 'CafeOrCoffeeShop', 'FoodEstablishment', 'Store', 'ProfessionalService', 'MedicalOrganization', 'Dentist', 'AutoDealer', 'Hotel']); const site = take(['WebSite']); const product = take(['Product']); if (org) summary.organization = { name: _clean(org.name), description: _clean(org.description), logo: typeof org.logo === 'string' ? org.logo : org.logo?.url, sameAs: [].concat(org.sameAs || []).filter(Boolean), foundingDate: org.foundingDate, }; if (local) { const addr = local.address || {}; summary.localBusiness = { name: _clean(local.name), type: local['@type'], addressLocality: _clean(addr.addressLocality || ''), addressRegion: _clean(addr.addressRegion || ''), addressCountry: typeof addr.addressCountry === 'string' ? addr.addressCountry : (addr.addressCountry?.name || ''), postalCode: _clean(addr.postalCode || ''), streetAddress: _clean(addr.streetAddress || ''), telephone: _clean(local.telephone || ''), priceRange: _clean(local.priceRange || ''), aggregateRating: local.aggregateRating ? { ratingValue: local.aggregateRating.ratingValue, reviewCount: local.aggregateRating.reviewCount || local.aggregateRating.ratingCount, } : null, }; } if (site) summary.website = { name: _clean(site.name), description: _clean(site.description) }; if (product) summary.product = { name: _clean(product.name) }; return { byType, raw, summary, types: Object.keys(byType) }; } /* ---------- Wikidata lookup (real, with same-vertical filter) ---------- * Wikidata's wbsearchentities is fuzzy and returns ANY matching label — * "bekon" matches a record producer (Q108649783). We filter so we only * accept entities that plausibly describe a company/website/brand. * * Heuristic: * 1. Drop entities whose description matches person/biographical patterns. * 2. Require the label to be a close-match to the search term (no random * 2-word brand → 1-word famous person collisions). * 3. Prefer entities whose description suggests business/tech/product. */ const _WD_DROP_RX = /\b(person|musician|songwriter|producer|singer|rapper|composer|actor|actress|athlete|footballer|politician|writer|author|poet|journalist|painter|sculptor|director|filmmaker|screenwriter|saint|monarch|king|queen|emperor|botanist|biologist|physicist|mathematician|surgeon|physician|judge|lawyer|priest|monk|nun|abbot|bishop|cardinal|pope|noble|baron|earl|duke|princess|prince|historian|philosopher|theologian|economist|sociologist|psychologist|astronomer|chemist|engineer (?:born|who)|born \d|died \d|\b(?:b\.|d\.) ?\d| family| dynasty|fictional character|species of|genus of|village in|town in|commune in|river in|mountain in)\b/i; const _WD_KEEP_RX = /\b(company|corporation|business|startup|startup company|brand|website|web site|web application|web platform|application|software|product|service|SaaS|platform|app|tool|technology company|tech company|firm|agency|organization|nonprofit|magazine|publisher|publication|store|retailer|restaurant|cafe|coffee|hotel|clinic|hospital|university|school|institute|studio|label|consultancy|consulting|enterprise|industry|manufacturer|maker of|provider of|provider|developer of|online (?:store|platform|service)|app for|software for)\b/i; function _wdLooksLikeBrand(entity, searchTerm) { if (!entity?.label) return false; const label = String(entity.label).toLowerCase(); const term = String(searchTerm || '').toLowerCase(); const desc = String(entity.description || '').toLowerCase(); // Exact / near-exact label match: accept immediately, even if description looks persony. // (Catches Apple → Q312 "American multinational technology company".) if (label === term) return true; const longer = label.length >= term.length ? label : term; const shorter = label.length >= term.length ? term : label; if (longer.includes(shorter) && shorter.length / longer.length >= 0.85) return true; // Looser path: require label-overlap + the description doesn't scream "person". if (_WD_DROP_RX.test(desc) && !_WD_KEEP_RX.test(desc)) return false; if (longer !== shorter && !longer.includes(shorter)) return false; if (shorter.length / longer.length < 0.6) return false; return true; } async function lookupWikidata(searchTerm) { if (!searchTerm) return null; try { const url = `https://www.wikidata.org/w/api.php?action=wbsearchentities&search=${encodeURIComponent(searchTerm)}&language=en&format=json&origin=*&limit=10`; const r = await _timed(fetch(url), WIKIDATA_TIMEOUT_MS, 'wikidata timeout'); if (!r.ok) return null; const data = await r.json(); if (!data.search || data.search.length === 0) return null; // Score: exact label > brand-shaped description > inclusive label > nothing const scored = data.search.map((entity) => { const looksLike = _wdLooksLikeBrand(entity, searchTerm); const keepMatch = _WD_KEEP_RX.test(String(entity.description || '')); const labelExact = String(entity.label).toLowerCase() === String(searchTerm).toLowerCase(); let score = 0; if (labelExact) score += 15; if (looksLike) score += 8; if (keepMatch) score += 4; return { entity, score }; }); const best = scored.sort((a, b) => b.score - a.score)[0]; if (!best || best.score < 8) return null; const top = best.entity; return { id: top.id, label: top.label, description: top.description, alternatives: scored.slice(1, 5).map((s) => ({ id: s.entity.id, label: s.entity.label, description: s.entity.description })), }; } catch { return null; } } /* ---------- Claude enrichment (uses BYO key if present, else window.claude) ---------- */ async function _claudeCall(prompt, anthropicKey) { const tryWith = async (fn) => _timed(fn(), CLAUDE_TIMEOUT_MS, 'claude timeout'); // Managed mode has no BYO key — keys live on the server. Route through callModel // whenever managed mode is on (not only when a BYO key is present), and give the // classification JSON enough room so it isn't truncated into unparseable output. const managed = !!(window.aisoEnabled && window.aisoEnabled()); if (typeof window.callModel === 'function' && (anthropicKey || managed)) { try { const r = await tryWith(() => window.callModel('anthropic', anthropicKey || '', prompt, { maxTokens: 1500 })); return r.text; } catch (e) { // fall through to built-in } } if (typeof window.claude?.complete === 'function') { try { return await tryWith(() => window.claude.complete(prompt)); } catch { return null; } } return null; } function _extractJsonFromText(text) { if (!text) return null; // grab the first { ... } or [ ... ] block const m = text.match(/(\{[\s\S]*\}|\[[\s\S]*\])/); if (!m) return null; return _safeParseJson(m[1]); } async function enrichWithClaude({ domain, meta, jsonld, wikidata, robots, sitemap, degraded, anthropicKey }) { const summary = { domain, homepage_reachable: !degraded, title: meta.title || null, h1: meta.h1 || null, description: meta.description || meta.ogDescription || null, ogSiteName: meta.ogSiteName || null, jsonld_types: jsonld.types || [], jsonld_organization: jsonld.summary?.organization || null, jsonld_localBusiness: jsonld.summary?.localBusiness || null, wikidata: wikidata ? { id: wikidata.id, label: wikidata.label, description: wikidata.description } : null, socials: Object.keys(meta.social || {}), robots_signals: robots ? { user_agents: robots.userAgents, ai_bots_named: robots.mentionedAI } : null, sitemap_urls_sample: sitemap?.sampleUrls?.slice(0, 8) || null, }; // Hard count of evidence we actually have. If zero / one, we're guessing. const evidenceCount = (summary.title ? 1 : 0) + (summary.description ? 1 : 0) + (summary.jsonld_types?.length ? 1 : 0) + (summary.wikidata ? 1 : 0) + (summary.sitemap_urls_sample?.length ? 1 : 0); const prompt = `You are classifying a website for an AI-visibility audit. Be calibrated and honest about what you can know. Scraped signals from ${domain}: \`\`\`json ${JSON.stringify(summary, null, 2)} \`\`\` EVIDENCE COUNT: ${evidenceCount} of 5 (title, description, jsonld, wikidata, sitemap) CRITICAL CALIBRATION RULES — read these: 1. If you do NOT clearly recognize the brand "${domain}" from your training data AND evidence_count < 2: set confidence ≤ 0.2, brandName to the literal domain (e.g. "${domain.split('.')[0]}"), industry: "unknown", location: "unknown", and competitors: []. DO NOT invent a vertical. DO NOT pattern-match on partial words (e.g. don't claim "cratox" is "Cratoxylum / a plant association"). DO NOT speculate about etymology, latin roots, or what the name "might suggest". If you don't know, say you don't know. 2. Confidence > 0.7 ONLY if BOTH: (a) you clearly know this exact brand from training, AND (b) the scraped signals are consistent with what you know. 3. If confidence < 0.3, return scanPrompts: [] and competitors: [] — there's no point guessing. 4. Brand names are proper nouns. Treat the domain as a literal string. Don't decompose it into roots. Return ONLY a JSON object (no markdown, no prose): { "brandName": "what people search for. If unknown, the literal domain root.", "tagline": "<=12 words OR empty string if unknown", "industry": "lowercase 1-4 words OR 'unknown'", "vertical": "broader category OR 'unknown'", "location": "City, Region/Country OR 'Online' OR 'unknown'", "audience": "who they serve OR ''", "isLocalBusiness": true|false, "confidence": 0.0-1.0, "knowsFromTraining": true|false, "uncertaintyNote": "if confidence < 0.5, ONE short sentence explaining what's missing", "competitors": [ {"name": "Competitor brand", "reason": "concrete reason — same vertical, same geo, same offer"} ], "scanPrompts": [ "..." (only if confidence >= 0.3) ] }`; const text = await _claudeCall(prompt, anthropicKey); if (!text) return null; const parsed = _extractJsonFromText(text); if (!parsed) return { raw: text }; return parsed; } /* ---------- main discovery ---------- */ async function discoverBrand(domain, { onProgress = () => {}, anthropicKey, signal } = {}) { const dom = String(domain || '').replace(/^https?:\/\//, '').replace(/\/.*$/, '').trim(); if (!dom) throw new Error('domain required'); const out = { domain: dom, discoveredAt: Date.now(), homepage: null, meta: null, jsonld: null, robots: null, sitemap: null, wikidata: null, enrichment: null, name: '', tagline: '', description: '', industry: '', vertical: '', location: '', audience: '', competitors: [], scanPrompts: [], logo: null, favicon: null, address: null, social: {}, socialLinks: {}, sources: [], confidence: 0, degraded: false, }; // 1. Homepage + robots + sitemap — fired in parallel, each emits its own progress // as soon as it resolves (so robots/sitemap don't block on a slow homepage). onProgress({ step: 'homepage', status: 'running', label: `Fetching https://${dom}/` }); onProgress({ step: 'robots', status: 'running', label: 'Probing robots.txt' }); onProgress({ step: 'sitemap', status: 'running', label: 'Probing sitemap.xml' }); const hpFuture = _fetchTry(`https://${dom}/`); const robotsFuture = _fetchTry(`https://${dom}/robots.txt`); const sitemapFuture = _fetchTry(`https://${dom}/sitemap.xml`); // robots — settles independently robotsFuture.then((res) => { if (signal?.aborted) return; if (res.ok && !/^\s*<!?(doctype|html)/i.test(res.text)) { const lines = res.text.split('\n').map(l => l.trim()).filter(Boolean); const userAgents = lines.filter(l => /^user-agent:/i.test(l)); const sitemaps = lines.filter(l => /^sitemap:/i.test(l)).map(l => l.replace(/^sitemap:\s*/i, '')); const aiBots = ['GPTBot', 'ClaudeBot', 'PerplexityBot', 'CCBot', 'Google-Extended', 'OAI-SearchBot', 'anthropic-ai']; const mentionedAI = aiBots.filter(b => res.text.includes(b)); out.robots = { lines: lines.length, userAgents: userAgents.length, sitemaps, mentionedAI, preview: res.text.slice(0, 320) }; onProgress({ step: 'robots', status: 'pass', label: `robots.txt found`, detail: `${lines.length} lines · ${userAgents.length} user-agent block(s)${mentionedAI.length ? ` · ${mentionedAI.join(', ')}` : ''}` }); } else { onProgress({ step: 'robots', status: 'fail', label: 'No robots.txt' }); } }); // sitemap — settles independently sitemapFuture.then((res) => { if (signal?.aborted) return; if (res.ok && !/^\s*<!?(doctype|html)/i.test(res.text)) { const urls = (res.text.match(/<loc>/gi) || []).length; const isIndex = /<sitemapindex/i.test(res.text); const sampleUrls = [...res.text.matchAll(/<loc>([^<]+)<\/loc>/gi)].slice(0, 12).map(m => m[1]); out.sitemap = { urls, isIndex, sampleUrls }; onProgress({ step: 'sitemap', status: 'pass', label: `sitemap.xml found`, detail: isIndex ? `index with ${urls} sub-sitemaps` : `${urls} URL${urls === 1 ? '' : 's'}` }); } else { onProgress({ step: 'sitemap', status: 'fail', label: 'No sitemap.xml' }); } }); // homepage — we MUST wait for this before parsing meta/JSON-LD. // Hard-cap total time at 35s — if it hasn't resolved by then, give up and continue degraded. const hpCap = new Promise((resolve) => setTimeout(() => resolve({ ok: false, errorShort: 'global timeout 35s', timedOut: true }), 35000)); const hp = await Promise.race([hpFuture, hpCap]); if (signal?.aborted) throw new Error('aborted'); // homepage — but reject bot-challenge interstitials (Cloudflare "Just a moment…"), // otherwise we'd treat the challenge page's <title> as the brand name. const challenged = hp.ok && _isChallengePage(hp.text); if (hp.ok && !challenged) { out.homepage = { ok: true, via: hp.via, length: hp.text.length, finalUrl: hp.url }; onProgress({ step: 'homepage', status: 'pass', label: `Read homepage`, detail: `${(hp.text.length / 1024).toFixed(0)}kb · ${hp.via}` }); out.sources.push({ source: 'homepage', via: hp.via }); // 2. Meta onProgress({ step: 'meta', status: 'running', label: 'Reading meta + Open Graph' }); const meta = parseMeta(hp.text, dom); out.meta = meta; out.social = meta.social || {}; out.socialLinks = meta.social || {}; out.favicon = meta.favicon; out.logo = meta.ogImage || meta.favicon; out.name = meta.ogSiteName || (meta.title && meta.title.split(/[—–|·:]/)[0].trim()) || ''; out.description = meta.description || meta.ogDescription || ''; const metaDetail = [ meta.title ? `“${meta.title.length > 50 ? meta.title.slice(0, 50) + '…' : meta.title}”` : 'no title', meta.description ? 'description ✓' : 'no description', meta.ogImage ? 'og:image ✓' : null, Object.keys(meta.social || {}).length ? `${Object.keys(meta.social).length} social link(s)` : null, ].filter(Boolean).join(' · '); onProgress({ step: 'meta', status: 'pass', label: 'Parsed page meta', detail: metaDetail }); // 3. JSON-LD onProgress({ step: 'jsonld', status: 'running', label: 'Scanning structured data' }); const jsonld = parseJsonLd(hp.text); out.jsonld = jsonld; if (jsonld.summary.organization?.name && !out.name) out.name = jsonld.summary.organization.name; if (jsonld.summary.organization?.description && !out.description) out.description = jsonld.summary.organization.description; if (jsonld.summary.organization?.logo) out.logo = _absoluteUrl(jsonld.summary.organization.logo, dom); const lb = jsonld.summary.localBusiness; if (lb) { const parts = [lb.streetAddress, lb.addressLocality, lb.addressRegion, lb.postalCode, lb.addressCountry].filter(Boolean); out.address = { line: parts.join(', '), city: lb.addressLocality, region: lb.addressRegion, country: lb.addressCountry, phone: lb.telephone, }; if (lb.addressLocality && !out.location) { out.location = [lb.addressLocality, lb.addressRegion || lb.addressCountry].filter(Boolean).join(', '); } } onProgress({ step: 'jsonld', status: jsonld.types.length > 0 ? 'pass' : 'fail', label: jsonld.types.length > 0 ? `Found structured data` : 'No JSON-LD on homepage', detail: jsonld.types.length > 0 ? jsonld.types.join(', ') : 'AI models will rely on prose only', }); } else { // homepage failed — degraded mode out.degraded = true; // robots/sitemap were fired in parallel above; await their real results // (the bare robotsRes/sitemapRes were never defined → ReferenceError that // crashed degraded discovery instead of falling back to limited data). const [robotsRes, sitemapRes] = await Promise.all([robotsFuture, sitemapFuture]); const siteAlive = robotsRes.ok || sitemapRes.ok; onProgress({ step: 'homepage', status: 'fail', label: challenged ? 'Bot-challenge page (Cloudflare) — not the real site' : (siteAlive ? 'Homepage blocked — likely bot challenge' : 'Couldn’t reach homepage'), detail: challenged ? 'Got Cloudflare’s “Just a moment…” interstitial, not the homepage — using the domain + robots/sitemap + Claude, not the challenge page’s title.' : (siteAlive ? `${hp.errorShort} · but robots/sitemap responded, so the site is alive. Falling back to limited data.` : (hp.errorShort || 'no signal — site may be down or domain misspelled')), }); onProgress({ step: 'meta', status: 'fail', label: 'Skipped — no homepage HTML' }); onProgress({ step: 'jsonld', status: 'fail', label: 'Skipped — no homepage HTML' }); } if (signal?.aborted) throw new Error('aborted'); // 4 + 5 run in PARALLEL — Wikidata and Claude don't depend on each other. const searchTerm = out.name || _domainBare(dom); // kick Claude off immediately with what we have, including a Wikidata "promise" we'll // wait on inside enrichment. Actually simpler: race both, surface progress per-step. onProgress({ step: 'wikidata', status: 'running', label: `Looking up Wikidata · "${searchTerm}"` }); onProgress({ step: 'claude', status: 'running', label: 'Classifying with Claude' }); const wikidataPromise = lookupWikidata(searchTerm).then((wd) => { if (signal?.aborted) return null; if (wd) { onProgress({ step: 'wikidata', status: 'pass', label: `${wd.id} · ${wd.label}`, detail: wd.description || 'matched entity' }); if (!out.description && wd.description) out.description = wd.description; } else { onProgress({ step: 'wikidata', status: 'fail', label: 'No matching Wikidata entity', detail: 'common for smaller / younger brands' }); } out.wikidata = wd; return wd; }); const enrichmentPromise = (async () => { // wait briefly so Wikidata can inform Claude if it returns first; otherwise proceed const wdRace = await Promise.race([ wikidataPromise.catch(() => null), new Promise((r) => setTimeout(() => r('timeout'), 4000)), ]); const wdForClaude = wdRace === 'timeout' ? null : wdRace; try { return await enrichWithClaude({ domain: dom, meta: out.meta || {}, jsonld: out.jsonld || { types: [], summary: {} }, wikidata: wdForClaude, robots: out.robots, sitemap: out.sitemap, degraded: out.degraded, anthropicKey, }); } catch (e) { return { error: e.message || 'enrichment failed' }; } })().then((enrichment) => { if (signal?.aborted) return null; out.enrichment = enrichment; if (enrichment && !enrichment.error) { const conf = typeof enrichment.confidence === 'number' ? enrichment.confidence : 0.5; const knowsIt = enrichment.knowsFromTraining !== false; out.confidence = conf; out.knowsFromTraining = knowsIt; out.uncertaintyNote = enrichment.uncertaintyNote || ''; // Only adopt enrichment fields if confidence is meaningful. // For low-confidence: keep literal domain-derived name + 'unknown' fields. if (conf >= 0.3) { if (enrichment.brandName) out.name = out.name || enrichment.brandName; if (enrichment.tagline) out.tagline = enrichment.tagline; if (enrichment.industry && enrichment.industry !== 'unknown') out.industry = enrichment.industry; if (enrichment.vertical && enrichment.vertical !== 'unknown') out.vertical = enrichment.vertical; if (enrichment.location && enrichment.location !== 'unknown' && !out.location) out.location = enrichment.location; if (enrichment.audience) out.audience = enrichment.audience; if (Array.isArray(enrichment.competitors)) out.competitors = enrichment.competitors; if (Array.isArray(enrichment.scanPrompts)) out.scanPrompts = enrichment.scanPrompts; } else { // Low-confidence: drop competitors + prompts (don't propagate guesses) out.competitors = []; out.scanPrompts = []; } const detail = conf < 0.3 ? `low confidence (${Math.round(conf * 100)}%) — ${enrichment.uncertaintyNote || 'brand not in training data'}` : [out.industry, out.location, enrichment.competitors?.length ? `${enrichment.competitors.length} competitors` : null].filter(Boolean).join(' · '); onProgress({ step: 'claude', status: conf >= 0.3 ? 'pass' : 'fail', label: conf >= 0.3 ? 'Inferred industry + competitors' : 'Brand not recognized', detail, }); } else { const reason = enrichment?.error || 'wire an Anthropic key in Settings to enable'; onProgress({ step: 'claude', status: 'fail', label: 'Claude enrichment unavailable', detail: reason }); } return enrichment; }); await Promise.all([wikidataPromise, enrichmentPromise]); // Final fallbacks if (!out.name) out.name = _titleCase(_domainBare(dom)); if (!out.industry) out.industry = out.confidence < 0.3 ? 'unknown' : 'business'; if (!out.location) out.location = out.confidence < 0.3 ? 'unknown' : 'Online'; onProgress({ step: 'done', status: 'pass', label: 'Discovery complete', detail: `${out.name} · ${out.industry}${out.confidence < 0.3 ? ' · low confidence' : ''}` }); return out; } Object.assign(window, { discoverBrand, parseMeta, parseJsonLd, lookupWikidata, enrichWithClaude, });