feat: add supplier scoring and UPC file analysis functionality

- Implemented supplier scoring logic in `supplier-scoring.ts` with functions to compute demand score, competition penalty, and overall supplier product score.
- Created unit tests for supplier scoring in `supplier-scoring.test.ts` to validate scoring logic against various scenarios.
- Developed UPC file analysis tool in `upc-file-analysis.ts` to process UPCs in batches, fetch product data from Keepa and SP-API, and generate supplier results.
- Added UPC input reading functionality in `upc-file-reader.ts` to handle XLSX and XLS files, including validation for UPC formats.
- Introduced a command-line tool in `upc-lookup.ts` for looking up UPCs and displaying detailed results or mappings to ASINs.
- Enhanced error handling and logging throughout the new modules for better traceability and user feedback.
This commit is contained in:
Victor Noguera
2026-05-25 00:53:47 -04:00
parent b982edd160
commit c006d87c54
36 changed files with 1905 additions and 113 deletions

777
src/integrations/searxng.ts Normal file
View File

@@ -0,0 +1,777 @@
const DEFAULT_SEARXNG_URL = "https://searxng.nvictor.me/";
const DEFAULT_GOOGLE_CUSTOM_SEARCH_URL =
"https://www.googleapis.com/customsearch/v1";
const DEFAULT_SERPAPI_URL = "https://serpapi.com/search.json";
const DEFAULT_TIMEOUT_MS = 10_000;
const DEFAULT_MAX_RESULTS = 10;
const ASIN_REGEX = /^B[0-9A-Z]{9}$/;
const ASIN_MATCH_REGEX = /\bB[0-9A-Z]{9}\b/gi;
const PRICE_LABELS = [
"selling price",
"sale price",
"offer price",
"current price",
"our price",
"list price",
"price",
] as const;
const CURRENCY_CODES = "USD|US\\$|EUR|GBP|INR|CAD|AUD";
const CURRENCY_SYMBOLS = "$€£₹";
const LABELED_PRICE_REGEX =
new RegExp(
`\\b(selling price|sale price|offer price|current price|our price|list price|price)\\b[^${escapeForCharClass(CURRENCY_SYMBOLS)}0-9]{0,24}((?:${CURRENCY_CODES})?\\s*[${escapeForCharClass(CURRENCY_SYMBOLS)}]\\s*[0-9]{1,5}(?:,[0-9]{3})*(?:\\.[0-9]{2})?|(?:${CURRENCY_CODES})\\s*[0-9]{1,5}(?:,[0-9]{3})*(?:\\.[0-9]{2})?)`,
"gi",
);
const PRICE_REGEX = new RegExp(
`((?:${CURRENCY_CODES})?\\s*[${escapeForCharClass(CURRENCY_SYMBOLS)}]\\s*[0-9]{1,5}(?:,[0-9]{3})*(?:\\.[0-9]{2})?|(?:${CURRENCY_CODES})\\s*[0-9]{1,5}(?:,[0-9]{3})*(?:\\.[0-9]{2})?|[0-9]{1,5}(?:,[0-9]{3})*(?:\\.[0-9]{2})?\\s*(?:${CURRENCY_CODES}))`,
"gi",
);
export type SearxngOfferSearchResult = {
asin?: string;
query: string;
title: string;
url: string;
domain: string;
snippet: string;
rank: number;
score: number;
matchedAsin?: string;
detectedPrice?: number;
detectedPriceCurrency?: string;
detectedPriceLabel?: string;
detectedPriceText?: string;
engines: string[];
};
export type SearxngSearchOptions = {
provider?: "serpapi" | "google-custom-search" | "searxng";
baseUrl?: string;
googleApiKey?: string;
googleCx?: string;
serpapiApiKey?: string;
timeoutMs?: number;
maxResults?: number;
page?: number;
categories?: string;
engines?: string;
includeUnmatchedAsinResults?: boolean;
fetchImpl?: typeof fetch;
};
type RawSearchResult = {
title: string;
url: string;
snippet: string;
engines: string[];
rank: number;
};
type JsonSearchResponse = {
results?: Array<Record<string, unknown>>;
};
type PriceDetection = {
amount: number;
currency: string;
text: string;
label?: string;
};
export async function searchAsinOffers(
asin: string,
options: SearxngSearchOptions = {},
): Promise<SearxngOfferSearchResult[]> {
return searchProductOffers(normalizeAsin(asin), options);
}
export async function searchProductOffers(
query: string,
options: SearxngSearchOptions = {},
): Promise<SearxngOfferSearchResult[]> {
const normalizedQuery = query.trim();
if (!normalizedQuery) {
throw new Error("Search query is required.");
}
const inferredAsin = getAsinQuery(normalizedQuery);
const searxngQuery = inferredAsin
? `${inferredAsin} price sale offer buy online`
: normalizedQuery;
const maxResults = positiveInteger(
options.maxResults ?? readEnvInt("SEARXNG_MAX_RESULTS", DEFAULT_MAX_RESULTS),
DEFAULT_MAX_RESULTS,
);
const rawResults =
options.provider === "searxng"
? await fetchSearxngResults(searxngQuery, options)
: options.provider === "google-custom-search"
? await fetchGoogleCustomSearchResults(searxngQuery, {
...options,
maxResults,
})
: await fetchSerpApiGoogleShoppingResults(searxngQuery, {
...options,
provider: "serpapi",
maxResults,
});
return rawResults
.map((result) => normalizeResult(result, searxngQuery, inferredAsin))
.filter((result) => {
if (!result.url) return false;
if (!inferredAsin || options.includeUnmatchedAsinResults) return true;
return result.matchedAsin === inferredAsin;
})
.sort((a, b) => b.score - a.score || a.rank - b.rank)
.slice(0, maxResults);
}
export function normalizeAsin(value: string): string {
const asin = value.trim().toUpperCase();
if (!ASIN_REGEX.test(asin)) {
throw new Error(`Invalid ASIN: ${value}`);
}
return asin;
}
function getAsinQuery(value: string): string | undefined {
const normalized = value.trim().toUpperCase();
return ASIN_REGEX.test(normalized) ? normalized : undefined;
}
async function fetchSearxngResults(
query: string,
options: SearxngSearchOptions,
): Promise<RawSearchResult[]> {
const baseUrl = normalizeBaseUrl(
options.baseUrl ?? Bun.env.SEARXNG_URL ?? DEFAULT_SEARXNG_URL,
);
const timeoutMs = positiveInteger(
options.timeoutMs ?? readEnvInt("SEARXNG_TIMEOUT_MS", DEFAULT_TIMEOUT_MS),
DEFAULT_TIMEOUT_MS,
);
const page = positiveInteger(options.page ?? 1, 1);
const categories = options.categories ?? "general";
const fetchImpl = options.fetchImpl ?? fetch;
const requestQuery = applySearxngEngineBang(query, options.engines);
const jsonUrl = buildSearchUrl(baseUrl, requestQuery, {
categories,
engines: options.engines,
page,
format: "json",
});
const jsonResponse = await fetchWithTimeout(fetchImpl, jsonUrl, timeoutMs);
if (isJsonResponse(jsonResponse)) {
const json = (await jsonResponse.json()) as JsonSearchResponse;
return parseJsonResults(json);
}
const htmlUrl = buildSearchUrl(baseUrl, requestQuery, {
categories,
engines: options.engines,
page,
});
const htmlResponse = await fetchWithTimeout(fetchImpl, htmlUrl, timeoutMs);
if (!htmlResponse.ok) {
throw new Error(
`SearXNG search failed: status=${htmlResponse.status} url=${htmlUrl.toString()}`,
);
}
return parseHtmlResults(await htmlResponse.text());
}
function applySearxngEngineBang(query: string, engines: string | undefined): string {
if (!engines || query.trim().startsWith("!")) return query;
const engineList = engines
.split(",")
.map((engine) => engine.trim().toLowerCase())
.filter(Boolean);
if (engineList.length !== 1) return query;
const shortcut = searxngEngineShortcut(engineList[0]!);
return shortcut ? `!${shortcut} ${query}` : query;
}
function searxngEngineShortcut(engine: string): string | undefined {
if (engine === "google") return "go";
return undefined;
}
function isJsonResponse(response: Response): boolean {
const contentType = response.headers.get("content-type") ?? "";
return response.ok && contentType.toLowerCase().includes("application/json");
}
async function fetchWithTimeout(
fetchImpl: typeof fetch,
url: URL,
timeoutMs: number,
): Promise<Response> {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), timeoutMs);
try {
return await fetchImpl(url, {
signal: controller.signal,
headers: {
accept: "application/json,text/html;q=0.9,*/*;q=0.8",
"user-agent": "asin-check/1.0 (+https://searxng.nvictor.me/)",
},
});
} finally {
clearTimeout(timeout);
}
}
function buildSearchUrl(
baseUrl: URL,
query: string,
params: { categories: string; engines?: string; page: number; format?: string },
): URL {
const url = new URL("search", baseUrl);
url.searchParams.set("q", query);
url.searchParams.set("categories", params.categories);
if (params.engines) {
url.searchParams.set("engines", params.engines);
}
url.searchParams.set("pageno", String(params.page));
if (params.format) {
url.searchParams.set("format", params.format);
}
return url;
}
async function fetchGoogleCustomSearchResults(
query: string,
options: SearxngSearchOptions,
): Promise<RawSearchResult[]> {
const apiKey = options.googleApiKey ?? Bun.env.GOOGLE_API_KEY;
const cx =
options.googleCx ??
Bun.env.GOOGLE_CSE_ID ??
Bun.env.GOOGLE_CX ??
Bun.env.GOOGLE_SEARCH_ENGINE_ID;
if (!apiKey) {
throw new Error("Missing GOOGLE_API_KEY for Google Custom Search.");
}
if (!cx) {
throw new Error(
"Missing Google Custom Search engine id. Set GOOGLE_CSE_ID, GOOGLE_CX, or GOOGLE_SEARCH_ENGINE_ID.",
);
}
const timeoutMs = positiveInteger(
options.timeoutMs ?? readEnvInt("SEARXNG_TIMEOUT_MS", DEFAULT_TIMEOUT_MS),
DEFAULT_TIMEOUT_MS,
);
const page = positiveInteger(options.page ?? 1, 1);
const num = Math.min(
10,
positiveInteger(options.maxResults ?? DEFAULT_MAX_RESULTS, DEFAULT_MAX_RESULTS),
);
const fetchImpl = options.fetchImpl ?? fetch;
const url = new URL(options.baseUrl ?? DEFAULT_GOOGLE_CUSTOM_SEARCH_URL);
url.searchParams.set("key", apiKey);
url.searchParams.set("cx", cx);
url.searchParams.set("q", query);
url.searchParams.set("num", String(num));
url.searchParams.set("start", String((page - 1) * num + 1));
const response = await fetchWithTimeout(fetchImpl, url, timeoutMs);
if (!response.ok) {
const body = await response.text().catch(() => "");
throw new Error(
`Google Custom Search failed: status=${response.status} ${body.slice(0, 300)}`,
);
}
const json = (await response.json()) as GoogleCustomSearchResponse;
return parseGoogleCustomSearchResults(json);
}
type GoogleCustomSearchResponse = {
items?: GoogleCustomSearchItem[];
};
type GoogleCustomSearchItem = {
title?: string;
link?: string;
snippet?: string;
displayLink?: string;
pagemap?: Record<string, unknown>;
};
type SerpApiShoppingResponse = {
shopping_results?: SerpApiShoppingResult[];
inline_shopping_results?: SerpApiShoppingResult[];
categorized_shopping_results?: Array<{
shopping_results?: SerpApiShoppingResult[];
}>;
error?: string;
};
type SerpApiShoppingResult = {
position?: number;
title?: string;
source?: string;
link?: string;
product_link?: string;
serpapi_product_api?: string;
price?: string;
extracted_price?: number;
old_price?: string;
extracted_old_price?: number;
delivery?: string;
rating?: number;
reviews?: number;
snippet?: string;
};
async function fetchSerpApiGoogleShoppingResults(
query: string,
options: SearxngSearchOptions,
): Promise<RawSearchResult[]> {
const apiKey = options.serpapiApiKey ?? Bun.env.SERPAPI_API_KEY;
if (!apiKey) {
throw new Error(
"Missing SERPAPI_API_KEY. Google does not provide an official public Shopping-tab search API; use SerpApi's google_shopping API or another SERP provider.",
);
}
const timeoutMs = positiveInteger(
options.timeoutMs ?? readEnvInt("SEARXNG_TIMEOUT_MS", DEFAULT_TIMEOUT_MS),
DEFAULT_TIMEOUT_MS,
);
const page = positiveInteger(options.page ?? 1, 1);
const fetchImpl = options.fetchImpl ?? fetch;
const url = new URL(options.baseUrl ?? DEFAULT_SERPAPI_URL);
url.searchParams.set("engine", "google_shopping");
url.searchParams.set("q", query);
url.searchParams.set("api_key", apiKey);
url.searchParams.set("google_domain", "google.com");
url.searchParams.set("gl", "us");
url.searchParams.set("hl", "en");
url.searchParams.set("start", String((page - 1) * 60));
const response = await fetchWithTimeout(fetchImpl, url, timeoutMs);
if (!response.ok) {
const body = await response.text().catch(() => "");
throw new Error(
`SerpApi Google Shopping failed: status=${response.status} ${body.slice(0, 300)}`,
);
}
const json = (await response.json()) as SerpApiShoppingResponse;
if (json.error) {
throw new Error(`SerpApi Google Shopping failed: ${json.error}`);
}
return parseSerpApiShoppingResults(json);
}
function parseSerpApiShoppingResults(
json: SerpApiShoppingResponse,
): RawSearchResult[] {
const results = [
...(json.shopping_results ?? []),
...(json.inline_shopping_results ?? []),
...(json.categorized_shopping_results ?? []).flatMap(
(category) => category.shopping_results ?? [],
),
];
return results.flatMap((item, index) => {
const url =
optionalString(item.link) ??
optionalString(item.product_link) ??
optionalString(item.serpapi_product_api);
if (!url) return [];
const priceText = optionalString(item.price);
const snippet = [
priceText ? `offer price: ${priceText}` : undefined,
optionalString(item.old_price)
? `list price: ${item.old_price}`
: undefined,
optionalString(item.source) ? `merchant: ${item.source}` : undefined,
optionalString(item.delivery),
optionalString(item.snippet),
typeof item.rating === "number" ? `rating: ${item.rating}` : undefined,
typeof item.reviews === "number" ? `reviews: ${item.reviews}` : undefined,
]
.filter((value): value is string => !!value)
.join(" ");
return [
{
title: optionalString(item.title) ?? "",
url,
snippet,
engines: ["serpapi google shopping"],
rank: item.position ?? index + 1,
},
];
});
}
function parseGoogleCustomSearchResults(
json: GoogleCustomSearchResponse,
): RawSearchResult[] {
return (json.items ?? []).flatMap((item, index) => {
const url = optionalString(item.link);
if (!url) return [];
const metadataText = extractGoogleCustomSearchMetadataText(item);
return [
{
title: optionalString(item.title) ?? "",
url,
snippet: [optionalString(item.snippet), metadataText]
.filter((value): value is string => !!value)
.join(" "),
engines: ["google custom search"],
rank: index + 1,
},
];
});
}
function extractGoogleCustomSearchMetadataText(
item: GoogleCustomSearchItem,
): string {
const pagemap = item.pagemap ?? {};
const chunks: string[] = [];
for (const offer of readPagemapObjects(pagemap.offer)) {
appendPriceMetadata(chunks, offer);
}
for (const product of readPagemapObjects(pagemap.product)) {
appendPriceMetadata(chunks, product);
}
for (const metatag of readPagemapObjects(pagemap.metatags)) {
appendPriceMetadata(chunks, metatag);
}
return chunks.join(" ");
}
function appendPriceMetadata(chunks: string[], value: Record<string, unknown>): void {
const price =
optionalString(value.price) ??
optionalString(value.lowprice) ??
optionalString(value.highprice) ??
optionalString(value["product:price:amount"]) ??
optionalString(value["og:price:amount"]) ??
optionalString(value["twitter:data1"]);
if (!price) return;
const currency =
optionalString(value.pricecurrency) ??
optionalString(value.priceCurrency) ??
optionalString(value["product:price:currency"]) ??
optionalString(value["og:price:currency"]);
chunks.push(currency ? `offer price: ${currency} ${price}` : `offer price: ${price}`);
}
function readPagemapObjects(value: unknown): Array<Record<string, unknown>> {
if (!Array.isArray(value)) return [];
return value.filter(
(item): item is Record<string, unknown> =>
item != null && typeof item === "object" && !Array.isArray(item),
);
}
function parseJsonResults(json: JsonSearchResponse): RawSearchResult[] {
return (json.results ?? []).flatMap((result, index) => {
const url = optionalString(result.url);
if (!url) return [];
return [
{
title: optionalString(result.title) ?? "",
url,
snippet: optionalString(result.content) ?? "",
engines: normalizeEngines(result.engines ?? result.engine),
rank: index + 1,
},
];
});
}
async function parseHtmlResults(html: string): Promise<RawSearchResult[]> {
type Draft = {
title: string;
url: string;
snippet: string;
engines: string[];
};
const results: RawSearchResult[] = [];
let current: Draft | null = null;
let currentTextTarget: "title" | "snippet" | "engine" | null = null;
const appendText = (text: string) => {
if (!current || !currentTextTarget) return;
const normalized = text.replace(/\s+/g, " ").trim();
if (!normalized) return;
if (currentTextTarget === "engine") {
current.engines.push(normalized);
return;
}
current[currentTextTarget] = appendWithSpace(
current[currentTextTarget],
normalized,
);
};
const response = new HTMLRewriter()
.on("article.result", {
element(element) {
current = { title: "", url: "", snippet: "", engines: [] };
const onEndTag = (element as unknown as {
onEndTag?: (handler: () => void) => void;
}).onEndTag;
onEndTag?.call(element, () => {
if (current?.url) {
results.push({ ...current, rank: results.length + 1 });
}
current = null;
currentTextTarget = null;
});
},
})
.on("article.result a.url_header", {
element(element) {
if (current && !current.url) {
current.url = element.getAttribute("href") ?? "";
}
},
})
.on("article.result h3 a", {
element(element) {
if (current && !current.url) {
current.url = element.getAttribute("href") ?? "";
}
currentTextTarget = "title";
},
text(text) {
appendText(text.text);
if (text.lastInTextNode) currentTextTarget = null;
},
})
.on("article.result p.content", {
text(text) {
currentTextTarget = "snippet";
appendText(text.text);
if (text.lastInTextNode) currentTextTarget = null;
},
})
.on("article.result .engines span", {
text(text) {
currentTextTarget = "engine";
appendText(text.text);
if (text.lastInTextNode) currentTextTarget = null;
},
})
.transform(new Response(html));
await response.text();
return results;
}
function normalizeResult(
raw: RawSearchResult,
query: string,
asin?: string,
): SearxngOfferSearchResult {
const url = normalizeUrl(raw.url);
const domain = extractDomain(url);
const title = normalizeText(raw.title);
const snippet = normalizeText(raw.snippet);
const matchedAsin = findMatchedAsin(`${title} ${snippet} ${url}`);
const detectedPrice = detectPrice(`${title} ${snippet}`);
const score = scoreResult({
asin,
matchedAsin,
detectedPrice: detectedPrice?.amount,
domain,
rank: raw.rank,
});
return {
...(asin ? { asin } : {}),
query,
title,
url,
domain,
snippet,
rank: raw.rank,
score,
...(matchedAsin ? { matchedAsin } : {}),
...(detectedPrice
? {
detectedPrice: detectedPrice.amount,
detectedPriceCurrency: detectedPrice.currency,
...(detectedPrice.label
? { detectedPriceLabel: detectedPrice.label }
: {}),
detectedPriceText: detectedPrice.text,
}
: {}),
engines: dedupe(raw.engines.map(normalizeText).filter(Boolean)),
};
}
function scoreResult(input: {
asin?: string;
matchedAsin?: string;
detectedPrice?: number;
domain: string;
rank: number;
}): number {
let score = 100 - input.rank;
if (input.asin && input.matchedAsin === input.asin) score += 80;
if (input.matchedAsin && !input.asin) score += 40;
if (input.detectedPrice != null) score += 30;
if (input.domain && !isAmazonDomain(input.domain)) score += 20;
if (isAmazonDomain(input.domain)) score -= 15;
return score;
}
function normalizeBaseUrl(value: string): URL {
const url = new URL(value);
if (!url.pathname.endsWith("/")) {
url.pathname = `${url.pathname}/`;
}
return url;
}
function normalizeUrl(value: string): string {
try {
return new URL(value).toString();
} catch {
return value.trim();
}
}
function extractDomain(value: string): string {
try {
return new URL(value).hostname.replace(/^www\./i, "").toLowerCase();
} catch {
return "";
}
}
function isAmazonDomain(domain: string): boolean {
return /(^|\.)amazon\./i.test(domain);
}
function findMatchedAsin(value: string): string | undefined {
const match = value.toUpperCase().match(ASIN_MATCH_REGEX);
return match?.[0];
}
function detectPrice(value: string): PriceDetection | undefined {
const labeledCandidates = Array.from(value.matchAll(LABELED_PRICE_REGEX))
.map((match) => parsePriceMatch(match[2], match[1]))
.filter((price): price is PriceDetection => !!price)
.sort(comparePriceDetections);
if (labeledCandidates[0]) return labeledCandidates[0];
const candidates = Array.from(value.matchAll(PRICE_REGEX))
.map((match) => parsePriceMatch(match[1]))
.filter((price): price is PriceDetection => !!price);
return candidates[0];
}
function parsePriceMatch(
rawPrice: string | undefined,
rawLabel?: string,
): PriceDetection | undefined {
if (!rawPrice) return undefined;
const text = normalizeText(rawPrice);
const currency = detectCurrency(text);
const amountMatch = text.match(/[0-9]{1,5}(?:,[0-9]{3})*(?:\.[0-9]{2})?/);
if (!amountMatch?.[0]) return undefined;
const amount = Number(amountMatch[0].replace(/,/g, ""));
if (!Number.isFinite(amount) || amount <= 0) return undefined;
const label = rawLabel ? normalizeText(rawLabel).toLowerCase() : undefined;
return {
amount,
currency,
text,
...(label ? { label } : {}),
};
}
function comparePriceDetections(a: PriceDetection, b: PriceDetection): number {
return priceLabelRank(a.label) - priceLabelRank(b.label);
}
function priceLabelRank(label: string | undefined): number {
if (!label) return PRICE_LABELS.length;
const index = PRICE_LABELS.indexOf(label as (typeof PRICE_LABELS)[number]);
return index === -1 ? PRICE_LABELS.length : index;
}
function detectCurrency(value: string): string {
if (/\b(EUR)\b|€/i.test(value)) return "EUR";
if (/\b(GBP)\b|£/i.test(value)) return "GBP";
if (/\b(INR)\b|₹/i.test(value)) return "INR";
if (/\b(CAD)\b/i.test(value)) return "CAD";
if (/\b(AUD)\b/i.test(value)) return "AUD";
return "USD";
}
function escapeForCharClass(value: string): string {
return value.replace(/[-\\\]^]/g, "\\$&");
}
function normalizeEngines(value: unknown): string[] {
if (Array.isArray(value)) {
return value.map(String).filter(Boolean);
}
const engine = optionalString(value);
return engine ? [engine] : [];
}
function optionalString(value: unknown): string | undefined {
if (value == null) return undefined;
const text = String(value).trim();
return text ? text : undefined;
}
function normalizeText(value: string): string {
return decodeHtmlEntities(value).replace(/\s+/g, " ").trim();
}
function appendWithSpace(left: string, right: string): string {
return left ? `${left} ${right}` : right;
}
function decodeHtmlEntities(value: string): string {
return value
.replace(/&amp;/g, "&")
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">")
.replace(/&nbsp;/g, " ");
}
function dedupe(values: string[]): string[] {
return Array.from(new Set(values));
}
function readEnvInt(key: string, fallback: number): number {
const parsed = Number(Bun.env[key]);
return Number.isFinite(parsed) ? parsed : fallback;
}
function positiveInteger(value: number, fallback: number): number {
return Number.isInteger(value) && value > 0 ? value : fallback;
}