feat: enhance README and index.ts for large file processing and output management

2026-04-11 16:57:27 -04:00
parent 0162e54007
commit 4386560964
2 changed files with 143 additions and 84 deletions
--- a/README.md
+++ b/README.md
@@ -31,6 +31,12 @@ bun run src/index.ts leads.xlsx
 bun run src/index.ts leads.csv --out results.xlsx
 ```

+Large-file behavior:
+
+- If the input has more than 50 products, processing is done in chunks of 50.
+- Each chunk is analyzed and written to a numbered output file, for example: `results_part_001.xlsx`, `results_part_002.xlsx`, ...
+- If `--out` is omitted for large files, the base output name defaults to `<input>_results.xlsx` and chunk files are still written with numbered suffixes.
+
 Quick SP-API connectivity tests:

 ```bash
@@ -90,8 +96,9 @@ Numeric parsing accepts plain numbers as well as formatted values like `$12.50`,
 3. **Sellability gate** — check all uncached ASINs against SP-API `getListingsRestrictions` (concurrency: 5 workers); immediately skip ASINs with status `not_available` and `canSell=false` (no Keepa/fees wasted)
 4. **Keepa fetch** — batch the sellable (uncached) ASINs in a single API call (up to 100 per request)
 5. **Enrich** — fetch SP-API pricing + FBA/FBM fees for sellable ASINs; combine with Keepa data and spreadsheet data
-6. **LLM analysis** — send batches of 5 sellable products to LM Studio for FBA/FBM/SKIP verdict; skipped ASINs get auto-SKIP verdict (confidence 100) and bypass LLM entirely
-7. **Output** — print results table to console (includes all ASINs), optionally write CSV/XLSX
+6. **LLM analysis** — send batches of 5 available products to LM Studio for FBA/FBM/SKIP verdict
+7. **Chunk orchestration** — if input size is greater than 50, run phases 2-6 for each 50-item chunk sequentially
+8. **Output** — print results table to console (includes all ASINs); for chunked runs, always write seriated chunk files (`*_part_001`, `*_part_002`, ...); for non-chunked runs, write a single file only when `--out` is provided

 ## Output columns

@@ -119,11 +126,10 @@ ASIN, Name, Brand, Category, Unit Cost, Current Price, Avg Price 90d, Sales Rank

 ## Notes

- **Sellability-first optimization**: SP-API `getListingsRestrictions` is checked first to filter out unsellable items before consuming Keepa tokens or running full SP-API pricing/fees queries. This saves API calls and reduces runtime for large lead lists.
+- **Available-only processing**: SP-API `getListingsRestrictions` is checked first and only ASINs with `sellabilityStatus=available` are enriched, analyzed, and included in outputs. Restricted, not_available, and unknown items are excluded.
 - **SP-API concurrency**: `fetchSellabilityBatch` limits concurrent requests to 5 workers to avoid 429 throttling. Pricing+fees fetches also use 5 concurrent workers.
 - **No batch endpoint**: Amazon SP-API does not provide batch endpoints for `getListingsRestrictions` or `getMyFeesEstimate*`. Concurrency limiting with the library's built-in `auto_request_throttled` safety net prevents overwhelming the API.
 - **Keepa rate limiting**: The client reads `tokensLeft` and `refillRate` from each API response and waits automatically when tokens are exhausted. With a Pro subscription (1 token/min), all 100 ASINs in a batch cost 1 token.
 - **Redis is optional**: If Redis is unavailable the tool runs without caching — every run re-fetches from Keepa.
 - **SP-API**: `src/sp-api.ts` provides `fetchSellability`, `fetchSellabilityBatch`, and `fetchSpApiPricingAndFees` functions. If SP-API credentials are missing or a call fails, the tool falls back to conservative fee defaults and keeps processing.
- **Skipped ASINs**: Products with `not_available` sellability status and `canSell=false` appear in output with verdict `SKIP`, confidence 100, and reasoning from the sellability check. They do not consume LLM inference.
 - **Sandbox vs production**: When `SP_API_USE_SANDBOX=true`, production ASIN calls can be denied. Use sandbox-compatible test data or set it to `false` for live marketplace connectivity.
--- a/src/index.ts
+++ b/src/index.ts
@@ -4,15 +4,18 @@ import { fetchSellabilityBatch, fetchSpApiPricingAndFees } from "./sp-api.ts";
 import { connectCache, getCache, setCache, disconnectCache } from "./cache.ts";
 import { analyzeProducts } from "./llm.ts";
 import { printResults, writeResultsCsv } from "./writer.ts";
+import path from "node:path";
 import type {
  EnrichedProduct,
  AnalysisResult,
  KeepaData,
  ProductRecord,
  SellabilityInfo,
+  SpApiData,
 } from "./types.ts";

 const LLM_BATCH_SIZE = 5;
+const INPUT_BATCH_SIZE = 50;

 function parseArgs(): { inputFile: string; outputFile?: string } {
  const args = process.argv.slice(2);
@@ -29,41 +32,67 @@ function parseArgs(): { inputFile: string; outputFile?: string } {
  return { inputFile, outputFile };
 }

-async function main() {
-  const { inputFile, outputFile } = parseArgs();
-
-  console.log("Connecting to Redis...");
-  await connectCache();
-
-  // Phase 1: Read input file
-  console.log(`\nReading ${inputFile}...`);
-  const products = readProducts(inputFile);
-
-  if (products.length === 0) {
-    console.error("No valid products found in input file.");
-    process.exit(1);
+function chunkArray<T>(items: T[], chunkSize: number): T[][] {
+  const chunks: T[][] = [];
+  for (let i = 0; i < items.length; i += chunkSize) {
+    chunks.push(items.slice(i, i + chunkSize));
  }
+  return chunks;
+}

-  // Phase 2: Check cache for all ASINs
+function resolveBaseOutputPath(inputFile: string, outputFile?: string): string {
+  if (outputFile) return outputFile;
+
+  const parsedInput = path.parse(inputFile);
+  return path.join(parsedInput.dir, `${parsedInput.name}_results.xlsx`);
+}
+
+function buildChunkOutputPath(
+  baseOutputPath: string,
+  chunkIndex: number,
+): string {
+  const parsed = path.parse(baseOutputPath);
+  const extension = parsed.ext || ".xlsx";
+  const chunkSuffix = String(chunkIndex + 1).padStart(3, "0");
+  return path.join(
+    parsed.dir,
+    `${parsed.name}_part_${chunkSuffix}${extension}`,
+  );
+}
+
+async function processProductChunk(
+  products: ProductRecord[],
+): Promise<AnalysisResult[]> {
+  // Phase 2: Check cache for all ASINs in chunk
  console.log(`\nChecking cache for ${products.length} products...`);
  const cached = new Map<string, EnrichedProduct>();
+  const excludedCachedAsins = new Set<string>();
  const uncachedProducts: ProductRecord[] = [];

  for (const p of products) {
    const hit = await getCache(p.asin);
    if (hit) {
-      console.log(`  [cache hit] ${p.asin}`);
-      cached.set(p.asin, hit);
+      if (hit.spApi.sellabilityStatus === "available") {
+        console.log(`  [cache hit] ${p.asin}`);
+        cached.set(p.asin, hit);
+      } else {
+        excludedCachedAsins.add(p.asin);
+        console.log(
+          `  [exclude cached] ${p.asin} — status=${hit.spApi.sellabilityStatus}`,
+        );
+      }
    } else {
      uncachedProducts.push(p);
    }
  }
-  console.log(`${cached.size} cached, ${uncachedProducts.length} to fetch`);
+  console.log(
+    `${cached.size} cached available, ${excludedCachedAsins.size} cached excluded, ${uncachedProducts.length} to fetch`,
+  );

-  // Phase 3: Sellability gate — check all uncached ASINs before anything else
+  // Phase 3: Sellability gate — check uncached ASINs before anything else
  const sellabilityMap = new Map<string, SellabilityInfo>();
-  const sellableProducts: ProductRecord[] = [];
-  const skippedProducts: ProductRecord[] = [];
+  const availableProducts: ProductRecord[] = [];
+  const unavailableProducts: ProductRecord[] = [];

  if (uncachedProducts.length > 0) {
    console.log(
@@ -81,50 +110,46 @@ async function main() {
      };
      sellabilityMap.set(p.asin, info);

-      // Keep: available, restricted (can request approval), unknown (proceed cautiously)
-      // Skip: not_available with canSell explicitly false
-      if (
-        info.sellabilityStatus === "not_available" &&
-        info.canSell === false
-      ) {
-        skippedProducts.push(p);
+      // Keep only ASINs that are explicitly available.
+      if (info.sellabilityStatus === "available") {
+        availableProducts.push(p);
        console.log(
-          `  [skip] ${p.asin} — ${info.sellabilityReason ?? "not available"}`,
+          `  [available] ${p.asin} — status=${info.sellabilityStatus}`,
        );
      } else {
-        sellableProducts.push(p);
+        unavailableProducts.push(p);
        console.log(
-          `  [sellable] ${p.asin} — status=${info.sellabilityStatus}`,
+          `  [exclude] ${p.asin} — status=${info.sellabilityStatus}, reason=${info.sellabilityReason ?? "n/a"}`,
        );
      }
    }

    console.log(
-      `\nSellability gate: ${sellableProducts.length} sellable, ${skippedProducts.length} skipped`,
+      `\nSellability gate: ${availableProducts.length} available, ${unavailableProducts.length} excluded`,
    );
  }

-  // Phase 4: Keepa batch fetch — only for sellable (uncached) ASINs
+  // Phase 4: Keepa batch fetch — only for available (uncached) ASINs
  let keepaResults = new Map<string, KeepaData>();
-  if (sellableProducts.length > 0) {
-    console.log(`\nFetching ${sellableProducts.length} ASINs from Keepa...`);
+  if (availableProducts.length > 0) {
+    console.log(`\nFetching ${availableProducts.length} ASINs from Keepa...`);
    try {
      keepaResults = await fetchKeepaDataBatch(
-        sellableProducts.map((p) => p.asin),
+        availableProducts.map((p) => p.asin),
      );
    } catch (err) {
      console.warn(`Keepa batch fetch failed: ${err}`);
    }
  }

-  // Phase 5: SP-API pricing + fees — only for sellable ASINs
+  // Phase 5: SP-API pricing + fees — only for available ASINs
  console.log(
-    `\nFetching pricing & fees for ${sellableProducts.length} ASINs...`,
+    `\nFetching pricing & fees for ${availableProducts.length} ASINs...`,
  );
-  const spApiResults = new Map<string, import("./types.ts").SpApiData>();
+  const spApiResults = new Map<string, SpApiData>();

  // Concurrency-limited pricing+fees fetches
-  const pricingQueue = [...sellableProducts];
+  const pricingQueue = [...availableProducts];
  let pricingDone = 0;

  async function fetchNextPricing(): Promise<void> {
@@ -140,16 +165,16 @@ async function main() {

      spApiResults.set(p.asin, spApi);
      pricingDone++;
-      if (pricingDone % 10 === 0 || pricingDone === sellableProducts.length) {
+      if (pricingDone % 10 === 0 || pricingDone === availableProducts.length) {
        console.log(
-          `  [pricing] ${pricingDone}/${sellableProducts.length} fetched`,
+          `  [pricing] ${pricingDone}/${availableProducts.length} fetched`,
        );
      }
    }
  }

  const pricingWorkers = Array.from(
-    { length: Math.min(5, sellableProducts.length || 1) },
+    { length: Math.min(5, availableProducts.length || 1) },
    () => fetchNextPricing(),
  );
  await Promise.all(pricingWorkers);
@@ -157,9 +182,13 @@ async function main() {
  // Phase 6: Build enriched products
  console.log(`\nEnriching products...`);
  const enriched: EnrichedProduct[] = [];
-  const autoSkipResults: AnalysisResult[] = [];
+  const availableAsins = new Set(availableProducts.map((ap) => ap.asin));

  for (const p of products) {
+    if (excludedCachedAsins.has(p.asin)) {
+      continue;
+    }
+
    // Cached products — already enriched
    const cachedProduct = cached.get(p.asin);
    if (cachedProduct) {
@@ -167,38 +196,12 @@ async function main() {
      continue;
    }

-    // Skipped products — not sellable, auto-SKIP
-    if (skippedProducts.some((sp) => sp.asin === p.asin)) {
-      const sellability = sellabilityMap.get(p.asin)!;
-      const product: EnrichedProduct = {
-        record: p,
-        keepa: null,
-        spApi: {
-          fbaFee: 0,
-          fbmFee: 0,
-          referralFeePercent: 15,
-          estimatedSalePrice: 0,
-          ...sellability,
-        },
-        fetchedAt: new Date().toISOString(),
-      };
-      autoSkipResults.push({
-        product,
-        verdict: {
-          asin: p.asin,
-          verdict: "SKIP",
-          confidence: 100,
-          reasoning:
-            `Not sellable: ${sellability.sellabilityReason ?? sellability.sellabilityStatus}`.slice(
-              0,
-              100,
-            ),
-        },
-      });
+    // Exclude products that are not explicitly available.
+    if (!availableAsins.has(p.asin)) {
      continue;
    }

-    // Sellable products — full enrichment
+    // Available products — full enrichment
    const keepa = keepaResults.get(p.asin) ?? null;
    const spApi = spApiResults.get(p.asin) ?? {
      fbaFee: 5.0,
@@ -229,7 +232,7 @@ async function main() {
    }
  }

-  // Phase 7: LLM analysis in batches — only for enriched (sellable + cached) products
+  // Phase 7: LLM analysis in batches — only for enriched available products
  console.log(
    `\nAnalyzing ${enriched.length} products via LLM (batch size: ${LLM_BATCH_SIZE})...\n`,
  );
@@ -274,16 +277,66 @@ async function main() {
    }
  }

-  // Merge: LLM-analyzed results + auto-skipped results
-  const allResults = [...results, ...autoSkipResults];
+  return results;
+}

-  printResults(allResults);
+async function main() {
+  const { inputFile, outputFile } = parseArgs();

-  if (outputFile) {
-    writeResultsCsv(allResults, outputFile);
+  console.log("Connecting to Redis...");
+  await connectCache();
+
+  try {
+    // Phase 1: Read input file
+    console.log(`\nReading ${inputFile}...`);
+    const products = readProducts(inputFile);
+
+    if (products.length === 0) {
+      console.error("No valid products found in input file.");
+      process.exit(1);
+    }
+
+    const productChunks = chunkArray(products, INPUT_BATCH_SIZE);
+    const hasMultipleChunks = productChunks.length > 1;
+    const shouldWriteChunkFiles = hasMultipleChunks;
+    const resolvedBaseOutputPath = resolveBaseOutputPath(inputFile, outputFile);
+    const allResults: AnalysisResult[] = [];
+
+    if (hasMultipleChunks) {
+      console.log(
+        `\nLarge input detected (${products.length} products). Processing in chunks of ${INPUT_BATCH_SIZE}.`,
+      );
+      console.log(
+        `Chunk outputs will be written as numbered files using base: ${resolvedBaseOutputPath}`,
+      );
+    }
+
+    for (let chunkIndex = 0; chunkIndex < productChunks.length; chunkIndex++) {
+      const chunk = productChunks[chunkIndex]!;
+      console.log(
+        `\n=== Input chunk ${chunkIndex + 1}/${productChunks.length} (${chunk.length} products) ===`,
+      );
+
+      const chunkResults = await processProductChunk(chunk);
+      allResults.push(...chunkResults);
+
+      if (shouldWriteChunkFiles) {
+        const chunkOutputPath = buildChunkOutputPath(
+          resolvedBaseOutputPath,
+          chunkIndex,
+        );
+        writeResultsCsv(chunkResults, chunkOutputPath);
+      }
+    }
+
+    printResults(allResults);
+
+    if (!hasMultipleChunks && outputFile) {
+      writeResultsCsv(allResults, outputFile);
+    }
+  } finally {
+    await disconnectCache();
  }
-
-  await disconnectCache();
 }

 main().catch((err) => {