fix(pdf): log non-PDF requests entering the PDF engine

Add warning logs when the PDF engine receives content that isn't a PDF, capturing the URL, Content-Type, status code, and context. This helps diagnose cases where non-PDF responses are routed to the PDF engine. Follow-up to #2915.
2026-03-13 08:21:20 +08:00 · 2026-02-26 15:32:13 -08:00
parent 8747b62449
commit 625f363e7d
1 changed files with 17 additions and 0 deletions
--- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
@@ -71,6 +71,16 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
      const ct = file.response.headers.get("Content-Type");
      if (ct && !ct.includes("application/pdf")) {
        // if downloaded file wasn't a PDF
+        meta.logger.warn(
+          "Non-PDF content received in PDF engine (buffer path)",
+          {
+            url: meta.rewrittenUrl ?? meta.url,
+            contentType: ct,
+            statusCode: file.response.status,
+            pdfPrefetch: meta.pdfPrefetch !== undefined,
+            hasPdfFeature: meta.featureFlags.has("pdf"),
+          },
+        );
        if (meta.pdfPrefetch === undefined) {
          // for non-PDF URLs, this is expected, not anti-bot
          if (!meta.featureFlags.has("pdf")) {
@@ -116,6 +126,13 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
      const ct = r.headers.get("Content-Type");
      if (ct && !ct.includes("application/pdf")) {
        // if downloaded file wasn't a PDF
+        meta.logger.warn("Non-PDF content received in PDF engine (file path)", {
+          url: meta.rewrittenUrl ?? meta.url,
+          contentType: ct,
+          statusCode: r.status,
+          pdfPrefetch: meta.pdfPrefetch !== undefined,
+          hasPdfFeature: meta.featureFlags.has("pdf"),
+        });
        if (meta.pdfPrefetch === undefined) {
          // for non-PDF URLs, this is expected, not anti-bot
          if (!meta.featureFlags.has("pdf")) {