fix(pdf): log non-PDF requests entering the PDF engine

Add warning logs when the PDF engine receives content that isn't a PDF,
capturing the URL, Content-Type, status code, and context. This helps
diagnose cases where non-PDF responses are routed to the PDF engine.

Follow-up to #2915.
This commit is contained in:
Abimael Martell
2026-02-26 15:32:13 -08:00
parent 8747b62449
commit 625f363e7d

View File

@@ -71,6 +71,16 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
const ct = file.response.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) {
// if downloaded file wasn't a PDF
meta.logger.warn(
"Non-PDF content received in PDF engine (buffer path)",
{
url: meta.rewrittenUrl ?? meta.url,
contentType: ct,
statusCode: file.response.status,
pdfPrefetch: meta.pdfPrefetch !== undefined,
hasPdfFeature: meta.featureFlags.has("pdf"),
},
);
if (meta.pdfPrefetch === undefined) {
// for non-PDF URLs, this is expected, not anti-bot
if (!meta.featureFlags.has("pdf")) {
@@ -116,6 +126,13 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
const ct = r.headers.get("Content-Type");
if (ct && !ct.includes("application/pdf")) {
// if downloaded file wasn't a PDF
meta.logger.warn("Non-PDF content received in PDF engine (file path)", {
url: meta.rewrittenUrl ?? meta.url,
contentType: ct,
statusCode: r.status,
pdfPrefetch: meta.pdfPrefetch !== undefined,
hasPdfFeature: meta.featureFlags.has("pdf"),
});
if (meta.pdfPrefetch === undefined) {
// for non-PDF URLs, this is expected, not anti-bot
if (!meta.featureFlags.has("pdf")) {