mirror of
https://github.com/firecrawl/firecrawl.git
synced 2026-03-13 08:21:20 +08:00
fix(pdf): log non-PDF requests entering the PDF engine
Add warning logs when the PDF engine receives content that isn't a PDF, capturing the URL, Content-Type, status code, and context. This helps diagnose cases where non-PDF responses are routed to the PDF engine. Follow-up to #2915.
This commit is contained in:
@@ -71,6 +71,16 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const ct = file.response.headers.get("Content-Type");
|
||||
if (ct && !ct.includes("application/pdf")) {
|
||||
// if downloaded file wasn't a PDF
|
||||
meta.logger.warn(
|
||||
"Non-PDF content received in PDF engine (buffer path)",
|
||||
{
|
||||
url: meta.rewrittenUrl ?? meta.url,
|
||||
contentType: ct,
|
||||
statusCode: file.response.status,
|
||||
pdfPrefetch: meta.pdfPrefetch !== undefined,
|
||||
hasPdfFeature: meta.featureFlags.has("pdf"),
|
||||
},
|
||||
);
|
||||
if (meta.pdfPrefetch === undefined) {
|
||||
// for non-PDF URLs, this is expected, not anti-bot
|
||||
if (!meta.featureFlags.has("pdf")) {
|
||||
@@ -116,6 +126,13 @@ export async function scrapePDF(meta: Meta): Promise<EngineScrapeResult> {
|
||||
const ct = r.headers.get("Content-Type");
|
||||
if (ct && !ct.includes("application/pdf")) {
|
||||
// if downloaded file wasn't a PDF
|
||||
meta.logger.warn("Non-PDF content received in PDF engine (file path)", {
|
||||
url: meta.rewrittenUrl ?? meta.url,
|
||||
contentType: ct,
|
||||
statusCode: r.status,
|
||||
pdfPrefetch: meta.pdfPrefetch !== undefined,
|
||||
hasPdfFeature: meta.featureFlags.has("pdf"),
|
||||
});
|
||||
if (meta.pdfPrefetch === undefined) {
|
||||
// for non-PDF URLs, this is expected, not anti-bot
|
||||
if (!meta.featureFlags.has("pdf")) {
|
||||
|
||||
Reference in New Issue
Block a user