From ccd8ef42d7d6c5a35640901368d9fee0a8266c28 Mon Sep 17 00:00:00 2001 From: "firecrawl-spring[bot]" <254786068+firecrawl-spring[bot]@users.noreply.github.com> Date: Thu, 12 Mar 2026 06:55:50 +0000 Subject: [PATCH] fix(scrape): fix screenshot format combined with actions failing Three issues caused /v2/scrape to fail when combining screenshot format with actions: 1. blockMedia not disabled for screenshot format: shouldAllowMedia only checked for branding/youtube, so media was blocked during chrome-cdp scrapes with screenshot format, causing fire-engine to fail or produce broken screenshots. 2. Waterfall timing undercount: fireEngineMaxReasonableTime only counted user-specified actions but not format-appended ones (screenshot action, branding script), potentially causing premature engine waterfall. 3. Quality check rejected screenshot-only scrapes: scrapeURLLoopIter deemed scrapes unsuccessful when HTML content was empty, even when screenshot data was successfully captured. This triggered NoEnginesLeftError for pages with minimal HTML. --- .../scraper/scrapeURL/engines/fire-engine/index.ts | 12 +++++++++++- apps/api/src/scraper/scrapeURL/index.ts | 13 ++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 09d1a967d..c4fd26b34 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -322,8 +322,11 @@ export async function scrapeURLWithFireEngineChromeCDP( 0, ); + const hasScreenshotFormat = + hasFormatOfType(meta.options.formats, "screenshot") !== undefined; const shouldAllowMedia = - hasFormatOfType(meta.options.formats, "branding") || + hasScreenshotFormat || + hasBranding || youtubePostprocessor.shouldRun( meta, new URL(meta.rewrittenUrl ?? meta.url), @@ -617,12 +620,19 @@ export function fireEngineMaxReasonableTime( } else if (engine === "playwright") { return (meta.options.waitFor ?? 0) + 30000; } else { + // Account for format-appended actions (screenshot, branding) that are added + // to the actions array sent to fire-engine but aren't in meta.options.actions + const hasScreenshotFormat = + hasFormatOfType(meta.options.formats, "screenshot") !== undefined; + const formatActionsTime = (hasScreenshotFormat ? 5000 : 0) + + (hasBranding ? 250 : 0); return ( effectiveWait + (meta.options.actions?.reduce( (a, x) => (x.type === "wait" ? (x.milliseconds ?? 2500) + a : 250 + a), 0, ) ?? 0) + + formatActionsTime + 30000 ); } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index fff2c14a0..a038b43e8 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -434,14 +434,21 @@ async function scrapeURLLoopIter( // NOTE: TODO: what to do when status code is bad is tough... // we cannot just rely on text because error messages can be brief and not hit the limit // should we just use all the fallbacks and pick the one with the longest text? - mogery - if (isLongEnough || !isGoodStatusCode) { + + // When screenshot format is requested and we got screenshot data back, + // consider the scrape successful even if HTML content is empty. + const hasScreenshotData = + hasFormatOfType(meta.options.formats, "screenshot") !== undefined && + !!engineResult.screenshot; + + if (isLongEnough || !isGoodStatusCode || hasScreenshotData) { meta.logger.info("Scrape via " + engine + " deemed successful.", { - factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, + factors: { isLongEnough, isGoodStatusCode, hasNoPageError, hasScreenshotData }, }); return engineResult; } else { meta.logger.warn("Scrape via " + engine + " deemed unsuccessful.", { - factors: { isLongEnough, isGoodStatusCode, hasNoPageError }, + factors: { isLongEnough, isGoodStatusCode, hasNoPageError, hasScreenshotData }, length: engineResult.html?.trim().length ?? 0, }); throw new EngineUnsuccessfulError(engine);