fix(scrape): fix screenshot format combined with actions failing

Three issues caused /v2/scrape to fail when combining screenshot format
with actions:

1. blockMedia not disabled for screenshot format: shouldAllowMedia only
   checked for branding/youtube, so media was blocked during chrome-cdp
   scrapes with screenshot format, causing fire-engine to fail or produce
   broken screenshots.

2. Waterfall timing undercount: fireEngineMaxReasonableTime only counted
   user-specified actions but not format-appended ones (screenshot action,
   branding script), potentially causing premature engine waterfall.

3. Quality check rejected screenshot-only scrapes: scrapeURLLoopIter
   deemed scrapes unsuccessful when HTML content was empty, even when
   screenshot data was successfully captured. This triggered
   NoEnginesLeftError for pages with minimal HTML.
This commit is contained in:
firecrawl-spring[bot]
2026-03-12 06:55:50 +00:00
parent 7f9b816e82
commit ccd8ef42d7
2 changed files with 21 additions and 4 deletions

View File

@@ -322,8 +322,11 @@ export async function scrapeURLWithFireEngineChromeCDP(
0,
);
const hasScreenshotFormat =
hasFormatOfType(meta.options.formats, "screenshot") !== undefined;
const shouldAllowMedia =
hasFormatOfType(meta.options.formats, "branding") ||
hasScreenshotFormat ||
hasBranding ||
youtubePostprocessor.shouldRun(
meta,
new URL(meta.rewrittenUrl ?? meta.url),
@@ -617,12 +620,19 @@ export function fireEngineMaxReasonableTime(
} else if (engine === "playwright") {
return (meta.options.waitFor ?? 0) + 30000;
} else {
// Account for format-appended actions (screenshot, branding) that are added
// to the actions array sent to fire-engine but aren't in meta.options.actions
const hasScreenshotFormat =
hasFormatOfType(meta.options.formats, "screenshot") !== undefined;
const formatActionsTime = (hasScreenshotFormat ? 5000 : 0) +
(hasBranding ? 250 : 0);
return (
effectiveWait +
(meta.options.actions?.reduce(
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 2500) + a : 250 + a),
0,
) ?? 0) +
formatActionsTime +
30000
);
}

View File

@@ -434,14 +434,21 @@ async function scrapeURLLoopIter(
// NOTE: TODO: what to do when status code is bad is tough...
// we cannot just rely on text because error messages can be brief and not hit the limit
// should we just use all the fallbacks and pick the one with the longest text? - mogery
if (isLongEnough || !isGoodStatusCode) {
// When screenshot format is requested and we got screenshot data back,
// consider the scrape successful even if HTML content is empty.
const hasScreenshotData =
hasFormatOfType(meta.options.formats, "screenshot") !== undefined &&
!!engineResult.screenshot;
if (isLongEnough || !isGoodStatusCode || hasScreenshotData) {
meta.logger.info("Scrape via " + engine + " deemed successful.", {
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
factors: { isLongEnough, isGoodStatusCode, hasNoPageError, hasScreenshotData },
});
return engineResult;
} else {
meta.logger.warn("Scrape via " + engine + " deemed unsuccessful.", {
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
factors: { isLongEnough, isGoodStatusCode, hasNoPageError, hasScreenshotData },
length: engineResult.html?.trim().length ?? 0,
});
throw new EngineUnsuccessfulError(engine);