mirror of
https://github.com/firecrawl/firecrawl.git
synced 2026-03-13 08:21:20 +08:00
fix(scrape): fix screenshot format combined with actions failing
Three issues caused /v2/scrape to fail when combining screenshot format with actions: 1. blockMedia not disabled for screenshot format: shouldAllowMedia only checked for branding/youtube, so media was blocked during chrome-cdp scrapes with screenshot format, causing fire-engine to fail or produce broken screenshots. 2. Waterfall timing undercount: fireEngineMaxReasonableTime only counted user-specified actions but not format-appended ones (screenshot action, branding script), potentially causing premature engine waterfall. 3. Quality check rejected screenshot-only scrapes: scrapeURLLoopIter deemed scrapes unsuccessful when HTML content was empty, even when screenshot data was successfully captured. This triggered NoEnginesLeftError for pages with minimal HTML.
This commit is contained in:
@@ -322,8 +322,11 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
0,
|
||||
);
|
||||
|
||||
const hasScreenshotFormat =
|
||||
hasFormatOfType(meta.options.formats, "screenshot") !== undefined;
|
||||
const shouldAllowMedia =
|
||||
hasFormatOfType(meta.options.formats, "branding") ||
|
||||
hasScreenshotFormat ||
|
||||
hasBranding ||
|
||||
youtubePostprocessor.shouldRun(
|
||||
meta,
|
||||
new URL(meta.rewrittenUrl ?? meta.url),
|
||||
@@ -617,12 +620,19 @@ export function fireEngineMaxReasonableTime(
|
||||
} else if (engine === "playwright") {
|
||||
return (meta.options.waitFor ?? 0) + 30000;
|
||||
} else {
|
||||
// Account for format-appended actions (screenshot, branding) that are added
|
||||
// to the actions array sent to fire-engine but aren't in meta.options.actions
|
||||
const hasScreenshotFormat =
|
||||
hasFormatOfType(meta.options.formats, "screenshot") !== undefined;
|
||||
const formatActionsTime = (hasScreenshotFormat ? 5000 : 0) +
|
||||
(hasBranding ? 250 : 0);
|
||||
return (
|
||||
effectiveWait +
|
||||
(meta.options.actions?.reduce(
|
||||
(a, x) => (x.type === "wait" ? (x.milliseconds ?? 2500) + a : 250 + a),
|
||||
0,
|
||||
) ?? 0) +
|
||||
formatActionsTime +
|
||||
30000
|
||||
);
|
||||
}
|
||||
|
||||
@@ -434,14 +434,21 @@ async function scrapeURLLoopIter(
|
||||
// NOTE: TODO: what to do when status code is bad is tough...
|
||||
// we cannot just rely on text because error messages can be brief and not hit the limit
|
||||
// should we just use all the fallbacks and pick the one with the longest text? - mogery
|
||||
if (isLongEnough || !isGoodStatusCode) {
|
||||
|
||||
// When screenshot format is requested and we got screenshot data back,
|
||||
// consider the scrape successful even if HTML content is empty.
|
||||
const hasScreenshotData =
|
||||
hasFormatOfType(meta.options.formats, "screenshot") !== undefined &&
|
||||
!!engineResult.screenshot;
|
||||
|
||||
if (isLongEnough || !isGoodStatusCode || hasScreenshotData) {
|
||||
meta.logger.info("Scrape via " + engine + " deemed successful.", {
|
||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError, hasScreenshotData },
|
||||
});
|
||||
return engineResult;
|
||||
} else {
|
||||
meta.logger.warn("Scrape via " + engine + " deemed unsuccessful.", {
|
||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError },
|
||||
factors: { isLongEnough, isGoodStatusCode, hasNoPageError, hasScreenshotData },
|
||||
length: engineResult.html?.trim().length ?? 0,
|
||||
});
|
||||
throw new EngineUnsuccessfulError(engine);
|
||||
|
||||
Reference in New Issue
Block a user