diff --git a/browser_use/agent/system_prompts/system_prompt.md b/browser_use/agent/system_prompts/system_prompt.md index 104121dc7..5be700861 100644 --- a/browser_use/agent/system_prompts/system_prompt.md +++ b/browser_use/agent/system_prompts/system_prompt.md @@ -77,6 +77,7 @@ Strictly follow these rules while using the browser and navigating the web: - You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible. - Call extract only if the information you are looking for is not visible in your otherwise always just use the needed text from the . - Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool. +- When collecting a large set of items (products, venues, records, etc.) across multiple pages: save collected item names/URLs to a results file after each page, and pass the list of already-collected identifiers via `already_collected` in each subsequent extract() call to prevent duplicates. Before calling done, deduplicate your results file. - Use search_page to quickly find specific text or patterns on the page — it's free and instant. Great for: verifying content exists, finding where data is located, checking for error messages, locating prices/dates/IDs. - Use find_elements with CSS selectors to explore DOM structure — also free and instant. Great for: counting items (e.g. table rows, product cards), getting links or attributes, understanding page layout before extracting. - Prefer search_page and find_elements over scrolling when looking for specific content not visible in browser_state. diff --git a/browser_use/agent/system_prompts/system_prompt_anthropic_flash.md b/browser_use/agent/system_prompts/system_prompt_anthropic_flash.md index bf9bbdc74..0897a58ad 100644 --- a/browser_use/agent/system_prompts/system_prompt_anthropic_flash.md +++ b/browser_use/agent/system_prompts/system_prompt_anthropic_flash.md @@ -36,6 +36,7 @@ Strictly follow these rules while using the browser and navigating the web: - You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible. - Call extract only if the information you are looking for is not visible in your otherwise always just use the needed text from the . - Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool. +- When collecting a large set of items (products, venues, records, etc.) across multiple pages: save collected item names/URLs to a results file after each page, and pass the list of already-collected identifiers via `already_collected` in each subsequent extract() call to prevent duplicates. Before calling done, deduplicate your results file. - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. - If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step. - If the includes specific page information such as product type, rating, price, location, etc., ALWAYS look for filter/sort options FIRST before browsing results. Apply all relevant filters before scrolling through results. This is critical for efficiency. diff --git a/browser_use/agent/system_prompts/system_prompt_no_thinking.md b/browser_use/agent/system_prompts/system_prompt_no_thinking.md index 71838e306..8b57b5f5a 100644 --- a/browser_use/agent/system_prompts/system_prompt_no_thinking.md +++ b/browser_use/agent/system_prompts/system_prompt_no_thinking.md @@ -70,6 +70,7 @@ Strictly follow these rules while using the browser and navigating the web: - You can call extract on specific pages to gather structured semantic information from the entire page, including parts not currently visible. - Call extract only if the information you are looking for is not visible in your otherwise always just use the needed text from the . - Calling the extract tool is expensive! DO NOT query the same page with the same extract query multiple times. Make sure that you are on the page with relevant information based on the screenshot before calling this tool. +- When collecting a large set of items (products, venues, records, etc.) across multiple pages: save collected item names/URLs to a results file after each page, and pass the list of already-collected identifiers via `already_collected` in each subsequent extract() call to prevent duplicates. Before calling done, deduplicate your results file. - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. - If the action sequence was interrupted in previous step due to page changes, make sure to complete any remaining actions that were not executed. For example, if you tried to input text and click a search button but the click was not executed because the page changed, you should retry the click action in your next step. - If the includes specific page information such as product type, rating, price, location, etc., ALWAYS look for filter/sort options FIRST before browsing results. Apply all relevant filters before scrolling through results. diff --git a/browser_use/tools/service.py b/browser_use/tools/service.py index bab8aa221..5e0a40b30 100644 --- a/browser_use/tools/service.py +++ b/browser_use/tools/service.py @@ -964,7 +964,7 @@ class Tools(Generic[Context]): ) @self.registry.action( - """LLM extracts structured data from page markdown. Use when: on right page, know what to extract, haven't called before on same page+query. Can't get interactive elements. Set extract_links=True for URLs. Use start_from_char if previous extraction was truncated to extract data further down the page.""", + """LLM extracts structured data from page markdown. Use when: on right page, know what to extract, haven't called before on same page+query. Can't get interactive elements. Set extract_links=True for URLs. Use start_from_char if previous extraction was truncated to extract data further down the page. When paginating across pages, pass already_collected with item identifiers (names/URLs) from prior pages to avoid duplicates.""", param_model=ExtractAction, ) async def extract( @@ -980,6 +980,9 @@ class Tools(Generic[Context]): extract_links = params['extract_links'] if isinstance(params, dict) else params.extract_links start_from_char = params['start_from_char'] if isinstance(params, dict) else params.start_from_char output_schema: dict | None = params.get('output_schema') if isinstance(params, dict) else params.output_schema + already_collected: list[str] = ( + params.get('already_collected', []) if isinstance(params, dict) else params.already_collected + ) # If the LLM didn't provide an output_schema, use the agent-injected extraction_schema if output_schema is None and extraction_schema is not None: @@ -1066,15 +1069,20 @@ You will be given a query, a JSON Schema, and the markdown of a webpage that has - Your response MUST conform to the provided JSON Schema exactly. - If a required field's value cannot be found on the page, use null (if the schema allows it) or an empty string / empty array as appropriate. - If the content was truncated, extract what is available from the visible portion. +- If items are provided, skip any items whose name/title/URL matches those listed — do not include duplicates. """.strip() schema_json = json.dumps(output_schema, indent=2) + already_collected_section = '' + if already_collected: + items_str = '\n'.join(f'- {item}' for item in already_collected[:100]) + already_collected_section = f'\n\n\nSkip items whose name/title/URL matches any of these already-collected identifiers:\n{items_str}\n' prompt = ( f'\n{query}\n\n\n' f'\n{schema_json}\n\n\n' f'\n{stats_summary}\n\n\n' - f'\n{content}\n' + f'\n{content}\n' + already_collected_section ) try: @@ -1138,6 +1146,7 @@ You will be given a query and the markdown of a webpage that has been filtered t - If the information relevant to the query is not available in the page, your response should mention that. - If the query asks for all items, products, etc., make sure to directly list all of them. - If the content was truncated and you need more information, note that the user can use start_from_char parameter to continue from where truncation occurred. +- If items are provided, exclude any results whose name/title/URL matches those already collected — do not include duplicates. @@ -1146,7 +1155,14 @@ You will be given a query and the markdown of a webpage that has been filtered t """.strip() - prompt = f'\n{query}\n\n\n\n{stats_summary}\n\n\n\n{content}\n' + already_collected_section = '' + if already_collected: + items_str = '\n'.join(f'- {item}' for item in already_collected[:100]) + already_collected_section = f'\n\n\nSkip items whose name/title/URL matches any of these already-collected identifiers:\n{items_str}\n' + prompt = ( + f'\n{query}\n\n\n\n{stats_summary}\n\n\n\n{content}\n' + + already_collected_section + ) try: response = await asyncio.wait_for( diff --git a/browser_use/tools/views.py b/browser_use/tools/views.py index 9aec8fc0f..a8102ecf9 100644 --- a/browser_use/tools/views.py +++ b/browser_use/tools/views.py @@ -17,6 +17,10 @@ class ExtractAction(BaseModel): default=None, description='Optional JSON Schema dict. When provided, extraction returns validated JSON matching this schema instead of free-text.', ) + already_collected: list[str] = Field( + default_factory=list, + description='Item identifiers (name, URL, or ID) already collected in prior extract calls on other pages. The extractor will skip items matching these to prevent duplicates. Use when paginating across multiple pages.', + ) class SearchPageAction(BaseModel):