Skip to content

Commit

Permalink
WebSurfer Documentation and Fixes (#4624)
Browse files Browse the repository at this point in the history
* fixes remake
* modified toml adding tests
* playwright tests
* tests websurfer
---------

Co-authored-by: Ryan Sweet <[email protected]>
  • Loading branch information
husseinmozannar and rysweet authored Dec 12, 2024
1 parent 4085ba5 commit b9d682c
Show file tree
Hide file tree
Showing 7 changed files with 1,125 additions and 287 deletions.
7 changes: 6 additions & 1 deletion python/packages/autogen-ext/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ web-surfer = [
"autogen-agentchat==0.4.0.dev11",
"playwright>=1.48.0",
"pillow>=11.0.0",
"markitdown>=0.0.1a2",
]
magentic-one = [
"autogen-agentchat==0.4.0.dev11",
Expand Down Expand Up @@ -77,7 +78,11 @@ testpaths = ["tests"]
include = "../../shared_tasks.toml"

[tool.poe.tasks]
test = "pytest -n auto"
test.sequence = [
"playwright install",
"pytest -n auto",
]
test.default_item_type = "cmd"
mypy = "mypy --config-file ../../pyproject.toml --exclude src/autogen_ext/runtimes/grpc/protos --exclude tests/protos src tests"

[tool.mypy]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from ._multimodal_web_surfer import MultimodalWebSurfer
from .playwright_controller import PlaywrightController

__all__ = ["MultimodalWebSurfer"]
__all__ = ["MultimodalWebSurfer", "PlaywrightController"]

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import asyncio
import base64
import io
import os
import random
from typing import Any, Callable, Dict, Optional, Tuple, Union, cast

# TODO: Fix unfollowed import
try:
from markitdown import MarkItDown # type: ignore
except ImportError:
MarkItDown = None
from playwright._impl._errors import Error as PlaywrightError
from playwright._impl._errors import TimeoutError
from playwright.async_api import Download, Page
Expand All @@ -17,24 +23,36 @@


class PlaywrightController:
"""
A helper class to allow Playwright to interact with web pages to perform actions such as clicking, filling, and scrolling.
Args:
downloads_folder (str | None): The folder to save downloads to. If None, downloads are not saved.
animate_actions (bool): Whether to animate the actions (create fake cursor to click).
viewport_width (int): The width of the viewport.
viewport_height (int): The height of the viewport.
_download_handler (Optional[Callable[[Download], None]]): A function to handle downloads.
to_resize_viewport (bool): Whether to resize the viewport
"""

def __init__(
self,
downloads_folder: str | None = None,
animate_actions: bool = False,
downloads_folder: Optional[str] = None,
viewport_width: int = 1440,
viewport_height: int = 900,
_download_handler: Optional[Callable[[Download], None]] = None,
to_resize_viewport: bool = True,
) -> None:
"""
A controller for Playwright to interact with web pages.
animate_actions: If True, actions will be animated.
downloads_folder: The folder to save downloads to.
viewport_width: The width of the viewport.
viewport_height: The height of the viewport.
_download_handler: A handler for downloads.
to_resize_viewport: If True, the viewport will be resized.
Initialize the PlaywrightController.
"""
assert isinstance(animate_actions, bool)
assert isinstance(viewport_width, int)
assert isinstance(viewport_height, int)
assert viewport_height > 0
assert viewport_width > 0

self.animate_actions = animate_actions
self.downloads_folder = downloads_folder
self.viewport_width = viewport_width
Expand All @@ -43,16 +61,33 @@ def __init__(
self.to_resize_viewport = to_resize_viewport
self._page_script: str = ""
self.last_cursor_position: Tuple[float, float] = (0.0, 0.0)
self._markdown_converter: Optional[Any] | None = None

# Read page_script
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
self._page_script = fh.read()

async def sleep(self, page: Page, duration: Union[int, float]) -> None:
"""
Pause the execution for a specified duration.
Args:
page (Page): The Playwright page object.
duration (Union[int, float]): The duration to sleep in milliseconds.
"""
assert page is not None
await page.wait_for_timeout(duration * 1000)

async def get_interactive_rects(self, page: Page) -> Dict[str, InteractiveRegion]:
"""
Retrieve interactive regions from the web page.
Args:
page (Page): The Playwright page object.
Returns:
Dict[str, InteractiveRegion]: A dictionary of interactive regions.
"""
assert page is not None
# Read the regions from the DOM
try:
Expand All @@ -71,6 +106,15 @@ async def get_interactive_rects(self, page: Page) -> Dict[str, InteractiveRegion
return typed_results

async def get_visual_viewport(self, page: Page) -> VisualViewport:
"""
Retrieve the visual viewport of the web page.
Args:
page (Page): The Playwright page object.
Returns:
VisualViewport: The visual viewport of the page.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
Expand All @@ -79,6 +123,15 @@ async def get_visual_viewport(self, page: Page) -> VisualViewport:
return visualviewport_from_dict(await page.evaluate("MultimodalWebSurfer.getVisualViewport();"))

async def get_focused_rect_id(self, page: Page) -> str:
"""
Retrieve the ID of the currently focused element.
Args:
page (Page): The Playwright page object.
Returns:
str: The ID of the focused element.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
Expand All @@ -88,6 +141,15 @@ async def get_focused_rect_id(self, page: Page) -> str:
return str(result)

async def get_page_metadata(self, page: Page) -> Dict[str, Any]:
"""
Retrieve metadata from the web page.
Args:
page (Page): The Playwright page object.
Returns:
Dict[str, Any]: A dictionary of page metadata.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
Expand All @@ -98,6 +160,12 @@ async def get_page_metadata(self, page: Page) -> Dict[str, Any]:
return cast(Dict[str, Any], result)

async def on_new_page(self, page: Page) -> None:
"""
Handle actions to perform on a new page.
Args:
page (Page): The Playwright page object.
"""
assert page is not None
page.on("download", self._download_handler) # type: ignore
if self.to_resize_viewport and self.viewport_width and self.viewport_height:
Expand All @@ -107,10 +175,26 @@ async def on_new_page(self, page: Page) -> None:
await page.wait_for_load_state()

async def back(self, page: Page) -> None:
"""
Navigate back to the previous page.
Args:
page (Page): The Playwright page object.
"""
assert page is not None
await page.go_back()

async def visit_page(self, page: Page, url: str) -> Tuple[bool, bool]:
"""
Visit a specified URL.
Args:
page (Page): The Playwright page object.
url (str): The URL to visit.
Returns:
Tuple[bool, bool]: A tuple indicating whether to reset prior metadata hash and last download.
"""
assert page is not None
reset_prior_metadata_hash = False
reset_last_download = False
Expand Down Expand Up @@ -143,16 +227,38 @@ async def visit_page(self, page: Page, url: str) -> Tuple[bool, bool]:
return reset_prior_metadata_hash, reset_last_download

async def page_down(self, page: Page) -> None:
"""
Scroll the page down by one viewport height minus 50 pixels.
Args:
page (Page): The Playwright page object.
"""
assert page is not None
await page.evaluate(f"window.scrollBy(0, {self.viewport_height-50});")

async def page_up(self, page: Page) -> None:
"""
Scroll the page up by one viewport height minus 50 pixels.
Args:
page (Page): The Playwright page object.
"""
assert page is not None
await page.evaluate(f"window.scrollBy(0, -{self.viewport_height-50});")

async def gradual_cursor_animation(
self, page: Page, start_x: float, start_y: float, end_x: float, end_y: float
) -> None:
"""
Animate the cursor movement gradually from start to end coordinates.
Args:
page (Page): The Playwright page object.
start_x (float): The starting x-coordinate.
start_y (float): The starting y-coordinate.
end_x (float): The ending x-coordinate.
end_y (float): The ending y-coordinate.
"""
# animation helper
steps = 20
for step in range(steps):
Expand All @@ -171,6 +277,13 @@ async def gradual_cursor_animation(
self.last_cursor_position = (end_x, end_y)

async def add_cursor_box(self, page: Page, identifier: str) -> None:
"""
Add a red cursor box around the element with the given identifier.
Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
"""
# animation helper
await page.evaluate(f"""
(function() {{
Expand Down Expand Up @@ -199,6 +312,13 @@ async def add_cursor_box(self, page: Page, identifier: str) -> None:
""")

async def remove_cursor_box(self, page: Page, identifier: str) -> None:
"""
Remove the red cursor box around the element with the given identifier.
Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
"""
# Remove the highlight and cursor
await page.evaluate(f"""
(function() {{
Expand All @@ -215,7 +335,14 @@ async def remove_cursor_box(self, page: Page, identifier: str) -> None:

async def click_id(self, page: Page, identifier: str) -> Page | None:
"""
Returns new page if a new page is opened, otherwise None.
Click the element with the given identifier.
Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
Returns:
Page | None: The new page if a new page is opened, otherwise None.
"""
new_page: Page | None = None
assert page is not None
Expand Down Expand Up @@ -266,7 +393,11 @@ async def click_id(self, page: Page, identifier: str) -> Page | None:

async def hover_id(self, page: Page, identifier: str) -> None:
"""
Hovers the mouse over the target with the given id.
Hover the mouse over the element with the given identifier.
Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
"""
assert page is not None
target = page.locator(f"[__elementId='{identifier}']")
Expand Down Expand Up @@ -296,7 +427,15 @@ async def hover_id(self, page: Page, identifier: str) -> None:
else:
await page.mouse.move(box["x"] + box["width"] / 2, box["y"] + box["height"] / 2)

async def fill_id(self, page: Page, identifier: str, value: str) -> None:
async def fill_id(self, page: Page, identifier: str, value: str, press_enter: bool = True) -> None:
"""
Fill the element with the given identifier with the specified value.
Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
value (str): The value to fill.
"""
assert page is not None
target = page.locator(f"[__elementId='{identifier}']")

Expand Down Expand Up @@ -332,12 +471,21 @@ async def fill_id(self, page: Page, identifier: str, value: str) -> None:
await target.fill(value)
except PlaywrightError:
await target.press_sequentially(value)
await target.press("Enter")
if press_enter:
await target.press("Enter")

if self.animate_actions:
await self.remove_cursor_box(page, identifier)

async def scroll_id(self, page: Page, identifier: str, direction: str) -> None:
"""
Scroll the element with the given identifier in the specified direction.
Args:
page (Page): The Playwright page object.
identifier (str): The element identifier.
direction (str): The direction to scroll ("up" or "down").
"""
assert page is not None
await page.evaluate(
f"""
Expand All @@ -355,11 +503,16 @@ async def scroll_id(self, page: Page, identifier: str, direction: str) -> None:
"""
)

async def get_webpage_text(self, page: Page, n_lines: int = 100) -> str:
async def get_webpage_text(self, page: Page, n_lines: int = 50) -> str:
"""
page: playwright page object
n_lines: number of lines to return from the page innertext
return: text in the first n_lines of the page
Retrieve the text content of the web page.
Args:
page (Page): The Playwright page object.
n_lines (int): The number of lines to return from the page inner text.
Returns:
str: The text content of the page.
"""
assert page is not None
try:
Expand All @@ -375,6 +528,22 @@ async def get_webpage_text(self, page: Page, n_lines: int = 100) -> str:
return ""

async def get_page_markdown(self, page: Page) -> str:
# TODO: replace with mdconvert
"""
Retrieve the markdown content of the web page.
Currently not implemented.
Args:
page (Page): The Playwright page object.
Returns:
str: The markdown content of the page.
"""
assert page is not None
return await self.get_webpage_text(page, n_lines=1000)
if self._markdown_converter is None and MarkItDown is not None:
self._markdown_converter = MarkItDown()
html = await page.evaluate("document.documentElement.outerHTML;")
res = self._markdown_converter.convert_stream(io.StringIO(html), file_extension=".html", url=page.url) # type: ignore
assert hasattr(res, "text_content") and isinstance(res.text_content, str)
return res.text_content
else:
return await self.get_webpage_text(page, n_lines=200)
Loading

0 comments on commit b9d682c

Please sign in to comment.