297 lines
11 KiB
Python
297 lines
11 KiB
Python
import asyncio
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
|
|
from app.config import Settings
|
|
from app.models import RuntimeSettings
|
|
from app.tools.base import Tool
|
|
|
|
|
|
class BrowserUseTool(Tool):
|
|
name = "browser_use"
|
|
description = (
|
|
"Use the browser-use agent for higher-level real browser tasks such as navigating sites, "
|
|
"extracting lists, comparing items, and completing multi-step browsing workflows."
|
|
)
|
|
|
|
def __init__(self, workspace_root: Path, runtime: RuntimeSettings, settings: Settings, api_key: str) -> None:
|
|
self.workspace_root = workspace_root.resolve()
|
|
self.runtime = runtime
|
|
self.settings = settings
|
|
self.api_key = api_key
|
|
self.debug_port = 9223 + (abs(hash(str(self.workspace_root))) % 200)
|
|
self.chromium_path = (
|
|
Path.home()
|
|
/ "Library"
|
|
/ "Caches"
|
|
/ "ms-playwright"
|
|
/ "chromium-1194"
|
|
/ "chrome-mac"
|
|
/ "Chromium.app"
|
|
/ "Contents"
|
|
/ "MacOS"
|
|
/ "Chromium"
|
|
)
|
|
|
|
def parameters_schema(self) -> dict[str, Any]:
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"task": {
|
|
"type": "string",
|
|
"description": "The high-level browser task to complete.",
|
|
},
|
|
"start_url": {
|
|
"type": "string",
|
|
"description": "Optional URL to open first before the agent starts.",
|
|
},
|
|
"max_steps": {
|
|
"type": "integer",
|
|
"description": "Maximum browser-use steps before stopping. Defaults to 20.",
|
|
},
|
|
"keep_alive": {
|
|
"type": "boolean",
|
|
"description": "Keep the browser open after the run finishes.",
|
|
},
|
|
"allowed_domains": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Optional list of allowed domains for the run.",
|
|
},
|
|
},
|
|
"required": ["task"],
|
|
"additionalProperties": False,
|
|
}
|
|
|
|
async def run(self, payload: dict[str, Any]) -> dict[str, Any]:
|
|
task = str(payload.get("task", "")).strip()
|
|
if not task:
|
|
return {"tool": self.name, "status": "error", "message": "task is required."}
|
|
|
|
start_url = str(payload.get("start_url", "")).strip()
|
|
max_steps = int(payload.get("max_steps", 20))
|
|
keep_alive = bool(payload.get("keep_alive", False))
|
|
allowed_domains = self._normalize_domains(payload.get("allowed_domains"))
|
|
|
|
if start_url and not allowed_domains:
|
|
host = urlparse(start_url).netloc
|
|
if host:
|
|
allowed_domains = [host]
|
|
|
|
llm_error = self._provider_readiness_error()
|
|
if llm_error is not None:
|
|
return {"tool": self.name, "status": "error", "message": llm_error}
|
|
|
|
try:
|
|
result = await self._run_agent(
|
|
task=self._compose_task(task, start_url),
|
|
max_steps=max_steps,
|
|
keep_alive=keep_alive,
|
|
allowed_domains=allowed_domains,
|
|
)
|
|
except Exception as exc:
|
|
return {
|
|
"tool": self.name,
|
|
"status": "error",
|
|
"message": str(exc),
|
|
}
|
|
|
|
return {
|
|
"tool": self.name,
|
|
"status": "ok" if result["success"] else "error",
|
|
**result,
|
|
}
|
|
|
|
async def _run_agent(
|
|
self,
|
|
task: str,
|
|
max_steps: int,
|
|
keep_alive: bool,
|
|
allowed_domains: list[str],
|
|
) -> dict[str, Any]:
|
|
from browser_use import Agent, Browser, ChatAnthropic, ChatOpenAI
|
|
|
|
cdp_url = await self._ensure_persistent_browser()
|
|
browser = Browser(
|
|
cdp_url=cdp_url,
|
|
is_local=True,
|
|
keep_alive=True,
|
|
allowed_domains=allowed_domains or None,
|
|
)
|
|
llm = self._build_llm(ChatAnthropic=ChatAnthropic, ChatOpenAI=ChatOpenAI)
|
|
agent = Agent(
|
|
task=task,
|
|
llm=llm,
|
|
browser=browser,
|
|
use_vision=True,
|
|
enable_planning=False,
|
|
max_actions_per_step=3,
|
|
display_files_in_done_text=False,
|
|
)
|
|
|
|
try:
|
|
history = await agent.run(max_steps=max_steps)
|
|
final_result = history.final_result() or ""
|
|
extracted = history.extracted_content()
|
|
errors = [error for error in history.errors() if error]
|
|
urls = [url for url in history.urls() if url]
|
|
return {
|
|
"success": bool(history.is_successful()),
|
|
"final_result": final_result,
|
|
"extracted_content": extracted[-10:],
|
|
"errors": errors[-5:],
|
|
"urls": urls[-10:],
|
|
"steps": history.number_of_steps(),
|
|
"actions": history.action_names()[-20:],
|
|
}
|
|
finally:
|
|
await agent.close()
|
|
|
|
def _build_llm(self, ChatAnthropic: Any, ChatOpenAI: Any) -> Any:
|
|
if self.runtime.model_provider == "zai":
|
|
return ChatAnthropic(
|
|
model=self.runtime.zai_model,
|
|
api_key=self.api_key,
|
|
base_url=self.settings.zai_base_url,
|
|
timeout=180.0,
|
|
)
|
|
|
|
return ChatOpenAI(
|
|
model=self.runtime.local_model,
|
|
api_key="lm-studio",
|
|
base_url=f"{self.runtime.local_base_url.rstrip('/')}/v1",
|
|
timeout=180.0,
|
|
)
|
|
|
|
def _provider_readiness_error(self) -> str | None:
|
|
if self.runtime.model_provider == "zai" and not self.api_key.strip():
|
|
return "Z.AI API key is not configured."
|
|
if self.runtime.model_provider == "local" and not self.runtime.local_base_url.strip():
|
|
return "Local model base URL is not configured."
|
|
return None
|
|
|
|
def _compose_task(self, task: str, start_url: str) -> str:
|
|
instructions = [
|
|
"Work in a real browser on macOS.",
|
|
"If the task asks for list extraction, return concise structured text.",
|
|
"If a captcha or login wall blocks progress, stop immediately and say that user action is required.",
|
|
"Do not click third-party sign-in buttons such as Google, Apple, or GitHub OAuth buttons.",
|
|
"Do not open or interact with login popups or OAuth consent windows.",
|
|
"If authentication is required, leave the page open in the persistent browser and tell the user to complete login manually, then retry the task.",
|
|
"Do not submit irreversible forms or purchases unless the user explicitly asked for it.",
|
|
]
|
|
if start_url:
|
|
instructions.append(f"Start at this URL first: {start_url}")
|
|
instructions.append(task)
|
|
return "\n".join(instructions)
|
|
|
|
def _normalize_domains(self, value: object) -> list[str]:
|
|
if not isinstance(value, list):
|
|
return []
|
|
return [str(item).strip() for item in value if str(item).strip()]
|
|
|
|
def _profile_root(self) -> Path:
|
|
profile_root = self.workspace_root / ".wiseclaw" / "browser-use-profile"
|
|
profile_root.mkdir(parents=True, exist_ok=True)
|
|
(profile_root / "WiseClaw").mkdir(parents=True, exist_ok=True)
|
|
return profile_root
|
|
|
|
async def _ensure_persistent_browser(self) -> str:
|
|
state = self._load_browser_state()
|
|
if state and self._pid_is_running(int(state.get("pid", 0))):
|
|
cdp_url = await self._fetch_cdp_url(int(state["port"]))
|
|
if cdp_url:
|
|
return cdp_url
|
|
|
|
await self._launch_persistent_browser()
|
|
cdp_url = await self._wait_for_cdp_url()
|
|
self._save_browser_state({"pid": self._read_pid_file(), "port": self.debug_port})
|
|
return cdp_url
|
|
|
|
async def _launch_persistent_browser(self) -> None:
|
|
executable = str(self.chromium_path if self.chromium_path.exists() else "Chromium")
|
|
profile_root = self._profile_root()
|
|
args = [
|
|
executable,
|
|
f"--remote-debugging-port={self.debug_port}",
|
|
f"--user-data-dir={profile_root}",
|
|
"--profile-directory=WiseClaw",
|
|
"--no-first-run",
|
|
"--no-default-browser-check",
|
|
"--start-maximized",
|
|
"about:blank",
|
|
]
|
|
process = await asyncio.create_subprocess_exec(
|
|
*args,
|
|
stdout=asyncio.subprocess.DEVNULL,
|
|
stderr=asyncio.subprocess.DEVNULL,
|
|
start_new_session=True,
|
|
)
|
|
self._write_pid_file(process.pid)
|
|
|
|
async def _wait_for_cdp_url(self) -> str:
|
|
for _ in range(40):
|
|
cdp_url = await self._fetch_cdp_url(self.debug_port)
|
|
if cdp_url:
|
|
return cdp_url
|
|
await asyncio.sleep(0.5)
|
|
raise RuntimeError("Persistent Chromium browser did not expose a CDP endpoint in time.")
|
|
|
|
async def _fetch_cdp_url(self, port: int) -> str:
|
|
try:
|
|
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
response = await client.get(f"http://127.0.0.1:{port}/json/version")
|
|
response.raise_for_status()
|
|
except httpx.HTTPError:
|
|
return ""
|
|
payload = response.json()
|
|
return str(payload.get("webSocketDebuggerUrl", ""))
|
|
|
|
def _browser_state_path(self) -> Path:
|
|
return self.workspace_root / ".wiseclaw" / "browser-use-browser.json"
|
|
|
|
def _browser_pid_path(self) -> Path:
|
|
return self.workspace_root / ".wiseclaw" / "browser-use-browser.pid"
|
|
|
|
def _load_browser_state(self) -> dict[str, int] | None:
|
|
path = self._browser_state_path()
|
|
if not path.exists():
|
|
return None
|
|
try:
|
|
return json.loads(path.read_text(encoding="utf-8"))
|
|
except json.JSONDecodeError:
|
|
return None
|
|
|
|
def _save_browser_state(self, payload: dict[str, int]) -> None:
|
|
path = self._browser_state_path()
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(json.dumps(payload), encoding="utf-8")
|
|
|
|
def _write_pid_file(self, pid: int) -> None:
|
|
path = self._browser_pid_path()
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
path.write_text(str(pid), encoding="utf-8")
|
|
|
|
def _read_pid_file(self) -> int:
|
|
path = self._browser_pid_path()
|
|
if not path.exists():
|
|
return 0
|
|
try:
|
|
return int(path.read_text(encoding="utf-8").strip())
|
|
except ValueError:
|
|
return 0
|
|
|
|
def _pid_is_running(self, pid: int) -> bool:
|
|
if pid <= 0:
|
|
return False
|
|
try:
|
|
os.kill(pid, 0)
|
|
except OSError:
|
|
return False
|
|
return True
|