wiseclaw/backend/app/tools/browser_use.py

import asyncio
import json
import os
from pathlib import Path
from typing import Any
from urllib.parse import urlparse

import httpx

from app.config import Settings
from app.models import RuntimeSettings
from app.tools.base import Tool


class BrowserUseTool(Tool):
    name = "browser_use"
    description = (
        "Use the browser-use agent for higher-level real browser tasks such as navigating sites, "
        "extracting lists, comparing items, and completing multi-step browsing workflows."
    )

    def __init__(self, workspace_root: Path, runtime: RuntimeSettings, settings: Settings, api_key: str) -> None:
        self.workspace_root = workspace_root.resolve()
        self.runtime = runtime
        self.settings = settings
        self.api_key = api_key
        self.debug_port = 9223 + (abs(hash(str(self.workspace_root))) % 200)
        self.chromium_path = (
            Path.home()
            / "Library"
            / "Caches"
            / "ms-playwright"
            / "chromium-1194"
            / "chrome-mac"
            / "Chromium.app"
            / "Contents"
            / "MacOS"
            / "Chromium"
        )

    def parameters_schema(self) -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "task": {
                    "type": "string",
                    "description": "The high-level browser task to complete.",
                },
                "start_url": {
                    "type": "string",
                    "description": "Optional URL to open first before the agent starts.",
                },
                "max_steps": {
                    "type": "integer",
                    "description": "Maximum browser-use steps before stopping. Defaults to 20.",
                },
                "keep_alive": {
                    "type": "boolean",
                    "description": "Keep the browser open after the run finishes.",
                },
                "allowed_domains": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "Optional list of allowed domains for the run.",
                },
            },
            "required": ["task"],
            "additionalProperties": False,
        }

    async def run(self, payload: dict[str, Any]) -> dict[str, Any]:
        task = str(payload.get("task", "")).strip()
        if not task:
            return {"tool": self.name, "status": "error", "message": "task is required."}

        start_url = str(payload.get("start_url", "")).strip()
        max_steps = int(payload.get("max_steps", 20))
        keep_alive = bool(payload.get("keep_alive", False))
        allowed_domains = self._normalize_domains(payload.get("allowed_domains"))

        if start_url and not allowed_domains:
            host = urlparse(start_url).netloc
            if host:
                allowed_domains = [host]

        llm_error = self._provider_readiness_error()
        if llm_error is not None:
            return {"tool": self.name, "status": "error", "message": llm_error}

        try:
            result = await self._run_agent(
                task=self._compose_task(task, start_url),
                max_steps=max_steps,
                keep_alive=keep_alive,
                allowed_domains=allowed_domains,
            )
        except Exception as exc:
            return {
                "tool": self.name,
                "status": "error",
                "message": str(exc),
            }

        return {
            "tool": self.name,
            "status": "ok" if result["success"] else "error",
            **result,
        }

    async def _run_agent(
        self,
        task: str,
        max_steps: int,
        keep_alive: bool,
        allowed_domains: list[str],
    ) -> dict[str, Any]:
        from browser_use import Agent, Browser, ChatAnthropic, ChatOpenAI

        cdp_url = await self._ensure_persistent_browser()
        browser = Browser(
            cdp_url=cdp_url,
            is_local=True,
            keep_alive=True,
            allowed_domains=allowed_domains or None,
        )
        llm = self._build_llm(ChatAnthropic=ChatAnthropic, ChatOpenAI=ChatOpenAI)
        agent = Agent(
            task=task,
            llm=llm,
            browser=browser,
            use_vision=True,
            enable_planning=False,
            max_actions_per_step=3,
            display_files_in_done_text=False,
        )

        try:
            history = await agent.run(max_steps=max_steps)
            final_result = history.final_result() or ""
            extracted = history.extracted_content()
            errors = [error for error in history.errors() if error]
            urls = [url for url in history.urls() if url]
            return {
                "success": bool(history.is_successful()),
                "final_result": final_result,
                "extracted_content": extracted[-10:],
                "errors": errors[-5:],
                "urls": urls[-10:],
                "steps": history.number_of_steps(),
                "actions": history.action_names()[-20:],
            }
        finally:
            await agent.close()

    def _build_llm(self, ChatAnthropic: Any, ChatOpenAI: Any) -> Any:
        if self.runtime.model_provider == "zai":
            return ChatAnthropic(
                model=self.runtime.zai_model,
                api_key=self.api_key,
                base_url=self.settings.zai_base_url,
                timeout=180.0,
            )

        return ChatOpenAI(
            model=self.runtime.local_model,
            api_key="lm-studio",
            base_url=f"{self.runtime.local_base_url.rstrip('/')}/v1",
            timeout=180.0,
        )

    def _provider_readiness_error(self) -> str | None:
        if self.runtime.model_provider == "zai" and not self.api_key.strip():
            return "Z.AI API key is not configured."
        if self.runtime.model_provider == "local" and not self.runtime.local_base_url.strip():
            return "Local model base URL is not configured."
        return None

    def _compose_task(self, task: str, start_url: str) -> str:
        instructions = [
            "Work in a real browser on macOS.",
            "If the task asks for list extraction, return concise structured text.",
            "If a captcha or login wall blocks progress, stop immediately and say that user action is required.",
            "Do not click third-party sign-in buttons such as Google, Apple, or GitHub OAuth buttons.",
            "Do not open or interact with login popups or OAuth consent windows.",
            "If authentication is required, leave the page open in the persistent browser and tell the user to complete login manually, then retry the task.",
            "Do not submit irreversible forms or purchases unless the user explicitly asked for it.",
        ]
        if start_url:
            instructions.append(f"Start at this URL first: {start_url}")
        instructions.append(task)
        return "\n".join(instructions)

    def _normalize_domains(self, value: object) -> list[str]:
        if not isinstance(value, list):
            return []
        return [str(item).strip() for item in value if str(item).strip()]

    def _profile_root(self) -> Path:
        profile_root = self.workspace_root / ".wiseclaw" / "browser-use-profile"
        profile_root.mkdir(parents=True, exist_ok=True)
        (profile_root / "WiseClaw").mkdir(parents=True, exist_ok=True)
        return profile_root

    async def _ensure_persistent_browser(self) -> str:
        state = self._load_browser_state()
        if state and self._pid_is_running(int(state.get("pid", 0))):
            cdp_url = await self._fetch_cdp_url(int(state["port"]))
            if cdp_url:
                return cdp_url

        await self._launch_persistent_browser()
        cdp_url = await self._wait_for_cdp_url()
        self._save_browser_state({"pid": self._read_pid_file(), "port": self.debug_port})
        return cdp_url

    async def _launch_persistent_browser(self) -> None:
        executable = str(self.chromium_path if self.chromium_path.exists() else "Chromium")
        profile_root = self._profile_root()
        args = [
            executable,
            f"--remote-debugging-port={self.debug_port}",
            f"--user-data-dir={profile_root}",
            "--profile-directory=WiseClaw",
            "--no-first-run",
            "--no-default-browser-check",
            "--start-maximized",
            "about:blank",
        ]
        process = await asyncio.create_subprocess_exec(
            *args,
            stdout=asyncio.subprocess.DEVNULL,
            stderr=asyncio.subprocess.DEVNULL,
            start_new_session=True,
        )
        self._write_pid_file(process.pid)

    async def _wait_for_cdp_url(self) -> str:
        for _ in range(40):
            cdp_url = await self._fetch_cdp_url(self.debug_port)
            if cdp_url:
                return cdp_url
            await asyncio.sleep(0.5)
        raise RuntimeError("Persistent Chromium browser did not expose a CDP endpoint in time.")

    async def _fetch_cdp_url(self, port: int) -> str:
        try:
            async with httpx.AsyncClient(timeout=2.0) as client:
                response = await client.get(f"http://127.0.0.1:{port}/json/version")
                response.raise_for_status()
        except httpx.HTTPError:
            return ""
        payload = response.json()
        return str(payload.get("webSocketDebuggerUrl", ""))

    def _browser_state_path(self) -> Path:
        return self.workspace_root / ".wiseclaw" / "browser-use-browser.json"

    def _browser_pid_path(self) -> Path:
        return self.workspace_root / ".wiseclaw" / "browser-use-browser.pid"

    def _load_browser_state(self) -> dict[str, int] | None:
        path = self._browser_state_path()
        if not path.exists():
            return None
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except json.JSONDecodeError:
            return None

    def _save_browser_state(self, payload: dict[str, int]) -> None:
        path = self._browser_state_path()
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(json.dumps(payload), encoding="utf-8")

    def _write_pid_file(self, pid: int) -> None:
        path = self._browser_pid_path()
        path.parent.mkdir(parents=True, exist_ok=True)
        path.write_text(str(pid), encoding="utf-8")

    def _read_pid_file(self) -> int:
        path = self._browser_pid_path()
        if not path.exists():
            return 0
        try:
            return int(path.read_text(encoding="utf-8").strip())
        except ValueError:
            return 0

    def _pid_is_running(self, pid: int) -> bool:
        if pid <= 0:
            return False
        try:
            os.kill(pid, 0)
        except OSError:
            return False
        return True