Files
wiseclaw/backend/app/tools/web_fetch.py

66 lines
2.0 KiB
Python

import re
from typing import Any
import httpx
from app.tools.base import Tool
class WebFetchTool(Tool):
name = "web_fetch"
description = "Fetch a webpage and return simplified content."
def parameters_schema(self) -> dict[str, Any]:
return {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The http or https URL to fetch.",
}
},
"required": ["url"],
"additionalProperties": False,
}
async def run(self, payload: dict[str, Any]) -> dict[str, Any]:
url = str(payload.get("url", "")).strip()
if not url.startswith(("http://", "https://")):
return {
"tool": self.name,
"status": "error",
"url": url,
"message": "Only http and https URLs are allowed.",
}
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
response = await client.get(url)
response.raise_for_status()
except httpx.HTTPError as exc:
return {
"tool": self.name,
"status": "error",
"url": url,
"message": str(exc),
}
text = self._simplify_content(response.text)
return {
"tool": self.name,
"status": "ok",
"url": url,
"content_type": response.headers.get("content-type", ""),
"content": text[:12000],
"truncated": len(text) > 12000,
}
def _simplify_content(self, content: str) -> str:
text = re.sub(r"(?is)<script.*?>.*?</script>", " ", content)
text = re.sub(r"(?is)<style.*?>.*?</style>", " ", text)
text = re.sub(r"(?s)<[^>]+>", " ", text)
text = re.sub(r"&nbsp;", " ", text)
text = re.sub(r"&amp;", "&", text)
text = re.sub(r"\s+", " ", text)
return text.strip()