66 lines
2.0 KiB
Python
66 lines
2.0 KiB
Python
import re
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
from app.tools.base import Tool
|
|
|
|
|
|
class WebFetchTool(Tool):
|
|
name = "web_fetch"
|
|
description = "Fetch a webpage and return simplified content."
|
|
|
|
def parameters_schema(self) -> dict[str, Any]:
|
|
return {
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "The http or https URL to fetch.",
|
|
}
|
|
},
|
|
"required": ["url"],
|
|
"additionalProperties": False,
|
|
}
|
|
|
|
async def run(self, payload: dict[str, Any]) -> dict[str, Any]:
|
|
url = str(payload.get("url", "")).strip()
|
|
if not url.startswith(("http://", "https://")):
|
|
return {
|
|
"tool": self.name,
|
|
"status": "error",
|
|
"url": url,
|
|
"message": "Only http and https URLs are allowed.",
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
|
response = await client.get(url)
|
|
response.raise_for_status()
|
|
except httpx.HTTPError as exc:
|
|
return {
|
|
"tool": self.name,
|
|
"status": "error",
|
|
"url": url,
|
|
"message": str(exc),
|
|
}
|
|
|
|
text = self._simplify_content(response.text)
|
|
return {
|
|
"tool": self.name,
|
|
"status": "ok",
|
|
"url": url,
|
|
"content_type": response.headers.get("content-type", ""),
|
|
"content": text[:12000],
|
|
"truncated": len(text) > 12000,
|
|
}
|
|
|
|
def _simplify_content(self, content: str) -> str:
|
|
text = re.sub(r"(?is)<script.*?>.*?</script>", " ", content)
|
|
text = re.sub(r"(?is)<style.*?>.*?</style>", " ", text)
|
|
text = re.sub(r"(?s)<[^>]+>", " ", text)
|
|
text = re.sub(r" ", " ", text)
|
|
text = re.sub(r"&", "&", text)
|
|
text = re.sub(r"\s+", " ", text)
|
|
return text.strip()
|