refactor: migrate to src package layout and wscraper entry module
This commit is contained in:
22
pyproject.toml
Normal file
22
pyproject.toml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
[build-system]
|
||||||
|
requires = ["setuptools", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "wscraper"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Multi-site scraper CLI"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"scrapling[fetchers]==0.4.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
wscraper = "wscraper.cli:main"
|
||||||
|
|
||||||
|
[tool.setuptools]
|
||||||
|
package-dir = {"" = "src"}
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["src"]
|
||||||
@@ -1 +1 @@
|
|||||||
scrapling[fetchers]==0.4.1
|
-e .
|
||||||
|
|||||||
@@ -1,256 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import random
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from scrapling.fetchers import DynamicSession
|
|
||||||
|
|
||||||
STOP_TEXT = "You have not bookmarked any torrents."
|
|
||||||
BG_URL_RE = re.compile(r"url\((?:'|\")?(.*?)(?:'|\")?\)")
|
|
||||||
|
|
||||||
|
|
||||||
def _domain_matches(target_host: str, cookie_domain: str) -> bool:
|
|
||||||
cd = cookie_domain.lstrip(".").lower()
|
|
||||||
th = target_host.lower()
|
|
||||||
return th == cd or th.endswith("." + cd)
|
|
||||||
|
|
||||||
|
|
||||||
def parse_cookie_string(cookie_string: str, target_host: str) -> dict[str, str]:
|
|
||||||
"""
|
|
||||||
Supports:
|
|
||||||
1) "key=value; key2=value2" cookie header style
|
|
||||||
2) Netscape cookie file format (tab-separated 7 columns)
|
|
||||||
"""
|
|
||||||
cookies: dict[str, str] = {}
|
|
||||||
lines = cookie_string.splitlines()
|
|
||||||
|
|
||||||
looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
|
|
||||||
if looks_like_netscape:
|
|
||||||
for raw_line in lines:
|
|
||||||
line = raw_line.strip()
|
|
||||||
if not line or line.startswith("#"):
|
|
||||||
continue
|
|
||||||
parts = line.split("\t")
|
|
||||||
if len(parts) < 7:
|
|
||||||
continue
|
|
||||||
domain, _flag, _path, _secure, _expires, name, value = parts[:7]
|
|
||||||
if not _domain_matches(target_host, domain):
|
|
||||||
continue
|
|
||||||
if name:
|
|
||||||
cookies[name] = value
|
|
||||||
return cookies
|
|
||||||
|
|
||||||
for chunk in cookie_string.split(";"):
|
|
||||||
piece = chunk.strip()
|
|
||||||
if not piece or "=" not in piece:
|
|
||||||
continue
|
|
||||||
key, value = piece.split("=", 1)
|
|
||||||
key = key.strip()
|
|
||||||
value = value.strip()
|
|
||||||
if key:
|
|
||||||
cookies[key] = value
|
|
||||||
return cookies
|
|
||||||
|
|
||||||
|
|
||||||
def parse_cookies_for_playwright(
|
|
||||||
cookie_string: str, target_host: str, base_url: str
|
|
||||||
) -> list[dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Converts cookie input into Playwright-compatible cookie objects.
|
|
||||||
"""
|
|
||||||
lines = cookie_string.splitlines()
|
|
||||||
cookies: list[dict[str, Any]] = []
|
|
||||||
|
|
||||||
looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
|
|
||||||
if looks_like_netscape:
|
|
||||||
for raw_line in lines:
|
|
||||||
line = raw_line.strip()
|
|
||||||
if not line or line.startswith("#"):
|
|
||||||
continue
|
|
||||||
parts = line.split("\t")
|
|
||||||
if len(parts) < 7:
|
|
||||||
continue
|
|
||||||
domain, _flag, path, secure, expires, name, value = parts[:7]
|
|
||||||
if not _domain_matches(target_host, domain):
|
|
||||||
continue
|
|
||||||
if not name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
cookie_obj: dict[str, Any] = {
|
|
||||||
"name": name,
|
|
||||||
"value": value,
|
|
||||||
"domain": domain.lstrip("."),
|
|
||||||
"path": path or "/",
|
|
||||||
"secure": (secure.upper() == "TRUE"),
|
|
||||||
}
|
|
||||||
if expires.isdigit():
|
|
||||||
exp_num = int(expires)
|
|
||||||
if exp_num > 0:
|
|
||||||
cookie_obj["expires"] = float(exp_num)
|
|
||||||
cookies.append(cookie_obj)
|
|
||||||
return cookies
|
|
||||||
|
|
||||||
kv = parse_cookie_string(cookie_string, target_host)
|
|
||||||
for name, value in kv.items():
|
|
||||||
cookies.append({"name": name, "value": value, "url": base_url})
|
|
||||||
return cookies
|
|
||||||
|
|
||||||
|
|
||||||
def extract_background_image(style: str) -> str | None:
|
|
||||||
if not style:
|
|
||||||
return None
|
|
||||||
match = BG_URL_RE.search(style)
|
|
||||||
if not match:
|
|
||||||
return None
|
|
||||||
value = match.group(1).strip()
|
|
||||||
return value or None
|
|
||||||
|
|
||||||
|
|
||||||
def extract_torrent_cards(response: Any, base_url: str) -> list[dict[str, Any]]:
|
|
||||||
records: list[dict[str, Any]] = []
|
|
||||||
cards = response.css("div.torrent_grid div.torrent_grid__torrent")
|
|
||||||
for card in cards:
|
|
||||||
page_url = (card.css('a[href^="/torrents.php?id="]::attr(href)').get("") or "").strip()
|
|
||||||
if page_url and not page_url.startswith("http"):
|
|
||||||
page_url = f"{base_url.rstrip('/')}{page_url}"
|
|
||||||
category = (card.css("span.torrent_grid__torrent__cat::text").get("") or "").strip()
|
|
||||||
title = (
|
|
||||||
card.css("h3.trim::attr(title)").get("")
|
|
||||||
or card.css("h3.trim::text").get("")
|
|
||||||
or ""
|
|
||||||
).strip()
|
|
||||||
style = (card.css("div.torrent__cover::attr(style)").get("") or "").strip()
|
|
||||||
background_image = extract_background_image(style)
|
|
||||||
|
|
||||||
records.append(
|
|
||||||
{
|
|
||||||
"pageURL": page_url,
|
|
||||||
"isVR": category == "VR",
|
|
||||||
"title": title,
|
|
||||||
"backgroundImage": background_image,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return records
|
|
||||||
|
|
||||||
|
|
||||||
def should_stop(response: Any) -> bool:
|
|
||||||
body_text = response.body.decode(response.encoding or "utf-8", errors="ignore")
|
|
||||||
return STOP_TEXT in body_text
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(session: Any, url: str, retries: int, backoff_base: float) -> Any:
|
|
||||||
last_error: Exception | None = None
|
|
||||||
for attempt in range(retries):
|
|
||||||
try:
|
|
||||||
response = session.fetch(
|
|
||||||
url,
|
|
||||||
timeout=45_000,
|
|
||||||
load_dom=True,
|
|
||||||
network_idle=False,
|
|
||||||
)
|
|
||||||
status = response.status
|
|
||||||
if status in (403, 429) or status >= 500:
|
|
||||||
raise RuntimeError(f"HTTP {status}")
|
|
||||||
return response
|
|
||||||
except Exception as err: # noqa: BLE001
|
|
||||||
last_error = err
|
|
||||||
if attempt == retries - 1:
|
|
||||||
break
|
|
||||||
sleep_seconds = backoff_base * (2**attempt) + random.uniform(0.0, 0.7)
|
|
||||||
time.sleep(sleep_seconds)
|
|
||||||
raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error
|
|
||||||
|
|
||||||
|
|
||||||
def build_bookmarks_url(base_url: str, page: int) -> str:
|
|
||||||
if page == 1:
|
|
||||||
return f"{base_url}/bookmarks.php?type=torrents"
|
|
||||||
return f"{base_url}/bookmarks.php?page={page}&type=torrents#torrent_table"
|
|
||||||
|
|
||||||
|
|
||||||
def run(args: argparse.Namespace) -> None:
|
|
||||||
target_host = urlparse(args.base_url).hostname or "www.happyfappy.net"
|
|
||||||
|
|
||||||
cookie_value = args.cookie or ""
|
|
||||||
if not cookie_value and args.cookie_file:
|
|
||||||
cookie_value = Path(args.cookie_file).read_text(encoding="utf-8").strip()
|
|
||||||
if not cookie_value:
|
|
||||||
raise ValueError("Cookie is required. Use --cookie or --cookie-file.")
|
|
||||||
|
|
||||||
cookies = parse_cookie_string(cookie_value, target_host=target_host)
|
|
||||||
if not cookies:
|
|
||||||
raise ValueError("No valid cookies parsed for target host. Check cookie content.")
|
|
||||||
pw_cookies = parse_cookies_for_playwright(
|
|
||||||
cookie_value, target_host=target_host, base_url=args.base_url.rstrip("/")
|
|
||||||
)
|
|
||||||
if not pw_cookies:
|
|
||||||
raise ValueError("No Playwright-compatible cookies generated for target host.")
|
|
||||||
|
|
||||||
all_records: list[dict[str, Any]] = []
|
|
||||||
|
|
||||||
with DynamicSession(
|
|
||||||
headless=True,
|
|
||||||
disable_resources=True,
|
|
||||||
cookies=pw_cookies,
|
|
||||||
google_search=False,
|
|
||||||
retries=1,
|
|
||||||
retry_delay=1,
|
|
||||||
) as session:
|
|
||||||
page = 1
|
|
||||||
while page <= args.max_pages:
|
|
||||||
if page > 1:
|
|
||||||
time.sleep(random.uniform(args.delay_min, args.delay_max))
|
|
||||||
|
|
||||||
url = build_bookmarks_url(args.base_url.rstrip("/"), page)
|
|
||||||
response = fetch_page(session, url, retries=args.retries, backoff_base=args.backoff_base)
|
|
||||||
|
|
||||||
if should_stop(response):
|
|
||||||
break
|
|
||||||
|
|
||||||
page_records = extract_torrent_cards(response, args.base_url)
|
|
||||||
all_records.extend(page_records)
|
|
||||||
print(f"[page={page}] extracted={len(page_records)} total={len(all_records)}")
|
|
||||||
page += 1
|
|
||||||
|
|
||||||
output_path = Path(args.output).resolve()
|
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
output_path.write_text(json.dumps(all_records, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
||||||
print(f"Saved {len(all_records)} records to {output_path}")
|
|
||||||
|
|
||||||
|
|
||||||
def make_parser() -> argparse.ArgumentParser:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Scrape HappyFappy torrent bookmarks using an authenticated cookie.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--base-url", default="https://www.happyfappy.net")
|
|
||||||
parser.add_argument("--cookie", help='Raw cookie string, e.g. "a=1; b=2"')
|
|
||||||
parser.add_argument("--cookie-file", help="Path to a text file containing raw cookie string")
|
|
||||||
parser.add_argument("--output", default="bookmarks.json")
|
|
||||||
parser.add_argument("--delay-min", type=float, default=1.8, help="Minimum delay between page requests")
|
|
||||||
parser.add_argument("--delay-max", type=float, default=3.2, help="Maximum delay between page requests")
|
|
||||||
parser.add_argument("--retries", type=int, default=3, help="Retries per page request")
|
|
||||||
parser.add_argument("--backoff-base", type=float, default=5.0, help="Backoff base seconds")
|
|
||||||
parser.add_argument("--max-pages", type=int, default=200, help="Safety cap for pagination loop")
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
parser = make_parser()
|
|
||||||
args = parser.parse_args()
|
|
||||||
if args.delay_min < 0 or args.delay_max < 0:
|
|
||||||
raise ValueError("Delay values must be non-negative.")
|
|
||||||
if args.delay_min > args.delay_max:
|
|
||||||
raise ValueError("--delay-min cannot be greater than --delay-max.")
|
|
||||||
if args.retries < 1:
|
|
||||||
raise ValueError("--retries must be at least 1.")
|
|
||||||
run(args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
92
src/wscraper.egg-info/PKG-INFO
Normal file
92
src/wscraper.egg-info/PKG-INFO
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
Metadata-Version: 2.4
|
||||||
|
Name: wscraper
|
||||||
|
Version: 0.1.0
|
||||||
|
Summary: Multi-site scraper CLI
|
||||||
|
Requires-Python: >=3.12
|
||||||
|
Description-Content-Type: text/markdown
|
||||||
|
Requires-Dist: scrapling[fetchers]==0.4.1
|
||||||
|
|
||||||
|
# wscraper
|
||||||
|
|
||||||
|
HappyFappy için komutlar paketlenmiş `wscraper` CLI üzerinden çalışır. Proje çoklu site desteği için `src/` paket yapısına göre düzenlenmiştir.
|
||||||
|
|
||||||
|
## 1) Repo Clone
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone <REPO_URL>
|
||||||
|
cd <REPO_FOLDER>
|
||||||
|
```
|
||||||
|
|
||||||
|
## 2) Kurulum
|
||||||
|
|
||||||
|
### macOS / Linux
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3.12 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
python -m pip install -U pip
|
||||||
|
python -m pip install -e .
|
||||||
|
scrapling install
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows (PowerShell)
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
py -3.12 -m venv .venv
|
||||||
|
.venv\Scripts\Activate.ps1
|
||||||
|
python -m pip install -U pip
|
||||||
|
python -m pip install -e .
|
||||||
|
scrapling install
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows (CMD)
|
||||||
|
|
||||||
|
```bat
|
||||||
|
py -3.12 -m venv .venv
|
||||||
|
.venv\Scripts\activate.bat
|
||||||
|
python -m pip install -U pip
|
||||||
|
python -m pip install -e .
|
||||||
|
scrapling install
|
||||||
|
```
|
||||||
|
|
||||||
|
Not: Ortamı aktive ettikten sonra komutlar `wscraper ...` olarak kullanılabilir. İstersen `python -m wscraper ...` da kullanabilirsin.
|
||||||
|
|
||||||
|
## 3) HappyFappy Komutları
|
||||||
|
|
||||||
|
### Bookmarks Çekme
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wscraper happyfappy --action get-bookmarks -c cookies.txt -o bookmarks.json
|
||||||
|
```
|
||||||
|
|
||||||
|
### Torrent Dosyası İndirme
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wscraper happyfappy --action download-torrent-files -u "https://www.happyfappy.net/torrents.php?id=110178" -c cookies.txt -o torrent
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4) Kısa Alias Kullanımı
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# site alias: hf
|
||||||
|
# action alias: gb (get-bookmarks), dtf (download-torrent-files)
|
||||||
|
wscraper hf -a gb -c cookies.txt -o bookmarks.json
|
||||||
|
wscraper hf -a dtf -u "https://www.happyfappy.net/torrents.php?id=110178" -c cookies.txt -o torrent
|
||||||
|
```
|
||||||
|
|
||||||
|
## 5) Proje Dizini
|
||||||
|
|
||||||
|
```text
|
||||||
|
.
|
||||||
|
├── pyproject.toml
|
||||||
|
├── requirements.txt
|
||||||
|
├── src/
|
||||||
|
│ └── wscraper/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ ├── __main__.py
|
||||||
|
│ ├── cli.py
|
||||||
|
│ └── sites/
|
||||||
|
│ ├── __init__.py
|
||||||
|
│ └── happyfappy.py
|
||||||
|
└── README.md
|
||||||
|
```
|
||||||
13
src/wscraper.egg-info/SOURCES.txt
Normal file
13
src/wscraper.egg-info/SOURCES.txt
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
README.md
|
||||||
|
pyproject.toml
|
||||||
|
src/wscraper/__init__.py
|
||||||
|
src/wscraper/__main__.py
|
||||||
|
src/wscraper/cli.py
|
||||||
|
src/wscraper.egg-info/PKG-INFO
|
||||||
|
src/wscraper.egg-info/SOURCES.txt
|
||||||
|
src/wscraper.egg-info/dependency_links.txt
|
||||||
|
src/wscraper.egg-info/entry_points.txt
|
||||||
|
src/wscraper.egg-info/requires.txt
|
||||||
|
src/wscraper.egg-info/top_level.txt
|
||||||
|
src/wscraper/sites/__init__.py
|
||||||
|
src/wscraper/sites/happyfappy.py
|
||||||
1
src/wscraper.egg-info/dependency_links.txt
Normal file
1
src/wscraper.egg-info/dependency_links.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
|
||||||
2
src/wscraper.egg-info/entry_points.txt
Normal file
2
src/wscraper.egg-info/entry_points.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[console_scripts]
|
||||||
|
wscraper = wscraper.cli:main
|
||||||
1
src/wscraper.egg-info/requires.txt
Normal file
1
src/wscraper.egg-info/requires.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
scrapling[fetchers]==0.4.1
|
||||||
1
src/wscraper.egg-info/top_level.txt
Normal file
1
src/wscraper.egg-info/top_level.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
wscraper
|
||||||
3
src/wscraper/__init__.py
Normal file
3
src/wscraper/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
__all__ = ["__version__"]
|
||||||
|
|
||||||
|
__version__ = "0.1.0"
|
||||||
5
src/wscraper/__main__.py
Normal file
5
src/wscraper/__main__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
from wscraper.cli import main
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -1,11 +1,8 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from download_happyfappy_torrent import run as run_happyfappy_download
|
from wscraper.sites.happyfappy import run_download_torrent_files, run_get_bookmarks
|
||||||
from scrape_happyfappy_bookmarks import run as run_happyfappy_bookmarks
|
|
||||||
|
|
||||||
|
|
||||||
SITE_ALIASES = {
|
SITE_ALIASES = {
|
||||||
"happyfappy": "happyfappy",
|
"happyfappy": "happyfappy",
|
||||||
@@ -39,9 +36,7 @@ def normalize_action(value: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def build_parser() -> argparse.ArgumentParser:
|
def build_parser() -> argparse.ArgumentParser:
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(description="wscraper: multi-site scraping entrypoint")
|
||||||
description="wscraper: multi-site scraping entrypoint",
|
|
||||||
)
|
|
||||||
parser.add_argument("site", help="Site key, e.g. happyfappy or hf")
|
parser.add_argument("site", help="Site key, e.g. happyfappy or hf")
|
||||||
parser.add_argument("-a", "--action", required=True, help="Action to run")
|
parser.add_argument("-a", "--action", required=True, help="Action to run")
|
||||||
|
|
||||||
@@ -58,7 +53,6 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
|
|
||||||
parser.add_argument("-r", "--retries", type=int, default=3)
|
parser.add_argument("-r", "--retries", type=int, default=3)
|
||||||
parser.add_argument("--backoff-base", type=float, default=5.0)
|
parser.add_argument("--backoff-base", type=float, default=5.0)
|
||||||
|
|
||||||
parser.add_argument("--delay-min", type=float, default=1.8)
|
parser.add_argument("--delay-min", type=float, default=1.8)
|
||||||
parser.add_argument("--delay-max", type=float, default=3.2)
|
parser.add_argument("--delay-max", type=float, default=3.2)
|
||||||
parser.add_argument("--max-pages", type=int, default=200)
|
parser.add_argument("--max-pages", type=int, default=200)
|
||||||
@@ -69,7 +63,8 @@ def run_happyfappy(args: argparse.Namespace, action: str) -> None:
|
|||||||
base_url = args.base_url or "https://www.happyfappy.net"
|
base_url = args.base_url or "https://www.happyfappy.net"
|
||||||
|
|
||||||
if action == "get-bookmarks":
|
if action == "get-bookmarks":
|
||||||
bookmarks_args = argparse.Namespace(
|
run_get_bookmarks(
|
||||||
|
argparse.Namespace(
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
cookie=args.cookie,
|
cookie=args.cookie,
|
||||||
cookie_file=args.cookie_file,
|
cookie_file=args.cookie_file,
|
||||||
@@ -80,13 +75,14 @@ def run_happyfappy(args: argparse.Namespace, action: str) -> None:
|
|||||||
backoff_base=args.backoff_base,
|
backoff_base=args.backoff_base,
|
||||||
max_pages=args.max_pages,
|
max_pages=args.max_pages,
|
||||||
)
|
)
|
||||||
run_happyfappy_bookmarks(bookmarks_args)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if action == "download-torrent-files":
|
if action == "download-torrent-files":
|
||||||
if not args.url:
|
if not args.url:
|
||||||
raise ValueError("--url is required for action=download-torrent-files.")
|
raise ValueError("--url is required for action=download-torrent-files.")
|
||||||
download_args = argparse.Namespace(
|
run_download_torrent_files(
|
||||||
|
argparse.Namespace(
|
||||||
url=args.url,
|
url=args.url,
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
cookie=args.cookie,
|
cookie=args.cookie,
|
||||||
@@ -95,7 +91,7 @@ def run_happyfappy(args: argparse.Namespace, action: str) -> None:
|
|||||||
retries=args.retries,
|
retries=args.retries,
|
||||||
backoff_base=args.backoff_base,
|
backoff_base=args.backoff_base,
|
||||||
)
|
)
|
||||||
run_happyfappy_download(download_args)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
raise ValueError(f"Unsupported action for happyfappy: {action}")
|
raise ValueError(f"Unsupported action for happyfappy: {action}")
|
||||||
1
src/wscraper/sites/__init__.py
Normal file
1
src/wscraper/sites/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
__all__ = ["happyfappy"]
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -9,6 +11,9 @@ from urllib.parse import urlparse
|
|||||||
|
|
||||||
from scrapling.fetchers import DynamicSession
|
from scrapling.fetchers import DynamicSession
|
||||||
|
|
||||||
|
STOP_TEXT = "You have not bookmarked any torrents."
|
||||||
|
BG_URL_RE = re.compile(r"url\((?:'|\")?(.*?)(?:'|\")?\)")
|
||||||
|
|
||||||
|
|
||||||
def _domain_matches(target_host: str, cookie_domain: str) -> bool:
|
def _domain_matches(target_host: str, cookie_domain: str) -> bool:
|
||||||
cd = cookie_domain.lstrip(".").lower()
|
cd = cookie_domain.lstrip(".").lower()
|
||||||
@@ -48,9 +53,7 @@ def parse_cookie_string(cookie_string: str, target_host: str) -> dict[str, str]:
|
|||||||
return cookies
|
return cookies
|
||||||
|
|
||||||
|
|
||||||
def parse_cookies_for_playwright(
|
def parse_cookies_for_playwright(cookie_string: str, target_host: str, base_url: str) -> list[dict[str, Any]]:
|
||||||
cookie_string: str, target_host: str, base_url: str
|
|
||||||
) -> list[dict[str, Any]]:
|
|
||||||
lines = cookie_string.splitlines()
|
lines = cookie_string.splitlines()
|
||||||
cookies: list[dict[str, Any]] = []
|
cookies: list[dict[str, Any]] = []
|
||||||
looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
|
looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
|
||||||
@@ -121,9 +124,103 @@ def fetch_dynamic_with_retry(session: Any, url: str, retries: int, backoff_base:
|
|||||||
raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error
|
raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error
|
||||||
|
|
||||||
|
|
||||||
def download_via_browser_with_retry(
|
# bookmarks
|
||||||
session: DynamicSession, detail_url: str, retries: int, backoff_base: float
|
|
||||||
) -> tuple[str, bytes]:
|
def extract_background_image(style: str) -> str | None:
|
||||||
|
if not style:
|
||||||
|
return None
|
||||||
|
match = BG_URL_RE.search(style)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
value = match.group(1).strip()
|
||||||
|
return value or None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_torrent_cards(response: Any, base_url: str) -> list[dict[str, Any]]:
|
||||||
|
records: list[dict[str, Any]] = []
|
||||||
|
cards = response.css("div.torrent_grid div.torrent_grid__torrent")
|
||||||
|
for card in cards:
|
||||||
|
page_url = (card.css('a[href^="/torrents.php?id="]::attr(href)').get("") or "").strip()
|
||||||
|
if page_url and not page_url.startswith("http"):
|
||||||
|
page_url = f"{base_url.rstrip('/')}{page_url}"
|
||||||
|
category = (card.css("span.torrent_grid__torrent__cat::text").get("") or "").strip()
|
||||||
|
title = (card.css("h3.trim::attr(title)").get("") or card.css("h3.trim::text").get("") or "").strip()
|
||||||
|
style = (card.css("div.torrent__cover::attr(style)").get("") or "").strip()
|
||||||
|
background_image = extract_background_image(style)
|
||||||
|
|
||||||
|
records.append(
|
||||||
|
{
|
||||||
|
"pageURL": page_url,
|
||||||
|
"isVR": category == "VR",
|
||||||
|
"title": title,
|
||||||
|
"backgroundImage": background_image,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
def should_stop(response: Any) -> bool:
|
||||||
|
body_text = response.body.decode(response.encoding or "utf-8", errors="ignore")
|
||||||
|
return STOP_TEXT in body_text
|
||||||
|
|
||||||
|
|
||||||
|
def build_bookmarks_url(base_url: str, page: int) -> str:
|
||||||
|
if page == 1:
|
||||||
|
return f"{base_url}/bookmarks.php?type=torrents"
|
||||||
|
return f"{base_url}/bookmarks.php?page={page}&type=torrents#torrent_table"
|
||||||
|
|
||||||
|
|
||||||
|
def run_get_bookmarks(args: argparse.Namespace) -> None:
|
||||||
|
target_host = urlparse(args.base_url).hostname or "www.happyfappy.net"
|
||||||
|
|
||||||
|
cookie_value = args.cookie or ""
|
||||||
|
if not cookie_value and args.cookie_file:
|
||||||
|
cookie_value = Path(args.cookie_file).read_text(encoding="utf-8").strip()
|
||||||
|
if not cookie_value:
|
||||||
|
raise ValueError("Cookie is required. Use --cookie or --cookie-file.")
|
||||||
|
|
||||||
|
cookies = parse_cookie_string(cookie_value, target_host=target_host)
|
||||||
|
if not cookies:
|
||||||
|
raise ValueError("No valid cookies parsed for target host. Check cookie content.")
|
||||||
|
pw_cookies = parse_cookies_for_playwright(cookie_value, target_host=target_host, base_url=args.base_url.rstrip("/"))
|
||||||
|
if not pw_cookies:
|
||||||
|
raise ValueError("No Playwright-compatible cookies generated for target host.")
|
||||||
|
|
||||||
|
all_records: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
with DynamicSession(
|
||||||
|
headless=True,
|
||||||
|
disable_resources=True,
|
||||||
|
cookies=pw_cookies,
|
||||||
|
google_search=False,
|
||||||
|
retries=1,
|
||||||
|
retry_delay=1,
|
||||||
|
) as session:
|
||||||
|
page = 1
|
||||||
|
while page <= args.max_pages:
|
||||||
|
if page > 1:
|
||||||
|
time.sleep(random.uniform(args.delay_min, args.delay_max))
|
||||||
|
|
||||||
|
url = build_bookmarks_url(args.base_url.rstrip("/"), page)
|
||||||
|
response = fetch_dynamic_with_retry(session, url, retries=args.retries, backoff_base=args.backoff_base)
|
||||||
|
|
||||||
|
if should_stop(response):
|
||||||
|
break
|
||||||
|
|
||||||
|
page_records = extract_torrent_cards(response, args.base_url)
|
||||||
|
all_records.extend(page_records)
|
||||||
|
print(f"[page={page}] extracted={len(page_records)} total={len(all_records)}")
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
output_path = Path(args.output).resolve()
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
output_path.write_text(json.dumps(all_records, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"Saved {len(all_records)} records to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
# torrent
|
||||||
|
|
||||||
|
def download_via_browser_with_retry(session: DynamicSession, detail_url: str, retries: int, backoff_base: float) -> tuple[str, bytes]:
|
||||||
last_error: Exception | None = None
|
last_error: Exception | None = None
|
||||||
for attempt in range(retries):
|
for attempt in range(retries):
|
||||||
page = session.context.new_page()
|
page = session.context.new_page()
|
||||||
@@ -184,7 +281,6 @@ def find_download_link(response: Any) -> str:
|
|||||||
if href:
|
if href:
|
||||||
return href
|
return href
|
||||||
|
|
||||||
# Fallback using text match if classes/attributes drift
|
|
||||||
href = (
|
href = (
|
||||||
response.xpath(
|
response.xpath(
|
||||||
"//a[contains(translate(normalize-space(string(.)),"
|
"//a[contains(translate(normalize-space(string(.)),"
|
||||||
@@ -206,7 +302,6 @@ def normalize_filename(filename: str, download_url: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def looks_like_torrent_bytes(data: bytes) -> bool:
|
def looks_like_torrent_bytes(data: bytes) -> bool:
|
||||||
# Basic bencode sanity check for torrent files
|
|
||||||
return bool(data) and data.startswith(b"d") and (b"4:info" in data[:4096])
|
return bool(data) and data.startswith(b"d") and (b"4:info" in data[:4096])
|
||||||
|
|
||||||
|
|
||||||
@@ -218,7 +313,7 @@ def validate_torrent_response(download_url: str, filename: str, data: bytes) ->
|
|||||||
raise RuntimeError("Downloaded file failed torrent bencode check.")
|
raise RuntimeError("Downloaded file failed torrent bencode check.")
|
||||||
|
|
||||||
|
|
||||||
def run(args: argparse.Namespace) -> None:
|
def run_download_torrent_files(args: argparse.Namespace) -> None:
|
||||||
base_url = args.base_url.rstrip("/")
|
base_url = args.base_url.rstrip("/")
|
||||||
target_host = urlparse(base_url).hostname or "www.happyfappy.net"
|
target_host = urlparse(base_url).hostname or "www.happyfappy.net"
|
||||||
|
|
||||||
@@ -246,47 +341,15 @@ def run(args: argparse.Namespace) -> None:
|
|||||||
retries=1,
|
retries=1,
|
||||||
retry_delay=1,
|
retry_delay=1,
|
||||||
) as session:
|
) as session:
|
||||||
detail_response = fetch_dynamic_with_retry(
|
detail_response = fetch_dynamic_with_retry(session, args.url, retries=args.retries, backoff_base=args.backoff_base)
|
||||||
session, args.url, retries=args.retries, backoff_base=args.backoff_base
|
|
||||||
)
|
|
||||||
href = find_download_link(detail_response)
|
href = find_download_link(detail_response)
|
||||||
if not href:
|
if not href:
|
||||||
raise RuntimeError("Download link not found on page.")
|
raise RuntimeError("Download link not found on page.")
|
||||||
|
|
||||||
download_url = absolute_url(base_url, href)
|
download_url = absolute_url(base_url, href)
|
||||||
suggested_filename, data = download_via_browser_with_retry(
|
suggested_filename, data = download_via_browser_with_retry(session, args.url, retries=args.retries, backoff_base=args.backoff_base)
|
||||||
session, args.url, retries=args.retries, backoff_base=args.backoff_base
|
|
||||||
)
|
|
||||||
filename = normalize_filename(suggested_filename, download_url)
|
filename = normalize_filename(suggested_filename, download_url)
|
||||||
validate_torrent_response(download_url, filename, data)
|
validate_torrent_response(download_url, filename, data)
|
||||||
output_path = output_dir / filename
|
output_path = output_dir / filename
|
||||||
output_path.write_bytes(data) # overwrite behavior by design
|
output_path.write_bytes(data)
|
||||||
print(f"Saved torrent to {output_path}")
|
print(f"Saved torrent to {output_path}")
|
||||||
|
|
||||||
|
|
||||||
def make_parser() -> argparse.ArgumentParser:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Download a torrent file from a single HappyFappy torrent detail page URL.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--url", required=True, help="Torrent detail page URL")
|
|
||||||
parser.add_argument("--base-url", default="https://www.happyfappy.net")
|
|
||||||
parser.add_argument("--cookie", help='Raw cookie string, e.g. "a=1; b=2"')
|
|
||||||
parser.add_argument("--cookie-file", help="Path to cookie file")
|
|
||||||
parser.add_argument("--output-dir", default="torrent")
|
|
||||||
parser.add_argument("--retries", type=int, default=3)
|
|
||||||
parser.add_argument("--backoff-base", type=float, default=5.0)
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
parser = make_parser()
|
|
||||||
args = parser.parse_args()
|
|
||||||
if args.retries < 1:
|
|
||||||
raise ValueError("--retries must be at least 1.")
|
|
||||||
if args.backoff_base < 0:
|
|
||||||
raise ValueError("--backoff-base must be >= 0.")
|
|
||||||
run(args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
Reference in New Issue
Block a user