From 1ef7118ba7ec87d0785cbc81c55c214bf14ea9cb Mon Sep 17 00:00:00 2001 From: wisecolt Date: Sat, 7 Mar 2026 01:40:18 +0300 Subject: [PATCH] refactor: migrate to src package layout and wscraper entry module --- pyproject.toml | 22 ++ requirements.txt | 2 +- scrape_happyfappy_bookmarks.py | 256 ------------------ src/wscraper.egg-info/PKG-INFO | 92 +++++++ src/wscraper.egg-info/SOURCES.txt | 13 + src/wscraper.egg-info/dependency_links.txt | 1 + src/wscraper.egg-info/entry_points.txt | 2 + src/wscraper.egg-info/requires.txt | 1 + src/wscraper.egg-info/top_level.txt | 1 + src/wscraper/__init__.py | 3 + src/wscraper/__main__.py | 5 + wscraper.py => src/wscraper/cli.py | 52 ++-- src/wscraper/sites/__init__.py | 1 + .../wscraper/sites/happyfappy.py | 153 ++++++++--- 14 files changed, 274 insertions(+), 330 deletions(-) create mode 100644 pyproject.toml delete mode 100644 scrape_happyfappy_bookmarks.py create mode 100644 src/wscraper.egg-info/PKG-INFO create mode 100644 src/wscraper.egg-info/SOURCES.txt create mode 100644 src/wscraper.egg-info/dependency_links.txt create mode 100644 src/wscraper.egg-info/entry_points.txt create mode 100644 src/wscraper.egg-info/requires.txt create mode 100644 src/wscraper.egg-info/top_level.txt create mode 100644 src/wscraper/__init__.py create mode 100644 src/wscraper/__main__.py rename wscraper.py => src/wscraper/cli.py (75%) create mode 100644 src/wscraper/sites/__init__.py rename download_happyfappy_torrent.py => src/wscraper/sites/happyfappy.py (68%) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1c31d88 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "wscraper" +version = "0.1.0" +description = "Multi-site scraper CLI" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "scrapling[fetchers]==0.4.1", +] + +[project.scripts] +wscraper = "wscraper.cli:main" + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/requirements.txt b/requirements.txt index 03260b2..d6e1198 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -scrapling[fetchers]==0.4.1 +-e . diff --git a/scrape_happyfappy_bookmarks.py b/scrape_happyfappy_bookmarks.py deleted file mode 100644 index b79e549..0000000 --- a/scrape_happyfappy_bookmarks.py +++ /dev/null @@ -1,256 +0,0 @@ -#!/usr/bin/env python3 -from __future__ import annotations - -import argparse -import json -import random -import re -import time -from pathlib import Path -from typing import Any -from urllib.parse import urlparse - -from scrapling.fetchers import DynamicSession - -STOP_TEXT = "You have not bookmarked any torrents." -BG_URL_RE = re.compile(r"url\((?:'|\")?(.*?)(?:'|\")?\)") - - -def _domain_matches(target_host: str, cookie_domain: str) -> bool: - cd = cookie_domain.lstrip(".").lower() - th = target_host.lower() - return th == cd or th.endswith("." + cd) - - -def parse_cookie_string(cookie_string: str, target_host: str) -> dict[str, str]: - """ - Supports: - 1) "key=value; key2=value2" cookie header style - 2) Netscape cookie file format (tab-separated 7 columns) - """ - cookies: dict[str, str] = {} - lines = cookie_string.splitlines() - - looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines) - if looks_like_netscape: - for raw_line in lines: - line = raw_line.strip() - if not line or line.startswith("#"): - continue - parts = line.split("\t") - if len(parts) < 7: - continue - domain, _flag, _path, _secure, _expires, name, value = parts[:7] - if not _domain_matches(target_host, domain): - continue - if name: - cookies[name] = value - return cookies - - for chunk in cookie_string.split(";"): - piece = chunk.strip() - if not piece or "=" not in piece: - continue - key, value = piece.split("=", 1) - key = key.strip() - value = value.strip() - if key: - cookies[key] = value - return cookies - - -def parse_cookies_for_playwright( - cookie_string: str, target_host: str, base_url: str -) -> list[dict[str, Any]]: - """ - Converts cookie input into Playwright-compatible cookie objects. - """ - lines = cookie_string.splitlines() - cookies: list[dict[str, Any]] = [] - - looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines) - if looks_like_netscape: - for raw_line in lines: - line = raw_line.strip() - if not line or line.startswith("#"): - continue - parts = line.split("\t") - if len(parts) < 7: - continue - domain, _flag, path, secure, expires, name, value = parts[:7] - if not _domain_matches(target_host, domain): - continue - if not name: - continue - - cookie_obj: dict[str, Any] = { - "name": name, - "value": value, - "domain": domain.lstrip("."), - "path": path or "/", - "secure": (secure.upper() == "TRUE"), - } - if expires.isdigit(): - exp_num = int(expires) - if exp_num > 0: - cookie_obj["expires"] = float(exp_num) - cookies.append(cookie_obj) - return cookies - - kv = parse_cookie_string(cookie_string, target_host) - for name, value in kv.items(): - cookies.append({"name": name, "value": value, "url": base_url}) - return cookies - - -def extract_background_image(style: str) -> str | None: - if not style: - return None - match = BG_URL_RE.search(style) - if not match: - return None - value = match.group(1).strip() - return value or None - - -def extract_torrent_cards(response: Any, base_url: str) -> list[dict[str, Any]]: - records: list[dict[str, Any]] = [] - cards = response.css("div.torrent_grid div.torrent_grid__torrent") - for card in cards: - page_url = (card.css('a[href^="/torrents.php?id="]::attr(href)').get("") or "").strip() - if page_url and not page_url.startswith("http"): - page_url = f"{base_url.rstrip('/')}{page_url}" - category = (card.css("span.torrent_grid__torrent__cat::text").get("") or "").strip() - title = ( - card.css("h3.trim::attr(title)").get("") - or card.css("h3.trim::text").get("") - or "" - ).strip() - style = (card.css("div.torrent__cover::attr(style)").get("") or "").strip() - background_image = extract_background_image(style) - - records.append( - { - "pageURL": page_url, - "isVR": category == "VR", - "title": title, - "backgroundImage": background_image, - } - ) - return records - - -def should_stop(response: Any) -> bool: - body_text = response.body.decode(response.encoding or "utf-8", errors="ignore") - return STOP_TEXT in body_text - - -def fetch_page(session: Any, url: str, retries: int, backoff_base: float) -> Any: - last_error: Exception | None = None - for attempt in range(retries): - try: - response = session.fetch( - url, - timeout=45_000, - load_dom=True, - network_idle=False, - ) - status = response.status - if status in (403, 429) or status >= 500: - raise RuntimeError(f"HTTP {status}") - return response - except Exception as err: # noqa: BLE001 - last_error = err - if attempt == retries - 1: - break - sleep_seconds = backoff_base * (2**attempt) + random.uniform(0.0, 0.7) - time.sleep(sleep_seconds) - raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error - - -def build_bookmarks_url(base_url: str, page: int) -> str: - if page == 1: - return f"{base_url}/bookmarks.php?type=torrents" - return f"{base_url}/bookmarks.php?page={page}&type=torrents#torrent_table" - - -def run(args: argparse.Namespace) -> None: - target_host = urlparse(args.base_url).hostname or "www.happyfappy.net" - - cookie_value = args.cookie or "" - if not cookie_value and args.cookie_file: - cookie_value = Path(args.cookie_file).read_text(encoding="utf-8").strip() - if not cookie_value: - raise ValueError("Cookie is required. Use --cookie or --cookie-file.") - - cookies = parse_cookie_string(cookie_value, target_host=target_host) - if not cookies: - raise ValueError("No valid cookies parsed for target host. Check cookie content.") - pw_cookies = parse_cookies_for_playwright( - cookie_value, target_host=target_host, base_url=args.base_url.rstrip("/") - ) - if not pw_cookies: - raise ValueError("No Playwright-compatible cookies generated for target host.") - - all_records: list[dict[str, Any]] = [] - - with DynamicSession( - headless=True, - disable_resources=True, - cookies=pw_cookies, - google_search=False, - retries=1, - retry_delay=1, - ) as session: - page = 1 - while page <= args.max_pages: - if page > 1: - time.sleep(random.uniform(args.delay_min, args.delay_max)) - - url = build_bookmarks_url(args.base_url.rstrip("/"), page) - response = fetch_page(session, url, retries=args.retries, backoff_base=args.backoff_base) - - if should_stop(response): - break - - page_records = extract_torrent_cards(response, args.base_url) - all_records.extend(page_records) - print(f"[page={page}] extracted={len(page_records)} total={len(all_records)}") - page += 1 - - output_path = Path(args.output).resolve() - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(json.dumps(all_records, ensure_ascii=False, indent=2), encoding="utf-8") - print(f"Saved {len(all_records)} records to {output_path}") - - -def make_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - description="Scrape HappyFappy torrent bookmarks using an authenticated cookie.", - ) - parser.add_argument("--base-url", default="https://www.happyfappy.net") - parser.add_argument("--cookie", help='Raw cookie string, e.g. "a=1; b=2"') - parser.add_argument("--cookie-file", help="Path to a text file containing raw cookie string") - parser.add_argument("--output", default="bookmarks.json") - parser.add_argument("--delay-min", type=float, default=1.8, help="Minimum delay between page requests") - parser.add_argument("--delay-max", type=float, default=3.2, help="Maximum delay between page requests") - parser.add_argument("--retries", type=int, default=3, help="Retries per page request") - parser.add_argument("--backoff-base", type=float, default=5.0, help="Backoff base seconds") - parser.add_argument("--max-pages", type=int, default=200, help="Safety cap for pagination loop") - return parser - - -def main() -> None: - parser = make_parser() - args = parser.parse_args() - if args.delay_min < 0 or args.delay_max < 0: - raise ValueError("Delay values must be non-negative.") - if args.delay_min > args.delay_max: - raise ValueError("--delay-min cannot be greater than --delay-max.") - if args.retries < 1: - raise ValueError("--retries must be at least 1.") - run(args) - - -if __name__ == "__main__": - main() diff --git a/src/wscraper.egg-info/PKG-INFO b/src/wscraper.egg-info/PKG-INFO new file mode 100644 index 0000000..ede08e2 --- /dev/null +++ b/src/wscraper.egg-info/PKG-INFO @@ -0,0 +1,92 @@ +Metadata-Version: 2.4 +Name: wscraper +Version: 0.1.0 +Summary: Multi-site scraper CLI +Requires-Python: >=3.12 +Description-Content-Type: text/markdown +Requires-Dist: scrapling[fetchers]==0.4.1 + +# wscraper + +HappyFappy için komutlar paketlenmiş `wscraper` CLI üzerinden çalışır. Proje çoklu site desteği için `src/` paket yapısına göre düzenlenmiştir. + +## 1) Repo Clone + +```bash +git clone +cd +``` + +## 2) Kurulum + +### macOS / Linux + +```bash +python3.12 -m venv .venv +source .venv/bin/activate +python -m pip install -U pip +python -m pip install -e . +scrapling install +``` + +### Windows (PowerShell) + +```powershell +py -3.12 -m venv .venv +.venv\Scripts\Activate.ps1 +python -m pip install -U pip +python -m pip install -e . +scrapling install +``` + +### Windows (CMD) + +```bat +py -3.12 -m venv .venv +.venv\Scripts\activate.bat +python -m pip install -U pip +python -m pip install -e . +scrapling install +``` + +Not: Ortamı aktive ettikten sonra komutlar `wscraper ...` olarak kullanılabilir. İstersen `python -m wscraper ...` da kullanabilirsin. + +## 3) HappyFappy Komutları + +### Bookmarks Çekme + +```bash +wscraper happyfappy --action get-bookmarks -c cookies.txt -o bookmarks.json +``` + +### Torrent Dosyası İndirme + +```bash +wscraper happyfappy --action download-torrent-files -u "https://www.happyfappy.net/torrents.php?id=110178" -c cookies.txt -o torrent +``` + +## 4) Kısa Alias Kullanımı + +```bash +# site alias: hf +# action alias: gb (get-bookmarks), dtf (download-torrent-files) +wscraper hf -a gb -c cookies.txt -o bookmarks.json +wscraper hf -a dtf -u "https://www.happyfappy.net/torrents.php?id=110178" -c cookies.txt -o torrent +``` + +## 5) Proje Dizini + +```text +. +├── pyproject.toml +├── requirements.txt +├── src/ +│ └── wscraper/ +│ ├── __init__.py +│ ├── __main__.py +│ ├── cli.py +│ └── sites/ +│ ├── __init__.py +│ └── happyfappy.py +└── README.md +``` diff --git a/src/wscraper.egg-info/SOURCES.txt b/src/wscraper.egg-info/SOURCES.txt new file mode 100644 index 0000000..2309d30 --- /dev/null +++ b/src/wscraper.egg-info/SOURCES.txt @@ -0,0 +1,13 @@ +README.md +pyproject.toml +src/wscraper/__init__.py +src/wscraper/__main__.py +src/wscraper/cli.py +src/wscraper.egg-info/PKG-INFO +src/wscraper.egg-info/SOURCES.txt +src/wscraper.egg-info/dependency_links.txt +src/wscraper.egg-info/entry_points.txt +src/wscraper.egg-info/requires.txt +src/wscraper.egg-info/top_level.txt +src/wscraper/sites/__init__.py +src/wscraper/sites/happyfappy.py \ No newline at end of file diff --git a/src/wscraper.egg-info/dependency_links.txt b/src/wscraper.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/wscraper.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/wscraper.egg-info/entry_points.txt b/src/wscraper.egg-info/entry_points.txt new file mode 100644 index 0000000..52aeba6 --- /dev/null +++ b/src/wscraper.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +wscraper = wscraper.cli:main diff --git a/src/wscraper.egg-info/requires.txt b/src/wscraper.egg-info/requires.txt new file mode 100644 index 0000000..03260b2 --- /dev/null +++ b/src/wscraper.egg-info/requires.txt @@ -0,0 +1 @@ +scrapling[fetchers]==0.4.1 diff --git a/src/wscraper.egg-info/top_level.txt b/src/wscraper.egg-info/top_level.txt new file mode 100644 index 0000000..016c5eb --- /dev/null +++ b/src/wscraper.egg-info/top_level.txt @@ -0,0 +1 @@ +wscraper diff --git a/src/wscraper/__init__.py b/src/wscraper/__init__.py new file mode 100644 index 0000000..a05eb9a --- /dev/null +++ b/src/wscraper/__init__.py @@ -0,0 +1,3 @@ +__all__ = ["__version__"] + +__version__ = "0.1.0" diff --git a/src/wscraper/__main__.py b/src/wscraper/__main__.py new file mode 100644 index 0000000..3508a4b --- /dev/null +++ b/src/wscraper/__main__.py @@ -0,0 +1,5 @@ +from wscraper.cli import main + + +if __name__ == "__main__": + main() diff --git a/wscraper.py b/src/wscraper/cli.py similarity index 75% rename from wscraper.py rename to src/wscraper/cli.py index 5e10d21..2e86b70 100644 --- a/wscraper.py +++ b/src/wscraper/cli.py @@ -1,11 +1,8 @@ -#!/usr/bin/env python3 from __future__ import annotations import argparse -from download_happyfappy_torrent import run as run_happyfappy_download -from scrape_happyfappy_bookmarks import run as run_happyfappy_bookmarks - +from wscraper.sites.happyfappy import run_download_torrent_files, run_get_bookmarks SITE_ALIASES = { "happyfappy": "happyfappy", @@ -39,9 +36,7 @@ def normalize_action(value: str) -> str: def build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - description="wscraper: multi-site scraping entrypoint", - ) + parser = argparse.ArgumentParser(description="wscraper: multi-site scraping entrypoint") parser.add_argument("site", help="Site key, e.g. happyfappy or hf") parser.add_argument("-a", "--action", required=True, help="Action to run") @@ -58,7 +53,6 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument("-r", "--retries", type=int, default=3) parser.add_argument("--backoff-base", type=float, default=5.0) - parser.add_argument("--delay-min", type=float, default=1.8) parser.add_argument("--delay-max", type=float, default=3.2) parser.add_argument("--max-pages", type=int, default=200) @@ -69,33 +63,35 @@ def run_happyfappy(args: argparse.Namespace, action: str) -> None: base_url = args.base_url or "https://www.happyfappy.net" if action == "get-bookmarks": - bookmarks_args = argparse.Namespace( - base_url=base_url, - cookie=args.cookie, - cookie_file=args.cookie_file, - output=args.output or "bookmarks.json", - delay_min=args.delay_min, - delay_max=args.delay_max, - retries=args.retries, - backoff_base=args.backoff_base, - max_pages=args.max_pages, + run_get_bookmarks( + argparse.Namespace( + base_url=base_url, + cookie=args.cookie, + cookie_file=args.cookie_file, + output=args.output or "bookmarks.json", + delay_min=args.delay_min, + delay_max=args.delay_max, + retries=args.retries, + backoff_base=args.backoff_base, + max_pages=args.max_pages, + ) ) - run_happyfappy_bookmarks(bookmarks_args) return if action == "download-torrent-files": if not args.url: raise ValueError("--url is required for action=download-torrent-files.") - download_args = argparse.Namespace( - url=args.url, - base_url=base_url, - cookie=args.cookie, - cookie_file=args.cookie_file, - output_dir=args.output or "torrent", - retries=args.retries, - backoff_base=args.backoff_base, + run_download_torrent_files( + argparse.Namespace( + url=args.url, + base_url=base_url, + cookie=args.cookie, + cookie_file=args.cookie_file, + output_dir=args.output or "torrent", + retries=args.retries, + backoff_base=args.backoff_base, + ) ) - run_happyfappy_download(download_args) return raise ValueError(f"Unsupported action for happyfappy: {action}") diff --git a/src/wscraper/sites/__init__.py b/src/wscraper/sites/__init__.py new file mode 100644 index 0000000..c677ec3 --- /dev/null +++ b/src/wscraper/sites/__init__.py @@ -0,0 +1 @@ +__all__ = ["happyfappy"] diff --git a/download_happyfappy_torrent.py b/src/wscraper/sites/happyfappy.py similarity index 68% rename from download_happyfappy_torrent.py rename to src/wscraper/sites/happyfappy.py index 684caba..4f4fa51 100644 --- a/download_happyfappy_torrent.py +++ b/src/wscraper/sites/happyfappy.py @@ -1,7 +1,9 @@ -#!/usr/bin/env python3 from __future__ import annotations import argparse +import json +import random +import re import time from pathlib import Path from typing import Any @@ -9,6 +11,9 @@ from urllib.parse import urlparse from scrapling.fetchers import DynamicSession +STOP_TEXT = "You have not bookmarked any torrents." +BG_URL_RE = re.compile(r"url\((?:'|\")?(.*?)(?:'|\")?\)") + def _domain_matches(target_host: str, cookie_domain: str) -> bool: cd = cookie_domain.lstrip(".").lower() @@ -48,9 +53,7 @@ def parse_cookie_string(cookie_string: str, target_host: str) -> dict[str, str]: return cookies -def parse_cookies_for_playwright( - cookie_string: str, target_host: str, base_url: str -) -> list[dict[str, Any]]: +def parse_cookies_for_playwright(cookie_string: str, target_host: str, base_url: str) -> list[dict[str, Any]]: lines = cookie_string.splitlines() cookies: list[dict[str, Any]] = [] looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines) @@ -121,9 +124,103 @@ def fetch_dynamic_with_retry(session: Any, url: str, retries: int, backoff_base: raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error -def download_via_browser_with_retry( - session: DynamicSession, detail_url: str, retries: int, backoff_base: float -) -> tuple[str, bytes]: +# bookmarks + +def extract_background_image(style: str) -> str | None: + if not style: + return None + match = BG_URL_RE.search(style) + if not match: + return None + value = match.group(1).strip() + return value or None + + +def extract_torrent_cards(response: Any, base_url: str) -> list[dict[str, Any]]: + records: list[dict[str, Any]] = [] + cards = response.css("div.torrent_grid div.torrent_grid__torrent") + for card in cards: + page_url = (card.css('a[href^="/torrents.php?id="]::attr(href)').get("") or "").strip() + if page_url and not page_url.startswith("http"): + page_url = f"{base_url.rstrip('/')}{page_url}" + category = (card.css("span.torrent_grid__torrent__cat::text").get("") or "").strip() + title = (card.css("h3.trim::attr(title)").get("") or card.css("h3.trim::text").get("") or "").strip() + style = (card.css("div.torrent__cover::attr(style)").get("") or "").strip() + background_image = extract_background_image(style) + + records.append( + { + "pageURL": page_url, + "isVR": category == "VR", + "title": title, + "backgroundImage": background_image, + } + ) + return records + + +def should_stop(response: Any) -> bool: + body_text = response.body.decode(response.encoding or "utf-8", errors="ignore") + return STOP_TEXT in body_text + + +def build_bookmarks_url(base_url: str, page: int) -> str: + if page == 1: + return f"{base_url}/bookmarks.php?type=torrents" + return f"{base_url}/bookmarks.php?page={page}&type=torrents#torrent_table" + + +def run_get_bookmarks(args: argparse.Namespace) -> None: + target_host = urlparse(args.base_url).hostname or "www.happyfappy.net" + + cookie_value = args.cookie or "" + if not cookie_value and args.cookie_file: + cookie_value = Path(args.cookie_file).read_text(encoding="utf-8").strip() + if not cookie_value: + raise ValueError("Cookie is required. Use --cookie or --cookie-file.") + + cookies = parse_cookie_string(cookie_value, target_host=target_host) + if not cookies: + raise ValueError("No valid cookies parsed for target host. Check cookie content.") + pw_cookies = parse_cookies_for_playwright(cookie_value, target_host=target_host, base_url=args.base_url.rstrip("/")) + if not pw_cookies: + raise ValueError("No Playwright-compatible cookies generated for target host.") + + all_records: list[dict[str, Any]] = [] + + with DynamicSession( + headless=True, + disable_resources=True, + cookies=pw_cookies, + google_search=False, + retries=1, + retry_delay=1, + ) as session: + page = 1 + while page <= args.max_pages: + if page > 1: + time.sleep(random.uniform(args.delay_min, args.delay_max)) + + url = build_bookmarks_url(args.base_url.rstrip("/"), page) + response = fetch_dynamic_with_retry(session, url, retries=args.retries, backoff_base=args.backoff_base) + + if should_stop(response): + break + + page_records = extract_torrent_cards(response, args.base_url) + all_records.extend(page_records) + print(f"[page={page}] extracted={len(page_records)} total={len(all_records)}") + page += 1 + + output_path = Path(args.output).resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(all_records, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Saved {len(all_records)} records to {output_path}") + + +# torrent + +def download_via_browser_with_retry(session: DynamicSession, detail_url: str, retries: int, backoff_base: float) -> tuple[str, bytes]: last_error: Exception | None = None for attempt in range(retries): page = session.context.new_page() @@ -184,7 +281,6 @@ def find_download_link(response: Any) -> str: if href: return href - # Fallback using text match if classes/attributes drift href = ( response.xpath( "//a[contains(translate(normalize-space(string(.))," @@ -206,7 +302,6 @@ def normalize_filename(filename: str, download_url: str) -> str: def looks_like_torrent_bytes(data: bytes) -> bool: - # Basic bencode sanity check for torrent files return bool(data) and data.startswith(b"d") and (b"4:info" in data[:4096]) @@ -218,7 +313,7 @@ def validate_torrent_response(download_url: str, filename: str, data: bytes) -> raise RuntimeError("Downloaded file failed torrent bencode check.") -def run(args: argparse.Namespace) -> None: +def run_download_torrent_files(args: argparse.Namespace) -> None: base_url = args.base_url.rstrip("/") target_host = urlparse(base_url).hostname or "www.happyfappy.net" @@ -246,47 +341,15 @@ def run(args: argparse.Namespace) -> None: retries=1, retry_delay=1, ) as session: - detail_response = fetch_dynamic_with_retry( - session, args.url, retries=args.retries, backoff_base=args.backoff_base - ) + detail_response = fetch_dynamic_with_retry(session, args.url, retries=args.retries, backoff_base=args.backoff_base) href = find_download_link(detail_response) if not href: raise RuntimeError("Download link not found on page.") download_url = absolute_url(base_url, href) - suggested_filename, data = download_via_browser_with_retry( - session, args.url, retries=args.retries, backoff_base=args.backoff_base - ) + suggested_filename, data = download_via_browser_with_retry(session, args.url, retries=args.retries, backoff_base=args.backoff_base) filename = normalize_filename(suggested_filename, download_url) validate_torrent_response(download_url, filename, data) output_path = output_dir / filename - output_path.write_bytes(data) # overwrite behavior by design + output_path.write_bytes(data) print(f"Saved torrent to {output_path}") - - -def make_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - description="Download a torrent file from a single HappyFappy torrent detail page URL.", - ) - parser.add_argument("--url", required=True, help="Torrent detail page URL") - parser.add_argument("--base-url", default="https://www.happyfappy.net") - parser.add_argument("--cookie", help='Raw cookie string, e.g. "a=1; b=2"') - parser.add_argument("--cookie-file", help="Path to cookie file") - parser.add_argument("--output-dir", default="torrent") - parser.add_argument("--retries", type=int, default=3) - parser.add_argument("--backoff-base", type=float, default=5.0) - return parser - - -def main() -> None: - parser = make_parser() - args = parser.parse_args() - if args.retries < 1: - raise ValueError("--retries must be at least 1.") - if args.backoff_base < 0: - raise ValueError("--backoff-base must be >= 0.") - run(args) - - -if __name__ == "__main__": - main()