feat: add browser-based torrent download and package-only Scrapling usage

2026-03-07 01:04:36 +03:00
parent 690733a224
commit bea3010839
3 changed files with 307 additions and 13 deletions
--- a/scrape_happyfappy_bookmarks.py
+++ b/scrape_happyfappy_bookmarks.py
@@ -5,21 +5,12 @@ import argparse
 import json
 import random
 import re
-import sys
 import time
 from pathlib import Path
 from typing import Any
 from urllib.parse import urlparse

-try:
-    from scrapling.fetchers import DynamicSession
-except ModuleNotFoundError:
-    local_repo = Path(__file__).resolve().parent / "Scrapling"
-    if local_repo.exists():
-        sys.path.insert(0, str(local_repo))
-        from scrapling.fetchers import DynamicSession
-    else:
-        raise
+from scrapling.fetchers import DynamicSession

 STOP_TEXT = "You have not bookmarked any torrents."
 BG_URL_RE = re.compile(r"url\((?:'|\")?(.*?)(?:'|\")?\)")
@@ -122,11 +113,13 @@ def extract_background_image(style: str) -> str | None:
    return value or None


-def extract_torrent_cards(response: Any) -> list[dict[str, Any]]:
+def extract_torrent_cards(response: Any, base_url: str) -> list[dict[str, Any]]:
    records: list[dict[str, Any]] = []
    cards = response.css("div.torrent_grid div.torrent_grid__torrent")
    for card in cards:
        page_url = (card.css('a[href^="/torrents.php?id="]::attr(href)').get("") or "").strip()
+        if page_url and not page_url.startswith("http"):
+            page_url = f"{base_url.rstrip('/')}{page_url}"
        category = (card.css("span.torrent_grid__torrent__cat::text").get("") or "").strip()
        title = (
            card.css("h3.trim::attr(title)").get("")
@@ -220,7 +213,7 @@ def run(args: argparse.Namespace) -> None:
            if should_stop(response):
                break

-            page_records = extract_torrent_cards(response)
+            page_records = extract_torrent_cards(response, args.base_url)
            all_records.extend(page_records)
            print(f"[page={page}] extracted={len(page_records)} total={len(all_records)}")
            page += 1