From 690733a224de5e89ff826931e4fa7dc5bf469750 Mon Sep 17 00:00:00 2001
From: wisecolt <wisecolt@gmail.com>
Date: Fri, 6 Mar 2026 21:11:07 +0300
Subject: [PATCH] first commit

---
 .gitignore                     |  11 ++
 README.md                      |  17 +++
 requirements.txt               |   1 +
 scrape_happyfappy_bookmarks.py | 263 +++++++++++++++++++++++++++++++++
 4 files changed, 292 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 requirements.txt
 create mode 100644 scrape_happyfappy_bookmarks.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..eb91146
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+.venv/
+__pycache__/
+*.pyc
+
+# Local runtime/output files
+cookies.txt
+bookmarks.json
+debug_html/
+
+# Local clone used during development; package install should be used instead
+Scrapling/
diff --git a/README.md b/README.md
index e69de29..74d01a8 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,17 @@
+# HappyFappy Bookmarks Scraper
+
+## Setup
+
+```bash
+python3.12 -m venv .venv
+source .venv/bin/activate
+python -m pip install -U pip
+python -m pip install -r requirements.txt
+scrapling install
+```
+
+## Run
+
+```bash
+python scrape_happyfappy_bookmarks.py --cookie-file cookies.txt --output bookmarks.json
+```
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..03260b2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+scrapling[fetchers]==0.4.1
diff --git a/scrape_happyfappy_bookmarks.py b/scrape_happyfappy_bookmarks.py
new file mode 100644
index 0000000..5ac9f06
--- /dev/null
+++ b/scrape_happyfappy_bookmarks.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import random
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Any
+from urllib.parse import urlparse
+
+try:
+    from scrapling.fetchers import DynamicSession
+except ModuleNotFoundError:
+    local_repo = Path(__file__).resolve().parent / "Scrapling"
+    if local_repo.exists():
+        sys.path.insert(0, str(local_repo))
+        from scrapling.fetchers import DynamicSession
+    else:
+        raise
+
+STOP_TEXT = "You have not bookmarked any torrents."
+BG_URL_RE = re.compile(r"url\((?:'|\")?(.*?)(?:'|\")?\)")
+
+
+def _domain_matches(target_host: str, cookie_domain: str) -> bool:
+    cd = cookie_domain.lstrip(".").lower()
+    th = target_host.lower()
+    return th == cd or th.endswith("." + cd)
+
+
+def parse_cookie_string(cookie_string: str, target_host: str) -> dict[str, str]:
+    """
+    Supports:
+    1) "key=value; key2=value2" cookie header style
+    2) Netscape cookie file format (tab-separated 7 columns)
+    """
+    cookies: dict[str, str] = {}
+    lines = cookie_string.splitlines()
+
+    looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
+    if looks_like_netscape:
+        for raw_line in lines:
+            line = raw_line.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = line.split("\t")
+            if len(parts) < 7:
+                continue
+            domain, _flag, _path, _secure, _expires, name, value = parts[:7]
+            if not _domain_matches(target_host, domain):
+                continue
+            if name:
+                cookies[name] = value
+        return cookies
+
+    for chunk in cookie_string.split(";"):
+        piece = chunk.strip()
+        if not piece or "=" not in piece:
+            continue
+        key, value = piece.split("=", 1)
+        key = key.strip()
+        value = value.strip()
+        if key:
+            cookies[key] = value
+    return cookies
+
+
+def parse_cookies_for_playwright(
+    cookie_string: str, target_host: str, base_url: str
+) -> list[dict[str, Any]]:
+    """
+    Converts cookie input into Playwright-compatible cookie objects.
+    """
+    lines = cookie_string.splitlines()
+    cookies: list[dict[str, Any]] = []
+
+    looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
+    if looks_like_netscape:
+        for raw_line in lines:
+            line = raw_line.strip()
+            if not line or line.startswith("#"):
+                continue
+            parts = line.split("\t")
+            if len(parts) < 7:
+                continue
+            domain, _flag, path, secure, expires, name, value = parts[:7]
+            if not _domain_matches(target_host, domain):
+                continue
+            if not name:
+                continue
+
+            cookie_obj: dict[str, Any] = {
+                "name": name,
+                "value": value,
+                "domain": domain.lstrip("."),
+                "path": path or "/",
+                "secure": (secure.upper() == "TRUE"),
+            }
+            if expires.isdigit():
+                exp_num = int(expires)
+                if exp_num > 0:
+                    cookie_obj["expires"] = float(exp_num)
+            cookies.append(cookie_obj)
+        return cookies
+
+    kv = parse_cookie_string(cookie_string, target_host)
+    for name, value in kv.items():
+        cookies.append({"name": name, "value": value, "url": base_url})
+    return cookies
+
+
+def extract_background_image(style: str) -> str | None:
+    if not style:
+        return None
+    match = BG_URL_RE.search(style)
+    if not match:
+        return None
+    value = match.group(1).strip()
+    return value or None
+
+
+def extract_torrent_cards(response: Any) -> list[dict[str, Any]]:
+    records: list[dict[str, Any]] = []
+    cards = response.css("div.torrent_grid div.torrent_grid__torrent")
+    for card in cards:
+        page_url = (card.css('a[href^="/torrents.php?id="]::attr(href)').get("") or "").strip()
+        category = (card.css("span.torrent_grid__torrent__cat::text").get("") or "").strip()
+        title = (
+            card.css("h3.trim::attr(title)").get("")
+            or card.css("h3.trim::text").get("")
+            or ""
+        ).strip()
+        style = (card.css("div.torrent__cover::attr(style)").get("") or "").strip()
+        background_image = extract_background_image(style)
+
+        records.append(
+            {
+                "pageURL": page_url,
+                "isVR": category == "VR",
+                "title": title,
+                "backgroundImage": background_image,
+            }
+        )
+    return records
+
+
+def should_stop(response: Any) -> bool:
+    body_text = response.body.decode(response.encoding or "utf-8", errors="ignore")
+    return STOP_TEXT in body_text
+
+
+def fetch_page(session: Any, url: str, retries: int, backoff_base: float) -> Any:
+    last_error: Exception | None = None
+    for attempt in range(retries):
+        try:
+            response = session.fetch(
+                url,
+                timeout=45_000,
+                load_dom=True,
+                network_idle=False,
+            )
+            status = response.status
+            if status in (403, 429) or status >= 500:
+                raise RuntimeError(f"HTTP {status}")
+            return response
+        except Exception as err:  # noqa: BLE001
+            last_error = err
+            if attempt == retries - 1:
+                break
+            sleep_seconds = backoff_base * (2**attempt) + random.uniform(0.0, 0.7)
+            time.sleep(sleep_seconds)
+    raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error
+
+
+def build_bookmarks_url(base_url: str, page: int) -> str:
+    if page == 1:
+        return f"{base_url}/bookmarks.php?type=torrents"
+    return f"{base_url}/bookmarks.php?page={page}&type=torrents#torrent_table"
+
+
+def run(args: argparse.Namespace) -> None:
+    target_host = urlparse(args.base_url).hostname or "www.happyfappy.net"
+
+    cookie_value = args.cookie or ""
+    if not cookie_value and args.cookie_file:
+        cookie_value = Path(args.cookie_file).read_text(encoding="utf-8").strip()
+    if not cookie_value:
+        raise ValueError("Cookie is required. Use --cookie or --cookie-file.")
+
+    cookies = parse_cookie_string(cookie_value, target_host=target_host)
+    if not cookies:
+        raise ValueError("No valid cookies parsed for target host. Check cookie content.")
+    pw_cookies = parse_cookies_for_playwright(
+        cookie_value, target_host=target_host, base_url=args.base_url.rstrip("/")
+    )
+    if not pw_cookies:
+        raise ValueError("No Playwright-compatible cookies generated for target host.")
+
+    all_records: list[dict[str, Any]] = []
+
+    with DynamicSession(
+        headless=True,
+        disable_resources=True,
+        cookies=pw_cookies,
+        google_search=False,
+        retries=1,
+        retry_delay=1,
+    ) as session:
+        page = 1
+        while page <= args.max_pages:
+            if page > 1:
+                time.sleep(random.uniform(args.delay_min, args.delay_max))
+
+            url = build_bookmarks_url(args.base_url.rstrip("/"), page)
+            response = fetch_page(session, url, retries=args.retries, backoff_base=args.backoff_base)
+
+            if should_stop(response):
+                break
+
+            page_records = extract_torrent_cards(response)
+            all_records.extend(page_records)
+            print(f"[page={page}] extracted={len(page_records)} total={len(all_records)}")
+            page += 1
+
+    output_path = Path(args.output).resolve()
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(json.dumps(all_records, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"Saved {len(all_records)} records to {output_path}")
+
+
+def make_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Scrape HappyFappy torrent bookmarks using an authenticated cookie.",
+    )
+    parser.add_argument("--base-url", default="https://www.happyfappy.net")
+    parser.add_argument("--cookie", help='Raw cookie string, e.g. "a=1; b=2"')
+    parser.add_argument("--cookie-file", help="Path to a text file containing raw cookie string")
+    parser.add_argument("--output", default="bookmarks.json")
+    parser.add_argument("--delay-min", type=float, default=1.8, help="Minimum delay between page requests")
+    parser.add_argument("--delay-max", type=float, default=3.2, help="Maximum delay between page requests")
+    parser.add_argument("--retries", type=int, default=3, help="Retries per page request")
+    parser.add_argument("--backoff-base", type=float, default=5.0, help="Backoff base seconds")
+    parser.add_argument("--max-pages", type=int, default=200, help="Safety cap for pagination loop")
+    return parser
+
+
+def main() -> None:
+    parser = make_parser()
+    args = parser.parse_args()
+    if args.delay_min < 0 or args.delay_max < 0:
+        raise ValueError("Delay values must be non-negative.")
+    if args.delay_min > args.delay_max:
+        raise ValueError("--delay-min cannot be greater than --delay-max.")
+    if args.retries < 1:
+        raise ValueError("--retries must be at least 1.")
+    run(args)
+
+
+if __name__ == "__main__":
+    main()