refactor: migrate to src package layout and wscraper entry module

This commit is contained in:
2026-03-07 01:40:18 +03:00
parent b224df5847
commit 1ef7118ba7
14 changed files with 274 additions and 330 deletions

22
pyproject.toml Normal file
View File

@@ -0,0 +1,22 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "wscraper"
version = "0.1.0"
description = "Multi-site scraper CLI"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"scrapling[fetchers]==0.4.1",
]
[project.scripts]
wscraper = "wscraper.cli:main"
[tool.setuptools]
package-dir = {"" = "src"}
[tool.setuptools.packages.find]
where = ["src"]

View File

@@ -1 +1 @@
scrapling[fetchers]==0.4.1 -e .

View File

@@ -1,256 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import random
import re
import time
from pathlib import Path
from typing import Any
from urllib.parse import urlparse
from scrapling.fetchers import DynamicSession
STOP_TEXT = "You have not bookmarked any torrents."
BG_URL_RE = re.compile(r"url\((?:'|\")?(.*?)(?:'|\")?\)")
def _domain_matches(target_host: str, cookie_domain: str) -> bool:
cd = cookie_domain.lstrip(".").lower()
th = target_host.lower()
return th == cd or th.endswith("." + cd)
def parse_cookie_string(cookie_string: str, target_host: str) -> dict[str, str]:
"""
Supports:
1) "key=value; key2=value2" cookie header style
2) Netscape cookie file format (tab-separated 7 columns)
"""
cookies: dict[str, str] = {}
lines = cookie_string.splitlines()
looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
if looks_like_netscape:
for raw_line in lines:
line = raw_line.strip()
if not line or line.startswith("#"):
continue
parts = line.split("\t")
if len(parts) < 7:
continue
domain, _flag, _path, _secure, _expires, name, value = parts[:7]
if not _domain_matches(target_host, domain):
continue
if name:
cookies[name] = value
return cookies
for chunk in cookie_string.split(";"):
piece = chunk.strip()
if not piece or "=" not in piece:
continue
key, value = piece.split("=", 1)
key = key.strip()
value = value.strip()
if key:
cookies[key] = value
return cookies
def parse_cookies_for_playwright(
cookie_string: str, target_host: str, base_url: str
) -> list[dict[str, Any]]:
"""
Converts cookie input into Playwright-compatible cookie objects.
"""
lines = cookie_string.splitlines()
cookies: list[dict[str, Any]] = []
looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
if looks_like_netscape:
for raw_line in lines:
line = raw_line.strip()
if not line or line.startswith("#"):
continue
parts = line.split("\t")
if len(parts) < 7:
continue
domain, _flag, path, secure, expires, name, value = parts[:7]
if not _domain_matches(target_host, domain):
continue
if not name:
continue
cookie_obj: dict[str, Any] = {
"name": name,
"value": value,
"domain": domain.lstrip("."),
"path": path or "/",
"secure": (secure.upper() == "TRUE"),
}
if expires.isdigit():
exp_num = int(expires)
if exp_num > 0:
cookie_obj["expires"] = float(exp_num)
cookies.append(cookie_obj)
return cookies
kv = parse_cookie_string(cookie_string, target_host)
for name, value in kv.items():
cookies.append({"name": name, "value": value, "url": base_url})
return cookies
def extract_background_image(style: str) -> str | None:
if not style:
return None
match = BG_URL_RE.search(style)
if not match:
return None
value = match.group(1).strip()
return value or None
def extract_torrent_cards(response: Any, base_url: str) -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
cards = response.css("div.torrent_grid div.torrent_grid__torrent")
for card in cards:
page_url = (card.css('a[href^="/torrents.php?id="]::attr(href)').get("") or "").strip()
if page_url and not page_url.startswith("http"):
page_url = f"{base_url.rstrip('/')}{page_url}"
category = (card.css("span.torrent_grid__torrent__cat::text").get("") or "").strip()
title = (
card.css("h3.trim::attr(title)").get("")
or card.css("h3.trim::text").get("")
or ""
).strip()
style = (card.css("div.torrent__cover::attr(style)").get("") or "").strip()
background_image = extract_background_image(style)
records.append(
{
"pageURL": page_url,
"isVR": category == "VR",
"title": title,
"backgroundImage": background_image,
}
)
return records
def should_stop(response: Any) -> bool:
body_text = response.body.decode(response.encoding or "utf-8", errors="ignore")
return STOP_TEXT in body_text
def fetch_page(session: Any, url: str, retries: int, backoff_base: float) -> Any:
last_error: Exception | None = None
for attempt in range(retries):
try:
response = session.fetch(
url,
timeout=45_000,
load_dom=True,
network_idle=False,
)
status = response.status
if status in (403, 429) or status >= 500:
raise RuntimeError(f"HTTP {status}")
return response
except Exception as err: # noqa: BLE001
last_error = err
if attempt == retries - 1:
break
sleep_seconds = backoff_base * (2**attempt) + random.uniform(0.0, 0.7)
time.sleep(sleep_seconds)
raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error
def build_bookmarks_url(base_url: str, page: int) -> str:
if page == 1:
return f"{base_url}/bookmarks.php?type=torrents"
return f"{base_url}/bookmarks.php?page={page}&type=torrents#torrent_table"
def run(args: argparse.Namespace) -> None:
target_host = urlparse(args.base_url).hostname or "www.happyfappy.net"
cookie_value = args.cookie or ""
if not cookie_value and args.cookie_file:
cookie_value = Path(args.cookie_file).read_text(encoding="utf-8").strip()
if not cookie_value:
raise ValueError("Cookie is required. Use --cookie or --cookie-file.")
cookies = parse_cookie_string(cookie_value, target_host=target_host)
if not cookies:
raise ValueError("No valid cookies parsed for target host. Check cookie content.")
pw_cookies = parse_cookies_for_playwright(
cookie_value, target_host=target_host, base_url=args.base_url.rstrip("/")
)
if not pw_cookies:
raise ValueError("No Playwright-compatible cookies generated for target host.")
all_records: list[dict[str, Any]] = []
with DynamicSession(
headless=True,
disable_resources=True,
cookies=pw_cookies,
google_search=False,
retries=1,
retry_delay=1,
) as session:
page = 1
while page <= args.max_pages:
if page > 1:
time.sleep(random.uniform(args.delay_min, args.delay_max))
url = build_bookmarks_url(args.base_url.rstrip("/"), page)
response = fetch_page(session, url, retries=args.retries, backoff_base=args.backoff_base)
if should_stop(response):
break
page_records = extract_torrent_cards(response, args.base_url)
all_records.extend(page_records)
print(f"[page={page}] extracted={len(page_records)} total={len(all_records)}")
page += 1
output_path = Path(args.output).resolve()
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(all_records, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Saved {len(all_records)} records to {output_path}")
def make_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Scrape HappyFappy torrent bookmarks using an authenticated cookie.",
)
parser.add_argument("--base-url", default="https://www.happyfappy.net")
parser.add_argument("--cookie", help='Raw cookie string, e.g. "a=1; b=2"')
parser.add_argument("--cookie-file", help="Path to a text file containing raw cookie string")
parser.add_argument("--output", default="bookmarks.json")
parser.add_argument("--delay-min", type=float, default=1.8, help="Minimum delay between page requests")
parser.add_argument("--delay-max", type=float, default=3.2, help="Maximum delay between page requests")
parser.add_argument("--retries", type=int, default=3, help="Retries per page request")
parser.add_argument("--backoff-base", type=float, default=5.0, help="Backoff base seconds")
parser.add_argument("--max-pages", type=int, default=200, help="Safety cap for pagination loop")
return parser
def main() -> None:
parser = make_parser()
args = parser.parse_args()
if args.delay_min < 0 or args.delay_max < 0:
raise ValueError("Delay values must be non-negative.")
if args.delay_min > args.delay_max:
raise ValueError("--delay-min cannot be greater than --delay-max.")
if args.retries < 1:
raise ValueError("--retries must be at least 1.")
run(args)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,92 @@
Metadata-Version: 2.4
Name: wscraper
Version: 0.1.0
Summary: Multi-site scraper CLI
Requires-Python: >=3.12
Description-Content-Type: text/markdown
Requires-Dist: scrapling[fetchers]==0.4.1
# wscraper
HappyFappy için komutlar paketlenmiş `wscraper` CLI üzerinden çalışır. Proje çoklu site desteği için `src/` paket yapısına göre düzenlenmiştir.
## 1) Repo Clone
```bash
git clone <REPO_URL>
cd <REPO_FOLDER>
```
## 2) Kurulum
### macOS / Linux
```bash
python3.12 -m venv .venv
source .venv/bin/activate
python -m pip install -U pip
python -m pip install -e .
scrapling install
```
### Windows (PowerShell)
```powershell
py -3.12 -m venv .venv
.venv\Scripts\Activate.ps1
python -m pip install -U pip
python -m pip install -e .
scrapling install
```
### Windows (CMD)
```bat
py -3.12 -m venv .venv
.venv\Scripts\activate.bat
python -m pip install -U pip
python -m pip install -e .
scrapling install
```
Not: Ortamı aktive ettikten sonra komutlar `wscraper ...` olarak kullanılabilir. İstersen `python -m wscraper ...` da kullanabilirsin.
## 3) HappyFappy Komutları
### Bookmarks Çekme
```bash
wscraper happyfappy --action get-bookmarks -c cookies.txt -o bookmarks.json
```
### Torrent Dosyası İndirme
```bash
wscraper happyfappy --action download-torrent-files -u "https://www.happyfappy.net/torrents.php?id=110178" -c cookies.txt -o torrent
```
## 4) Kısa Alias Kullanımı
```bash
# site alias: hf
# action alias: gb (get-bookmarks), dtf (download-torrent-files)
wscraper hf -a gb -c cookies.txt -o bookmarks.json
wscraper hf -a dtf -u "https://www.happyfappy.net/torrents.php?id=110178" -c cookies.txt -o torrent
```
## 5) Proje Dizini
```text
.
├── pyproject.toml
├── requirements.txt
├── src/
│ └── wscraper/
│ ├── __init__.py
│ ├── __main__.py
│ ├── cli.py
│ └── sites/
│ ├── __init__.py
│ └── happyfappy.py
└── README.md
```

View File

@@ -0,0 +1,13 @@
README.md
pyproject.toml
src/wscraper/__init__.py
src/wscraper/__main__.py
src/wscraper/cli.py
src/wscraper.egg-info/PKG-INFO
src/wscraper.egg-info/SOURCES.txt
src/wscraper.egg-info/dependency_links.txt
src/wscraper.egg-info/entry_points.txt
src/wscraper.egg-info/requires.txt
src/wscraper.egg-info/top_level.txt
src/wscraper/sites/__init__.py
src/wscraper/sites/happyfappy.py

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,2 @@
[console_scripts]
wscraper = wscraper.cli:main

View File

@@ -0,0 +1 @@
scrapling[fetchers]==0.4.1

View File

@@ -0,0 +1 @@
wscraper

3
src/wscraper/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
__all__ = ["__version__"]
__version__ = "0.1.0"

5
src/wscraper/__main__.py Normal file
View File

@@ -0,0 +1,5 @@
from wscraper.cli import main
if __name__ == "__main__":
main()

View File

@@ -1,11 +1,8 @@
#!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import argparse import argparse
from download_happyfappy_torrent import run as run_happyfappy_download from wscraper.sites.happyfappy import run_download_torrent_files, run_get_bookmarks
from scrape_happyfappy_bookmarks import run as run_happyfappy_bookmarks
SITE_ALIASES = { SITE_ALIASES = {
"happyfappy": "happyfappy", "happyfappy": "happyfappy",
@@ -39,9 +36,7 @@ def normalize_action(value: str) -> str:
def build_parser() -> argparse.ArgumentParser: def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(description="wscraper: multi-site scraping entrypoint")
description="wscraper: multi-site scraping entrypoint",
)
parser.add_argument("site", help="Site key, e.g. happyfappy or hf") parser.add_argument("site", help="Site key, e.g. happyfappy or hf")
parser.add_argument("-a", "--action", required=True, help="Action to run") parser.add_argument("-a", "--action", required=True, help="Action to run")
@@ -58,7 +53,6 @@ def build_parser() -> argparse.ArgumentParser:
parser.add_argument("-r", "--retries", type=int, default=3) parser.add_argument("-r", "--retries", type=int, default=3)
parser.add_argument("--backoff-base", type=float, default=5.0) parser.add_argument("--backoff-base", type=float, default=5.0)
parser.add_argument("--delay-min", type=float, default=1.8) parser.add_argument("--delay-min", type=float, default=1.8)
parser.add_argument("--delay-max", type=float, default=3.2) parser.add_argument("--delay-max", type=float, default=3.2)
parser.add_argument("--max-pages", type=int, default=200) parser.add_argument("--max-pages", type=int, default=200)
@@ -69,33 +63,35 @@ def run_happyfappy(args: argparse.Namespace, action: str) -> None:
base_url = args.base_url or "https://www.happyfappy.net" base_url = args.base_url or "https://www.happyfappy.net"
if action == "get-bookmarks": if action == "get-bookmarks":
bookmarks_args = argparse.Namespace( run_get_bookmarks(
base_url=base_url, argparse.Namespace(
cookie=args.cookie, base_url=base_url,
cookie_file=args.cookie_file, cookie=args.cookie,
output=args.output or "bookmarks.json", cookie_file=args.cookie_file,
delay_min=args.delay_min, output=args.output or "bookmarks.json",
delay_max=args.delay_max, delay_min=args.delay_min,
retries=args.retries, delay_max=args.delay_max,
backoff_base=args.backoff_base, retries=args.retries,
max_pages=args.max_pages, backoff_base=args.backoff_base,
max_pages=args.max_pages,
)
) )
run_happyfappy_bookmarks(bookmarks_args)
return return
if action == "download-torrent-files": if action == "download-torrent-files":
if not args.url: if not args.url:
raise ValueError("--url is required for action=download-torrent-files.") raise ValueError("--url is required for action=download-torrent-files.")
download_args = argparse.Namespace( run_download_torrent_files(
url=args.url, argparse.Namespace(
base_url=base_url, url=args.url,
cookie=args.cookie, base_url=base_url,
cookie_file=args.cookie_file, cookie=args.cookie,
output_dir=args.output or "torrent", cookie_file=args.cookie_file,
retries=args.retries, output_dir=args.output or "torrent",
backoff_base=args.backoff_base, retries=args.retries,
backoff_base=args.backoff_base,
)
) )
run_happyfappy_download(download_args)
return return
raise ValueError(f"Unsupported action for happyfappy: {action}") raise ValueError(f"Unsupported action for happyfappy: {action}")

View File

@@ -0,0 +1 @@
__all__ = ["happyfappy"]

View File

@@ -1,7 +1,9 @@
#!/usr/bin/env python3
from __future__ import annotations from __future__ import annotations
import argparse import argparse
import json
import random
import re
import time import time
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -9,6 +11,9 @@ from urllib.parse import urlparse
from scrapling.fetchers import DynamicSession from scrapling.fetchers import DynamicSession
STOP_TEXT = "You have not bookmarked any torrents."
BG_URL_RE = re.compile(r"url\((?:'|\")?(.*?)(?:'|\")?\)")
def _domain_matches(target_host: str, cookie_domain: str) -> bool: def _domain_matches(target_host: str, cookie_domain: str) -> bool:
cd = cookie_domain.lstrip(".").lower() cd = cookie_domain.lstrip(".").lower()
@@ -48,9 +53,7 @@ def parse_cookie_string(cookie_string: str, target_host: str) -> dict[str, str]:
return cookies return cookies
def parse_cookies_for_playwright( def parse_cookies_for_playwright(cookie_string: str, target_host: str, base_url: str) -> list[dict[str, Any]]:
cookie_string: str, target_host: str, base_url: str
) -> list[dict[str, Any]]:
lines = cookie_string.splitlines() lines = cookie_string.splitlines()
cookies: list[dict[str, Any]] = [] cookies: list[dict[str, Any]] = []
looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines) looks_like_netscape = len(lines) > 1 and any("\t" in line for line in lines)
@@ -121,9 +124,103 @@ def fetch_dynamic_with_retry(session: Any, url: str, retries: int, backoff_base:
raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error raise RuntimeError(f"Request failed for {url}: {last_error}") from last_error
def download_via_browser_with_retry( # bookmarks
session: DynamicSession, detail_url: str, retries: int, backoff_base: float
) -> tuple[str, bytes]: def extract_background_image(style: str) -> str | None:
if not style:
return None
match = BG_URL_RE.search(style)
if not match:
return None
value = match.group(1).strip()
return value or None
def extract_torrent_cards(response: Any, base_url: str) -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
cards = response.css("div.torrent_grid div.torrent_grid__torrent")
for card in cards:
page_url = (card.css('a[href^="/torrents.php?id="]::attr(href)').get("") or "").strip()
if page_url and not page_url.startswith("http"):
page_url = f"{base_url.rstrip('/')}{page_url}"
category = (card.css("span.torrent_grid__torrent__cat::text").get("") or "").strip()
title = (card.css("h3.trim::attr(title)").get("") or card.css("h3.trim::text").get("") or "").strip()
style = (card.css("div.torrent__cover::attr(style)").get("") or "").strip()
background_image = extract_background_image(style)
records.append(
{
"pageURL": page_url,
"isVR": category == "VR",
"title": title,
"backgroundImage": background_image,
}
)
return records
def should_stop(response: Any) -> bool:
body_text = response.body.decode(response.encoding or "utf-8", errors="ignore")
return STOP_TEXT in body_text
def build_bookmarks_url(base_url: str, page: int) -> str:
if page == 1:
return f"{base_url}/bookmarks.php?type=torrents"
return f"{base_url}/bookmarks.php?page={page}&type=torrents#torrent_table"
def run_get_bookmarks(args: argparse.Namespace) -> None:
target_host = urlparse(args.base_url).hostname or "www.happyfappy.net"
cookie_value = args.cookie or ""
if not cookie_value and args.cookie_file:
cookie_value = Path(args.cookie_file).read_text(encoding="utf-8").strip()
if not cookie_value:
raise ValueError("Cookie is required. Use --cookie or --cookie-file.")
cookies = parse_cookie_string(cookie_value, target_host=target_host)
if not cookies:
raise ValueError("No valid cookies parsed for target host. Check cookie content.")
pw_cookies = parse_cookies_for_playwright(cookie_value, target_host=target_host, base_url=args.base_url.rstrip("/"))
if not pw_cookies:
raise ValueError("No Playwright-compatible cookies generated for target host.")
all_records: list[dict[str, Any]] = []
with DynamicSession(
headless=True,
disable_resources=True,
cookies=pw_cookies,
google_search=False,
retries=1,
retry_delay=1,
) as session:
page = 1
while page <= args.max_pages:
if page > 1:
time.sleep(random.uniform(args.delay_min, args.delay_max))
url = build_bookmarks_url(args.base_url.rstrip("/"), page)
response = fetch_dynamic_with_retry(session, url, retries=args.retries, backoff_base=args.backoff_base)
if should_stop(response):
break
page_records = extract_torrent_cards(response, args.base_url)
all_records.extend(page_records)
print(f"[page={page}] extracted={len(page_records)} total={len(all_records)}")
page += 1
output_path = Path(args.output).resolve()
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(all_records, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Saved {len(all_records)} records to {output_path}")
# torrent
def download_via_browser_with_retry(session: DynamicSession, detail_url: str, retries: int, backoff_base: float) -> tuple[str, bytes]:
last_error: Exception | None = None last_error: Exception | None = None
for attempt in range(retries): for attempt in range(retries):
page = session.context.new_page() page = session.context.new_page()
@@ -184,7 +281,6 @@ def find_download_link(response: Any) -> str:
if href: if href:
return href return href
# Fallback using text match if classes/attributes drift
href = ( href = (
response.xpath( response.xpath(
"//a[contains(translate(normalize-space(string(.))," "//a[contains(translate(normalize-space(string(.)),"
@@ -206,7 +302,6 @@ def normalize_filename(filename: str, download_url: str) -> str:
def looks_like_torrent_bytes(data: bytes) -> bool: def looks_like_torrent_bytes(data: bytes) -> bool:
# Basic bencode sanity check for torrent files
return bool(data) and data.startswith(b"d") and (b"4:info" in data[:4096]) return bool(data) and data.startswith(b"d") and (b"4:info" in data[:4096])
@@ -218,7 +313,7 @@ def validate_torrent_response(download_url: str, filename: str, data: bytes) ->
raise RuntimeError("Downloaded file failed torrent bencode check.") raise RuntimeError("Downloaded file failed torrent bencode check.")
def run(args: argparse.Namespace) -> None: def run_download_torrent_files(args: argparse.Namespace) -> None:
base_url = args.base_url.rstrip("/") base_url = args.base_url.rstrip("/")
target_host = urlparse(base_url).hostname or "www.happyfappy.net" target_host = urlparse(base_url).hostname or "www.happyfappy.net"
@@ -246,47 +341,15 @@ def run(args: argparse.Namespace) -> None:
retries=1, retries=1,
retry_delay=1, retry_delay=1,
) as session: ) as session:
detail_response = fetch_dynamic_with_retry( detail_response = fetch_dynamic_with_retry(session, args.url, retries=args.retries, backoff_base=args.backoff_base)
session, args.url, retries=args.retries, backoff_base=args.backoff_base
)
href = find_download_link(detail_response) href = find_download_link(detail_response)
if not href: if not href:
raise RuntimeError("Download link not found on page.") raise RuntimeError("Download link not found on page.")
download_url = absolute_url(base_url, href) download_url = absolute_url(base_url, href)
suggested_filename, data = download_via_browser_with_retry( suggested_filename, data = download_via_browser_with_retry(session, args.url, retries=args.retries, backoff_base=args.backoff_base)
session, args.url, retries=args.retries, backoff_base=args.backoff_base
)
filename = normalize_filename(suggested_filename, download_url) filename = normalize_filename(suggested_filename, download_url)
validate_torrent_response(download_url, filename, data) validate_torrent_response(download_url, filename, data)
output_path = output_dir / filename output_path = output_dir / filename
output_path.write_bytes(data) # overwrite behavior by design output_path.write_bytes(data)
print(f"Saved torrent to {output_path}") print(f"Saved torrent to {output_path}")
def make_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Download a torrent file from a single HappyFappy torrent detail page URL.",
)
parser.add_argument("--url", required=True, help="Torrent detail page URL")
parser.add_argument("--base-url", default="https://www.happyfappy.net")
parser.add_argument("--cookie", help='Raw cookie string, e.g. "a=1; b=2"')
parser.add_argument("--cookie-file", help="Path to cookie file")
parser.add_argument("--output-dir", default="torrent")
parser.add_argument("--retries", type=int, default=3)
parser.add_argument("--backoff-base", type=float, default=5.0)
return parser
def main() -> None:
parser = make_parser()
args = parser.parse_args()
if args.retries < 1:
raise ValueError("--retries must be at least 1.")
if args.backoff_base < 0:
raise ValueError("--backoff-base must be >= 0.")
run(args)
if __name__ == "__main__":
main()