first commit

2025-11-23 14:25:09 +03:00
commit 46d75b64d5
18 changed files with 4749 additions and 0 deletions
--- a/src/headless.js
+++ b/src/headless.js
@@ -0,0 +1,41 @@
+const DEFAULT_VIEWPORT = { width: 1280, height: 720 };
+
+/**
+ * Load a Netflix title page with Playwright and return the HTML.
+ * Playwright is optional; when missing we surface a friendly message.
+ * @param {string} url
+ * @param {{ timeoutMs?: number, userAgent?: string, headless?: boolean }} options
+ */
+export async function fetchPageContentWithPlaywright(url, options) {
+  let playwright;
+  try {
+    playwright = await import('playwright');
+  } catch (err) {
+    throw new Error(
+      'Playwright is not installed. Install the optional dependency "playwright" to enable headless scraping.'
+    );
+  }
+
+  const { chromium } = playwright;
+  if (!chromium) {
+    throw new Error('Playwright chromium browser is unavailable.');
+  }
+
+  const browser = await chromium.launch({ headless: options.headless !== false });
+  const context = await browser.newContext({
+    userAgent: options.userAgent,
+    viewport: DEFAULT_VIEWPORT
+  });
+
+  const page = await context.newPage();
+  try {
+    await page.goto(url, {
+      waitUntil: 'domcontentloaded',
+      timeout: options.timeoutMs
+    });
+    await page.waitForLoadState('networkidle', { timeout: options.timeoutMs }).catch(() => {});
+    return await page.content();
+  } finally {
+    await browser.close();
+  }
+}
--- a/src/index.js
+++ b/src/index.js
@@ -0,0 +1,198 @@
+import './polyfill.js';
+import { parseNetflixHtml } from './parser.js';
+import { fetchPageContentWithPlaywright } from './headless.js';
+
+const DEFAULT_TIMEOUT_MS = 15000;
+
+// 🎯 LOG SİSTEMİ
+function logPass(message) {
+  console.log(`✅ ${message}`);
+}
+
+function logError(message, error) {
+  console.error(`❌ ${message}: ${error.message}`);
+}
+
+function logResult(result) {
+  console.log(JSON.stringify(result, null, 2));
+}
+
+// 📋 URL NORMALİZASYON FONKSİYONU
+function normalizeNetflixUrl(inputUrl) {
+  if (!inputUrl) {
+    throw new Error('Netflix URL\'i gereklidir.');
+  }
+
+  let parsed;
+  try {
+    parsed = new URL(inputUrl);
+  } catch (err) {
+    throw new Error('Geçersiz URL sağlandı.');
+  }
+
+  if (!parsed.hostname.includes('netflix')) {
+    throw new Error('URL netflix.com adresini göstermelidir.');
+  }
+
+  const segments = parsed.pathname.split('/').filter(Boolean);
+  const titleIndex = segments.indexOf('title');
+  const idSegment = titleIndex >= 0 ? segments[titleIndex + 1] : undefined;
+  const idMatch = idSegment ? idSegment.match(/^(\d+)/) : null;
+
+  if (!idMatch) {
+    throw new Error('URL\'de Netflix başlık ID\'si bulunamadı.');
+  }
+
+  const id = idMatch[1];
+  return `https://www.netflix.com/title/${id}`;
+}
+const DEFAULT_USER_AGENT =
+  'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36';
+
+/**
+ * Bazı Node sürümlerinde File/Blob henüz tanımlı olmayabilir.
+ * Gerekirse undici içinden eksik global’leri tamamlar.
+ */
+async function ensureFetchGlobals() {
+  // Undici bazı sürümlerde File globaline ihtiyaç duyuyor; önceden stub oluşturuyoruz.
+  if (typeof globalThis.File === 'undefined') {
+    const { Blob } = await import('node:buffer');
+    // Basit File implementasyonu; undici import'u sırasında global File beklentisini karşılar.
+    globalThis.Blob ??= Blob;
+    class PolyfillFile extends Blob {
+      constructor(parts, name, options = {}) {
+        super(parts, options);
+        this.name = String(name);
+        this.lastModified = options.lastModified ?? Date.now();
+      }
+    }
+    globalThis.File = PolyfillFile;
+  }
+
+  const needsFetchPolyfill =
+    typeof globalThis.fetch === 'undefined' ||
+    typeof globalThis.Headers === 'undefined' ||
+    typeof globalThis.Request === 'undefined' ||
+    typeof globalThis.Response === 'undefined' ||
+    typeof globalThis.FormData === 'undefined' ||
+    typeof globalThis.Blob === 'undefined' ||
+    typeof globalThis.File === 'undefined';
+
+  if (!needsFetchPolyfill) return;
+
+  const undici = await import('undici');
+  globalThis.fetch ??= undici.fetch;
+  globalThis.Headers ??= undici.Headers;
+  globalThis.Request ??= undici.Request;
+  globalThis.Response ??= undici.Response;
+  globalThis.FormData ??= undici.FormData;
+  globalThis.Blob ??= undici.Blob;
+  globalThis.File ??= undici.File;
+}
+
+/**
+ * Fetch HTML using the built-in fetch API.
+ * @param {string} url
+ * @param {string} userAgent
+ * @param {number} timeoutMs
+ */
+async function fetchStaticHtml(url, userAgent, timeoutMs) {
+  const controller = new AbortController();
+  const timer = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const res = await globalThis.fetch(url, {
+      headers: {
+        'User-Agent': userAgent,
+        Accept: 'text/html,application/xhtml+xml'
+      },
+      signal: controller.signal
+    });
+
+    if (!res.ok) {
+      if (res.status === 404) {
+        throw new Error('Netflix title not found (404).');
+      }
+      throw new Error(`Request failed with status ${res.status}.`);
+    }
+
+    return await res.text();
+  } catch (err) {
+    if (err.name === 'AbortError') {
+      throw new Error('Request timed out while reaching Netflix.');
+    }
+    throw err;
+  } finally {
+    clearTimeout(timer);
+  }
+}
+
+/**
+ * Decide whether we need a headless fallback based on missing fields.
+ * @param {{ name?: string, year?: string | number }} meta
+ */
+function needsHeadless(meta) {
+  return !meta?.name || !meta?.year;
+}
+
+/**
+ * Netflix meta verilerini scrape eder.
+ * @param {string} inputUrl
+ * @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
+ * @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
+ */
+export async function scraperNetflix(inputUrl, options = {}) {
+  try {
+    await ensureFetchGlobals();
+
+    const normalizedUrl = normalizeNetflixUrl(inputUrl);
+    const id = normalizedUrl.split('/').pop();
+    const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+    const userAgent = options.userAgent || DEFAULT_USER_AGENT;
+
+    logPass(`Netflix URL normalize edildi: ${normalizedUrl}`);
+
+    const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs);
+    logPass("HTML içeriği başarıyla çekildi");
+
+    let meta = parseNetflixHtml(staticHtml);
+
+    if (needsHeadless(meta) && options.headless !== false) {
+      logPass("Headless mode aktifleştiriliyor");
+      const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, {
+        timeoutMs,
+        userAgent,
+        headless: options.headless !== false
+      });
+
+      const enriched = parseNetflixHtml(headlessHtml);
+      meta = {
+        ...meta,
+        ...Object.fromEntries(
+          Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null)
+        )
+      };
+      logPass("Headless scraping tamamlandı");
+    } else {
+      logPass("Statik scraping yeterli");
+    }
+
+    if (!meta.name) {
+      throw new Error('Netflix sayfa meta verisi parse edilemedi.');
+    }
+
+    const finalResult = {
+      url: normalizedUrl,
+      id: id || '',
+      name: meta.name,
+      year: meta.year,
+      seasons: meta.seasons ?? null
+    };
+
+    logResult(finalResult);
+    return finalResult;
+  } catch (error) {
+    logError('Netflix scraping başarısız', error);
+    throw error;
+  }
+}
--- a/src/parser.js
+++ b/src/parser.js
@@ -0,0 +1,162 @@
+import { load } from 'cheerio';
+
+const NETFLIX_SUFFIX_REGEX = /\s*\|\s*Netflix.*$/i;
+
+// Turkish UI text patterns that Netflix adds to titles
+const TURKISH_UI_PATTERNS = [
+  /\s+izlemenizi bekliyor$/i,           // "waiting for you to watch"
+  /\s+izleyin$/i,                      // "watch"
+  /\s+devam et$/i,                     // "continue"
+  /\s+başla$/i,                        // "start"
+  /\s+izlemeye devam$/i,               // "continue watching"
+  /\s+Sezon\s+\d+.*izlemeye devam$/i,  // "Sezon X izlemeye devam" → remove whole thing
+  /\s+Sezon\s+\d+.*başla$/i,           // "Sezon X başla" → remove whole thing
+];
+
+// Other language UI patterns that might appear
+const UNIVERSAL_UI_PATTERNS = [
+  /^(?:Watch Now|Watch)\s+/i,                    // "Watch" prefix at beginning
+  /\s+(?:Watch Now|Continue|Resume|Play|Start)$/i,
+  /\s+(?:Continue Watching|Resume Watching)$/i,
+  /\s+Season\s+\d+.*(?:Continue|Resume|Play|Start)$/i, // Remove season + UI text together
+];
+
+const YEAR_FIELDS = ['datePublished', 'startDate', 'uploadDate', 'copyrightYear', 'releasedEvent', 'releaseYear', 'dateCreated'];
+const SEASON_TYPES = ['TVSeries', 'TVShow', 'Series'];
+
+/**
+ * Extract a usable year value from various JSON-LD fields.
+ * @param {unknown} value
+ * @returns {string | number | undefined}
+ */
+function extractYear(value) {
+  if (!value) return undefined;
+  if (typeof value === 'number') return value;
+  if (typeof value === 'string') {
+    const match = value.match(/(\d{4})/);
+    return match ? match[1] : undefined;
+  }
+  if (Array.isArray(value)) {
+    for (const entry of value) {
+      const year = extractYear(entry);
+      if (year) return year;
+    }
+  }
+  if (typeof value === 'object') {
+    for (const key of Object.keys(value)) {
+      const year = extractYear(value[key]);
+      if (year) return year;
+    }
+  }
+  return undefined;
+}
+
+/**
+ * Clean titles by removing Netflix suffixes and UI text.
+ * Handles patterns like "The Witcher izlemenizi bekliyor | Netflix" → "The Witcher"
+ * @param {string | undefined | null} title
+ */
+function cleanTitle(title) {
+  if (!title) return undefined;
+
+  let cleaned = title;
+
+  // Remove Netflix suffix first
+  cleaned = cleaned.replace(NETFLIX_SUFFIX_REGEX, '');
+
+  // Remove Turkish UI text patterns
+  for (const pattern of TURKISH_UI_PATTERNS) {
+    cleaned = cleaned.replace(pattern, '');
+  }
+
+  // Remove universal English UI text patterns
+  for (const pattern of UNIVERSAL_UI_PATTERNS) {
+    cleaned = cleaned.replace(pattern, '');
+  }
+
+  // Clean up extra whitespace and return
+  const trimmed = cleaned.trim();
+  return trimmed || undefined;
+}
+
+/**
+ * Parse JSON-LD objects for metadata.
+ * @param {any} obj
+ */
+function parseJsonLdObject(obj) {
+  const payload = Array.isArray(obj) ? obj : [obj];
+  const result = {};
+
+  for (const entry of payload) {
+    if (!entry || typeof entry !== 'object') continue;
+
+    if (!result.name && typeof entry.name === 'string') {
+      result.name = cleanTitle(entry.name);
+    }
+
+    if (!result.year) {
+      for (const field of YEAR_FIELDS) {
+        if (entry[field]) {
+          const extracted = extractYear(entry[field]);
+          if (extracted) {
+            result.year = extracted;
+            break;
+          }
+        }
+      }
+    }
+
+    const isSeries = typeof entry['@type'] === 'string' && SEASON_TYPES.includes(entry['@type']);
+    if (isSeries) {
+      const seasonCount =
+        typeof entry.numberOfSeasons === 'number'
+          ? entry.numberOfSeasons
+          : Array.isArray(entry.containsSeason)
+            ? entry.containsSeason.length
+            : undefined;
+
+      if (seasonCount && !result.seasons) {
+        result.seasons = `${seasonCount} Sezon`;
+      } else if (!result.seasons && entry.seasons && typeof entry.seasons.length === 'number') {
+        result.seasons = `${entry.seasons.length} Sezon`;
+      }
+    }
+  }
+
+  return result;
+}
+
+/**
+ * Parse Netflix HTML to extract metadata without executing scripts.
+ * @param {string} html
+ * @returns {{ name?: string, year?: string | number, seasons?: string | null }}
+ */
+export function parseNetflixHtml(html) {
+  if (!html) return {};
+
+  const $ = load(html);
+
+  let name =
+    cleanTitle($('meta[property="og:title"]').attr('content')) ||
+    cleanTitle($('meta[name="title"]').attr('content')) ||
+    cleanTitle($('title').first().text());
+
+  let year;
+  let seasons = null;
+
+  $('script[type="application/ld+json"]').each((_, el) => {
+    const raw = $(el).contents().text();
+    if (!raw) return;
+    try {
+      const parsed = JSON.parse(raw);
+      const info = parseJsonLdObject(parsed);
+      if (!name && info.name) name = info.name;
+      if (!year && info.year) year = info.year;
+      if (!seasons && info.seasons) seasons = info.seasons;
+    } catch {
+      // Ignore malformed JSON-LD blocks.
+    }
+  });
+
+  return { name, year, seasons };
+}
--- a/src/polyfill.js
+++ b/src/polyfill.js
@@ -0,0 +1,22 @@
+/**
+ * Minimal File/Blob polyfill for Node.js undici compatibility
+ * Only provides what's needed for fetch functionality
+ */
+
+import { Blob } from 'node:buffer';
+
+// Simple File implementation for undici compatibility
+class PolyfillFile extends Blob {
+  constructor(parts, name, options = {}) {
+    super(parts, options);
+    this.name = String(name);
+    this.lastModified = options.lastModified ?? Date.now();
+  }
+}
+
+// Export for use in our code
+export { PolyfillFile as File, Blob };
+
+// Set globals for undici (this is the critical part)
+globalThis.File = globalThis.File || PolyfillFile;
+globalThis.Blob = globalThis.Blob || Blob;