first commit

This commit is contained in:
2025-11-23 14:25:09 +03:00
commit 46d75b64d5
18 changed files with 4749 additions and 0 deletions

41
src/headless.js Normal file
View File

@@ -0,0 +1,41 @@
const DEFAULT_VIEWPORT = { width: 1280, height: 720 };
/**
* Load a Netflix title page with Playwright and return the HTML.
* Playwright is optional; when missing we surface a friendly message.
* @param {string} url
* @param {{ timeoutMs?: number, userAgent?: string, headless?: boolean }} options
*/
export async function fetchPageContentWithPlaywright(url, options) {
let playwright;
try {
playwright = await import('playwright');
} catch (err) {
throw new Error(
'Playwright is not installed. Install the optional dependency "playwright" to enable headless scraping.'
);
}
const { chromium } = playwright;
if (!chromium) {
throw new Error('Playwright chromium browser is unavailable.');
}
const browser = await chromium.launch({ headless: options.headless !== false });
const context = await browser.newContext({
userAgent: options.userAgent,
viewport: DEFAULT_VIEWPORT
});
const page = await context.newPage();
try {
await page.goto(url, {
waitUntil: 'domcontentloaded',
timeout: options.timeoutMs
});
await page.waitForLoadState('networkidle', { timeout: options.timeoutMs }).catch(() => {});
return await page.content();
} finally {
await browser.close();
}
}

198
src/index.js Normal file
View File

@@ -0,0 +1,198 @@
import './polyfill.js';
import { parseNetflixHtml } from './parser.js';
import { fetchPageContentWithPlaywright } from './headless.js';
const DEFAULT_TIMEOUT_MS = 15000;
// 🎯 LOG SİSTEMİ
function logPass(message) {
console.log(`${message}`);
}
function logError(message, error) {
console.error(`${message}: ${error.message}`);
}
function logResult(result) {
console.log(JSON.stringify(result, null, 2));
}
// 📋 URL NORMALİZASYON FONKSİYONU
function normalizeNetflixUrl(inputUrl) {
if (!inputUrl) {
throw new Error('Netflix URL\'i gereklidir.');
}
let parsed;
try {
parsed = new URL(inputUrl);
} catch (err) {
throw new Error('Geçersiz URL sağlandı.');
}
if (!parsed.hostname.includes('netflix')) {
throw new Error('URL netflix.com adresini göstermelidir.');
}
const segments = parsed.pathname.split('/').filter(Boolean);
const titleIndex = segments.indexOf('title');
const idSegment = titleIndex >= 0 ? segments[titleIndex + 1] : undefined;
const idMatch = idSegment ? idSegment.match(/^(\d+)/) : null;
if (!idMatch) {
throw new Error('URL\'de Netflix başlık ID\'si bulunamadı.');
}
const id = idMatch[1];
return `https://www.netflix.com/title/${id}`;
}
const DEFAULT_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36';
/**
* Bazı Node sürümlerinde File/Blob henüz tanımlı olmayabilir.
* Gerekirse undici içinden eksik globalleri tamamlar.
*/
async function ensureFetchGlobals() {
// Undici bazı sürümlerde File globaline ihtiyaç duyuyor; önceden stub oluşturuyoruz.
if (typeof globalThis.File === 'undefined') {
const { Blob } = await import('node:buffer');
// Basit File implementasyonu; undici import'u sırasında global File beklentisini karşılar.
globalThis.Blob ??= Blob;
class PolyfillFile extends Blob {
constructor(parts, name, options = {}) {
super(parts, options);
this.name = String(name);
this.lastModified = options.lastModified ?? Date.now();
}
}
globalThis.File = PolyfillFile;
}
const needsFetchPolyfill =
typeof globalThis.fetch === 'undefined' ||
typeof globalThis.Headers === 'undefined' ||
typeof globalThis.Request === 'undefined' ||
typeof globalThis.Response === 'undefined' ||
typeof globalThis.FormData === 'undefined' ||
typeof globalThis.Blob === 'undefined' ||
typeof globalThis.File === 'undefined';
if (!needsFetchPolyfill) return;
const undici = await import('undici');
globalThis.fetch ??= undici.fetch;
globalThis.Headers ??= undici.Headers;
globalThis.Request ??= undici.Request;
globalThis.Response ??= undici.Response;
globalThis.FormData ??= undici.FormData;
globalThis.Blob ??= undici.Blob;
globalThis.File ??= undici.File;
}
/**
* Fetch HTML using the built-in fetch API.
* @param {string} url
* @param {string} userAgent
* @param {number} timeoutMs
*/
async function fetchStaticHtml(url, userAgent, timeoutMs) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);
try {
const res = await globalThis.fetch(url, {
headers: {
'User-Agent': userAgent,
Accept: 'text/html,application/xhtml+xml'
},
signal: controller.signal
});
if (!res.ok) {
if (res.status === 404) {
throw new Error('Netflix title not found (404).');
}
throw new Error(`Request failed with status ${res.status}.`);
}
return await res.text();
} catch (err) {
if (err.name === 'AbortError') {
throw new Error('Request timed out while reaching Netflix.');
}
throw err;
} finally {
clearTimeout(timer);
}
}
/**
* Decide whether we need a headless fallback based on missing fields.
* @param {{ name?: string, year?: string | number }} meta
*/
function needsHeadless(meta) {
return !meta?.name || !meta?.year;
}
/**
* Netflix meta verilerini scrape eder.
* @param {string} inputUrl
* @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
* @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
*/
export async function scraperNetflix(inputUrl, options = {}) {
try {
await ensureFetchGlobals();
const normalizedUrl = normalizeNetflixUrl(inputUrl);
const id = normalizedUrl.split('/').pop();
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const userAgent = options.userAgent || DEFAULT_USER_AGENT;
logPass(`Netflix URL normalize edildi: ${normalizedUrl}`);
const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs);
logPass("HTML içeriği başarıyla çekildi");
let meta = parseNetflixHtml(staticHtml);
if (needsHeadless(meta) && options.headless !== false) {
logPass("Headless mode aktifleştiriliyor");
const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, {
timeoutMs,
userAgent,
headless: options.headless !== false
});
const enriched = parseNetflixHtml(headlessHtml);
meta = {
...meta,
...Object.fromEntries(
Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null)
)
};
logPass("Headless scraping tamamlandı");
} else {
logPass("Statik scraping yeterli");
}
if (!meta.name) {
throw new Error('Netflix sayfa meta verisi parse edilemedi.');
}
const finalResult = {
url: normalizedUrl,
id: id || '',
name: meta.name,
year: meta.year,
seasons: meta.seasons ?? null
};
logResult(finalResult);
return finalResult;
} catch (error) {
logError('Netflix scraping başarısız', error);
throw error;
}
}

162
src/parser.js Normal file
View File

@@ -0,0 +1,162 @@
import { load } from 'cheerio';
const NETFLIX_SUFFIX_REGEX = /\s*\|\s*Netflix.*$/i;
// Turkish UI text patterns that Netflix adds to titles
const TURKISH_UI_PATTERNS = [
/\s+izlemenizi bekliyor$/i, // "waiting for you to watch"
/\s+izleyin$/i, // "watch"
/\s+devam et$/i, // "continue"
/\s+başla$/i, // "start"
/\s+izlemeye devam$/i, // "continue watching"
/\s+Sezon\s+\d+.*izlemeye devam$/i, // "Sezon X izlemeye devam" → remove whole thing
/\s+Sezon\s+\d+.*başla$/i, // "Sezon X başla" → remove whole thing
];
// Other language UI patterns that might appear
const UNIVERSAL_UI_PATTERNS = [
/^(?:Watch Now|Watch)\s+/i, // "Watch" prefix at beginning
/\s+(?:Watch Now|Continue|Resume|Play|Start)$/i,
/\s+(?:Continue Watching|Resume Watching)$/i,
/\s+Season\s+\d+.*(?:Continue|Resume|Play|Start)$/i, // Remove season + UI text together
];
const YEAR_FIELDS = ['datePublished', 'startDate', 'uploadDate', 'copyrightYear', 'releasedEvent', 'releaseYear', 'dateCreated'];
const SEASON_TYPES = ['TVSeries', 'TVShow', 'Series'];
/**
* Extract a usable year value from various JSON-LD fields.
* @param {unknown} value
* @returns {string | number | undefined}
*/
function extractYear(value) {
if (!value) return undefined;
if (typeof value === 'number') return value;
if (typeof value === 'string') {
const match = value.match(/(\d{4})/);
return match ? match[1] : undefined;
}
if (Array.isArray(value)) {
for (const entry of value) {
const year = extractYear(entry);
if (year) return year;
}
}
if (typeof value === 'object') {
for (const key of Object.keys(value)) {
const year = extractYear(value[key]);
if (year) return year;
}
}
return undefined;
}
/**
* Clean titles by removing Netflix suffixes and UI text.
* Handles patterns like "The Witcher izlemenizi bekliyor | Netflix" → "The Witcher"
* @param {string | undefined | null} title
*/
function cleanTitle(title) {
if (!title) return undefined;
let cleaned = title;
// Remove Netflix suffix first
cleaned = cleaned.replace(NETFLIX_SUFFIX_REGEX, '');
// Remove Turkish UI text patterns
for (const pattern of TURKISH_UI_PATTERNS) {
cleaned = cleaned.replace(pattern, '');
}
// Remove universal English UI text patterns
for (const pattern of UNIVERSAL_UI_PATTERNS) {
cleaned = cleaned.replace(pattern, '');
}
// Clean up extra whitespace and return
const trimmed = cleaned.trim();
return trimmed || undefined;
}
/**
* Parse JSON-LD objects for metadata.
* @param {any} obj
*/
function parseJsonLdObject(obj) {
const payload = Array.isArray(obj) ? obj : [obj];
const result = {};
for (const entry of payload) {
if (!entry || typeof entry !== 'object') continue;
if (!result.name && typeof entry.name === 'string') {
result.name = cleanTitle(entry.name);
}
if (!result.year) {
for (const field of YEAR_FIELDS) {
if (entry[field]) {
const extracted = extractYear(entry[field]);
if (extracted) {
result.year = extracted;
break;
}
}
}
}
const isSeries = typeof entry['@type'] === 'string' && SEASON_TYPES.includes(entry['@type']);
if (isSeries) {
const seasonCount =
typeof entry.numberOfSeasons === 'number'
? entry.numberOfSeasons
: Array.isArray(entry.containsSeason)
? entry.containsSeason.length
: undefined;
if (seasonCount && !result.seasons) {
result.seasons = `${seasonCount} Sezon`;
} else if (!result.seasons && entry.seasons && typeof entry.seasons.length === 'number') {
result.seasons = `${entry.seasons.length} Sezon`;
}
}
}
return result;
}
/**
* Parse Netflix HTML to extract metadata without executing scripts.
* @param {string} html
* @returns {{ name?: string, year?: string | number, seasons?: string | null }}
*/
export function parseNetflixHtml(html) {
if (!html) return {};
const $ = load(html);
let name =
cleanTitle($('meta[property="og:title"]').attr('content')) ||
cleanTitle($('meta[name="title"]').attr('content')) ||
cleanTitle($('title').first().text());
let year;
let seasons = null;
$('script[type="application/ld+json"]').each((_, el) => {
const raw = $(el).contents().text();
if (!raw) return;
try {
const parsed = JSON.parse(raw);
const info = parseJsonLdObject(parsed);
if (!name && info.name) name = info.name;
if (!year && info.year) year = info.year;
if (!seasons && info.seasons) seasons = info.seasons;
} catch {
// Ignore malformed JSON-LD blocks.
}
});
return { name, year, seasons };
}

22
src/polyfill.js Normal file
View File

@@ -0,0 +1,22 @@
/**
* Minimal File/Blob polyfill for Node.js undici compatibility
* Only provides what's needed for fetch functionality
*/
import { Blob } from 'node:buffer';
// Simple File implementation for undici compatibility
class PolyfillFile extends Blob {
constructor(parts, name, options = {}) {
super(parts, options);
this.name = String(name);
this.lastModified = options.lastModified ?? Date.now();
}
}
// Export for use in our code
export { PolyfillFile as File, Blob };
// Set globals for undici (this is the critical part)
globalThis.File = globalThis.File || PolyfillFile;
globalThis.Blob = globalThis.Blob || Blob;