first commit
This commit is contained in:
41
src/headless.js
Normal file
41
src/headless.js
Normal file
@@ -0,0 +1,41 @@
|
||||
const DEFAULT_VIEWPORT = { width: 1280, height: 720 };
|
||||
|
||||
/**
|
||||
* Load a Netflix title page with Playwright and return the HTML.
|
||||
* Playwright is optional; when missing we surface a friendly message.
|
||||
* @param {string} url
|
||||
* @param {{ timeoutMs?: number, userAgent?: string, headless?: boolean }} options
|
||||
*/
|
||||
export async function fetchPageContentWithPlaywright(url, options) {
|
||||
let playwright;
|
||||
try {
|
||||
playwright = await import('playwright');
|
||||
} catch (err) {
|
||||
throw new Error(
|
||||
'Playwright is not installed. Install the optional dependency "playwright" to enable headless scraping.'
|
||||
);
|
||||
}
|
||||
|
||||
const { chromium } = playwright;
|
||||
if (!chromium) {
|
||||
throw new Error('Playwright chromium browser is unavailable.');
|
||||
}
|
||||
|
||||
const browser = await chromium.launch({ headless: options.headless !== false });
|
||||
const context = await browser.newContext({
|
||||
userAgent: options.userAgent,
|
||||
viewport: DEFAULT_VIEWPORT
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
try {
|
||||
await page.goto(url, {
|
||||
waitUntil: 'domcontentloaded',
|
||||
timeout: options.timeoutMs
|
||||
});
|
||||
await page.waitForLoadState('networkidle', { timeout: options.timeoutMs }).catch(() => {});
|
||||
return await page.content();
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
198
src/index.js
Normal file
198
src/index.js
Normal file
@@ -0,0 +1,198 @@
|
||||
import './polyfill.js';
|
||||
import { parseNetflixHtml } from './parser.js';
|
||||
import { fetchPageContentWithPlaywright } from './headless.js';
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 15000;
|
||||
|
||||
// 🎯 LOG SİSTEMİ
|
||||
function logPass(message) {
|
||||
console.log(`✅ ${message}`);
|
||||
}
|
||||
|
||||
function logError(message, error) {
|
||||
console.error(`❌ ${message}: ${error.message}`);
|
||||
}
|
||||
|
||||
function logResult(result) {
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
}
|
||||
|
||||
// 📋 URL NORMALİZASYON FONKSİYONU
|
||||
function normalizeNetflixUrl(inputUrl) {
|
||||
if (!inputUrl) {
|
||||
throw new Error('Netflix URL\'i gereklidir.');
|
||||
}
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = new URL(inputUrl);
|
||||
} catch (err) {
|
||||
throw new Error('Geçersiz URL sağlandı.');
|
||||
}
|
||||
|
||||
if (!parsed.hostname.includes('netflix')) {
|
||||
throw new Error('URL netflix.com adresini göstermelidir.');
|
||||
}
|
||||
|
||||
const segments = parsed.pathname.split('/').filter(Boolean);
|
||||
const titleIndex = segments.indexOf('title');
|
||||
const idSegment = titleIndex >= 0 ? segments[titleIndex + 1] : undefined;
|
||||
const idMatch = idSegment ? idSegment.match(/^(\d+)/) : null;
|
||||
|
||||
if (!idMatch) {
|
||||
throw new Error('URL\'de Netflix başlık ID\'si bulunamadı.');
|
||||
}
|
||||
|
||||
const id = idMatch[1];
|
||||
return `https://www.netflix.com/title/${id}`;
|
||||
}
|
||||
const DEFAULT_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36';
|
||||
|
||||
/**
|
||||
* Bazı Node sürümlerinde File/Blob henüz tanımlı olmayabilir.
|
||||
* Gerekirse undici içinden eksik global’leri tamamlar.
|
||||
*/
|
||||
async function ensureFetchGlobals() {
|
||||
// Undici bazı sürümlerde File globaline ihtiyaç duyuyor; önceden stub oluşturuyoruz.
|
||||
if (typeof globalThis.File === 'undefined') {
|
||||
const { Blob } = await import('node:buffer');
|
||||
// Basit File implementasyonu; undici import'u sırasında global File beklentisini karşılar.
|
||||
globalThis.Blob ??= Blob;
|
||||
class PolyfillFile extends Blob {
|
||||
constructor(parts, name, options = {}) {
|
||||
super(parts, options);
|
||||
this.name = String(name);
|
||||
this.lastModified = options.lastModified ?? Date.now();
|
||||
}
|
||||
}
|
||||
globalThis.File = PolyfillFile;
|
||||
}
|
||||
|
||||
const needsFetchPolyfill =
|
||||
typeof globalThis.fetch === 'undefined' ||
|
||||
typeof globalThis.Headers === 'undefined' ||
|
||||
typeof globalThis.Request === 'undefined' ||
|
||||
typeof globalThis.Response === 'undefined' ||
|
||||
typeof globalThis.FormData === 'undefined' ||
|
||||
typeof globalThis.Blob === 'undefined' ||
|
||||
typeof globalThis.File === 'undefined';
|
||||
|
||||
if (!needsFetchPolyfill) return;
|
||||
|
||||
const undici = await import('undici');
|
||||
globalThis.fetch ??= undici.fetch;
|
||||
globalThis.Headers ??= undici.Headers;
|
||||
globalThis.Request ??= undici.Request;
|
||||
globalThis.Response ??= undici.Response;
|
||||
globalThis.FormData ??= undici.FormData;
|
||||
globalThis.Blob ??= undici.Blob;
|
||||
globalThis.File ??= undici.File;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch HTML using the built-in fetch API.
|
||||
* @param {string} url
|
||||
* @param {string} userAgent
|
||||
* @param {number} timeoutMs
|
||||
*/
|
||||
async function fetchStaticHtml(url, userAgent, timeoutMs) {
|
||||
const controller = new AbortController();
|
||||
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
||||
|
||||
try {
|
||||
const res = await globalThis.fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': userAgent,
|
||||
Accept: 'text/html,application/xhtml+xml'
|
||||
},
|
||||
signal: controller.signal
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
if (res.status === 404) {
|
||||
throw new Error('Netflix title not found (404).');
|
||||
}
|
||||
throw new Error(`Request failed with status ${res.status}.`);
|
||||
}
|
||||
|
||||
return await res.text();
|
||||
} catch (err) {
|
||||
if (err.name === 'AbortError') {
|
||||
throw new Error('Request timed out while reaching Netflix.');
|
||||
}
|
||||
throw err;
|
||||
} finally {
|
||||
clearTimeout(timer);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decide whether we need a headless fallback based on missing fields.
|
||||
* @param {{ name?: string, year?: string | number }} meta
|
||||
*/
|
||||
function needsHeadless(meta) {
|
||||
return !meta?.name || !meta?.year;
|
||||
}
|
||||
|
||||
/**
|
||||
* Netflix meta verilerini scrape eder.
|
||||
* @param {string} inputUrl
|
||||
* @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
|
||||
* @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
|
||||
*/
|
||||
export async function scraperNetflix(inputUrl, options = {}) {
|
||||
try {
|
||||
await ensureFetchGlobals();
|
||||
|
||||
const normalizedUrl = normalizeNetflixUrl(inputUrl);
|
||||
const id = normalizedUrl.split('/').pop();
|
||||
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
||||
const userAgent = options.userAgent || DEFAULT_USER_AGENT;
|
||||
|
||||
logPass(`Netflix URL normalize edildi: ${normalizedUrl}`);
|
||||
|
||||
const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs);
|
||||
logPass("HTML içeriği başarıyla çekildi");
|
||||
|
||||
let meta = parseNetflixHtml(staticHtml);
|
||||
|
||||
if (needsHeadless(meta) && options.headless !== false) {
|
||||
logPass("Headless mode aktifleştiriliyor");
|
||||
const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, {
|
||||
timeoutMs,
|
||||
userAgent,
|
||||
headless: options.headless !== false
|
||||
});
|
||||
|
||||
const enriched = parseNetflixHtml(headlessHtml);
|
||||
meta = {
|
||||
...meta,
|
||||
...Object.fromEntries(
|
||||
Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null)
|
||||
)
|
||||
};
|
||||
logPass("Headless scraping tamamlandı");
|
||||
} else {
|
||||
logPass("Statik scraping yeterli");
|
||||
}
|
||||
|
||||
if (!meta.name) {
|
||||
throw new Error('Netflix sayfa meta verisi parse edilemedi.');
|
||||
}
|
||||
|
||||
const finalResult = {
|
||||
url: normalizedUrl,
|
||||
id: id || '',
|
||||
name: meta.name,
|
||||
year: meta.year,
|
||||
seasons: meta.seasons ?? null
|
||||
};
|
||||
|
||||
logResult(finalResult);
|
||||
return finalResult;
|
||||
} catch (error) {
|
||||
logError('Netflix scraping başarısız', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
162
src/parser.js
Normal file
162
src/parser.js
Normal file
@@ -0,0 +1,162 @@
|
||||
import { load } from 'cheerio';
|
||||
|
||||
const NETFLIX_SUFFIX_REGEX = /\s*\|\s*Netflix.*$/i;
|
||||
|
||||
// Turkish UI text patterns that Netflix adds to titles
|
||||
const TURKISH_UI_PATTERNS = [
|
||||
/\s+izlemenizi bekliyor$/i, // "waiting for you to watch"
|
||||
/\s+izleyin$/i, // "watch"
|
||||
/\s+devam et$/i, // "continue"
|
||||
/\s+başla$/i, // "start"
|
||||
/\s+izlemeye devam$/i, // "continue watching"
|
||||
/\s+Sezon\s+\d+.*izlemeye devam$/i, // "Sezon X izlemeye devam" → remove whole thing
|
||||
/\s+Sezon\s+\d+.*başla$/i, // "Sezon X başla" → remove whole thing
|
||||
];
|
||||
|
||||
// Other language UI patterns that might appear
|
||||
const UNIVERSAL_UI_PATTERNS = [
|
||||
/^(?:Watch Now|Watch)\s+/i, // "Watch" prefix at beginning
|
||||
/\s+(?:Watch Now|Continue|Resume|Play|Start)$/i,
|
||||
/\s+(?:Continue Watching|Resume Watching)$/i,
|
||||
/\s+Season\s+\d+.*(?:Continue|Resume|Play|Start)$/i, // Remove season + UI text together
|
||||
];
|
||||
|
||||
const YEAR_FIELDS = ['datePublished', 'startDate', 'uploadDate', 'copyrightYear', 'releasedEvent', 'releaseYear', 'dateCreated'];
|
||||
const SEASON_TYPES = ['TVSeries', 'TVShow', 'Series'];
|
||||
|
||||
/**
|
||||
* Extract a usable year value from various JSON-LD fields.
|
||||
* @param {unknown} value
|
||||
* @returns {string | number | undefined}
|
||||
*/
|
||||
function extractYear(value) {
|
||||
if (!value) return undefined;
|
||||
if (typeof value === 'number') return value;
|
||||
if (typeof value === 'string') {
|
||||
const match = value.match(/(\d{4})/);
|
||||
return match ? match[1] : undefined;
|
||||
}
|
||||
if (Array.isArray(value)) {
|
||||
for (const entry of value) {
|
||||
const year = extractYear(entry);
|
||||
if (year) return year;
|
||||
}
|
||||
}
|
||||
if (typeof value === 'object') {
|
||||
for (const key of Object.keys(value)) {
|
||||
const year = extractYear(value[key]);
|
||||
if (year) return year;
|
||||
}
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean titles by removing Netflix suffixes and UI text.
|
||||
* Handles patterns like "The Witcher izlemenizi bekliyor | Netflix" → "The Witcher"
|
||||
* @param {string | undefined | null} title
|
||||
*/
|
||||
function cleanTitle(title) {
|
||||
if (!title) return undefined;
|
||||
|
||||
let cleaned = title;
|
||||
|
||||
// Remove Netflix suffix first
|
||||
cleaned = cleaned.replace(NETFLIX_SUFFIX_REGEX, '');
|
||||
|
||||
// Remove Turkish UI text patterns
|
||||
for (const pattern of TURKISH_UI_PATTERNS) {
|
||||
cleaned = cleaned.replace(pattern, '');
|
||||
}
|
||||
|
||||
// Remove universal English UI text patterns
|
||||
for (const pattern of UNIVERSAL_UI_PATTERNS) {
|
||||
cleaned = cleaned.replace(pattern, '');
|
||||
}
|
||||
|
||||
// Clean up extra whitespace and return
|
||||
const trimmed = cleaned.trim();
|
||||
return trimmed || undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse JSON-LD objects for metadata.
|
||||
* @param {any} obj
|
||||
*/
|
||||
function parseJsonLdObject(obj) {
|
||||
const payload = Array.isArray(obj) ? obj : [obj];
|
||||
const result = {};
|
||||
|
||||
for (const entry of payload) {
|
||||
if (!entry || typeof entry !== 'object') continue;
|
||||
|
||||
if (!result.name && typeof entry.name === 'string') {
|
||||
result.name = cleanTitle(entry.name);
|
||||
}
|
||||
|
||||
if (!result.year) {
|
||||
for (const field of YEAR_FIELDS) {
|
||||
if (entry[field]) {
|
||||
const extracted = extractYear(entry[field]);
|
||||
if (extracted) {
|
||||
result.year = extracted;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const isSeries = typeof entry['@type'] === 'string' && SEASON_TYPES.includes(entry['@type']);
|
||||
if (isSeries) {
|
||||
const seasonCount =
|
||||
typeof entry.numberOfSeasons === 'number'
|
||||
? entry.numberOfSeasons
|
||||
: Array.isArray(entry.containsSeason)
|
||||
? entry.containsSeason.length
|
||||
: undefined;
|
||||
|
||||
if (seasonCount && !result.seasons) {
|
||||
result.seasons = `${seasonCount} Sezon`;
|
||||
} else if (!result.seasons && entry.seasons && typeof entry.seasons.length === 'number') {
|
||||
result.seasons = `${entry.seasons.length} Sezon`;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Netflix HTML to extract metadata without executing scripts.
|
||||
* @param {string} html
|
||||
* @returns {{ name?: string, year?: string | number, seasons?: string | null }}
|
||||
*/
|
||||
export function parseNetflixHtml(html) {
|
||||
if (!html) return {};
|
||||
|
||||
const $ = load(html);
|
||||
|
||||
let name =
|
||||
cleanTitle($('meta[property="og:title"]').attr('content')) ||
|
||||
cleanTitle($('meta[name="title"]').attr('content')) ||
|
||||
cleanTitle($('title').first().text());
|
||||
|
||||
let year;
|
||||
let seasons = null;
|
||||
|
||||
$('script[type="application/ld+json"]').each((_, el) => {
|
||||
const raw = $(el).contents().text();
|
||||
if (!raw) return;
|
||||
try {
|
||||
const parsed = JSON.parse(raw);
|
||||
const info = parseJsonLdObject(parsed);
|
||||
if (!name && info.name) name = info.name;
|
||||
if (!year && info.year) year = info.year;
|
||||
if (!seasons && info.seasons) seasons = info.seasons;
|
||||
} catch {
|
||||
// Ignore malformed JSON-LD blocks.
|
||||
}
|
||||
});
|
||||
|
||||
return { name, year, seasons };
|
||||
}
|
||||
22
src/polyfill.js
Normal file
22
src/polyfill.js
Normal file
@@ -0,0 +1,22 @@
|
||||
/**
|
||||
* Minimal File/Blob polyfill for Node.js undici compatibility
|
||||
* Only provides what's needed for fetch functionality
|
||||
*/
|
||||
|
||||
import { Blob } from 'node:buffer';
|
||||
|
||||
// Simple File implementation for undici compatibility
|
||||
class PolyfillFile extends Blob {
|
||||
constructor(parts, name, options = {}) {
|
||||
super(parts, options);
|
||||
this.name = String(name);
|
||||
this.lastModified = options.lastModified ?? Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
// Export for use in our code
|
||||
export { PolyfillFile as File, Blob };
|
||||
|
||||
// Set globals for undici (this is the critical part)
|
||||
globalThis.File = globalThis.File || PolyfillFile;
|
||||
globalThis.Blob = globalThis.Blob || Blob;
|
||||
Reference in New Issue
Block a user