From fefa6627e9252e1876f1d3b1a78183f33b7ab79c Mon Sep 17 00:00:00 2001 From: sbilketay Date: Sun, 23 Nov 2025 16:09:39 +0300 Subject: [PATCH] =?UTF-8?q?amazon=20prime=20scrap=20=C3=B6zelli=C4=9Fi=20e?= =?UTF-8?q?klendi?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 73 ++- doc/API.md | 207 ++++++++- src/index.js | 113 ++++- src/parser.js | 522 +++++++++++++++++++++- tests/{scrape.test.js => netflix.test.js} | 42 ++ tests/prime.test.js | 59 +++ 6 files changed, 988 insertions(+), 28 deletions(-) rename tests/{scrape.test.js => netflix.test.js} (56%) create mode 100644 tests/prime.test.js diff --git a/README.md b/README.md index 1bae3ad..e6d9f25 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # metascraper -Netflix URL'lerinden film ve dizi meta verilerini (başlık, yıl, sezon bilgisi) çıkaran modern Node.js kütüphanesi. +Netflix ve Amazon Prime Video URL'lerinden film ve dizi meta verilerini (başlık, yıl, sezon bilgisi, thumbnail görseli) çıkaran modern Node.js kütüphanesi. ## 🚀 Kurulum @@ -22,7 +22,10 @@ console.log(movie); // "id": "82123114", // "name": "ONE SHOT with Ed Sheeran", // "year": "2025", -// "seasons": null +// "seasons": null, +// "thumbnail": "https://occ-0-7335-778.1.nflxso.net/dnm/api/v6/6AYY37jfdO6hpXcMjf9Yu5cnmO0/AAAABSkrIGPSyEfSWYQzc8rEFo6EtVV6Ls8WtPpNwR42MSKSNPNomZWV5P_l2MxGuJEkoPm71UT_eBK_SsTEH8pRslQr0sjpdhVHjxh4.jpg", +// "info": "Ed Sheeran, matematiğin mucizevi gücünü ve müziğin birleştirici gücünü sergileyen benzersiz bir performansla sahneye çıkıyor.", +// "genre": "Belgesel" // } ``` @@ -38,22 +41,48 @@ console.log(series); // "id": "80189685", // "name": "The Witcher", // "year": "2025", -// "seasons": "4 Sezon" +// "seasons": "4 Sezon", +// "thumbnail": "https://occ-0-7335-778.1.nflxso.net/dnm/api/v6/6AYY37jfdO6hpXcMjf9Yu5cnmO0/AAAABSkrIGPSyEfSWYQzc8rEFo6EtVV6Ls8WtPpNwR42MSKSNPNomZWV5P_l2MxGuJEkoPm71UT_eBK_SsTEH8pRslQr0sjpdhVHjxh4.jpg", +// "info": "Mutasyona uğramış bir canavar avcısı olan Rivyalı Geralt, insanların çoğunlukla yaratıklardan daha uğursuz olduğu, karmaşa içindeki bir dünyada kaderine doğru yol alıyor.", +// "genre": "Aksiyon" +// } +``` + +### Amazon Prime Video Meta Verisi + +```javascript +import { scraperPrime } from 'metascraper'; + +const movie = await scraperPrime('https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie'); +console.log(movie); +// { +// "url": "https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL", +// "id": "0NHIN3TGAI9L7VZ45RS52RHUPL", +// "name": "Little Women", +// "year": "2020", +// "seasons": null, +// "thumbnail": "https://m.media-amazon.com/images/S/pv-target-images/c1b08ebea5ba29c47145c623e7d1c586290221ec12fa93850029e581f54049c4.jpg", +// "info": "In the years after the Civil War, Jo March lives in New York and makes her living as a writer, while her sister Amy studies painting in Paris.", +// "genre": "Dram" // } ``` ### URL Normalizasyonu -URL normalizasyonu artık `scraperNetflix` fonksiyonu içinde otomatik olarak yapılır. +URL normalizasyonu artık `scraperNetflix` ve `scraperPrime` fonksiyonları içinde otomatik olarak yapılır. ## ✨ Özellikler -- ✅ **Film ve Dizi Destekli** - Her tür Netflix içeriği için meta veri +- ✅ **Çoklu Platform Desteği** - Netflix ve Amazon Prime Video scraping +- ✅ **Film ve Dizi Destekli** - Her tür içerik için meta veri +- ✅ **Thumbnail Görselleri** - Otomatik afiş/poster görselleri çıkarır +- ✅ **Film/Dizi Açıklamaları** - İçerik özet bilgilerini (info) çıkarır +- ✅ **Tür Bilgisi** - Türkçe ve İngilizce tür normalizasyonu - ✅ **Türkçe Arayüz Temizleme** - "izlemenizi bekliyor" gibi metinleri temizler -- ✅ **JSON-LD Tabanlı** - Netflix'in yapısal verilerini kullanır +- ✅ **JSON-LD Tabanlı** - Platformların yapısal verilerini kullanır - ✅ **Hızlı ve Güvenilir** - Statik HTML scraping + Playwright fallback - ✅ **Node.js 18+ Uyumlu** - Modern JavaScript özellikleri -- ✅ **Türkiye Odaklı** - Netflix Türkiye URL'leri optimize edilmiş +- ✅ **Türkiye Odaklı** - Netflix Türkiye ve Prime Video URL'leri optimize edilmiş ## 🔧 API @@ -75,7 +104,35 @@ Netflix URL'sinden meta veri çeker. URL normalizasyonu otomatik olarak yapılı id: string, // Netflix ID name: string, // İçerik adı year: string | number, // Yılı - seasons: string | null // Sezon bilgisi (diziler için) + seasons: string | null, // Sezon bilgisi (diziler için) + thumbnail: string | null, // Afiş/poster görseli URL'si + info: string | null, // Film/dizi açıklaması + genre: string | null // Tür bilgisi (Türkçe) +} +``` + +### `scraperPrime(url, options)` + +Amazon Prime Video URL'sinden meta veri çeker. URL normalizasyonu otomatik olarak yapılır. + +**Parametreler:** +- `url` (string): Amazon Prime Video URL'i +- `options` (object, isteğe bağlı): + - `headless` (boolean): Headless mode (default: false) + - `timeoutMs` (number): Timeout süresi (default: 15000) + - `userAgent` (string): Özel User-Agent + +**Dönen Veri:** +```typescript +{ + url: string, // Temizlenmiş URL + id: string, // Prime Video ID + name: string, // İçerik adı + year: string | number, // Yılı + seasons: string | null, // Sezon bilgisi (diziler için, filmler için null) + thumbnail: string | null, // Afiş/poster görseli URL'si + info: string | null, // Film/dizi açıklaması + genre: string | null // Tür bilgisi (Türkçe normalize edilmiş) } ``` diff --git a/doc/API.md b/doc/API.md index 02c2b2a..fd4ab6b 100644 --- a/doc/API.md +++ b/doc/API.md @@ -6,6 +6,10 @@ Netflix metadata extraction function with automatic fallback and Turkish localization. +### `scraperPrime(inputUrl, options?)` + +Amazon Prime Video metadata extraction function with automatic fallback and Turkish localization. + #### Parameters | Parameter | Type | Required | Default | Description | @@ -30,6 +34,9 @@ Promise<{ name: string; // Clean title (Turkish UI removed) year: string \| number \| undefined; // Release year seasons: string \| null; // Season info for TV series + thumbnail: string \| null; // Poster/thumbnail image URL + info: string \| null; // Content description/summary + genre: string \| null; // Genre (Turkish normalized) }> ``` @@ -46,7 +53,10 @@ console.log(result); // "id": "82123114", // "name": "ONE SHOT with Ed Sheeran", // "year": "2025", -// "seasons": null +// "seasons": null, +// "thumbnail": "https://occ-0-7335-778.1.nflxso.net/dnm/api/v6/6AYY37jfdO6hpXcMjf9Yu5cnmO0/AAAABSkrIGPSyEfSWYQzc8rEFo6EtVV6Ls8WtPpNwR42MSKSNPNomZWV5P_l2MxGuJEkoPm71UT_eBK_SsTEH8pRslQr0sjpdhVHjxh4.jpg", +// "info": "Ed Sheeran, matematiğin mucizevi gücünü ve müziğin birleştirici gücünü sergileyen benzersiz bir performansla sahneye çıkıyor.", +// "genre": "Belgesel" // } ``` @@ -78,6 +88,88 @@ try { } ``` +### `scraperPrime(inputUrl, options?)` + +Amazon Prime Video metadata extraction function with automatic fallback and Turkish localization. + +#### Parameters + +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `inputUrl` | `string` | ✅ | - | Amazon Prime Video URL (any format) | +| `options` | `object` | ❌ | `{}` | Configuration options | + +#### Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `headless` | `boolean` | `true` | Enable Playwright fallback for missing data | +| `timeoutMs` | `number` | `15000` | Request timeout in milliseconds | +| `userAgent` | `string` | Chrome 118 User-Agent | Custom User-Agent string | + +#### Returns + +```typescript +Promise<{ + url: string; // Normalized Prime Video URL + id: string; // Prime Video content ID + name: string; // Clean title (Amazon UI removed) + year: string | number | undefined; // Release year + seasons: string | null; // Season info for TV series (null for movies) + thumbnail: string | null; // Poster/thumbnail image URL + info: string | null; // Content description/summary + genre: string | null; // Genre (Turkish normalized) +}> +``` + +#### Examples + +**Basic Usage** +```javascript +import { scraperPrime } from 'metascraper'; + +const result = await scraperPrime('https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie'); +console.log(result); +// { +// "url": "https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL", +// "id": "0NHIN3TGAI9L7VZ45RS52RHUPL", +// "name": "Little Women", +// "year": "2020", +// "seasons": null, +// "thumbnail": "https://m.media-amazon.com/images/S/pv-target-images/c1b08ebea5ba29c47145c623e7d1c586290221ec12fa93850029e581f54049c4.jpg", +// "info": "In the years after the Civil War, Jo March lives in New York and makes her living as a writer...", +// "genre": "Dram" +// } +``` + +**Advanced Configuration** +```javascript +import { scraperPrime } from 'metascraper'; + +const result = await scraperPrime( + 'https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL', + { + headless: false, // Disable browser fallback + timeoutMs: 30000, // 30 second timeout + userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + } +); +``` + +**Error Handling** +```javascript +import { scraperPrime } from 'metascraper'; + +try { + const result = await scraperPrime('https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL'); + console.log('Success:', result); +} catch (error) { + console.error('Scraping failed:', error.message); + // Turkish error messages for Turkish users + // "Amazon Prime scraping başarısız: Amazon Prime URL'i gereklidir." +} +``` + ## 🧩 Internal APIs ### `parseNetflixHtml(html)` - Parser API @@ -97,6 +189,9 @@ Parse Netflix HTML content to extract metadata without network requests. name?: string; // Clean title year?: string \| number; // Release year seasons?: string \| null; // Season information + thumbnail?: string \| null; // Thumbnail image URL + info?: string \| null; // Content description + genre?: string \| null; // Genre information } ``` @@ -114,7 +209,10 @@ console.log(metadata); // { // "name": "The Witcher", // "year": "2025", -// "seasons": "4 Sezon" +// "seasons": "4 Sezon", +// "thumbnail": "https://occ-0-7335-778.1.nflxso.net/dnm/api/v6/6AYY37jfdO6hpXcMjf9Yu5cnmO0/AAAABSkrIGPSyEfSWYQzc8rEFo6EtVV6Ls8WtPpNwR42MSKSNPNomZWV5P_l2MxGuJEkoPm71UT_eBK_SsTEH8pRslQr0sjpdhVHjxh4.jpg", +// "info": "Mutasyona uğramış bir canavar avcısı olan Rivyalı Geralt, insanların çoğunlukla yaratıklardan daha uğursuz olduğu, karmaşa içindeki bir dünyada kaderine doğru yol alıyor.", +// "genre": "Aksiyon" // } ``` @@ -165,6 +263,50 @@ try { } ``` +### `parsePrimeHtml(html)` - Prime Video Parser API + +Parse Amazon Prime Video HTML content to extract metadata without network requests. + +#### Parameters + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `html` | `string` | ✅ | Raw HTML content from Prime Video page | + +#### Returns + +```typescript +{ + name?: string; // Clean title + year?: string | number; // Release year + seasons?: string | null; // Season information + thumbnail?: string | null; // Thumbnail image URL + info?: string | null; // Content description + genre?: string | null; // Genre information +} +``` + +#### Examples + +```javascript +import { parsePrimeHtml } from 'metascraper/parser'; + +// With cached HTML +const fs = await import('node:fs'); +const html = fs.readFileSync('prime-page.html', 'utf8'); +const metadata = parsePrimeHtml(html); + +console.log(metadata); +// { +// "name": "Little Women", +// "year": "2020", +// "seasons": null, +// "thumbnail": "https://m.media-amazon.com/images/S/pv-target-images/...", +// "info": "In the years after the Civil War, Jo March lives in New York...", +// "genre": "Dram" +// } +``` + ## 🔧 URL Processing ### Supported URL Formats @@ -199,6 +341,36 @@ The function validates URLs with these rules: 'https://www.netflix.com/title/abc' // Non-numeric ID ``` +### Amazon Prime Video URL Formats + +The `scraperPrime` function automatically normalizes various Prime Video URL formats: + +| Input Format | Normalized Output | Notes | +|--------------|-------------------|-------| +| `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL` | `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL` | Standard format | +| `https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie` | `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL` | Turkish locale with tracking | +| `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL?ref_=atv_dp` | `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL` | With parameters | + +### Prime Video URL Validation + +The function validates URLs with these rules: + +1. **Format**: Must be a valid URL +2. **Domain**: Must contain `primevideo.com` +3. **Path**: Must contain `detail/` followed by content ID +4. **ID Extraction**: Uses path parsing to extract content ID + +```javascript +// These will work: +'https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL' +'https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie' + +// These will fail: +'https://google.com' // Wrong domain +'https://www.primevideo.com/browse' // No content ID +'not-a-url' // Invalid format +``` + ## 🌍 Localization Features ### Turkish UI Text Removal @@ -263,6 +435,37 @@ If JSON-LD is unavailable, falls back to HTML meta tags: The Witcher izlemenizi bekliyor | Netflix ``` +### Thumbnail Image Extraction + +The parser automatically extracts poster/thumbnail images from Netflix meta tags: + +```javascript +// Thumbnail selectors in priority order: +const THUMBNAIL_SELECTORS = [ + 'meta[property="og:image"]', // Open Graph image (primary) + 'meta[name="twitter:image"]', // Twitter card image + 'meta[property="og:image:secure_url"]', // Secure image URL + 'link[rel="image_src"]', // Image source link + 'meta[itemprop="image"]' // Schema.org image +]; +``` + +**Example Netflix HTML:** +```html + +``` + +**URL Validation:** +- Only Netflix CDN domains are accepted (nflxso.net, nflximg.net, etc.) +- Image file extensions are verified (.jpg, .jpeg, .png, .webp) +- Query parameters are cleaned for stability + +**Fallback Strategy:** +1. Try Open Graph image first (most reliable) +2. Fall back to Twitter card image +3. Try other meta tags if needed +4. Return null if no valid thumbnail found + ### Season Detection For TV series, extracts season information: diff --git a/src/index.js b/src/index.js index 607e5d1..a65b799 100644 --- a/src/index.js +++ b/src/index.js @@ -1,15 +1,21 @@ import './polyfill.js'; -import { parseNetflixHtml } from './parser.js'; +import { parseNetflixHtml, parsePrimeHtml } from './parser.js'; import { fetchPageContentWithPlaywright } from './headless.js'; const DEFAULT_TIMEOUT_MS = 15000; // 🎯 LOG SİSTEMİ -function logPass(message) { +function logPass(message, data) { console.log(`✅ ${message}`); + if (data) { + console.log(JSON.stringify(data, null, 2)); + } } function logError(message, error) { + if (process.env.NODE_ENV === 'test') { + return; + } console.error(`❌ ${message}: ${error.message}`); } @@ -46,6 +52,35 @@ function normalizeNetflixUrl(inputUrl) { const id = idMatch[1]; return `https://www.netflix.com/title/${id}`; } + +// 📋 AMAZON PRIME URL NORMALİZASYON FONKSİYONU +function normalizePrimeUrl(inputUrl) { + if (!inputUrl) { + throw new Error('Amazon Prime URL\'i gereklidir.'); + } + + let parsed; + try { + parsed = new URL(inputUrl); + } catch (err) { + throw new Error('Geçersiz URL sağlandı.'); + } + + if (!parsed.hostname.includes('primevideo.com')) { + throw new Error('URL primevideo.com adresini göstermelidir.'); + } + + const segments = parsed.pathname.split('/').filter(Boolean); + const detailIndex = segments.indexOf('detail'); + + if (detailIndex >= 0 && segments[detailIndex + 1]) { + const id = segments[detailIndex + 1]; + return `https://www.primevideo.com/detail/${id}`; + } + + throw new Error('URL\'de Amazon Prime içerik ID\'si bulunamadı.'); +} + const DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'; @@ -139,7 +174,7 @@ function needsHeadless(meta) { * Netflix meta verilerini scrape eder. * @param {string} inputUrl * @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options] - * @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>} + * @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }>} */ export async function scraperNetflix(inputUrl, options = {}) { try { @@ -150,15 +185,11 @@ export async function scraperNetflix(inputUrl, options = {}) { const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS; const userAgent = options.userAgent || DEFAULT_USER_AGENT; - logPass(`Netflix URL normalize edildi: ${normalizedUrl}`); - const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs); - logPass("HTML içeriği başarıyla çekildi"); let meta = parseNetflixHtml(staticHtml); if (needsHeadless(meta) && options.headless !== false) { - logPass("Headless mode aktifleştiriliyor"); const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, { timeoutMs, userAgent, @@ -172,9 +203,6 @@ export async function scraperNetflix(inputUrl, options = {}) { Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null) ) }; - logPass("Headless scraping tamamlandı"); - } else { - logPass("Statik scraping yeterli"); } if (!meta.name) { @@ -186,13 +214,74 @@ export async function scraperNetflix(inputUrl, options = {}) { id: id || '', name: meta.name, year: meta.year, - seasons: meta.seasons ?? null + seasons: meta.seasons ?? null, + thumbnail: meta.thumbnail ?? null, + info: meta.info ?? null, + genre: meta.genre ?? null }; - logResult(finalResult); + logPass('Netflix scraping tamamlandı', finalResult); return finalResult; } catch (error) { logError('Netflix scraping başarısız', error); throw error; } } + +/** + * Amazon Prime meta verilerini scrape eder. + * @param {string} inputUrl + * @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options] + * @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null, thumbnail: string | null, info: string | null, genre: string | null }>} + */ +export async function scraperPrime(inputUrl, options = {}) { + try { + await ensureFetchGlobals(); + + const normalizedUrl = normalizePrimeUrl(inputUrl); + const id = normalizedUrl.split('/').pop(); + const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS; + const userAgent = options.userAgent || DEFAULT_USER_AGENT; + + const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs); + + let meta = parsePrimeHtml(staticHtml); + + if (needsHeadless(meta) && options.headless !== false) { + const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, { + timeoutMs, + userAgent, + headless: options.headless !== false + }); + + const enriched = parsePrimeHtml(headlessHtml); + meta = { + ...meta, + ...Object.fromEntries( + Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null) + ) + }; + } + + if (!meta.name) { + throw new Error('Amazon Prime sayfa meta verisi parse edilemedi.'); + } + + const finalResult = { + url: normalizedUrl, + id: id || '', + name: meta.name, + year: meta.year, + seasons: meta.seasons ?? null, + thumbnail: meta.thumbnail ?? null, + info: meta.info ?? null, + genre: meta.genre ?? null + }; + + logPass('Amazon Prime scraping tamamlandı', finalResult); + return finalResult; + } catch (error) { + logError('Amazon Prime scraping başarısız', error); + throw error; + } +} diff --git a/src/parser.js b/src/parser.js index 684f785..f0b759b 100644 --- a/src/parser.js +++ b/src/parser.js @@ -24,6 +24,26 @@ const UNIVERSAL_UI_PATTERNS = [ const YEAR_FIELDS = ['datePublished', 'startDate', 'uploadDate', 'copyrightYear', 'releasedEvent', 'releaseYear', 'dateCreated']; const SEASON_TYPES = ['TVSeries', 'TVShow', 'Series']; +/** + * Netflix thumbnail image patterns for extraction + */ +const THUMBNAIL_SELECTORS = [ + 'meta[property="og:image"]', // Open Graph image (primary) + 'meta[name="twitter:image"]', // Twitter card image + 'meta[property="og:image:secure_url"]', // Secure image URL + 'link[rel="image_src"]', // Image source link + 'meta[itemprop="image"]' // Schema.org image +]; + +/** + * Netflix description/meta description patterns for extraction + */ +const DESCRIPTION_SELECTORS = [ + 'meta[name="description"]', // Standard meta description (primary) + 'meta[property="og:description"]', // Open Graph description + 'meta[itemprop="description"]' // Schema.org description +]; + /** * Extract a usable year value from various JSON-LD fields. * @param {unknown} value @@ -79,6 +99,141 @@ function cleanTitle(title) { return trimmed || undefined; } +/** + * Netflix thumbnail image extraction from HTML meta tags. + * Extracts thumbnail URLs from various meta tags in priority order. + * @param {string} html - Raw HTML content + * @returns {string | undefined} Thumbnail URL or undefined if not found + */ +function extractThumbnail(html) { + if (!html) return undefined; + + const $ = load(html); + + // Try different meta tag selectors in priority order + for (const selector of THUMBNAIL_SELECTORS) { + const imageUrl = $(selector).attr('content'); + if (imageUrl && isValidThumbnailUrl(imageUrl)) { + return normalizeThumbnailUrl(imageUrl); + } + } + + return undefined; +} + +/** + * Check if URL is a valid Netflix thumbnail URL. + * @param {string} url - URL to validate + * @returns {boolean} True if valid thumbnail URL + */ +function isValidThumbnailUrl(url) { + if (!url || typeof url !== 'string') return false; + + // Check for Netflix CDN domains + const netflixDomains = [ + 'nflxso.net', + 'assets.nflxext.com', + 'netflix.com', + 'occ-0-', + 'nflximg.net' + ]; + + const hasNetflixDomain = netflixDomains.some(domain => url.includes(domain)); + const hasImageExtension = /\.(jpg|jpeg|png|webp)(\?.*)?$/i.test(url); + + return hasNetflixDomain && hasImageExtension; +} + +/** + * Normalize thumbnail URL by removing query parameters if needed. + * @param {string} url - Original thumbnail URL + * @returns {string} Normalized URL + */ +function normalizeThumbnailUrl(url) { + if (!url) return url; + + try { + const urlObj = new URL(url); + // Remove certain query parameters that might cause issues + const paramsToRemove = ['r', 't', 'e', 'v']; + const searchParams = new URLSearchParams(urlObj.search); + + paramsToRemove.forEach(param => searchParams.delete(param)); + + // Reconstruct URL without removed parameters + const cleanUrl = urlObj.origin + urlObj.pathname + (searchParams.toString() ? '?' + searchParams.toString() : ''); + return cleanUrl; + } catch { + // If URL parsing fails, return original + return url; + } +} + +/** + * Netflix description/info extraction from HTML meta tags. + * Extracts description information from various meta tags in priority order. + * @param {string} html - Raw HTML content + * @returns {string | undefined} Description info or undefined if not found + */ +function extractInfo(html) { + if (!html) return undefined; + + const $ = load(html); + + // Try different meta tag selectors in priority order + for (const selector of DESCRIPTION_SELECTORS) { + const description = $(selector).attr('content'); + if (description && description.trim()) { + // Clean up description - remove Netflix-specific suffixes + const cleaned = description.trim() + .replace(/\s*\|\s*Netflix.*$/i, '') // Remove Netflix suffix + .replace(/\s+Fragmanları izleyin ve daha fazla bilgi edinin\.$/, ''); // Remove trailing call-to-action + return cleaned || undefined; + } + } + + return undefined; +} + +/** + * Normalize and clean genre information. + * Maps Netflix genre names to Turkish equivalents and cleans them up. + * @param {string | null | undefined} genre - Raw genre from JSON-LD + * @returns {string | null} Normalized Turkish genre or null + */ +function normalizeGenre(genre) { + if (!genre || typeof genre !== 'string') return null; + + const genreMapping = { + 'Aksiyon': 'Aksiyon', + 'Action': 'Aksiyon', + 'Macera': 'Macera', + 'Adventure': 'Macera', + 'Bilim Kurgu': 'Bilim Kurgu', + 'Science Fiction': 'Bilim Kurgu', + 'Fantastik': 'Fantastik', + 'Fantasy': 'Fantastik', + 'Dram': 'Dram', + 'Drama': 'Dram', + 'Komedi': 'Komedi', + 'Comedy': 'Komedi', + 'Korku': 'Korku', + 'Horror': 'Korku', + 'Gerilim': 'Gerilim', + 'Thriller': 'Gerilim', + 'Gizem': 'Gizem', + 'Mystery': 'Gizem', + 'Romantik': 'Romantik', + 'Romance': 'Romantik' + }; + + // Clean up genre name + const cleanedGenre = genre.trim(); + + // Return mapped genre or original if no mapping exists + return genreMapping[cleanedGenre] || cleanedGenre || null; +} + /** * Parse JSON-LD objects for metadata. * @param {any} obj @@ -121,6 +276,19 @@ function parseJsonLdObject(obj) { result.seasons = `${entry.seasons.length} Sezon`; } } + + // Extract info/description from JSON-LD + if (!result.info && typeof entry.description === 'string') { + const cleanedInfo = entry.description.trim() + .replace(/\s*\|\s*Netflix.*$/i, '') + .replace(/\s+Fragmanları izleyin ve daha fazla bilgi edinin\.$/, ''); + result.info = cleanedInfo || undefined; + } + + // Extract genre from JSON-LD + if (!result.genre && typeof entry.genre === 'string') { + result.genre = normalizeGenre(entry.genre); + } } return result; @@ -129,7 +297,7 @@ function parseJsonLdObject(obj) { /** * Parse Netflix HTML to extract metadata without executing scripts. * @param {string} html - * @returns {{ name?: string, year?: string | number, seasons?: string | null }} + * @returns {{ name?: string, year?: string | number, seasons?: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }} */ export function parseNetflixHtml(html) { if (!html) return {}; @@ -143,20 +311,362 @@ export function parseNetflixHtml(html) { let year; let seasons = null; + let thumbnail = null; + let info = null; + let genre = null; + + // Extract thumbnail from meta tags + thumbnail = extractThumbnail(html); + + // Extract info from meta tags (fallback if JSON-LD doesn't have it) + info = extractInfo(html); $('script[type="application/ld+json"]').each((_, el) => { const raw = $(el).contents().text(); if (!raw) return; try { const parsed = JSON.parse(raw); - const info = parseJsonLdObject(parsed); - if (!name && info.name) name = info.name; - if (!year && info.year) year = info.year; - if (!seasons && info.seasons) seasons = info.seasons; + const jsonLdInfo = parseJsonLdObject(parsed); + if (!name && jsonLdInfo.name) name = jsonLdInfo.name; + if (!year && jsonLdInfo.year) year = jsonLdInfo.year; + if (!seasons && jsonLdInfo.seasons) seasons = jsonLdInfo.seasons; + // Also check JSON-LD for image information + if (!thumbnail && jsonLdInfo.image) { + thumbnail = typeof jsonLdInfo.image === 'string' ? jsonLdInfo.image : jsonLdInfo.image.url; + } + // Extract info and genre from JSON-LD if available + if (!info && jsonLdInfo.info) info = jsonLdInfo.info; + if (!genre && jsonLdInfo.genre) genre = jsonLdInfo.genre; } catch { // Ignore malformed JSON-LD blocks. } }); - return { name, year, seasons }; + return { name, year, seasons, thumbnail, info, genre }; +} + +/** + * Amazon Prime specific constants and functions + */ + +// Amazon Prime selectors for metadata extraction +const PRIME_TITLE_SELECTORS = [ + 'meta[property="og:title"]', + 'meta[name="title"]', + 'title', + '[data-testid="title"]', + '.dv-node-dp-title', + 'h1' +]; + +const PRIME_THUMBNAIL_SELECTORS = [ + 'meta[property="og:image"]', + 'meta[name="twitter:image"]', + 'meta[property="og:image:secure_url"]', + '[data-testid="hero-image"] img', + '.dv-node-dp-hero-image img', + 'img[alt*="poster"]' +]; + +const PRIME_DESCRIPTION_SELECTORS = [ + 'meta[name="description"]', + 'meta[property="og:description"]', + 'meta[itemprop="description"]', + '[data-testid="synopsis"]', + '.dv-node-dp-synopsis', + '.synopsis' +]; + +const PRIME_YEAR_SELECTORS = [ + 'meta[itemprop="dateCreated"]', + 'meta[property="video:release_date"]', + '[data-testid="release-year"]', + '.release-year', + '[class*="year"]' +]; + +const PRIME_GENRE_SELECTORS = [ + 'meta[itemprop="genre"]', + '[data-testid="genres"]', + '.genres', + '[class*="genre"]' +]; + +/** + * Extract title from Amazon Prime page + */ +function extractPrimeTitle($, html) { + // Try meta tags first + for (const selector of PRIME_TITLE_SELECTORS) { + const title = $(selector).attr('content') || $(selector).text(); + if (title && title.trim()) { + return cleanPrimeTitle(title.trim()); + } + } + + // Try to extract from embedded JSON data + const jsonMatch = html.match(/"title":"([^"]+)"/); + if (jsonMatch && jsonMatch[1]) { + return cleanPrimeTitle(jsonMatch[1]); + } + + return undefined; +} + +/** + * Extract year from Amazon Prime page + */ +function extractPrimeYear($, html) { + // Try structured data first + for (const selector of PRIME_YEAR_SELECTORS) { + const yearText = $(selector).attr('content') || $(selector).text(); + if (yearText) { + const yearMatch = yearText.match(/(\d{4})/); + if (yearMatch) return yearMatch[1]; + } + } + + // Try to extract from embedded JSON data + const jsonMatch = html.match(/"releaseYear"\s*:\s*"(\d{4})"/); + if (jsonMatch) return jsonMatch[1]; + + // Try to find year in title + const title = extractPrimeTitle($, html); + if (title) { + const yearMatch = title.match(/(\d{4})/); + if (yearMatch) return yearMatch[1]; + } + + return undefined; +} + +/** + * Extract thumbnail from Amazon Prime page + */ +function extractPrimeThumbnail($, html) { + for (const selector of PRIME_THUMBNAIL_SELECTORS) { + const imageUrl = $(selector).attr('content') || $(selector).attr('src'); + if (imageUrl && isValidPrimeThumbnail(imageUrl)) { + return imageUrl; + } + } + + // Try to extract from embedded JSON data + const jsonMatch = html.match(/"heroImageUrl":"([^"]+)"/); + if (jsonMatch && jsonMatch[1]) { + return jsonMatch[1].replace(/\\u002F/g, '/'); + } + + return undefined; +} + +/** + * Extract info/description from Amazon Prime page + */ +function extractPrimeInfo($, html) { + for (const selector of PRIME_DESCRIPTION_SELECTORS) { + const description = $(selector).attr('content') || $(selector).text(); + if (description && description.trim()) { + return cleanPrimeDescription(description.trim()); + } + } + + // Try to extract from embedded JSON data + const jsonMatch = html.match(/"synopsis":"([^"]+)"/); + if (jsonMatch && jsonMatch[1]) { + return cleanPrimeDescription(jsonMatch[1].replace(/\\u002F/g, '/').replace(/\\"/g, '"')); + } + + return undefined; +} + +/** + * Extract genres from Amazon Prime page + */ +function extractPrimeGenre($, html) { + for (const selector of PRIME_GENRE_SELECTORS) { + const genreText = $(selector).attr('content') || $(selector).text(); + if (genreText && genreText.trim()) { + return normalizePrimeGenre(genreText.trim()); + } + } + + // Try to extract from embedded JSON data + const jsonMatch = html.match(/"genres":\["([^"]+)"\]/); + if (jsonMatch && jsonMatch[1]) { + return normalizePrimeGenre(jsonMatch[1]); + } + + return undefined; +} + +/** + * Extract seasons information from Amazon Prime page + */ +function extractPrimeSeasons($, html) { + // Try to find the highest season number from all season matches + const allSeasonMatches = html.match(/\d+\s*\.?\s*Sezon/gi); + if (allSeasonMatches) { + const seasons = allSeasonMatches.map(match => parseInt(match.match(/\d+/)[0])); + const maxSeason = Math.max(...seasons); + if (maxSeason > 0) { + return `${maxSeason} Season`; + } + } + + // Look for series indicators in a more specific way + const seriesIndicators = [ + /\b(Season|Sezon)\s*\d+/i, + /\bepisode\s*\d+/i, + /\bbölüm\s*\d+/i, + /"type":\s*["']\s*(TV\s*Series|Dizi)/i, + /\b(TV\s*Series|Dizi)\s*$/i + ]; + + const hasSeriesIndicator = seriesIndicators.some(pattern => pattern.test(html)); + if (hasSeriesIndicator) { + return '1 Season'; // Default for series without clear season count + } + + // Look for movie indicators + const movieIndicators = [ + /\b(film|movie)\s*$/i, + /"type":\s*["']\s*(Movie|Film)/i + ]; + + const hasMovieIndicator = movieIndicators.some(pattern => pattern.test(html)); + if (hasMovieIndicator) { + return null; // It's explicitly a movie + } + + // If we can't determine, look at page structure + // Prime Video typically shows season information prominently for series + if (html.includes('Sezon') && html.includes('Bölüm')) { + return '1 Season'; + } + + return null; // Default to movie +} + +/** + * Clean Amazon Prime title text + */ +function cleanPrimeTitle(title) { + if (!title) return undefined; + + let cleaned = title; + + // Remove Amazon Prime suffixes + cleaned = cleaned.replace(/\s*\|\s*Prime\s*Video.*$/i, ''); + cleaned = cleaned.replace(/\s*\|\s*Amazon.*$/i, ''); + + // Remove common UI text + cleaned = cleaned.replace(/\s+(izle|watch|play|oynat)$/i, ''); + + return cleaned.trim() || undefined; +} + +/** + * Clean Amazon Prime description text + */ +function cleanPrimeDescription(description) { + if (!description) return undefined; + + let cleaned = description; + + // Remove Amazon/Prime Video suffixes + cleaned = cleaned.replace(/\s*\|\s*Prime\s*Video.*$/i, ''); + cleaned = cleaned.replace(/\s*\|\s*Amazon.*$/i, ''); + + // Remove common call-to-action text + cleaned = cleaned.replace(/\s+(Daha fazla bilgi için tıklayın|Click for more info).*$/i, ''); + + return cleaned.trim() || undefined; +} + +/** + * Check if URL is a valid Amazon Prime thumbnail + */ +function isValidPrimeThumbnail(url) { + if (!url || typeof url !== 'string') return false; + + const primeDomains = [ + 'm.media-amazon.com', + 'images-na.ssl-images-amazon.com', + 'media-amazon.com', + 'primevideo.com' + ]; + + return primeDomains.some(domain => url.includes(domain)) && + /\.(jpg|jpeg|png|webp)(\?.*)?$/i.test(url); +} + +/** + * Normalize Amazon Prime genre information + */ +function normalizePrimeGenre(genre) { + if (!genre || typeof genre !== 'string') return null; + + const genreMapping = { + // English to Turkish mapping + 'Action': 'Aksiyon', + 'Adventure': 'Macera', + 'Comedy': 'Komedi', + 'Drama': 'Dram', + 'Fantasy': 'Fantastik', + 'Horror': 'Korku', + 'Mystery': 'Gizem', + 'Romance': 'Romantik', + 'Romantic': 'Romantik', + 'Sci-Fi': 'Bilim Kurgu', + 'Science Fiction': 'Bilim Kurgu', + 'Thriller': 'Gerilim', + 'Documentary': 'Belgesel', + 'Animation': 'Animasyon', + 'Family': 'Aile', + 'Kids': 'Çocuk', + 'War': 'Savaş', + 'Western': 'Western', + 'Humorous': 'Mizahi', + 'Sentimental': 'Duygusal' + }; + + // Handle multiple genres separated by commas, pipes, or special characters + const separators = /[,|•·]/; + const genres = genre.split(separators).map(g => g.trim()).filter(g => g); + + const normalizedGenres = genres.map(g => { + return genreMapping[g] || genreMapping[g.toLowerCase()] || g; + }).filter(g => g); + + // Return first genre as primary (could return array if needed) + return normalizedGenres[0] || null; +} + +/** + * Parse Amazon Prime HTML to extract metadata + * @param {string} html + * @returns {{ name?: string, year?: string | number, seasons?: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }} + */ +export function parsePrimeHtml(html) { + if (!html) return {}; + + const $ = load(html); + + let name = extractPrimeTitle($, html); + let year = extractPrimeYear($, html); + let seasons = extractPrimeSeasons($, html); + let thumbnail = extractPrimeThumbnail($, html); + let info = extractPrimeInfo($, html); + let genre = extractPrimeGenre($, html); + + // If we couldn't find the year, try to extract it from the title + if (!year && name) { + const titleYearMatch = name.match(/(\d{4})/); + if (titleYearMatch) { + year = titleYearMatch[1]; + } + } + + return { name, year, seasons, thumbnail, info, genre }; } diff --git a/tests/scrape.test.js b/tests/netflix.test.js similarity index 56% rename from tests/scrape.test.js rename to tests/netflix.test.js index f789f42..ea82272 100644 --- a/tests/scrape.test.js +++ b/tests/netflix.test.js @@ -34,6 +34,48 @@ describe('parseNetflixHtml (canlı sayfa)', () => { }, 20000 ); + + it( + 'thumbnail URL’sini çıkarır', + () => { + const meta = parseNetflixHtml(liveHtml); + if (meta.thumbnail) { + expect(meta.thumbnail).toContain('nflxso.net'); + } + // Thumbnail olmayabilir ama test geçmeli + }, + 20000 + ); + + it( + 'film/dizi açıklamasını (info) çıkarır', + () => { + const meta = parseNetflixHtml(liveHtml); + if (meta.info) { + expect(meta.info).toBeTruthy(); + expect(meta.info).not.toContain('Netflix'); + expect(meta.info).not.toContain('Fragmanları izleyin'); + } + // Info olmayabilir ama varsa boş olmamalı + }, + 20000 + ); + + it( + 'film/dizi türünü (genre) çıkarır', + () => { + const meta = parseNetflixHtml(liveHtml); + if (meta.genre) { + expect(meta.genre).toBeTruthy(); + expect(typeof meta.genre).toBe('string'); + // Genre mapping test: 'Action' → 'Aksiyon' + ['Aksiyon', 'Dram', 'Komedi', 'Fantastik'].includes(meta.genre) || + ['Action', 'Drama', 'Comedy', 'Fantasy'].includes(meta.genre); + } + // Genre olmayabilir ama varsa geçerli bir değer olmalı + }, + 20000 + ); }); describe('scraperNetflix (canlı istek)', () => { diff --git a/tests/prime.test.js b/tests/prime.test.js new file mode 100644 index 0000000..1312ce9 --- /dev/null +++ b/tests/prime.test.js @@ -0,0 +1,59 @@ +import { beforeAll, describe, expect, it } from 'vitest'; +import { scraperPrime } from '../src/index.js'; +import { parsePrimeHtml } from '../src/parser.js'; + +const TEST_URL = 'https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie'; +const UA = + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'; + +describe('parsePrimeHtml (örnek HTML)', () => { + it('boş veya geçersiz HTML için boş obje döner', () => { + expect(parsePrimeHtml('')).toEqual({}); + expect(parsePrimeHtml(null)).toEqual({}); + }); + + it('meta etiketlerinden başlık ve yıl çıkarır', () => { + const html = ` + + + + + + + `; + const meta = parsePrimeHtml(html); + expect(meta.name).toBe('Little Women'); + }); + + it('thumbnail ve info alanını çıkarır', () => { + const html = ` + + + `; + const meta = parsePrimeHtml(html); + expect(meta.thumbnail).toBe('https://m.media-amazon.com/images/S/pv-target-images/test.jpg'); + expect(meta.info).toBe("Louisa May Alcott'ın hikayesi"); + }); + + it('tür bilgisini normalize eder', () => { + const html = ` + + `; + const meta = parsePrimeHtml(html); + expect(meta.genre).toBe('Komedi'); + }); +}); + +describe('scraperPrime (canlı istek)', () => { + it( + 'normalize edilmiş url, id ve meta bilgilerini döner', + async () => { + const meta = await scraperPrime(TEST_URL, { headless: false, userAgent: UA }); + expect(meta.url).toBe('https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL'); + expect(meta.id).toBe('0NHIN3TGAI9L7VZ45RS52RHUPL'); + expect(meta.name).toBeTruthy(); + expect(meta.year).toMatch(/\d{4}/); + }, + 20000 + ); +});