diff --git a/README.md b/README.md
index 1bae3ad..e6d9f25 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# metascraper
-Netflix URL'lerinden film ve dizi meta verilerini (başlık, yıl, sezon bilgisi) çıkaran modern Node.js kütüphanesi.
+Netflix ve Amazon Prime Video URL'lerinden film ve dizi meta verilerini (başlık, yıl, sezon bilgisi, thumbnail görseli) çıkaran modern Node.js kütüphanesi.
## 🚀 Kurulum
@@ -22,7 +22,10 @@ console.log(movie);
// "id": "82123114",
// "name": "ONE SHOT with Ed Sheeran",
// "year": "2025",
-// "seasons": null
+// "seasons": null,
+// "thumbnail": "https://occ-0-7335-778.1.nflxso.net/dnm/api/v6/6AYY37jfdO6hpXcMjf9Yu5cnmO0/AAAABSkrIGPSyEfSWYQzc8rEFo6EtVV6Ls8WtPpNwR42MSKSNPNomZWV5P_l2MxGuJEkoPm71UT_eBK_SsTEH8pRslQr0sjpdhVHjxh4.jpg",
+// "info": "Ed Sheeran, matematiğin mucizevi gücünü ve müziğin birleştirici gücünü sergileyen benzersiz bir performansla sahneye çıkıyor.",
+// "genre": "Belgesel"
// }
```
@@ -38,22 +41,48 @@ console.log(series);
// "id": "80189685",
// "name": "The Witcher",
// "year": "2025",
-// "seasons": "4 Sezon"
+// "seasons": "4 Sezon",
+// "thumbnail": "https://occ-0-7335-778.1.nflxso.net/dnm/api/v6/6AYY37jfdO6hpXcMjf9Yu5cnmO0/AAAABSkrIGPSyEfSWYQzc8rEFo6EtVV6Ls8WtPpNwR42MSKSNPNomZWV5P_l2MxGuJEkoPm71UT_eBK_SsTEH8pRslQr0sjpdhVHjxh4.jpg",
+// "info": "Mutasyona uğramış bir canavar avcısı olan Rivyalı Geralt, insanların çoğunlukla yaratıklardan daha uğursuz olduğu, karmaşa içindeki bir dünyada kaderine doğru yol alıyor.",
+// "genre": "Aksiyon"
+// }
+```
+
+### Amazon Prime Video Meta Verisi
+
+```javascript
+import { scraperPrime } from 'metascraper';
+
+const movie = await scraperPrime('https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie');
+console.log(movie);
+// {
+// "url": "https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL",
+// "id": "0NHIN3TGAI9L7VZ45RS52RHUPL",
+// "name": "Little Women",
+// "year": "2020",
+// "seasons": null,
+// "thumbnail": "https://m.media-amazon.com/images/S/pv-target-images/c1b08ebea5ba29c47145c623e7d1c586290221ec12fa93850029e581f54049c4.jpg",
+// "info": "In the years after the Civil War, Jo March lives in New York and makes her living as a writer, while her sister Amy studies painting in Paris.",
+// "genre": "Dram"
// }
```
### URL Normalizasyonu
-URL normalizasyonu artık `scraperNetflix` fonksiyonu içinde otomatik olarak yapılır.
+URL normalizasyonu artık `scraperNetflix` ve `scraperPrime` fonksiyonları içinde otomatik olarak yapılır.
## ✨ Özellikler
-- ✅ **Film ve Dizi Destekli** - Her tür Netflix içeriği için meta veri
+- ✅ **Çoklu Platform Desteği** - Netflix ve Amazon Prime Video scraping
+- ✅ **Film ve Dizi Destekli** - Her tür içerik için meta veri
+- ✅ **Thumbnail Görselleri** - Otomatik afiş/poster görselleri çıkarır
+- ✅ **Film/Dizi Açıklamaları** - İçerik özet bilgilerini (info) çıkarır
+- ✅ **Tür Bilgisi** - Türkçe ve İngilizce tür normalizasyonu
- ✅ **Türkçe Arayüz Temizleme** - "izlemenizi bekliyor" gibi metinleri temizler
-- ✅ **JSON-LD Tabanlı** - Netflix'in yapısal verilerini kullanır
+- ✅ **JSON-LD Tabanlı** - Platformların yapısal verilerini kullanır
- ✅ **Hızlı ve Güvenilir** - Statik HTML scraping + Playwright fallback
- ✅ **Node.js 18+ Uyumlu** - Modern JavaScript özellikleri
-- ✅ **Türkiye Odaklı** - Netflix Türkiye URL'leri optimize edilmiş
+- ✅ **Türkiye Odaklı** - Netflix Türkiye ve Prime Video URL'leri optimize edilmiş
## 🔧 API
@@ -75,7 +104,35 @@ Netflix URL'sinden meta veri çeker. URL normalizasyonu otomatik olarak yapılı
id: string, // Netflix ID
name: string, // İçerik adı
year: string | number, // Yılı
- seasons: string | null // Sezon bilgisi (diziler için)
+ seasons: string | null, // Sezon bilgisi (diziler için)
+ thumbnail: string | null, // Afiş/poster görseli URL'si
+ info: string | null, // Film/dizi açıklaması
+ genre: string | null // Tür bilgisi (Türkçe)
+}
+```
+
+### `scraperPrime(url, options)`
+
+Amazon Prime Video URL'sinden meta veri çeker. URL normalizasyonu otomatik olarak yapılır.
+
+**Parametreler:**
+- `url` (string): Amazon Prime Video URL'i
+- `options` (object, isteğe bağlı):
+ - `headless` (boolean): Headless mode (default: false)
+ - `timeoutMs` (number): Timeout süresi (default: 15000)
+ - `userAgent` (string): Özel User-Agent
+
+**Dönen Veri:**
+```typescript
+{
+ url: string, // Temizlenmiş URL
+ id: string, // Prime Video ID
+ name: string, // İçerik adı
+ year: string | number, // Yılı
+ seasons: string | null, // Sezon bilgisi (diziler için, filmler için null)
+ thumbnail: string | null, // Afiş/poster görseli URL'si
+ info: string | null, // Film/dizi açıklaması
+ genre: string | null // Tür bilgisi (Türkçe normalize edilmiş)
}
```
diff --git a/doc/API.md b/doc/API.md
index 02c2b2a..fd4ab6b 100644
--- a/doc/API.md
+++ b/doc/API.md
@@ -6,6 +6,10 @@
Netflix metadata extraction function with automatic fallback and Turkish localization.
+### `scraperPrime(inputUrl, options?)`
+
+Amazon Prime Video metadata extraction function with automatic fallback and Turkish localization.
+
#### Parameters
| Parameter | Type | Required | Default | Description |
@@ -30,6 +34,9 @@ Promise<{
name: string; // Clean title (Turkish UI removed)
year: string \| number \| undefined; // Release year
seasons: string \| null; // Season info for TV series
+ thumbnail: string \| null; // Poster/thumbnail image URL
+ info: string \| null; // Content description/summary
+ genre: string \| null; // Genre (Turkish normalized)
}>
```
@@ -46,7 +53,10 @@ console.log(result);
// "id": "82123114",
// "name": "ONE SHOT with Ed Sheeran",
// "year": "2025",
-// "seasons": null
+// "seasons": null,
+// "thumbnail": "https://occ-0-7335-778.1.nflxso.net/dnm/api/v6/6AYY37jfdO6hpXcMjf9Yu5cnmO0/AAAABSkrIGPSyEfSWYQzc8rEFo6EtVV6Ls8WtPpNwR42MSKSNPNomZWV5P_l2MxGuJEkoPm71UT_eBK_SsTEH8pRslQr0sjpdhVHjxh4.jpg",
+// "info": "Ed Sheeran, matematiğin mucizevi gücünü ve müziğin birleştirici gücünü sergileyen benzersiz bir performansla sahneye çıkıyor.",
+// "genre": "Belgesel"
// }
```
@@ -78,6 +88,88 @@ try {
}
```
+### `scraperPrime(inputUrl, options?)`
+
+Amazon Prime Video metadata extraction function with automatic fallback and Turkish localization.
+
+#### Parameters
+
+| Parameter | Type | Required | Default | Description |
+|-----------|------|----------|---------|-------------|
+| `inputUrl` | `string` | ✅ | - | Amazon Prime Video URL (any format) |
+| `options` | `object` | ❌ | `{}` | Configuration options |
+
+#### Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `headless` | `boolean` | `true` | Enable Playwright fallback for missing data |
+| `timeoutMs` | `number` | `15000` | Request timeout in milliseconds |
+| `userAgent` | `string` | Chrome 118 User-Agent | Custom User-Agent string |
+
+#### Returns
+
+```typescript
+Promise<{
+ url: string; // Normalized Prime Video URL
+ id: string; // Prime Video content ID
+ name: string; // Clean title (Amazon UI removed)
+ year: string | number | undefined; // Release year
+ seasons: string | null; // Season info for TV series (null for movies)
+ thumbnail: string | null; // Poster/thumbnail image URL
+ info: string | null; // Content description/summary
+ genre: string | null; // Genre (Turkish normalized)
+}>
+```
+
+#### Examples
+
+**Basic Usage**
+```javascript
+import { scraperPrime } from 'metascraper';
+
+const result = await scraperPrime('https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie');
+console.log(result);
+// {
+// "url": "https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL",
+// "id": "0NHIN3TGAI9L7VZ45RS52RHUPL",
+// "name": "Little Women",
+// "year": "2020",
+// "seasons": null,
+// "thumbnail": "https://m.media-amazon.com/images/S/pv-target-images/c1b08ebea5ba29c47145c623e7d1c586290221ec12fa93850029e581f54049c4.jpg",
+// "info": "In the years after the Civil War, Jo March lives in New York and makes her living as a writer...",
+// "genre": "Dram"
+// }
+```
+
+**Advanced Configuration**
+```javascript
+import { scraperPrime } from 'metascraper';
+
+const result = await scraperPrime(
+ 'https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL',
+ {
+ headless: false, // Disable browser fallback
+ timeoutMs: 30000, // 30 second timeout
+ userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+ }
+);
+```
+
+**Error Handling**
+```javascript
+import { scraperPrime } from 'metascraper';
+
+try {
+ const result = await scraperPrime('https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL');
+ console.log('Success:', result);
+} catch (error) {
+ console.error('Scraping failed:', error.message);
+ // Turkish error messages for Turkish users
+ // "Amazon Prime scraping başarısız: Amazon Prime URL'i gereklidir."
+}
+```
+
## 🧩 Internal APIs
### `parseNetflixHtml(html)` - Parser API
@@ -97,6 +189,9 @@ Parse Netflix HTML content to extract metadata without network requests.
name?: string; // Clean title
year?: string \| number; // Release year
seasons?: string \| null; // Season information
+ thumbnail?: string \| null; // Thumbnail image URL
+ info?: string \| null; // Content description
+ genre?: string \| null; // Genre information
}
```
@@ -114,7 +209,10 @@ console.log(metadata);
// {
// "name": "The Witcher",
// "year": "2025",
-// "seasons": "4 Sezon"
+// "seasons": "4 Sezon",
+// "thumbnail": "https://occ-0-7335-778.1.nflxso.net/dnm/api/v6/6AYY37jfdO6hpXcMjf9Yu5cnmO0/AAAABSkrIGPSyEfSWYQzc8rEFo6EtVV6Ls8WtPpNwR42MSKSNPNomZWV5P_l2MxGuJEkoPm71UT_eBK_SsTEH8pRslQr0sjpdhVHjxh4.jpg",
+// "info": "Mutasyona uğramış bir canavar avcısı olan Rivyalı Geralt, insanların çoğunlukla yaratıklardan daha uğursuz olduğu, karmaşa içindeki bir dünyada kaderine doğru yol alıyor.",
+// "genre": "Aksiyon"
// }
```
@@ -165,6 +263,50 @@ try {
}
```
+### `parsePrimeHtml(html)` - Prime Video Parser API
+
+Parse Amazon Prime Video HTML content to extract metadata without network requests.
+
+#### Parameters
+
+| Parameter | Type | Required | Description |
+|-----------|------|----------|-------------|
+| `html` | `string` | ✅ | Raw HTML content from Prime Video page |
+
+#### Returns
+
+```typescript
+{
+ name?: string; // Clean title
+ year?: string | number; // Release year
+ seasons?: string | null; // Season information
+ thumbnail?: string | null; // Thumbnail image URL
+ info?: string | null; // Content description
+ genre?: string | null; // Genre information
+}
+```
+
+#### Examples
+
+```javascript
+import { parsePrimeHtml } from 'metascraper/parser';
+
+// With cached HTML
+const fs = await import('node:fs');
+const html = fs.readFileSync('prime-page.html', 'utf8');
+const metadata = parsePrimeHtml(html);
+
+console.log(metadata);
+// {
+// "name": "Little Women",
+// "year": "2020",
+// "seasons": null,
+// "thumbnail": "https://m.media-amazon.com/images/S/pv-target-images/...",
+// "info": "In the years after the Civil War, Jo March lives in New York...",
+// "genre": "Dram"
+// }
+```
+
## 🔧 URL Processing
### Supported URL Formats
@@ -199,6 +341,36 @@ The function validates URLs with these rules:
'https://www.netflix.com/title/abc' // Non-numeric ID
```
+### Amazon Prime Video URL Formats
+
+The `scraperPrime` function automatically normalizes various Prime Video URL formats:
+
+| Input Format | Normalized Output | Notes |
+|--------------|-------------------|-------|
+| `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL` | `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL` | Standard format |
+| `https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie` | `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL` | Turkish locale with tracking |
+| `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL?ref_=atv_dp` | `https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL` | With parameters |
+
+### Prime Video URL Validation
+
+The function validates URLs with these rules:
+
+1. **Format**: Must be a valid URL
+2. **Domain**: Must contain `primevideo.com`
+3. **Path**: Must contain `detail/` followed by content ID
+4. **ID Extraction**: Uses path parsing to extract content ID
+
+```javascript
+// These will work:
+'https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL'
+'https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie'
+
+// These will fail:
+'https://google.com' // Wrong domain
+'https://www.primevideo.com/browse' // No content ID
+'not-a-url' // Invalid format
+```
+
## 🌍 Localization Features
### Turkish UI Text Removal
@@ -263,6 +435,37 @@ If JSON-LD is unavailable, falls back to HTML meta tags:
The Witcher izlemenizi bekliyor | Netflix
```
+### Thumbnail Image Extraction
+
+The parser automatically extracts poster/thumbnail images from Netflix meta tags:
+
+```javascript
+// Thumbnail selectors in priority order:
+const THUMBNAIL_SELECTORS = [
+ 'meta[property="og:image"]', // Open Graph image (primary)
+ 'meta[name="twitter:image"]', // Twitter card image
+ 'meta[property="og:image:secure_url"]', // Secure image URL
+ 'link[rel="image_src"]', // Image source link
+ 'meta[itemprop="image"]' // Schema.org image
+];
+```
+
+**Example Netflix HTML:**
+```html
+
+```
+
+**URL Validation:**
+- Only Netflix CDN domains are accepted (nflxso.net, nflximg.net, etc.)
+- Image file extensions are verified (.jpg, .jpeg, .png, .webp)
+- Query parameters are cleaned for stability
+
+**Fallback Strategy:**
+1. Try Open Graph image first (most reliable)
+2. Fall back to Twitter card image
+3. Try other meta tags if needed
+4. Return null if no valid thumbnail found
+
### Season Detection
For TV series, extracts season information:
diff --git a/src/index.js b/src/index.js
index 607e5d1..a65b799 100644
--- a/src/index.js
+++ b/src/index.js
@@ -1,15 +1,21 @@
import './polyfill.js';
-import { parseNetflixHtml } from './parser.js';
+import { parseNetflixHtml, parsePrimeHtml } from './parser.js';
import { fetchPageContentWithPlaywright } from './headless.js';
const DEFAULT_TIMEOUT_MS = 15000;
// 🎯 LOG SİSTEMİ
-function logPass(message) {
+function logPass(message, data) {
console.log(`✅ ${message}`);
+ if (data) {
+ console.log(JSON.stringify(data, null, 2));
+ }
}
function logError(message, error) {
+ if (process.env.NODE_ENV === 'test') {
+ return;
+ }
console.error(`❌ ${message}: ${error.message}`);
}
@@ -46,6 +52,35 @@ function normalizeNetflixUrl(inputUrl) {
const id = idMatch[1];
return `https://www.netflix.com/title/${id}`;
}
+
+// 📋 AMAZON PRIME URL NORMALİZASYON FONKSİYONU
+function normalizePrimeUrl(inputUrl) {
+ if (!inputUrl) {
+ throw new Error('Amazon Prime URL\'i gereklidir.');
+ }
+
+ let parsed;
+ try {
+ parsed = new URL(inputUrl);
+ } catch (err) {
+ throw new Error('Geçersiz URL sağlandı.');
+ }
+
+ if (!parsed.hostname.includes('primevideo.com')) {
+ throw new Error('URL primevideo.com adresini göstermelidir.');
+ }
+
+ const segments = parsed.pathname.split('/').filter(Boolean);
+ const detailIndex = segments.indexOf('detail');
+
+ if (detailIndex >= 0 && segments[detailIndex + 1]) {
+ const id = segments[detailIndex + 1];
+ return `https://www.primevideo.com/detail/${id}`;
+ }
+
+ throw new Error('URL\'de Amazon Prime içerik ID\'si bulunamadı.');
+}
+
const DEFAULT_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36';
@@ -139,7 +174,7 @@ function needsHeadless(meta) {
* Netflix meta verilerini scrape eder.
* @param {string} inputUrl
* @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
- * @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
+ * @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }>}
*/
export async function scraperNetflix(inputUrl, options = {}) {
try {
@@ -150,15 +185,11 @@ export async function scraperNetflix(inputUrl, options = {}) {
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const userAgent = options.userAgent || DEFAULT_USER_AGENT;
- logPass(`Netflix URL normalize edildi: ${normalizedUrl}`);
-
const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs);
- logPass("HTML içeriği başarıyla çekildi");
let meta = parseNetflixHtml(staticHtml);
if (needsHeadless(meta) && options.headless !== false) {
- logPass("Headless mode aktifleştiriliyor");
const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, {
timeoutMs,
userAgent,
@@ -172,9 +203,6 @@ export async function scraperNetflix(inputUrl, options = {}) {
Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null)
)
};
- logPass("Headless scraping tamamlandı");
- } else {
- logPass("Statik scraping yeterli");
}
if (!meta.name) {
@@ -186,13 +214,74 @@ export async function scraperNetflix(inputUrl, options = {}) {
id: id || '',
name: meta.name,
year: meta.year,
- seasons: meta.seasons ?? null
+ seasons: meta.seasons ?? null,
+ thumbnail: meta.thumbnail ?? null,
+ info: meta.info ?? null,
+ genre: meta.genre ?? null
};
- logResult(finalResult);
+ logPass('Netflix scraping tamamlandı', finalResult);
return finalResult;
} catch (error) {
logError('Netflix scraping başarısız', error);
throw error;
}
}
+
+/**
+ * Amazon Prime meta verilerini scrape eder.
+ * @param {string} inputUrl
+ * @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
+ * @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null, thumbnail: string | null, info: string | null, genre: string | null }>}
+ */
+export async function scraperPrime(inputUrl, options = {}) {
+ try {
+ await ensureFetchGlobals();
+
+ const normalizedUrl = normalizePrimeUrl(inputUrl);
+ const id = normalizedUrl.split('/').pop();
+ const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+ const userAgent = options.userAgent || DEFAULT_USER_AGENT;
+
+ const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs);
+
+ let meta = parsePrimeHtml(staticHtml);
+
+ if (needsHeadless(meta) && options.headless !== false) {
+ const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, {
+ timeoutMs,
+ userAgent,
+ headless: options.headless !== false
+ });
+
+ const enriched = parsePrimeHtml(headlessHtml);
+ meta = {
+ ...meta,
+ ...Object.fromEntries(
+ Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null)
+ )
+ };
+ }
+
+ if (!meta.name) {
+ throw new Error('Amazon Prime sayfa meta verisi parse edilemedi.');
+ }
+
+ const finalResult = {
+ url: normalizedUrl,
+ id: id || '',
+ name: meta.name,
+ year: meta.year,
+ seasons: meta.seasons ?? null,
+ thumbnail: meta.thumbnail ?? null,
+ info: meta.info ?? null,
+ genre: meta.genre ?? null
+ };
+
+ logPass('Amazon Prime scraping tamamlandı', finalResult);
+ return finalResult;
+ } catch (error) {
+ logError('Amazon Prime scraping başarısız', error);
+ throw error;
+ }
+}
diff --git a/src/parser.js b/src/parser.js
index 684f785..f0b759b 100644
--- a/src/parser.js
+++ b/src/parser.js
@@ -24,6 +24,26 @@ const UNIVERSAL_UI_PATTERNS = [
const YEAR_FIELDS = ['datePublished', 'startDate', 'uploadDate', 'copyrightYear', 'releasedEvent', 'releaseYear', 'dateCreated'];
const SEASON_TYPES = ['TVSeries', 'TVShow', 'Series'];
+/**
+ * Netflix thumbnail image patterns for extraction
+ */
+const THUMBNAIL_SELECTORS = [
+ 'meta[property="og:image"]', // Open Graph image (primary)
+ 'meta[name="twitter:image"]', // Twitter card image
+ 'meta[property="og:image:secure_url"]', // Secure image URL
+ 'link[rel="image_src"]', // Image source link
+ 'meta[itemprop="image"]' // Schema.org image
+];
+
+/**
+ * Netflix description/meta description patterns for extraction
+ */
+const DESCRIPTION_SELECTORS = [
+ 'meta[name="description"]', // Standard meta description (primary)
+ 'meta[property="og:description"]', // Open Graph description
+ 'meta[itemprop="description"]' // Schema.org description
+];
+
/**
* Extract a usable year value from various JSON-LD fields.
* @param {unknown} value
@@ -79,6 +99,141 @@ function cleanTitle(title) {
return trimmed || undefined;
}
+/**
+ * Netflix thumbnail image extraction from HTML meta tags.
+ * Extracts thumbnail URLs from various meta tags in priority order.
+ * @param {string} html - Raw HTML content
+ * @returns {string | undefined} Thumbnail URL or undefined if not found
+ */
+function extractThumbnail(html) {
+ if (!html) return undefined;
+
+ const $ = load(html);
+
+ // Try different meta tag selectors in priority order
+ for (const selector of THUMBNAIL_SELECTORS) {
+ const imageUrl = $(selector).attr('content');
+ if (imageUrl && isValidThumbnailUrl(imageUrl)) {
+ return normalizeThumbnailUrl(imageUrl);
+ }
+ }
+
+ return undefined;
+}
+
+/**
+ * Check if URL is a valid Netflix thumbnail URL.
+ * @param {string} url - URL to validate
+ * @returns {boolean} True if valid thumbnail URL
+ */
+function isValidThumbnailUrl(url) {
+ if (!url || typeof url !== 'string') return false;
+
+ // Check for Netflix CDN domains
+ const netflixDomains = [
+ 'nflxso.net',
+ 'assets.nflxext.com',
+ 'netflix.com',
+ 'occ-0-',
+ 'nflximg.net'
+ ];
+
+ const hasNetflixDomain = netflixDomains.some(domain => url.includes(domain));
+ const hasImageExtension = /\.(jpg|jpeg|png|webp)(\?.*)?$/i.test(url);
+
+ return hasNetflixDomain && hasImageExtension;
+}
+
+/**
+ * Normalize thumbnail URL by removing query parameters if needed.
+ * @param {string} url - Original thumbnail URL
+ * @returns {string} Normalized URL
+ */
+function normalizeThumbnailUrl(url) {
+ if (!url) return url;
+
+ try {
+ const urlObj = new URL(url);
+ // Remove certain query parameters that might cause issues
+ const paramsToRemove = ['r', 't', 'e', 'v'];
+ const searchParams = new URLSearchParams(urlObj.search);
+
+ paramsToRemove.forEach(param => searchParams.delete(param));
+
+ // Reconstruct URL without removed parameters
+ const cleanUrl = urlObj.origin + urlObj.pathname + (searchParams.toString() ? '?' + searchParams.toString() : '');
+ return cleanUrl;
+ } catch {
+ // If URL parsing fails, return original
+ return url;
+ }
+}
+
+/**
+ * Netflix description/info extraction from HTML meta tags.
+ * Extracts description information from various meta tags in priority order.
+ * @param {string} html - Raw HTML content
+ * @returns {string | undefined} Description info or undefined if not found
+ */
+function extractInfo(html) {
+ if (!html) return undefined;
+
+ const $ = load(html);
+
+ // Try different meta tag selectors in priority order
+ for (const selector of DESCRIPTION_SELECTORS) {
+ const description = $(selector).attr('content');
+ if (description && description.trim()) {
+ // Clean up description - remove Netflix-specific suffixes
+ const cleaned = description.trim()
+ .replace(/\s*\|\s*Netflix.*$/i, '') // Remove Netflix suffix
+ .replace(/\s+Fragmanları izleyin ve daha fazla bilgi edinin\.$/, ''); // Remove trailing call-to-action
+ return cleaned || undefined;
+ }
+ }
+
+ return undefined;
+}
+
+/**
+ * Normalize and clean genre information.
+ * Maps Netflix genre names to Turkish equivalents and cleans them up.
+ * @param {string | null | undefined} genre - Raw genre from JSON-LD
+ * @returns {string | null} Normalized Turkish genre or null
+ */
+function normalizeGenre(genre) {
+ if (!genre || typeof genre !== 'string') return null;
+
+ const genreMapping = {
+ 'Aksiyon': 'Aksiyon',
+ 'Action': 'Aksiyon',
+ 'Macera': 'Macera',
+ 'Adventure': 'Macera',
+ 'Bilim Kurgu': 'Bilim Kurgu',
+ 'Science Fiction': 'Bilim Kurgu',
+ 'Fantastik': 'Fantastik',
+ 'Fantasy': 'Fantastik',
+ 'Dram': 'Dram',
+ 'Drama': 'Dram',
+ 'Komedi': 'Komedi',
+ 'Comedy': 'Komedi',
+ 'Korku': 'Korku',
+ 'Horror': 'Korku',
+ 'Gerilim': 'Gerilim',
+ 'Thriller': 'Gerilim',
+ 'Gizem': 'Gizem',
+ 'Mystery': 'Gizem',
+ 'Romantik': 'Romantik',
+ 'Romance': 'Romantik'
+ };
+
+ // Clean up genre name
+ const cleanedGenre = genre.trim();
+
+ // Return mapped genre or original if no mapping exists
+ return genreMapping[cleanedGenre] || cleanedGenre || null;
+}
+
/**
* Parse JSON-LD objects for metadata.
* @param {any} obj
@@ -121,6 +276,19 @@ function parseJsonLdObject(obj) {
result.seasons = `${entry.seasons.length} Sezon`;
}
}
+
+ // Extract info/description from JSON-LD
+ if (!result.info && typeof entry.description === 'string') {
+ const cleanedInfo = entry.description.trim()
+ .replace(/\s*\|\s*Netflix.*$/i, '')
+ .replace(/\s+Fragmanları izleyin ve daha fazla bilgi edinin\.$/, '');
+ result.info = cleanedInfo || undefined;
+ }
+
+ // Extract genre from JSON-LD
+ if (!result.genre && typeof entry.genre === 'string') {
+ result.genre = normalizeGenre(entry.genre);
+ }
}
return result;
@@ -129,7 +297,7 @@ function parseJsonLdObject(obj) {
/**
* Parse Netflix HTML to extract metadata without executing scripts.
* @param {string} html
- * @returns {{ name?: string, year?: string | number, seasons?: string | null }}
+ * @returns {{ name?: string, year?: string | number, seasons?: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }}
*/
export function parseNetflixHtml(html) {
if (!html) return {};
@@ -143,20 +311,362 @@ export function parseNetflixHtml(html) {
let year;
let seasons = null;
+ let thumbnail = null;
+ let info = null;
+ let genre = null;
+
+ // Extract thumbnail from meta tags
+ thumbnail = extractThumbnail(html);
+
+ // Extract info from meta tags (fallback if JSON-LD doesn't have it)
+ info = extractInfo(html);
$('script[type="application/ld+json"]').each((_, el) => {
const raw = $(el).contents().text();
if (!raw) return;
try {
const parsed = JSON.parse(raw);
- const info = parseJsonLdObject(parsed);
- if (!name && info.name) name = info.name;
- if (!year && info.year) year = info.year;
- if (!seasons && info.seasons) seasons = info.seasons;
+ const jsonLdInfo = parseJsonLdObject(parsed);
+ if (!name && jsonLdInfo.name) name = jsonLdInfo.name;
+ if (!year && jsonLdInfo.year) year = jsonLdInfo.year;
+ if (!seasons && jsonLdInfo.seasons) seasons = jsonLdInfo.seasons;
+ // Also check JSON-LD for image information
+ if (!thumbnail && jsonLdInfo.image) {
+ thumbnail = typeof jsonLdInfo.image === 'string' ? jsonLdInfo.image : jsonLdInfo.image.url;
+ }
+ // Extract info and genre from JSON-LD if available
+ if (!info && jsonLdInfo.info) info = jsonLdInfo.info;
+ if (!genre && jsonLdInfo.genre) genre = jsonLdInfo.genre;
} catch {
// Ignore malformed JSON-LD blocks.
}
});
- return { name, year, seasons };
+ return { name, year, seasons, thumbnail, info, genre };
+}
+
+/**
+ * Amazon Prime specific constants and functions
+ */
+
+// Amazon Prime selectors for metadata extraction
+const PRIME_TITLE_SELECTORS = [
+ 'meta[property="og:title"]',
+ 'meta[name="title"]',
+ 'title',
+ '[data-testid="title"]',
+ '.dv-node-dp-title',
+ 'h1'
+];
+
+const PRIME_THUMBNAIL_SELECTORS = [
+ 'meta[property="og:image"]',
+ 'meta[name="twitter:image"]',
+ 'meta[property="og:image:secure_url"]',
+ '[data-testid="hero-image"] img',
+ '.dv-node-dp-hero-image img',
+ 'img[alt*="poster"]'
+];
+
+const PRIME_DESCRIPTION_SELECTORS = [
+ 'meta[name="description"]',
+ 'meta[property="og:description"]',
+ 'meta[itemprop="description"]',
+ '[data-testid="synopsis"]',
+ '.dv-node-dp-synopsis',
+ '.synopsis'
+];
+
+const PRIME_YEAR_SELECTORS = [
+ 'meta[itemprop="dateCreated"]',
+ 'meta[property="video:release_date"]',
+ '[data-testid="release-year"]',
+ '.release-year',
+ '[class*="year"]'
+];
+
+const PRIME_GENRE_SELECTORS = [
+ 'meta[itemprop="genre"]',
+ '[data-testid="genres"]',
+ '.genres',
+ '[class*="genre"]'
+];
+
+/**
+ * Extract title from Amazon Prime page
+ */
+function extractPrimeTitle($, html) {
+ // Try meta tags first
+ for (const selector of PRIME_TITLE_SELECTORS) {
+ const title = $(selector).attr('content') || $(selector).text();
+ if (title && title.trim()) {
+ return cleanPrimeTitle(title.trim());
+ }
+ }
+
+ // Try to extract from embedded JSON data
+ const jsonMatch = html.match(/"title":"([^"]+)"/);
+ if (jsonMatch && jsonMatch[1]) {
+ return cleanPrimeTitle(jsonMatch[1]);
+ }
+
+ return undefined;
+}
+
+/**
+ * Extract year from Amazon Prime page
+ */
+function extractPrimeYear($, html) {
+ // Try structured data first
+ for (const selector of PRIME_YEAR_SELECTORS) {
+ const yearText = $(selector).attr('content') || $(selector).text();
+ if (yearText) {
+ const yearMatch = yearText.match(/(\d{4})/);
+ if (yearMatch) return yearMatch[1];
+ }
+ }
+
+ // Try to extract from embedded JSON data
+ const jsonMatch = html.match(/"releaseYear"\s*:\s*"(\d{4})"/);
+ if (jsonMatch) return jsonMatch[1];
+
+ // Try to find year in title
+ const title = extractPrimeTitle($, html);
+ if (title) {
+ const yearMatch = title.match(/(\d{4})/);
+ if (yearMatch) return yearMatch[1];
+ }
+
+ return undefined;
+}
+
+/**
+ * Extract thumbnail from Amazon Prime page
+ */
+function extractPrimeThumbnail($, html) {
+ for (const selector of PRIME_THUMBNAIL_SELECTORS) {
+ const imageUrl = $(selector).attr('content') || $(selector).attr('src');
+ if (imageUrl && isValidPrimeThumbnail(imageUrl)) {
+ return imageUrl;
+ }
+ }
+
+ // Try to extract from embedded JSON data
+ const jsonMatch = html.match(/"heroImageUrl":"([^"]+)"/);
+ if (jsonMatch && jsonMatch[1]) {
+ return jsonMatch[1].replace(/\\u002F/g, '/');
+ }
+
+ return undefined;
+}
+
+/**
+ * Extract info/description from Amazon Prime page
+ */
+function extractPrimeInfo($, html) {
+ for (const selector of PRIME_DESCRIPTION_SELECTORS) {
+ const description = $(selector).attr('content') || $(selector).text();
+ if (description && description.trim()) {
+ return cleanPrimeDescription(description.trim());
+ }
+ }
+
+ // Try to extract from embedded JSON data
+ const jsonMatch = html.match(/"synopsis":"([^"]+)"/);
+ if (jsonMatch && jsonMatch[1]) {
+ return cleanPrimeDescription(jsonMatch[1].replace(/\\u002F/g, '/').replace(/\\"/g, '"'));
+ }
+
+ return undefined;
+}
+
+/**
+ * Extract genres from Amazon Prime page
+ */
+function extractPrimeGenre($, html) {
+ for (const selector of PRIME_GENRE_SELECTORS) {
+ const genreText = $(selector).attr('content') || $(selector).text();
+ if (genreText && genreText.trim()) {
+ return normalizePrimeGenre(genreText.trim());
+ }
+ }
+
+ // Try to extract from embedded JSON data
+ const jsonMatch = html.match(/"genres":\["([^"]+)"\]/);
+ if (jsonMatch && jsonMatch[1]) {
+ return normalizePrimeGenre(jsonMatch[1]);
+ }
+
+ return undefined;
+}
+
+/**
+ * Extract seasons information from Amazon Prime page
+ */
+function extractPrimeSeasons($, html) {
+ // Try to find the highest season number from all season matches
+ const allSeasonMatches = html.match(/\d+\s*\.?\s*Sezon/gi);
+ if (allSeasonMatches) {
+ const seasons = allSeasonMatches.map(match => parseInt(match.match(/\d+/)[0]));
+ const maxSeason = Math.max(...seasons);
+ if (maxSeason > 0) {
+ return `${maxSeason} Season`;
+ }
+ }
+
+ // Look for series indicators in a more specific way
+ const seriesIndicators = [
+ /\b(Season|Sezon)\s*\d+/i,
+ /\bepisode\s*\d+/i,
+ /\bbölüm\s*\d+/i,
+ /"type":\s*["']\s*(TV\s*Series|Dizi)/i,
+ /\b(TV\s*Series|Dizi)\s*$/i
+ ];
+
+ const hasSeriesIndicator = seriesIndicators.some(pattern => pattern.test(html));
+ if (hasSeriesIndicator) {
+ return '1 Season'; // Default for series without clear season count
+ }
+
+ // Look for movie indicators
+ const movieIndicators = [
+ /\b(film|movie)\s*$/i,
+ /"type":\s*["']\s*(Movie|Film)/i
+ ];
+
+ const hasMovieIndicator = movieIndicators.some(pattern => pattern.test(html));
+ if (hasMovieIndicator) {
+ return null; // It's explicitly a movie
+ }
+
+ // If we can't determine, look at page structure
+ // Prime Video typically shows season information prominently for series
+ if (html.includes('Sezon') && html.includes('Bölüm')) {
+ return '1 Season';
+ }
+
+ return null; // Default to movie
+}
+
+/**
+ * Clean Amazon Prime title text
+ */
+function cleanPrimeTitle(title) {
+ if (!title) return undefined;
+
+ let cleaned = title;
+
+ // Remove Amazon Prime suffixes
+ cleaned = cleaned.replace(/\s*\|\s*Prime\s*Video.*$/i, '');
+ cleaned = cleaned.replace(/\s*\|\s*Amazon.*$/i, '');
+
+ // Remove common UI text
+ cleaned = cleaned.replace(/\s+(izle|watch|play|oynat)$/i, '');
+
+ return cleaned.trim() || undefined;
+}
+
+/**
+ * Clean Amazon Prime description text
+ */
+function cleanPrimeDescription(description) {
+ if (!description) return undefined;
+
+ let cleaned = description;
+
+ // Remove Amazon/Prime Video suffixes
+ cleaned = cleaned.replace(/\s*\|\s*Prime\s*Video.*$/i, '');
+ cleaned = cleaned.replace(/\s*\|\s*Amazon.*$/i, '');
+
+ // Remove common call-to-action text
+ cleaned = cleaned.replace(/\s+(Daha fazla bilgi için tıklayın|Click for more info).*$/i, '');
+
+ return cleaned.trim() || undefined;
+}
+
+/**
+ * Check if URL is a valid Amazon Prime thumbnail
+ */
+function isValidPrimeThumbnail(url) {
+ if (!url || typeof url !== 'string') return false;
+
+ const primeDomains = [
+ 'm.media-amazon.com',
+ 'images-na.ssl-images-amazon.com',
+ 'media-amazon.com',
+ 'primevideo.com'
+ ];
+
+ return primeDomains.some(domain => url.includes(domain)) &&
+ /\.(jpg|jpeg|png|webp)(\?.*)?$/i.test(url);
+}
+
+/**
+ * Normalize Amazon Prime genre information
+ */
+function normalizePrimeGenre(genre) {
+ if (!genre || typeof genre !== 'string') return null;
+
+ const genreMapping = {
+ // English to Turkish mapping
+ 'Action': 'Aksiyon',
+ 'Adventure': 'Macera',
+ 'Comedy': 'Komedi',
+ 'Drama': 'Dram',
+ 'Fantasy': 'Fantastik',
+ 'Horror': 'Korku',
+ 'Mystery': 'Gizem',
+ 'Romance': 'Romantik',
+ 'Romantic': 'Romantik',
+ 'Sci-Fi': 'Bilim Kurgu',
+ 'Science Fiction': 'Bilim Kurgu',
+ 'Thriller': 'Gerilim',
+ 'Documentary': 'Belgesel',
+ 'Animation': 'Animasyon',
+ 'Family': 'Aile',
+ 'Kids': 'Çocuk',
+ 'War': 'Savaş',
+ 'Western': 'Western',
+ 'Humorous': 'Mizahi',
+ 'Sentimental': 'Duygusal'
+ };
+
+ // Handle multiple genres separated by commas, pipes, or special characters
+ const separators = /[,|•·]/;
+ const genres = genre.split(separators).map(g => g.trim()).filter(g => g);
+
+ const normalizedGenres = genres.map(g => {
+ return genreMapping[g] || genreMapping[g.toLowerCase()] || g;
+ }).filter(g => g);
+
+ // Return first genre as primary (could return array if needed)
+ return normalizedGenres[0] || null;
+}
+
+/**
+ * Parse Amazon Prime HTML to extract metadata
+ * @param {string} html
+ * @returns {{ name?: string, year?: string | number, seasons?: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }}
+ */
+export function parsePrimeHtml(html) {
+ if (!html) return {};
+
+ const $ = load(html);
+
+ let name = extractPrimeTitle($, html);
+ let year = extractPrimeYear($, html);
+ let seasons = extractPrimeSeasons($, html);
+ let thumbnail = extractPrimeThumbnail($, html);
+ let info = extractPrimeInfo($, html);
+ let genre = extractPrimeGenre($, html);
+
+ // If we couldn't find the year, try to extract it from the title
+ if (!year && name) {
+ const titleYearMatch = name.match(/(\d{4})/);
+ if (titleYearMatch) {
+ year = titleYearMatch[1];
+ }
+ }
+
+ return { name, year, seasons, thumbnail, info, genre };
}
diff --git a/tests/scrape.test.js b/tests/netflix.test.js
similarity index 56%
rename from tests/scrape.test.js
rename to tests/netflix.test.js
index f789f42..ea82272 100644
--- a/tests/scrape.test.js
+++ b/tests/netflix.test.js
@@ -34,6 +34,48 @@ describe('parseNetflixHtml (canlı sayfa)', () => {
},
20000
);
+
+ it(
+ 'thumbnail URL’sini çıkarır',
+ () => {
+ const meta = parseNetflixHtml(liveHtml);
+ if (meta.thumbnail) {
+ expect(meta.thumbnail).toContain('nflxso.net');
+ }
+ // Thumbnail olmayabilir ama test geçmeli
+ },
+ 20000
+ );
+
+ it(
+ 'film/dizi açıklamasını (info) çıkarır',
+ () => {
+ const meta = parseNetflixHtml(liveHtml);
+ if (meta.info) {
+ expect(meta.info).toBeTruthy();
+ expect(meta.info).not.toContain('Netflix');
+ expect(meta.info).not.toContain('Fragmanları izleyin');
+ }
+ // Info olmayabilir ama varsa boş olmamalı
+ },
+ 20000
+ );
+
+ it(
+ 'film/dizi türünü (genre) çıkarır',
+ () => {
+ const meta = parseNetflixHtml(liveHtml);
+ if (meta.genre) {
+ expect(meta.genre).toBeTruthy();
+ expect(typeof meta.genre).toBe('string');
+ // Genre mapping test: 'Action' → 'Aksiyon'
+ ['Aksiyon', 'Dram', 'Komedi', 'Fantastik'].includes(meta.genre) ||
+ ['Action', 'Drama', 'Comedy', 'Fantasy'].includes(meta.genre);
+ }
+ // Genre olmayabilir ama varsa geçerli bir değer olmalı
+ },
+ 20000
+ );
});
describe('scraperNetflix (canlı istek)', () => {
diff --git a/tests/prime.test.js b/tests/prime.test.js
new file mode 100644
index 0000000..1312ce9
--- /dev/null
+++ b/tests/prime.test.js
@@ -0,0 +1,59 @@
+import { beforeAll, describe, expect, it } from 'vitest';
+import { scraperPrime } from '../src/index.js';
+import { parsePrimeHtml } from '../src/parser.js';
+
+const TEST_URL = 'https://www.primevideo.com/-/tr/detail/0NHIN3TGAI9L7VZ45RS52RHUPL/ref=share_ios_movie';
+const UA =
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36';
+
+describe('parsePrimeHtml (örnek HTML)', () => {
+ it('boş veya geçersiz HTML için boş obje döner', () => {
+ expect(parsePrimeHtml('')).toEqual({});
+ expect(parsePrimeHtml(null)).toEqual({});
+ });
+
+ it('meta etiketlerinden başlık ve yıl çıkarır', () => {
+ const html = `
+
+
+
+
+
+
+ `;
+ const meta = parsePrimeHtml(html);
+ expect(meta.name).toBe('Little Women');
+ });
+
+ it('thumbnail ve info alanını çıkarır', () => {
+ const html = `
+
+
+ `;
+ const meta = parsePrimeHtml(html);
+ expect(meta.thumbnail).toBe('https://m.media-amazon.com/images/S/pv-target-images/test.jpg');
+ expect(meta.info).toBe("Louisa May Alcott'ın hikayesi");
+ });
+
+ it('tür bilgisini normalize eder', () => {
+ const html = `
+
+ `;
+ const meta = parsePrimeHtml(html);
+ expect(meta.genre).toBe('Komedi');
+ });
+});
+
+describe('scraperPrime (canlı istek)', () => {
+ it(
+ 'normalize edilmiş url, id ve meta bilgilerini döner',
+ async () => {
+ const meta = await scraperPrime(TEST_URL, { headless: false, userAgent: UA });
+ expect(meta.url).toBe('https://www.primevideo.com/detail/0NHIN3TGAI9L7VZ45RS52RHUPL');
+ expect(meta.id).toBe('0NHIN3TGAI9L7VZ45RS52RHUPL');
+ expect(meta.name).toBeTruthy();
+ expect(meta.year).toMatch(/\d{4}/);
+ },
+ 20000
+ );
+});