amazon prime scrap özelliği eklendi

This commit is contained in:
2025-11-23 16:09:39 +03:00
parent 46d75b64d5
commit fefa6627e9
6 changed files with 988 additions and 28 deletions

View File

@@ -1,15 +1,21 @@
import './polyfill.js';
import { parseNetflixHtml } from './parser.js';
import { parseNetflixHtml, parsePrimeHtml } from './parser.js';
import { fetchPageContentWithPlaywright } from './headless.js';
const DEFAULT_TIMEOUT_MS = 15000;
// 🎯 LOG SİSTEMİ
function logPass(message) {
function logPass(message, data) {
console.log(`${message}`);
if (data) {
console.log(JSON.stringify(data, null, 2));
}
}
function logError(message, error) {
if (process.env.NODE_ENV === 'test') {
return;
}
console.error(`${message}: ${error.message}`);
}
@@ -46,6 +52,35 @@ function normalizeNetflixUrl(inputUrl) {
const id = idMatch[1];
return `https://www.netflix.com/title/${id}`;
}
// 📋 AMAZON PRIME URL NORMALİZASYON FONKSİYONU
function normalizePrimeUrl(inputUrl) {
if (!inputUrl) {
throw new Error('Amazon Prime URL\'i gereklidir.');
}
let parsed;
try {
parsed = new URL(inputUrl);
} catch (err) {
throw new Error('Geçersiz URL sağlandı.');
}
if (!parsed.hostname.includes('primevideo.com')) {
throw new Error('URL primevideo.com adresini göstermelidir.');
}
const segments = parsed.pathname.split('/').filter(Boolean);
const detailIndex = segments.indexOf('detail');
if (detailIndex >= 0 && segments[detailIndex + 1]) {
const id = segments[detailIndex + 1];
return `https://www.primevideo.com/detail/${id}`;
}
throw new Error('URL\'de Amazon Prime içerik ID\'si bulunamadı.');
}
const DEFAULT_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36';
@@ -139,7 +174,7 @@ function needsHeadless(meta) {
* Netflix meta verilerini scrape eder.
* @param {string} inputUrl
* @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
* @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
* @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }>}
*/
export async function scraperNetflix(inputUrl, options = {}) {
try {
@@ -150,15 +185,11 @@ export async function scraperNetflix(inputUrl, options = {}) {
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const userAgent = options.userAgent || DEFAULT_USER_AGENT;
logPass(`Netflix URL normalize edildi: ${normalizedUrl}`);
const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs);
logPass("HTML içeriği başarıyla çekildi");
let meta = parseNetflixHtml(staticHtml);
if (needsHeadless(meta) && options.headless !== false) {
logPass("Headless mode aktifleştiriliyor");
const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, {
timeoutMs,
userAgent,
@@ -172,9 +203,6 @@ export async function scraperNetflix(inputUrl, options = {}) {
Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null)
)
};
logPass("Headless scraping tamamlandı");
} else {
logPass("Statik scraping yeterli");
}
if (!meta.name) {
@@ -186,13 +214,74 @@ export async function scraperNetflix(inputUrl, options = {}) {
id: id || '',
name: meta.name,
year: meta.year,
seasons: meta.seasons ?? null
seasons: meta.seasons ?? null,
thumbnail: meta.thumbnail ?? null,
info: meta.info ?? null,
genre: meta.genre ?? null
};
logResult(finalResult);
logPass('Netflix scraping tamamlandı', finalResult);
return finalResult;
} catch (error) {
logError('Netflix scraping başarısız', error);
throw error;
}
}
/**
* Amazon Prime meta verilerini scrape eder.
* @param {string} inputUrl
* @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
* @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null, thumbnail: string | null, info: string | null, genre: string | null }>}
*/
export async function scraperPrime(inputUrl, options = {}) {
try {
await ensureFetchGlobals();
const normalizedUrl = normalizePrimeUrl(inputUrl);
const id = normalizedUrl.split('/').pop();
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const userAgent = options.userAgent || DEFAULT_USER_AGENT;
const staticHtml = await fetchStaticHtml(normalizedUrl, userAgent, timeoutMs);
let meta = parsePrimeHtml(staticHtml);
if (needsHeadless(meta) && options.headless !== false) {
const headlessHtml = await fetchPageContentWithPlaywright(normalizedUrl, {
timeoutMs,
userAgent,
headless: options.headless !== false
});
const enriched = parsePrimeHtml(headlessHtml);
meta = {
...meta,
...Object.fromEntries(
Object.entries(enriched).filter(([_, value]) => value !== undefined && value !== null)
)
};
}
if (!meta.name) {
throw new Error('Amazon Prime sayfa meta verisi parse edilemedi.');
}
const finalResult = {
url: normalizedUrl,
id: id || '',
name: meta.name,
year: meta.year,
seasons: meta.seasons ?? null,
thumbnail: meta.thumbnail ?? null,
info: meta.info ?? null,
genre: meta.genre ?? null
};
logPass('Amazon Prime scraping tamamlandı', finalResult);
return finalResult;
} catch (error) {
logError('Amazon Prime scraping başarısız', error);
throw error;
}
}

View File

@@ -24,6 +24,26 @@ const UNIVERSAL_UI_PATTERNS = [
const YEAR_FIELDS = ['datePublished', 'startDate', 'uploadDate', 'copyrightYear', 'releasedEvent', 'releaseYear', 'dateCreated'];
const SEASON_TYPES = ['TVSeries', 'TVShow', 'Series'];
/**
* Netflix thumbnail image patterns for extraction
*/
const THUMBNAIL_SELECTORS = [
'meta[property="og:image"]', // Open Graph image (primary)
'meta[name="twitter:image"]', // Twitter card image
'meta[property="og:image:secure_url"]', // Secure image URL
'link[rel="image_src"]', // Image source link
'meta[itemprop="image"]' // Schema.org image
];
/**
* Netflix description/meta description patterns for extraction
*/
const DESCRIPTION_SELECTORS = [
'meta[name="description"]', // Standard meta description (primary)
'meta[property="og:description"]', // Open Graph description
'meta[itemprop="description"]' // Schema.org description
];
/**
* Extract a usable year value from various JSON-LD fields.
* @param {unknown} value
@@ -79,6 +99,141 @@ function cleanTitle(title) {
return trimmed || undefined;
}
/**
* Netflix thumbnail image extraction from HTML meta tags.
* Extracts thumbnail URLs from various meta tags in priority order.
* @param {string} html - Raw HTML content
* @returns {string | undefined} Thumbnail URL or undefined if not found
*/
function extractThumbnail(html) {
if (!html) return undefined;
const $ = load(html);
// Try different meta tag selectors in priority order
for (const selector of THUMBNAIL_SELECTORS) {
const imageUrl = $(selector).attr('content');
if (imageUrl && isValidThumbnailUrl(imageUrl)) {
return normalizeThumbnailUrl(imageUrl);
}
}
return undefined;
}
/**
* Check if URL is a valid Netflix thumbnail URL.
* @param {string} url - URL to validate
* @returns {boolean} True if valid thumbnail URL
*/
function isValidThumbnailUrl(url) {
if (!url || typeof url !== 'string') return false;
// Check for Netflix CDN domains
const netflixDomains = [
'nflxso.net',
'assets.nflxext.com',
'netflix.com',
'occ-0-',
'nflximg.net'
];
const hasNetflixDomain = netflixDomains.some(domain => url.includes(domain));
const hasImageExtension = /\.(jpg|jpeg|png|webp)(\?.*)?$/i.test(url);
return hasNetflixDomain && hasImageExtension;
}
/**
* Normalize thumbnail URL by removing query parameters if needed.
* @param {string} url - Original thumbnail URL
* @returns {string} Normalized URL
*/
function normalizeThumbnailUrl(url) {
if (!url) return url;
try {
const urlObj = new URL(url);
// Remove certain query parameters that might cause issues
const paramsToRemove = ['r', 't', 'e', 'v'];
const searchParams = new URLSearchParams(urlObj.search);
paramsToRemove.forEach(param => searchParams.delete(param));
// Reconstruct URL without removed parameters
const cleanUrl = urlObj.origin + urlObj.pathname + (searchParams.toString() ? '?' + searchParams.toString() : '');
return cleanUrl;
} catch {
// If URL parsing fails, return original
return url;
}
}
/**
* Netflix description/info extraction from HTML meta tags.
* Extracts description information from various meta tags in priority order.
* @param {string} html - Raw HTML content
* @returns {string | undefined} Description info or undefined if not found
*/
function extractInfo(html) {
if (!html) return undefined;
const $ = load(html);
// Try different meta tag selectors in priority order
for (const selector of DESCRIPTION_SELECTORS) {
const description = $(selector).attr('content');
if (description && description.trim()) {
// Clean up description - remove Netflix-specific suffixes
const cleaned = description.trim()
.replace(/\s*\|\s*Netflix.*$/i, '') // Remove Netflix suffix
.replace(/\s+Fragmanları izleyin ve daha fazla bilgi edinin\.$/, ''); // Remove trailing call-to-action
return cleaned || undefined;
}
}
return undefined;
}
/**
* Normalize and clean genre information.
* Maps Netflix genre names to Turkish equivalents and cleans them up.
* @param {string | null | undefined} genre - Raw genre from JSON-LD
* @returns {string | null} Normalized Turkish genre or null
*/
function normalizeGenre(genre) {
if (!genre || typeof genre !== 'string') return null;
const genreMapping = {
'Aksiyon': 'Aksiyon',
'Action': 'Aksiyon',
'Macera': 'Macera',
'Adventure': 'Macera',
'Bilim Kurgu': 'Bilim Kurgu',
'Science Fiction': 'Bilim Kurgu',
'Fantastik': 'Fantastik',
'Fantasy': 'Fantastik',
'Dram': 'Dram',
'Drama': 'Dram',
'Komedi': 'Komedi',
'Comedy': 'Komedi',
'Korku': 'Korku',
'Horror': 'Korku',
'Gerilim': 'Gerilim',
'Thriller': 'Gerilim',
'Gizem': 'Gizem',
'Mystery': 'Gizem',
'Romantik': 'Romantik',
'Romance': 'Romantik'
};
// Clean up genre name
const cleanedGenre = genre.trim();
// Return mapped genre or original if no mapping exists
return genreMapping[cleanedGenre] || cleanedGenre || null;
}
/**
* Parse JSON-LD objects for metadata.
* @param {any} obj
@@ -121,6 +276,19 @@ function parseJsonLdObject(obj) {
result.seasons = `${entry.seasons.length} Sezon`;
}
}
// Extract info/description from JSON-LD
if (!result.info && typeof entry.description === 'string') {
const cleanedInfo = entry.description.trim()
.replace(/\s*\|\s*Netflix.*$/i, '')
.replace(/\s+Fragmanları izleyin ve daha fazla bilgi edinin\.$/, '');
result.info = cleanedInfo || undefined;
}
// Extract genre from JSON-LD
if (!result.genre && typeof entry.genre === 'string') {
result.genre = normalizeGenre(entry.genre);
}
}
return result;
@@ -129,7 +297,7 @@ function parseJsonLdObject(obj) {
/**
* Parse Netflix HTML to extract metadata without executing scripts.
* @param {string} html
* @returns {{ name?: string, year?: string | number, seasons?: string | null }}
* @returns {{ name?: string, year?: string | number, seasons?: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }}
*/
export function parseNetflixHtml(html) {
if (!html) return {};
@@ -143,20 +311,362 @@ export function parseNetflixHtml(html) {
let year;
let seasons = null;
let thumbnail = null;
let info = null;
let genre = null;
// Extract thumbnail from meta tags
thumbnail = extractThumbnail(html);
// Extract info from meta tags (fallback if JSON-LD doesn't have it)
info = extractInfo(html);
$('script[type="application/ld+json"]').each((_, el) => {
const raw = $(el).contents().text();
if (!raw) return;
try {
const parsed = JSON.parse(raw);
const info = parseJsonLdObject(parsed);
if (!name && info.name) name = info.name;
if (!year && info.year) year = info.year;
if (!seasons && info.seasons) seasons = info.seasons;
const jsonLdInfo = parseJsonLdObject(parsed);
if (!name && jsonLdInfo.name) name = jsonLdInfo.name;
if (!year && jsonLdInfo.year) year = jsonLdInfo.year;
if (!seasons && jsonLdInfo.seasons) seasons = jsonLdInfo.seasons;
// Also check JSON-LD for image information
if (!thumbnail && jsonLdInfo.image) {
thumbnail = typeof jsonLdInfo.image === 'string' ? jsonLdInfo.image : jsonLdInfo.image.url;
}
// Extract info and genre from JSON-LD if available
if (!info && jsonLdInfo.info) info = jsonLdInfo.info;
if (!genre && jsonLdInfo.genre) genre = jsonLdInfo.genre;
} catch {
// Ignore malformed JSON-LD blocks.
}
});
return { name, year, seasons };
return { name, year, seasons, thumbnail, info, genre };
}
/**
* Amazon Prime specific constants and functions
*/
// Amazon Prime selectors for metadata extraction
const PRIME_TITLE_SELECTORS = [
'meta[property="og:title"]',
'meta[name="title"]',
'title',
'[data-testid="title"]',
'.dv-node-dp-title',
'h1'
];
const PRIME_THUMBNAIL_SELECTORS = [
'meta[property="og:image"]',
'meta[name="twitter:image"]',
'meta[property="og:image:secure_url"]',
'[data-testid="hero-image"] img',
'.dv-node-dp-hero-image img',
'img[alt*="poster"]'
];
const PRIME_DESCRIPTION_SELECTORS = [
'meta[name="description"]',
'meta[property="og:description"]',
'meta[itemprop="description"]',
'[data-testid="synopsis"]',
'.dv-node-dp-synopsis',
'.synopsis'
];
const PRIME_YEAR_SELECTORS = [
'meta[itemprop="dateCreated"]',
'meta[property="video:release_date"]',
'[data-testid="release-year"]',
'.release-year',
'[class*="year"]'
];
const PRIME_GENRE_SELECTORS = [
'meta[itemprop="genre"]',
'[data-testid="genres"]',
'.genres',
'[class*="genre"]'
];
/**
* Extract title from Amazon Prime page
*/
function extractPrimeTitle($, html) {
// Try meta tags first
for (const selector of PRIME_TITLE_SELECTORS) {
const title = $(selector).attr('content') || $(selector).text();
if (title && title.trim()) {
return cleanPrimeTitle(title.trim());
}
}
// Try to extract from embedded JSON data
const jsonMatch = html.match(/"title":"([^"]+)"/);
if (jsonMatch && jsonMatch[1]) {
return cleanPrimeTitle(jsonMatch[1]);
}
return undefined;
}
/**
* Extract year from Amazon Prime page
*/
function extractPrimeYear($, html) {
// Try structured data first
for (const selector of PRIME_YEAR_SELECTORS) {
const yearText = $(selector).attr('content') || $(selector).text();
if (yearText) {
const yearMatch = yearText.match(/(\d{4})/);
if (yearMatch) return yearMatch[1];
}
}
// Try to extract from embedded JSON data
const jsonMatch = html.match(/"releaseYear"\s*:\s*"(\d{4})"/);
if (jsonMatch) return jsonMatch[1];
// Try to find year in title
const title = extractPrimeTitle($, html);
if (title) {
const yearMatch = title.match(/(\d{4})/);
if (yearMatch) return yearMatch[1];
}
return undefined;
}
/**
* Extract thumbnail from Amazon Prime page
*/
function extractPrimeThumbnail($, html) {
for (const selector of PRIME_THUMBNAIL_SELECTORS) {
const imageUrl = $(selector).attr('content') || $(selector).attr('src');
if (imageUrl && isValidPrimeThumbnail(imageUrl)) {
return imageUrl;
}
}
// Try to extract from embedded JSON data
const jsonMatch = html.match(/"heroImageUrl":"([^"]+)"/);
if (jsonMatch && jsonMatch[1]) {
return jsonMatch[1].replace(/\\u002F/g, '/');
}
return undefined;
}
/**
* Extract info/description from Amazon Prime page
*/
function extractPrimeInfo($, html) {
for (const selector of PRIME_DESCRIPTION_SELECTORS) {
const description = $(selector).attr('content') || $(selector).text();
if (description && description.trim()) {
return cleanPrimeDescription(description.trim());
}
}
// Try to extract from embedded JSON data
const jsonMatch = html.match(/"synopsis":"([^"]+)"/);
if (jsonMatch && jsonMatch[1]) {
return cleanPrimeDescription(jsonMatch[1].replace(/\\u002F/g, '/').replace(/\\"/g, '"'));
}
return undefined;
}
/**
* Extract genres from Amazon Prime page
*/
function extractPrimeGenre($, html) {
for (const selector of PRIME_GENRE_SELECTORS) {
const genreText = $(selector).attr('content') || $(selector).text();
if (genreText && genreText.trim()) {
return normalizePrimeGenre(genreText.trim());
}
}
// Try to extract from embedded JSON data
const jsonMatch = html.match(/"genres":\["([^"]+)"\]/);
if (jsonMatch && jsonMatch[1]) {
return normalizePrimeGenre(jsonMatch[1]);
}
return undefined;
}
/**
* Extract seasons information from Amazon Prime page
*/
function extractPrimeSeasons($, html) {
// Try to find the highest season number from all season matches
const allSeasonMatches = html.match(/\d+\s*\.?\s*Sezon/gi);
if (allSeasonMatches) {
const seasons = allSeasonMatches.map(match => parseInt(match.match(/\d+/)[0]));
const maxSeason = Math.max(...seasons);
if (maxSeason > 0) {
return `${maxSeason} Season`;
}
}
// Look for series indicators in a more specific way
const seriesIndicators = [
/\b(Season|Sezon)\s*\d+/i,
/\bepisode\s*\d+/i,
/\bbölüm\s*\d+/i,
/"type":\s*["']\s*(TV\s*Series|Dizi)/i,
/\b(TV\s*Series|Dizi)\s*$/i
];
const hasSeriesIndicator = seriesIndicators.some(pattern => pattern.test(html));
if (hasSeriesIndicator) {
return '1 Season'; // Default for series without clear season count
}
// Look for movie indicators
const movieIndicators = [
/\b(film|movie)\s*$/i,
/"type":\s*["']\s*(Movie|Film)/i
];
const hasMovieIndicator = movieIndicators.some(pattern => pattern.test(html));
if (hasMovieIndicator) {
return null; // It's explicitly a movie
}
// If we can't determine, look at page structure
// Prime Video typically shows season information prominently for series
if (html.includes('Sezon') && html.includes('Bölüm')) {
return '1 Season';
}
return null; // Default to movie
}
/**
* Clean Amazon Prime title text
*/
function cleanPrimeTitle(title) {
if (!title) return undefined;
let cleaned = title;
// Remove Amazon Prime suffixes
cleaned = cleaned.replace(/\s*\|\s*Prime\s*Video.*$/i, '');
cleaned = cleaned.replace(/\s*\|\s*Amazon.*$/i, '');
// Remove common UI text
cleaned = cleaned.replace(/\s+(izle|watch|play|oynat)$/i, '');
return cleaned.trim() || undefined;
}
/**
* Clean Amazon Prime description text
*/
function cleanPrimeDescription(description) {
if (!description) return undefined;
let cleaned = description;
// Remove Amazon/Prime Video suffixes
cleaned = cleaned.replace(/\s*\|\s*Prime\s*Video.*$/i, '');
cleaned = cleaned.replace(/\s*\|\s*Amazon.*$/i, '');
// Remove common call-to-action text
cleaned = cleaned.replace(/\s+(Daha fazla bilgi için tıklayın|Click for more info).*$/i, '');
return cleaned.trim() || undefined;
}
/**
* Check if URL is a valid Amazon Prime thumbnail
*/
function isValidPrimeThumbnail(url) {
if (!url || typeof url !== 'string') return false;
const primeDomains = [
'm.media-amazon.com',
'images-na.ssl-images-amazon.com',
'media-amazon.com',
'primevideo.com'
];
return primeDomains.some(domain => url.includes(domain)) &&
/\.(jpg|jpeg|png|webp)(\?.*)?$/i.test(url);
}
/**
* Normalize Amazon Prime genre information
*/
function normalizePrimeGenre(genre) {
if (!genre || typeof genre !== 'string') return null;
const genreMapping = {
// English to Turkish mapping
'Action': 'Aksiyon',
'Adventure': 'Macera',
'Comedy': 'Komedi',
'Drama': 'Dram',
'Fantasy': 'Fantastik',
'Horror': 'Korku',
'Mystery': 'Gizem',
'Romance': 'Romantik',
'Romantic': 'Romantik',
'Sci-Fi': 'Bilim Kurgu',
'Science Fiction': 'Bilim Kurgu',
'Thriller': 'Gerilim',
'Documentary': 'Belgesel',
'Animation': 'Animasyon',
'Family': 'Aile',
'Kids': 'Çocuk',
'War': 'Savaş',
'Western': 'Western',
'Humorous': 'Mizahi',
'Sentimental': 'Duygusal'
};
// Handle multiple genres separated by commas, pipes, or special characters
const separators = /[,|•·]/;
const genres = genre.split(separators).map(g => g.trim()).filter(g => g);
const normalizedGenres = genres.map(g => {
return genreMapping[g] || genreMapping[g.toLowerCase()] || g;
}).filter(g => g);
// Return first genre as primary (could return array if needed)
return normalizedGenres[0] || null;
}
/**
* Parse Amazon Prime HTML to extract metadata
* @param {string} html
* @returns {{ name?: string, year?: string | number, seasons?: string | null, thumbnail?: string | null, info?: string | null, genre?: string | null }}
*/
export function parsePrimeHtml(html) {
if (!html) return {};
const $ = load(html);
let name = extractPrimeTitle($, html);
let year = extractPrimeYear($, html);
let seasons = extractPrimeSeasons($, html);
let thumbnail = extractPrimeThumbnail($, html);
let info = extractPrimeInfo($, html);
let genre = extractPrimeGenre($, html);
// If we couldn't find the year, try to extract it from the title
if (!year && name) {
const titleYearMatch = name.match(/(\d{4})/);
if (titleYearMatch) {
year = titleYearMatch[1];
}
}
return { name, year, seasons, thumbnail, info, genre };
}