From c0e62e778cceed44154f28dcc5e852a8d86e5a7d Mon Sep 17 00:00:00 2001 From: szbk Date: Sun, 1 Mar 2026 01:08:25 +0300 Subject: [PATCH] feat(api): support Prime Video scraping and provider-aware metadata --- src/config/socket.ts | 2 +- src/middleware/validation.middleware.ts | 24 +- src/routes/api.routes.ts | 24 ++ src/services/admin.service.ts | 71 +++- src/services/cache.service.ts | 32 +- src/services/content.service.ts | 3 + src/services/job.service.ts | 26 +- src/services/metrics.service.ts | 4 +- src/services/scraper.service.ts | 441 ++++++++++++++++++------ src/types/index.ts | 5 +- src/utils/contentUrl.ts | 49 +++ 11 files changed, 515 insertions(+), 166 deletions(-) create mode 100644 src/utils/contentUrl.ts diff --git a/src/config/socket.ts b/src/config/socket.ts index 359c6a1..d5b5a34 100644 --- a/src/config/socket.ts +++ b/src/config/socket.ts @@ -33,7 +33,7 @@ export interface MetricsRealtimeEvent { sourceCounts: { cache: number; database: number; - netflix: number; + scraper: number; }; occurredAt: string; } diff --git a/src/middleware/validation.middleware.ts b/src/middleware/validation.middleware.ts index a49562d..f10d596 100644 --- a/src/middleware/validation.middleware.ts +++ b/src/middleware/validation.middleware.ts @@ -1,27 +1,19 @@ import { Request, Response, NextFunction } from 'express'; import { z } from 'zod'; import type { ApiResponse, GetInfoRequest } from '../types/index.js'; +import { isSupportedContentUrl } from '../utils/contentUrl.js'; /** * Validation schema for /api/getinfo endpoint */ const getInfoSchema = z.object({ - url: z.string().url('Invalid URL format').refine((url) => { - // Validate Netflix URL - try { - const parsedUrl = new URL(url); - const validHosts = [ - 'www.netflix.com', - 'netflix.com', - 'www.netflix.com.tr', - 'netflix.com.tr', - ]; - const hasTitlePath = /\/title\/\d+/.test(url); - return validHosts.includes(parsedUrl.hostname) && hasTitlePath; - } catch { - return false; - } - }, 'URL must be a valid Netflix title URL (e.g., https://www.netflix.com/tr/title/81616256)'), + url: z + .string() + .url('Invalid URL format') + .refine( + (url) => isSupportedContentUrl(url), + 'URL must be Netflix /title/... or PrimeVideo /detail/...' + ), }); /** diff --git a/src/routes/api.routes.ts b/src/routes/api.routes.ts index 32b603d..2912d20 100644 --- a/src/routes/api.routes.ts +++ b/src/routes/api.routes.ts @@ -286,6 +286,30 @@ router.post( } ); +/** + * POST /api/admin/content/purge + * Delete all content rows from DB (with related entities). + */ +router.post( + '/admin/content/purge', + adminOnlyMiddleware, + async (_req: Request, res: Response>) => { + try { + const result = await AdminService.purgeAllContent(); + res.json({ success: true, data: result }); + } catch (error) { + res.status(500).json({ + success: false, + error: { + code: 'ADMIN_CONTENT_PURGE_ERROR', + message: + error instanceof Error ? error.message : 'Failed to purge content', + }, + }); + } + } +); + /** * POST /api/getinfo/async * Create async job for content scraping diff --git a/src/services/admin.service.ts b/src/services/admin.service.ts index 09feb20..7ae37fe 100644 --- a/src/services/admin.service.ts +++ b/src/services/admin.service.ts @@ -6,21 +6,29 @@ import { MetricsService } from './metrics.service.js'; import { CacheService } from './cache.service.js'; import { ContentService } from './content.service.js'; import type { AdminActionResponse, AdminOverviewResponse } from '../types/index.js'; +import { parseSupportedContentUrl } from '../utils/contentUrl.js'; -const CACHE_PREFIX = 'netflix:content:'; +const CACHE_PREFIX = 'content:'; const MAX_CACHE_KEYS_FOR_ANALYSIS = 1000; function formatCacheKeyLabel(key: string): string { return key.replace(CACHE_PREFIX, ''); } -function extractTitleIdFromCacheKey(key: string): string | null { +function extractProviderIdFromCacheKey(key: string): { provider: string; id: string } | null { const normalized = formatCacheKeyLabel(key); - return /^\d+$/.test(normalized) ? normalized : null; + const match = normalized.match(/^(netflix|primevideo):([A-Za-z0-9]+)$/); + if (!match) return null; + const provider = match[1]; + const id = match[2]; + if (!provider || !id) return null; + return { provider, id }; } -function extractTitleIdFromUrl(url: string): string | null { - return url.match(/\/title\/(\d+)/)?.[1] ?? null; +function extractProviderIdFromUrl(url: string): { provider: string; id: string } | null { + const parsed = parseSupportedContentUrl(url); + if (!parsed) return null; + return { provider: parsed.provider, id: parsed.id }; } function parseRedisInfoValue(info: string, key: string): number | null { @@ -144,16 +152,25 @@ export class AdminService { min30Plus: 0, }; - const cacheTitleIds = Array.from( - new Set(cacheKeys.map((key) => extractTitleIdFromCacheKey(key)).filter((id): id is string => Boolean(id))) + const cacheProviderIds = Array.from( + new Set( + cacheKeys + .map((key) => extractProviderIdFromCacheKey(key)) + .filter((item): item is { provider: string; id: string } => Boolean(item)) + .map((item) => `${item.provider}:${item.id}`) + ) ); - const relatedContent = cacheTitleIds.length + const relatedContent = cacheProviderIds.length ? await prisma.content.findMany({ where: { - OR: cacheTitleIds.map((id) => ({ - url: { contains: `/title/${id}` }, - })), + OR: cacheProviderIds.map((providerId) => { + const [provider, id] = providerId.split(':'); + if (provider === 'primevideo') { + return { url: { contains: `/detail/${id}` } }; + } + return { url: { contains: `/title/${id}` } }; + }), }, select: { url: true, @@ -164,9 +181,12 @@ export class AdminService { const titleMap = new Map(); for (const item of relatedContent) { - const id = extractTitleIdFromUrl(item.url); - if (id && !titleMap.has(id)) { - titleMap.set(id, item.title); + const parsed = extractProviderIdFromUrl(item.url); + if (parsed) { + const key = `${parsed.provider}:${parsed.id}`; + if (!titleMap.has(key)) { + titleMap.set(key, item.title); + } } } @@ -196,7 +216,7 @@ export class AdminService { if (ttlValue > 0) { const formattedKey = formatCacheKeyLabel(cacheKeys[i] || ''); - const titleId = extractTitleIdFromCacheKey(cacheKeys[i] || ''); + const providerId = extractProviderIdFromCacheKey(cacheKeys[i] || ''); const rawValue = valueResults?.[i]?.[1]; let cachedAt: number | null = null; if (typeof rawValue === 'string') { @@ -209,7 +229,9 @@ export class AdminService { } expiringSoon.push({ key: formattedKey, - mediaTitle: titleId ? titleMap.get(titleId) ?? null : null, + mediaTitle: providerId + ? titleMap.get(`${providerId.provider}:${providerId.id}`) ?? null + : null, cachedAt, ttlSeconds: ttlValue, }); @@ -450,6 +472,23 @@ export class AdminService { details: `Stale content refresh queued for items older than ${days} days`, }; } + + static async purgeAllContent(): Promise { + const totalContent = await prisma.content.count(); + + await prisma.$transaction([ + prisma.content.deleteMany({}), + prisma.genre.deleteMany({}), + ]); + + await CacheService.clearAll(); + + return { + queued: totalContent, + skipped: 0, + details: 'Tum icerik verileri veritabanindan silindi', + }; + } } export default AdminService; diff --git a/src/services/cache.service.ts b/src/services/cache.service.ts index 31d1e16..0e0aee3 100644 --- a/src/services/cache.service.ts +++ b/src/services/cache.service.ts @@ -3,19 +3,35 @@ import { env } from '../config/env.js'; import { emitCacheEvent } from '../config/socket.js'; import logger from '../utils/logger.js'; import type { GetInfoResponse, CacheEntry } from '../types/index.js'; +import { parseSupportedContentUrl } from '../utils/contentUrl.js'; /** - * Cache key prefix for Netflix content + * Cache key prefix for scraped content */ -const CACHE_PREFIX = 'netflix:content:'; +const CACHE_PREFIX = 'content:'; /** * Generate cache key from URL */ function getCacheKey(url: string): string { - // Use URL hash or title ID as key - const titleId = url.match(/\/title\/(\d+)/)?.[1] || url; - return `${CACHE_PREFIX}${titleId}`; + const parsed = parseSupportedContentUrl(url); + + if (parsed) { + return `${CACHE_PREFIX}${parsed.provider}:${parsed.id}`; + } + + return `${CACHE_PREFIX}url:${encodeURIComponent(url)}`; +} + +function normalizeCachedResponse(url: string, data: GetInfoResponse): GetInfoResponse { + if (data.provider === 'netflix' || data.provider === 'primevideo') { + return data; + } + + return { + ...data, + provider: parseSupportedContentUrl(url)?.provider ?? 'netflix', + }; } /** @@ -39,7 +55,7 @@ export class CacheService { logger.debug('Cache hit', { url }); const entry: CacheEntry = JSON.parse(cached); - return entry.data; + return normalizeCachedResponse(url, entry.data); } catch (error) { logger.error('Cache get error', { url, @@ -57,7 +73,7 @@ export class CacheService { const ttl = env.REDIS_TTL_SECONDS; const entry: CacheEntry = { - data, + data: normalizeCachedResponse(url, data), cachedAt: Date.now(), ttl, }; @@ -137,7 +153,7 @@ export class CacheService { } /** - * Clear all Netflix content cache + * Clear all scraped content cache */ static async clearAll(): Promise { try { diff --git a/src/services/content.service.ts b/src/services/content.service.ts index 79155d1..95e65e4 100644 --- a/src/services/content.service.ts +++ b/src/services/content.service.ts @@ -1,6 +1,7 @@ import prisma from '../config/database.js'; import { emitContentEvent } from '../config/socket.js'; import type { ContentData, ScraperResult, GetInfoResponse } from '../types/index.js'; +import { parseSupportedContentUrl } from '../utils/contentUrl.js'; /** * Content Service for database operations @@ -242,7 +243,9 @@ export class ContentService { * Convert ContentData to API response format */ static toApiResponse(data: ContentData): GetInfoResponse { + const provider = parseSupportedContentUrl(data.url)?.provider ?? 'netflix'; return { + provider, title: data.title, year: data.year, plot: data.plot, diff --git a/src/services/job.service.ts b/src/services/job.service.ts index cff15a3..a70238a 100644 --- a/src/services/job.service.ts +++ b/src/services/job.service.ts @@ -1,4 +1,5 @@ import { v4 as uuidv4 } from 'uuid'; +import type { Prisma } from '@prisma/client'; import prisma from '../config/database.js'; import { CacheService } from './cache.service.js'; import { ContentService } from './content.service.js'; @@ -60,7 +61,7 @@ export class JobService { status?: JobStatus; progress?: number; step?: string; - result?: unknown; + result?: Prisma.InputJsonValue; error?: string; } ): Promise { @@ -73,7 +74,7 @@ export class JobService { } /** - * Process a scrape job (hybrid: cache -> db -> netflix) + * Process a scrape job (hybrid: cache -> db -> scraper) */ static async process(jobId: string): Promise { const job = await this.getById(jobId); @@ -117,11 +118,14 @@ export class JobService { return; } - // Update progress - await this.update(jobId, { progress: 50, step: 'scraping_netflix' }); - emitJobProgress(jobId, 50, 'processing', 'Scraping Netflix'); + const provider = ScraperService.detectProvider(job.url); + const providerLabel = provider === 'primevideo' ? 'Prime Video' : 'Netflix'; - // Step 3: Scrape from Netflix + // Update progress + await this.update(jobId, { progress: 50, step: `scraping_${provider ?? 'source'}` }); + emitJobProgress(jobId, 50, 'processing', `Scraping ${providerLabel}`); + + // Step 3: Scrape from source URL const scraperResult = await ScraperService.scrape(job.url); // Update progress @@ -136,7 +140,7 @@ export class JobService { await CacheService.set(job.url, responseData); // Complete the job - await this.completeJob(jobId, responseData, 'netflix'); + await this.completeJob(jobId, responseData, 'scraper'); } catch (error) { const apiError: ApiError = { code: 'SCRAPE_ERROR', @@ -168,7 +172,7 @@ export class JobService { status: 'completed', progress: 100, step: 'completed', - result: data, + result: data as unknown as Prisma.InputJsonValue, }); emitJobCompleted(jobId, data, source); @@ -201,7 +205,7 @@ export class JobService { return { data: responseData, source: 'database' }; } - // Step 3: Scrape from Netflix + // Step 3: Scrape from source URL const scraperResult = await ScraperService.scrape(url); // Step 4: Save to database @@ -210,9 +214,9 @@ export class JobService { // Step 5: Cache the result await CacheService.set(url, responseData); - await MetricsService.incrementSource('netflix'); + await MetricsService.incrementSource('scraper'); - return { data: responseData, source: 'netflix' }; + return { data: responseData, source: 'scraper' }; } /** diff --git a/src/services/metrics.service.ts b/src/services/metrics.service.ts index aad8635..e9e5cb9 100644 --- a/src/services/metrics.service.ts +++ b/src/services/metrics.service.ts @@ -59,7 +59,7 @@ export class MetricsService { bySource: { cache: number; database: number; - netflix: number; + scraper: number; }; }> { const [counters, sources] = await Promise.all([ @@ -73,7 +73,7 @@ export class MetricsService { bySource: { cache: toInt(sources.cache), database: toInt(sources.database), - netflix: toInt(sources.netflix), + scraper: toInt(sources.scraper), }, }; } diff --git a/src/services/scraper.service.ts b/src/services/scraper.service.ts index 0a7216d..c4a1099 100644 --- a/src/services/scraper.service.ts +++ b/src/services/scraper.service.ts @@ -1,6 +1,10 @@ import * as cheerio from 'cheerio'; import type { ScraperResult, ContentType } from '../types/index.js'; import logger from '../utils/logger.js'; +import { + parseSupportedContentUrl, + type SupportedProvider, +} from '../utils/contentUrl.js'; /** * Age rating patterns to detect and exclude from genres @@ -14,43 +18,55 @@ const AGE_RATING_PATTERN = /^[\u2066-\u2069\u202A-\u202E\u200E-\u200F]*(\d+\+|PG * Matches patterns like "3 Sezon", "2 Seasons", "1. Sezon", etc. */ const SEASON_PATTERN = /(\d+)\.?\s*(sezon|season|sezonlar|seasons)/i; +const EPISODE_PATTERN = /(\d+)\.?\s*(bölüm|bolum|bölümler|bolumler|episode|episodes)/i; +const EPISODE_TOKEN_PATTERN = /\b(bölüm|bolum|bölümler|bolumler|episode|episodes)\b/i; /** - * Netflix HTML Scraper Service + * Scraper Service (Netflix + Prime Video) * Uses Cheerio for parsing HTML content */ export class ScraperService { + /** + * Detect content provider from URL + */ + static detectProvider(url: string): SupportedProvider | null { + return parseSupportedContentUrl(url)?.provider ?? null; + } + + /** + * Validate if URL is a supported content URL + */ + static isSupportedUrl(url: string): boolean { + return Boolean(parseSupportedContentUrl(url)); + } + /** * Validate if URL is a valid Netflix URL */ static isValidNetflixUrl(url: string): boolean { - try { - const parsedUrl = new URL(url); - const validHosts = [ - 'www.netflix.com', - 'netflix.com', - 'www.netflix.com.tr', - 'netflix.com.tr', - ]; - return validHosts.includes(parsedUrl.hostname); - } catch { - return false; - } + return parseSupportedContentUrl(url)?.provider === 'netflix'; + } + + /** + * Validate if URL is a valid Prime Video URL + */ + static isValidPrimeVideoUrl(url: string): boolean { + return parseSupportedContentUrl(url)?.provider === 'primevideo'; } /** * Extract Netflix title ID from URL */ static extractTitleId(url: string): string | null { - const match = url.match(/\/title\/(\d+)/); - return match ? match[1] : null; + const parsed = parseSupportedContentUrl(url); + return parsed?.provider === 'netflix' ? parsed.id : null; } /** - * Fetch HTML content from Netflix URL + * Fetch HTML content from URL */ - private static async fetchHtml(url: string): Promise { - logger.info('Fetching Netflix page', { url }); + private static async fetchHtml(url: string, provider: SupportedProvider): Promise { + logger.info('Fetching content page', { provider, url }); const response = await fetch(url, { headers: { @@ -63,7 +79,7 @@ export class ScraperService { }); if (!response.ok) { - throw new Error(`Failed to fetch Netflix page: ${response.status}`); + throw new Error(`Failed to fetch ${provider} page: ${response.status}`); } return response.text(); @@ -73,22 +89,46 @@ export class ScraperService { * Parse HTML and extract content data */ static async scrape(url: string): Promise { - if (!this.isValidNetflixUrl(url)) { - throw new Error('Invalid Netflix URL'); + const parsed = parseSupportedContentUrl(url); + + if (!parsed) { + throw new Error( + 'Invalid content URL. Use Netflix /title/... or PrimeVideo /detail/...' + ); } - const html = await this.fetchHtml(url); + const html = await this.fetchHtml(url, parsed.provider); const $ = cheerio.load(html); - const title = this.extractTitle($); - const year = this.extractYear($); - const plot = this.extractPlot($); - const ageRating = this.extractAgeRating($); - const { genres, type, currentSeason } = this.extractGenresTypeAndSeason($); - const cast = this.extractCast($); - const backdropUrl = this.extractBackdrop($); + const result = + parsed.provider === 'netflix' + ? this.scrapeNetflix($) + : this.scrapePrimeVideo($, parsed.id); - const result: ScraperResult = { + logger.info('Scraping completed', { + provider: parsed.provider, + url, + title: result.title, + year: result.year, + ageRating: result.ageRating, + type: result.type, + genresCount: result.genres.length, + castCount: result.cast.length, + }); + + return result; + } + + private static scrapeNetflix($: cheerio.CheerioAPI): ScraperResult { + const title = this.extractNetflixTitle($); + const year = this.extractNetflixYear($); + const plot = this.extractNetflixPlot($); + const ageRating = this.extractNetflixAgeRating($); + const { genres, type, currentSeason } = this.extractNetflixGenresTypeAndSeason($); + const cast = this.extractNetflixCast($); + const backdropUrl = this.extractNetflixBackdrop($); + + return { title, year, plot, @@ -99,24 +139,71 @@ export class ScraperService { backdropUrl, currentSeason, }; + } - logger.info('Scraping completed', { - url, + private static scrapePrimeVideo($: cheerio.CheerioAPI, detailId: string): ScraperResult { + const title = this.extractPrimeTitle($, detailId); + const year = this.extractPrimeYear($); + const { type, currentSeason } = this.extractPrimeTypeAndSeason($); + const plot = this.extractPrimePlot($); + const cast = this.extractPrimeCast($); + const genres = this.extractPrimeGenres($); + const backdropUrl = this.extractPrimeBackdrop($); + const ageRating = this.extractPrimeAgeRating($); + + return { title, year, + plot, ageRating, type, - genresCount: genres.length, - castCount: cast.length, - }); + genres, + cast, + backdropUrl, + currentSeason, + }; + } - return result; + private static parseYear(text: string): number | null { + const yearMatch = text.match(/(19|20)\d{2}/); + if (!yearMatch) return null; + + const year = Number.parseInt(yearMatch[0], 10); + if (Number.isNaN(year)) return null; + if (year < 1900 || year > new Date().getFullYear() + 5) return null; + return year; + } + + private static cleanText(text: string): string { + return text.replace(/\s+/g, ' ').trim(); + } + + private static normalizePrimeTitleCandidate(text: string): string { + return this.cleanText(text) + .replace(/^[İIiı]zle:\s*/i, '') + .replace(/^canl[ıi]\s+izleyin:\s*/i, '') + .replace(/^watch\s+now:\s*/i, '') + .replace(/^prime\s+video:\s*/i, '') + .replace(/\s*(sezon|season)\s+\d+(?=\s*[-–—]\s*prime\s+video$)/i, '') + .replace(/\s*[-–—]\s*prime\s+video$/i, '') + .replace(/\s*\|\s*prime\s*video$/i, '') + .replace(/\s+(sezon|season)\s+\d+\s*$/i, '') + .trim(); + } + + private static uniqueTextList(items: string[]): string[] { + const unique = new Set(); + for (const item of items) { + const normalized = this.cleanText(item); + if (normalized) unique.add(normalized); + } + return Array.from(unique); } /** - * Extract title from HTML + * Netflix extractors */ - private static extractTitle($: cheerio.CheerioAPI): string { + private static extractNetflixTitle($: cheerio.CheerioAPI): string { let title = $('h2.default-ltr-iqcdef-cache-tnklrp').first().text().trim(); if (!title) { @@ -131,24 +218,12 @@ export class ScraperService { return title || 'Unknown Title'; } - /** - * Extract year from HTML (first li element) - */ - private static extractYear($: cheerio.CheerioAPI): number | null { + private static extractNetflixYear($: cheerio.CheerioAPI): number | null { const yearText = $('li.default-ltr-iqcdef-cache-6prs41').first().text().trim(); - const year = parseInt(yearText, 10); - - if (!isNaN(year) && year >= 1900 && year <= new Date().getFullYear() + 5) { - return year; - } - - return null; + return this.parseYear(yearText); } - /** - * Extract plot/description from HTML - */ - private static extractPlot($: cheerio.CheerioAPI): string | null { + private static extractNetflixPlot($: cheerio.CheerioAPI): string | null { const plot = $('span.default-ltr-iqcdef-cache-6ukeej').first().text().trim(); if (!plot) { @@ -159,91 +234,70 @@ export class ScraperService { return plot || null; } - /** - * Extract age rating from HTML (e.g., "18+", "16+") - * Searches all li elements (except first which is year) - */ - private static extractAgeRating($: cheerio.CheerioAPI): string | null { - let ageRating: string | null = null; - const foundTexts: string[] = []; - - $('li.default-ltr-iqcdef-cache-6prs41').each((index, element) => { - if (index === 0) return; // Skip year + private static extractNetflixAgeRating($: cheerio.CheerioAPI): string | null { + const items = $('li.default-ltr-iqcdef-cache-6prs41').toArray(); + for (let i = 1; i < items.length; i += 1) { + const element = items[i]; + if (!element) continue; const text = $(element).text().trim(); - foundTexts.push(text); - - // Clean Unicode characters first - const cleanText = text.replace(/[\u2066-\u2069\u202A-\u202E\u200E-\u200F]/g, '').trim(); + const cleanText = text + .replace(/[\u2066-\u2069\u202A-\u202E\u200E-\u200F]/g, '') + .trim(); if (cleanText && AGE_RATING_PATTERN.test(cleanText)) { - ageRating = cleanText; - return false; // Break loop + return cleanText; } - }); - - // Debug logging - if (!ageRating && foundTexts.length > 0) { - logger.debug('Age rating not found in elements', { - foundTexts, - pattern: AGE_RATING_PATTERN.source, - }); } - return ageRating; + return null; } - /** - * Extract genres from HTML (skip year, age rating, and season info) - * Also detects content type (movie/tvshow) based on season presence - * Extracts current season number from season text - */ - private static extractGenresTypeAndSeason($: cheerio.CheerioAPI): { genres: string[]; type: ContentType; currentSeason: number | null } { + private static extractNetflixGenresTypeAndSeason( + $: cheerio.CheerioAPI + ): { genres: string[]; type: ContentType; currentSeason: number | null } { const genres: string[] = []; let type: ContentType = 'movie'; let currentSeason: number | null = null; - const foundTexts: string[] = []; $('li.default-ltr-iqcdef-cache-6prs41').each((index, element) => { - if (index === 0) return; // Skip year + if (index === 0) return; const text = $(element).text().trim(); - const cleanText = text.replace(/[\u2066\u2069\u202A\u202B\u202C\u202D\u202E\u200E\u200F]/g, '').trim(); - foundTexts.push(cleanText); + const cleanText = text + .replace(/[\u2066\u2069\u202A\u202B\u202C\u202D\u202E\u200E\u200F]/g, '') + .trim(); - // Check for season pattern - indicates TV show const seasonMatch = cleanText.match(SEASON_PATTERN); if (cleanText && seasonMatch) { type = 'tvshow'; - // Extract season number from the text - const seasonNum = parseInt(seasonMatch[1], 10); - if (!isNaN(seasonNum)) { + const seasonValue = seasonMatch[1]; + const seasonNum = seasonValue ? Number.parseInt(seasonValue, 10) : Number.NaN; + if (Number.isFinite(seasonNum)) { currentSeason = seasonNum; } - return; // Skip adding to genres + return; + } + + const episodeMatch = cleanText.match(EPISODE_PATTERN); + const hasEpisodeToken = EPISODE_TOKEN_PATTERN.test(cleanText); + if (cleanText && (episodeMatch || hasEpisodeToken)) { + type = 'tvshow'; + if (currentSeason == null) { + currentSeason = 1; + } + return; } - // Skip age rating - only add actual genres if (cleanText && !AGE_RATING_PATTERN.test(cleanText)) { genres.push(cleanText); } }); - // Debug logging - logger.debug('extractGenresTypeAndSeason completed', { - foundTexts, - genres, - type, - currentSeason, - }); - return { genres, type, currentSeason }; } - /** - * Extract cast members from HTML - */ - private static extractCast($: cheerio.CheerioAPI): string[] { + private static extractNetflixCast($: cheerio.CheerioAPI): string[] { const castText = $('span.default-ltr-iqcdef-cache-m0886o').first().text().trim(); if (!castText) { @@ -256,10 +310,7 @@ export class ScraperService { .filter((name) => name.length > 0); } - /** - * Extract backdrop image URL from HTML - */ - private static extractBackdrop($: cheerio.CheerioAPI): string | null { + private static extractNetflixBackdrop($: cheerio.CheerioAPI): string | null { const backdropDiv = $('div.default-ltr-iqcdef-cache-1wezh7a').first(); const img = backdropDiv.find('img').first(); @@ -279,6 +330,176 @@ export class ScraperService { return null; } + + /** + * Prime Video extractors + */ + private static extractPrimeTitle($: cheerio.CheerioAPI, detailId: string): string { + const primaryTitle = this.normalizePrimeTitleCandidate( + $('h1[data-automation-id="title"]').first().text() || '' + ); + const detailLinkSelector = `a[href*="/detail/${detailId}"]`; + const imageLinkAriaTitle = this.normalizePrimeTitleCandidate( + $(`a[data-testid="image-link"][aria-label][href*="/detail/${detailId}"]`).first().attr('aria-label') || + $(`${detailLinkSelector}[aria-label]`).first().attr('aria-label') || + '' + ); + const imageLinkTextTitle = this.normalizePrimeTitleCandidate( + $(`a[data-testid="image-link"][href*="/detail/${detailId}"]`).first().text() || + $(detailLinkSelector).first().text() || + '' + ); + const metaOgTitle = this.normalizePrimeTitleCandidate( + $('meta[property="og:title"]').attr('content') || '' + ); + const metaNameTitle = this.normalizePrimeTitleCandidate( + $('meta[name="title"]').attr('content') || '' + ); + const pageTitle = this.normalizePrimeTitleCandidate( + $('title').first().text() || '' + ); + const canonicalHref = $('link[rel="canonical"]').attr('href') || ''; + let canonicalTitle = ''; + if (canonicalHref) { + try { + const canonicalUrl = new URL(canonicalHref, 'https://www.primevideo.com'); + const canonicalMatch = canonicalUrl.pathname.match(/\/detail\/([^/]+)\/([A-Za-z0-9]+)/i); + if (canonicalMatch && canonicalMatch[2] === detailId) { + canonicalTitle = this.normalizePrimeTitleCandidate( + decodeURIComponent(canonicalMatch[1] || '') + ); + } + } catch { + // best effort + } + } + + const title = + primaryTitle || + imageLinkAriaTitle || + imageLinkTextTitle || + metaOgTitle || + metaNameTitle || + pageTitle || + canonicalTitle; + + return title || 'Unknown Title'; + } + + private static extractPrimeYear($: cheerio.CheerioAPI): number | null { + const releaseBadge = $('span[data-automation-id="release-year-badge"]').first(); + return ( + this.parseYear(this.cleanText(releaseBadge.text())) || + this.parseYear(this.cleanText(releaseBadge.attr('aria-label') || '')) + ); + } + + private static extractPrimeTypeAndSeason( + $: cheerio.CheerioAPI + ): { type: ContentType; currentSeason: number | null } { + const seasonNodeText = this.cleanText( + $('div.dv-node-dp-seasons, [data-testid="dp-season-selector"]').text() + ); + const hasSeasonMarker = /\b(sezon|season)\b/i.test(seasonNodeText); + + const seasonLabel = + $('input#av-droplist-av-atf-season-selector').attr('aria-label') || + $('label[for="av-droplist-av-atf-season-selector"] ._36qUej').first().text() || + ''; + + const seasonMatch = this.cleanText(seasonLabel).match( + /(?:sezon|season)\s*(\d+)|(\d+)\.?\s*(?:sezon|season)/i + ); + + const currentSeasonRaw = seasonMatch ? seasonMatch[1] || seasonMatch[2] : null; + const currentSeason = currentSeasonRaw + ? Number.parseInt(currentSeasonRaw, 10) + : null; + + return { + type: hasSeasonMarker ? 'tvshow' : 'movie', + currentSeason: Number.isNaN(currentSeason as number) ? null : currentSeason, + }; + } + + private static extractPrimeCast($: cheerio.CheerioAPI): string[] { + const cast = $('dd.skJCpF a._1NNx6V') + .map((_, el) => $(el).text()) + .get(); + + return this.uniqueTextList(cast); + } + + private static extractPrimeGenres($: cheerio.CheerioAPI): string[] { + const genres = $( + 'div[data-testid="dv-node-dp-genres"] [data-testid="genre-texts"], div[data-testid="dv-node-dp-genres"] [data-testid="mood-texts"]' + ) + .map((_, el) => $(el).text()) + .get(); + + return this.uniqueTextList(genres); + } + + private static extractPrimePlot($: cheerio.CheerioAPI): string | null { + const plot = this.cleanText( + $('span.fbl-expandable-text span._1H6ABQ').first().text() || + $('meta[property="og:description"]').attr('content') || + '' + ); + + return plot || null; + } + + private static extractPrimeAgeRating($: cheerio.CheerioAPI): string | null { + const ageRating = this.cleanText( + $('span[data-automation-id="age-rating-badge"]').first().text() || + $('[data-testid="age-rating-badge"]').first().text() || + '' + ); + + return ageRating || null; + } + + private static extractPrimeBackdrop($: cheerio.CheerioAPI): string | null { + const webpSrcSet = + $('div.Kc5eKF picture source[type="image/webp"]').first().attr('srcset') || + $('picture source[type="image/webp"]').first().attr('srcset') || + ''; + + if (webpSrcSet) { + const sources = webpSrcSet + .split(',') + .map((item) => item.trim()) + .map((item) => { + const match = item.match(/^(\S+)\s+(\d+)w$/); + if (!match) return null; + const url = match[1]; + const widthRaw = match[2]; + if (!url || !widthRaw) return null; + return { + url, + width: Number.parseInt(widthRaw, 10), + }; + }) + .filter((item): item is { url: string; width: number } => Boolean(item)); + + if (sources.length > 0) { + const exact1080 = sources.find((item) => item.width === 1080); + if (exact1080) return exact1080.url; + + const nextLargest = sources + .filter((item) => item.width > 1080) + .sort((a, b) => a.width - b.width)[0]; + if (nextLargest) return nextLargest.url; + + const largest = sources.sort((a, b) => b.width - a.width)[0]; + if (largest) return largest.url; + } + } + + const fallback = $('img[data-testid="base-image"]').first().attr('src'); + return fallback || null; + } } export default ScraperService; diff --git a/src/types/index.ts b/src/types/index.ts index 68ee890..5d3da80 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -57,6 +57,7 @@ export interface GetInfoRequest { } export interface GetInfoResponse { + provider: 'netflix' | 'primevideo'; title: string; year: number | null; plot: string | null; @@ -134,7 +135,7 @@ export interface AdminOverviewResponse { sourceCounts: { cache: number; database: number; - netflix: number; + scraper: number; }; }; } @@ -155,7 +156,7 @@ export interface CacheEntry { ttl: number; } -export type DataSource = 'cache' | 'database' | 'netflix'; +export type DataSource = 'cache' | 'database' | 'scraper'; // ============================================ // Socket Event Types diff --git a/src/utils/contentUrl.ts b/src/utils/contentUrl.ts new file mode 100644 index 0000000..86693d6 --- /dev/null +++ b/src/utils/contentUrl.ts @@ -0,0 +1,49 @@ +export type SupportedProvider = 'netflix' | 'primevideo'; + +const NETFLIX_HOSTS = new Set([ + 'www.netflix.com', + 'netflix.com', + 'www.netflix.com.tr', + 'netflix.com.tr', +]); + +const PRIME_HOSTS = new Set([ + 'www.primevideo.com', + 'primevideo.com', +]); + +export interface ParsedContentUrl { + provider: SupportedProvider; + id: string; +} + +export function parseSupportedContentUrl(rawUrl: string): ParsedContentUrl | null { + try { + const parsedUrl = new URL(rawUrl); + const hostname = parsedUrl.hostname.toLowerCase(); + + if (NETFLIX_HOSTS.has(hostname)) { + const titleIdMatch = parsedUrl.pathname.match(/\/title\/(\d+)/); + if (!titleIdMatch) return null; + const id = titleIdMatch[1]; + if (!id) return null; + return { provider: 'netflix', id }; + } + + if (PRIME_HOSTS.has(hostname)) { + const detailIdMatch = parsedUrl.pathname.match(/\/detail\/([A-Za-z0-9]+)/); + if (!detailIdMatch) return null; + const id = detailIdMatch[1]; + if (!id) return null; + return { provider: 'primevideo', id }; + } + + return null; + } catch { + return null; + } +} + +export function isSupportedContentUrl(rawUrl: string): boolean { + return Boolean(parseSupportedContentUrl(rawUrl)); +}