first commit
This commit is contained in:
446
doc/API.md
Normal file
446
doc/API.md
Normal file
@@ -0,0 +1,446 @@
|
||||
# MetaScraper API Reference
|
||||
|
||||
## 🎯 Main API
|
||||
|
||||
### `scraperNetflix(inputUrl, options?)`
|
||||
|
||||
Netflix metadata extraction function with automatic fallback and Turkish localization.
|
||||
|
||||
#### Parameters
|
||||
|
||||
| Parameter | Type | Required | Default | Description |
|
||||
|-----------|------|----------|---------|-------------|
|
||||
| `inputUrl` | `string` | ✅ | - | Netflix title URL (any format) |
|
||||
| `options` | `object` | ❌ | `{}` | Configuration options |
|
||||
|
||||
#### Options
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
|--------|------|---------|-------------|
|
||||
| `headless` | `boolean` | `true` | Enable Playwright fallback for missing data |
|
||||
| `timeoutMs` | `number` | `15000` | Request timeout in milliseconds |
|
||||
| `userAgent` | `string` | Chrome 118 User-Agent | Custom User-Agent string |
|
||||
|
||||
#### Returns
|
||||
|
||||
```typescript
|
||||
Promise<{
|
||||
url: string; // Normalized Netflix URL
|
||||
id: string; // Netflix title ID
|
||||
name: string; // Clean title (Turkish UI removed)
|
||||
year: string \| number \| undefined; // Release year
|
||||
seasons: string \| null; // Season info for TV series
|
||||
}>
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
**Basic Usage**
|
||||
```javascript
|
||||
import { scraperNetflix } from 'metascraper';
|
||||
|
||||
const result = await scraperNetflix('https://www.netflix.com/tr/title/82123114');
|
||||
console.log(result);
|
||||
// {
|
||||
// "url": "https://www.netflix.com/title/82123114",
|
||||
// "id": "82123114",
|
||||
// "name": "ONE SHOT with Ed Sheeran",
|
||||
// "year": "2025",
|
||||
// "seasons": null
|
||||
// }
|
||||
```
|
||||
|
||||
**Advanced Configuration**
|
||||
```javascript
|
||||
import { scraperNetflix } from 'metascraper';
|
||||
|
||||
const result = await scraperNetflix(
|
||||
'https://www.netflix.com/title/80189685',
|
||||
{
|
||||
headless: false, // Disable browser fallback
|
||||
timeoutMs: 30000, // 30 second timeout
|
||||
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
);
|
||||
```
|
||||
|
||||
**Error Handling**
|
||||
```javascript
|
||||
import { scraperNetflix } from 'metascraper';
|
||||
|
||||
try {
|
||||
const result = await scraperNetflix('https://www.netflix.com/title/80189685');
|
||||
console.log('Success:', result);
|
||||
} catch (error) {
|
||||
console.error('Scraping failed:', error.message);
|
||||
// Turkish error messages for Turkish users
|
||||
// "Netflix scraping başarısız: Netflix URL'i gereklidir."
|
||||
}
|
||||
```
|
||||
|
||||
## 🧩 Internal APIs
|
||||
|
||||
### `parseNetflixHtml(html)` - Parser API
|
||||
|
||||
Parse Netflix HTML content to extract metadata without network requests.
|
||||
|
||||
#### Parameters
|
||||
|
||||
| Parameter | Type | Required | Description |
|
||||
|-----------|------|----------|-------------|
|
||||
| `html` | `string` | ✅ | Raw HTML content from Netflix page |
|
||||
|
||||
#### Returns
|
||||
|
||||
```typescript
|
||||
{
|
||||
name?: string; // Clean title
|
||||
year?: string \| number; // Release year
|
||||
seasons?: string \| null; // Season information
|
||||
}
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
```javascript
|
||||
import { parseNetflixHtml } from 'metascraper/parser';
|
||||
|
||||
// With cached HTML
|
||||
const fs = await import('node:fs');
|
||||
const html = fs.readFileSync('netflix-page.html', 'utf8');
|
||||
const metadata = parseNetflixHtml(html);
|
||||
|
||||
console.log(metadata);
|
||||
// {
|
||||
// "name": "The Witcher",
|
||||
// "year": "2025",
|
||||
// "seasons": "4 Sezon"
|
||||
// }
|
||||
```
|
||||
|
||||
### `fetchPageContentWithPlaywright(url, options)` - Headless API
|
||||
|
||||
Fetch Netflix page content using Playwright browser automation.
|
||||
|
||||
#### Parameters
|
||||
|
||||
| Parameter | Type | Required | Description |
|
||||
|-----------|------|----------|-------------|
|
||||
| `url` | `string` | ✅ | Complete URL to fetch |
|
||||
| `options` | `object` | ✅ | Browser configuration |
|
||||
|
||||
#### Options
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
|--------|------|---------|-------------|
|
||||
| `timeoutMs` | `number` | `15000` | Page load timeout |
|
||||
| `userAgent` | `string` | Chrome 118 | Browser User-Agent |
|
||||
| `headless` | `boolean` | `true` | Run browser in headless mode |
|
||||
|
||||
#### Returns
|
||||
|
||||
```typescript
|
||||
Promise<string> // HTML content of the page
|
||||
```
|
||||
|
||||
#### Examples
|
||||
|
||||
```javascript
|
||||
import { fetchPageContentWithPlaywright } from 'metascraper/headless';
|
||||
|
||||
try {
|
||||
const html = await fetchPageContentWithPlaywright(
|
||||
'https://www.netflix.com/title/80189685',
|
||||
{
|
||||
timeoutMs: 30000,
|
||||
headless: false // Show browser (useful for debugging)
|
||||
}
|
||||
);
|
||||
|
||||
// Process the HTML with parser
|
||||
const metadata = parseNetflixHtml(html);
|
||||
console.log(metadata);
|
||||
} catch (error) {
|
||||
console.error('Browser automation failed:', error.message);
|
||||
}
|
||||
```
|
||||
|
||||
## 🔧 URL Processing
|
||||
|
||||
### Supported URL Formats
|
||||
|
||||
The `scraperNetflix` function automatically normalizes various Netflix URL formats:
|
||||
|
||||
| Input Format | Normalized Output | Notes |
|
||||
|--------------|-------------------|-------|
|
||||
| `https://www.netflix.com/title/80189685` | `https://www.netflix.com/title/80189685` | Standard format |
|
||||
| `https://www.netflix.com/tr/title/80189685` | `https://www.netflix.com/title/80189685` | Turkish locale |
|
||||
| `https://www.netflix.com/tr/title/80189685?s=i&trkid=264356104&vlang=tr` | `https://www.netflix.com/title/80189685` | With parameters |
|
||||
| `https://www.netflix.com/title/80189685?trackId=12345` | `https://www.netflix.com/title/80189685` | With tracking |
|
||||
|
||||
### URL Validation
|
||||
|
||||
The function validates URLs with these rules:
|
||||
|
||||
1. **Format**: Must be a valid URL
|
||||
2. **Domain**: Must contain `netflix.com`
|
||||
3. **Path**: Must contain `title/` followed by numeric ID
|
||||
4. **ID Extraction**: Uses regex to extract title ID
|
||||
|
||||
```javascript
|
||||
// These will work:
|
||||
'https://www.netflix.com/title/80189685'
|
||||
'https://www.netflix.com/tr/title/80189685?s=i&vlang=tr'
|
||||
|
||||
// These will fail:
|
||||
'https://google.com' // Wrong domain
|
||||
'https://www.netflix.com/browse' // No title ID
|
||||
'not-a-url' // Invalid format
|
||||
'https://www.netflix.com/title/abc' // Non-numeric ID
|
||||
```
|
||||
|
||||
## 🌍 Localization Features
|
||||
|
||||
### Turkish UI Text Removal
|
||||
|
||||
The parser automatically removes Turkish Netflix UI text from titles:
|
||||
|
||||
| Original Title | Cleaned Title | Removed Pattern |
|
||||
|----------------|---------------|-----------------|
|
||||
| "The Witcher izlemenizi bekliyor" | "The Witcher | `izlemenizi bekliyor` |
|
||||
| "Stranger Things izleyin" | "Stranger Things" | `izleyin` |
|
||||
| "Sezon 4 devam et" | "Sezon 4" | `devam et` |
|
||||
| "Dark başla" | "Dark" | `başla` |
|
||||
| "The Crown izlemeye devam" | "The Crown" | `izlemeye devam` |
|
||||
|
||||
### Supported Turkish Patterns
|
||||
|
||||
```javascript
|
||||
const TURKISH_UI_PATTERNS = [
|
||||
/\s+izlemenizi bekliyor$/i, // "waiting for you to watch"
|
||||
/\s+izleyin$/i, // "watch"
|
||||
/\s+devam et$/i, // "continue"
|
||||
/\s+başla$/i, // "start"
|
||||
/\s+izlemeye devam$/i, // "continue watching"
|
||||
/\s+Sezon\s+\d+.*izlemeye devam$/i, // "Sezon X izlemeye devam"
|
||||
/\s+Sezon\s+\d+.*başla$/i, // "Sezon X başla"
|
||||
];
|
||||
```
|
||||
|
||||
### English UI Pattern Removal
|
||||
|
||||
Also removes universal English UI text:
|
||||
|
||||
| Original Title | Cleaned Title | Removed Pattern |
|
||||
|----------------|---------------|-----------------|
|
||||
| "Watch Now The Witcher" | "The Witcher" | `Watch Now` |
|
||||
| "The Witcher Continue Watching" | "The Witcher" | `Continue Watching` |
|
||||
| "Season 4 Play" | "Season 4" | `Season X Play` |
|
||||
|
||||
## 📊 Data Extraction Patterns
|
||||
|
||||
### JSON-LD Processing
|
||||
|
||||
The parser extracts metadata from JSON-LD structured data:
|
||||
|
||||
```javascript
|
||||
// Looks for these JSON-LD fields:
|
||||
const YEAR_FIELDS = [
|
||||
'datePublished', 'startDate', 'uploadDate',
|
||||
'copyrightYear', 'releasedEvent', 'releaseYear', 'dateCreated'
|
||||
];
|
||||
|
||||
const SEASON_TYPES = ['TVSeries', 'TVShow', 'Series'];
|
||||
```
|
||||
|
||||
### Meta Tag Fallbacks
|
||||
|
||||
If JSON-LD is unavailable, falls back to HTML meta tags:
|
||||
|
||||
```html
|
||||
<meta property="og:title" content="The Witcher izlemenizi bekliyor | Netflix">
|
||||
<meta name="title" content="The Witcher | Netflix">
|
||||
<title>The Witcher izlemenizi bekliyor | Netflix</title>
|
||||
```
|
||||
|
||||
### Season Detection
|
||||
|
||||
For TV series, extracts season information:
|
||||
|
||||
```javascript
|
||||
// Example JSON-LD for TV series:
|
||||
{
|
||||
"@type": "TVSeries",
|
||||
"name": "The Witcher",
|
||||
"numberOfSeasons": 4,
|
||||
"datePublished": "2025"
|
||||
}
|
||||
|
||||
// Result: "4 Sezon"
|
||||
```
|
||||
|
||||
## ⚡ Performance Characteristics
|
||||
|
||||
### Response Times by Mode
|
||||
|
||||
| Mode | Typical Response | Success Rate | Resource Usage |
|
||||
|------|------------------|--------------|----------------|
|
||||
| Static Only | 200-500ms | ~85% | Very Low |
|
||||
| Static + Headless Fallback | 2-5s | ~95% | Medium |
|
||||
| Headless Only | 2-3s | ~90% | High |
|
||||
|
||||
### Resource Requirements
|
||||
|
||||
**Static Mode:**
|
||||
- CPU: Low (< 5%)
|
||||
- Memory: < 20MB
|
||||
- Network: 1 HTTP request
|
||||
|
||||
**Headless Mode:**
|
||||
- CPU: Medium (10-20%)
|
||||
- Memory: 100-200MB
|
||||
- Network: Multiple requests
|
||||
- Browser: Chromium instance
|
||||
|
||||
## 🚨 Error Types & Handling
|
||||
|
||||
### Common Error Scenarios
|
||||
|
||||
#### 1. Invalid URL
|
||||
```javascript
|
||||
await scraperNetflix('invalid-url');
|
||||
// Throws: "Geçersiz URL sağlandı."
|
||||
```
|
||||
|
||||
#### 2. Non-Netflix URL
|
||||
```javascript
|
||||
await scraperNetflix('https://google.com');
|
||||
// Throws: "URL netflix.com adresini göstermelidir."
|
||||
```
|
||||
|
||||
#### 3. Missing Title ID
|
||||
```javascript
|
||||
await scraperNetflix('https://www.netflix.com/browse');
|
||||
// Throws: "URL'de Netflix başlık ID'si bulunamadı."
|
||||
```
|
||||
|
||||
#### 4. Network Timeout
|
||||
```javascript
|
||||
await scraperNetflix('https://www.netflix.com/title/80189685', { timeoutMs: 1 });
|
||||
// Throws: "Request timed out while reaching Netflix."
|
||||
```
|
||||
|
||||
#### 5. 404 Not Found
|
||||
```javascript
|
||||
await scraperNetflix('https://www.netflix.com/title/99999999');
|
||||
// Throws: "Netflix title not found (404)."
|
||||
```
|
||||
|
||||
#### 6. Playwright Not Available
|
||||
```javascript
|
||||
// When headless mode needed but Playwright not installed
|
||||
// Throws: "Playwright is not installed. Install the optional dependency..."
|
||||
```
|
||||
|
||||
#### 7. Parsing Failed
|
||||
```javascript
|
||||
// When HTML cannot be parsed for metadata
|
||||
// Throws: "Netflix sayfa meta verisi parse edilemedi."
|
||||
```
|
||||
|
||||
### Error Object Structure
|
||||
|
||||
```javascript
|
||||
{
|
||||
name: "Error",
|
||||
message: "Netflix scraping başarısız: Geçersiz URL sağlandı.",
|
||||
stack: "Error: Netflix scraping başarısız: Geçersiz URL sağlandı.\n at scraperNetflix...",
|
||||
// Additional context for debugging
|
||||
}
|
||||
```
|
||||
|
||||
## 🔧 Advanced Usage Patterns
|
||||
|
||||
### Batch Processing
|
||||
|
||||
```javascript
|
||||
import { scraperNetflix } from 'metascraper';
|
||||
|
||||
const urls = [
|
||||
'https://www.netflix.com/title/80189685',
|
||||
'https://www.netflix.com/title/82123114',
|
||||
'https://www.netflix.com/title/70177057'
|
||||
];
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
urls.map(url => scraperNetflix(url))
|
||||
);
|
||||
|
||||
results.forEach((result, index) => {
|
||||
if (result.status === 'fulfilled') {
|
||||
console.log(`✅ ${urls[index]}:`, result.value.name);
|
||||
} else {
|
||||
console.log(`❌ ${urls[index]}:`, result.reason.message);
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
### Custom User-Agent Rotation
|
||||
|
||||
```javascript
|
||||
const userAgents = [
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
|
||||
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
|
||||
];
|
||||
|
||||
const getRandomUA = () => userAgents[Math.floor(Math.random() * userAgents.length)];
|
||||
|
||||
const result = await scraperNetflix(url, {
|
||||
userAgent: getRandomUA()
|
||||
});
|
||||
```
|
||||
|
||||
### Retry Logic Implementation
|
||||
|
||||
```javascript
|
||||
async function scrapeWithRetry(url, maxRetries = 3) {
|
||||
for (let attempt = 1; attempt <= maxRetries; attempt++) {
|
||||
try {
|
||||
return await scraperNetflix(url);
|
||||
} catch (error) {
|
||||
if (attempt === maxRetries) throw error;
|
||||
|
||||
console.log(`Attempt ${attempt} failed, retrying in ${attempt * 1000}ms...`);
|
||||
await new Promise(resolve => setTimeout(resolve, attempt * 1000));
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Caching Integration
|
||||
|
||||
```javascript
|
||||
const cache = new Map();
|
||||
|
||||
async function scrapeWithCache(url) {
|
||||
const cacheKey = `netflix:${url}`;
|
||||
|
||||
if (cache.has(cacheKey)) {
|
||||
console.log('Cache hit for:', url);
|
||||
return cache.get(cacheKey);
|
||||
}
|
||||
|
||||
const result = await scraperNetflix(url);
|
||||
cache.set(cacheKey, result);
|
||||
|
||||
// Optional: Cache expiration
|
||||
setTimeout(() => cache.delete(cacheKey), 30 * 60 * 1000); // 30 minutes
|
||||
|
||||
return result;
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*API documentation last updated: 2025-11-23*
|
||||
Reference in New Issue
Block a user