614 lines
14 KiB
Markdown
614 lines
14 KiB
Markdown
# MetaScraper Development Guide
|
|
|
|
## 🚀 Getting Started
|
|
|
|
### Prerequisites
|
|
|
|
- **Node.js**: 18+ (tested on 18.18.2 and 24.x)
|
|
- **npm**: 8+ (comes with Node.js)
|
|
- **Git**: For version control
|
|
|
|
### Development Setup
|
|
|
|
```bash
|
|
# Clone the repository
|
|
git clone <repository-url>
|
|
cd metascraper
|
|
|
|
# Install dependencies
|
|
npm install
|
|
|
|
# Run tests to verify setup
|
|
npm test
|
|
|
|
# Run demo to test functionality
|
|
npm run demo
|
|
```
|
|
|
|
### IDE Configuration
|
|
|
|
#### VS Code Setup
|
|
|
|
Create `.vscode/settings.json`:
|
|
|
|
```json
|
|
{
|
|
"editor.formatOnSave": true,
|
|
"editor.defaultFormatter": "esbenp.prettier-vscode",
|
|
"files.associations": {
|
|
"*.js": "javascript"
|
|
},
|
|
"typescript.preferences.importModuleSpecifier": "relative"
|
|
}
|
|
```
|
|
|
|
#### Recommended Extensions
|
|
|
|
- **ESLint**: `esbenp.prettier-vscode`
|
|
- **Prettier**: `dbaeumer.vscode-eslint`
|
|
- **Vitest**: `ZixuanChen.vitest-explorer`
|
|
|
|
## 📁 Project Structure
|
|
|
|
```
|
|
metascraper/
|
|
├── src/ # Source code
|
|
│ ├── index.js # Main scraperNetflix function
|
|
│ ├── parser.js # HTML parsing and title cleaning
|
|
│ ├── headless.js # Playwright browser automation
|
|
│ └── polyfill.js # File/Blob polyfill for Node.js
|
|
├── tests/ # Test files
|
|
│ ├── scrape.test.js # Integration tests
|
|
│ └── fixtures/ # Test data and HTML samples
|
|
├── doc/ # Documentation (this directory)
|
|
│ ├── README.md # Documentation index
|
|
│ ├── ARCHITECTURE.md # System design and patterns
|
|
│ ├── API.md # Complete API reference
|
|
│ ├── DEVELOPMENT.md # Development guide (this file)
|
|
│ ├── TESTING.md # Testing patterns and procedures
|
|
│ ├── TROUBLESHOOTING.md # Common issues and solutions
|
|
│ ├── FAQ.md # Frequently asked questions
|
|
│ └── DEPLOYMENT.md # Packaging and publishing
|
|
├── local-demo.js # Demo application for testing
|
|
├── package.json # Project configuration
|
|
├── vitest.config.js # Test configuration (if exists)
|
|
└── README.md # Project README
|
|
```
|
|
|
|
## 🧱 Code Style & Conventions
|
|
|
|
### JavaScript Standards
|
|
|
|
```javascript
|
|
// Use ES6+ modules
|
|
import { scraperNetflix } from './index.js';
|
|
import { parseNetflixHtml } from './parser.js';
|
|
|
|
// Prefer async/await over Promise chains
|
|
async function scrapeNetflixTitle(url) {
|
|
try {
|
|
const result = await scraperNetflix(url);
|
|
return result;
|
|
} catch (error) {
|
|
console.error('Scraping failed:', error.message);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
// Use template literals for strings
|
|
const message = `Scraping ${url} completed in ${duration}ms`;
|
|
|
|
// Destructure objects and arrays
|
|
const { url, id, name, year } = result;
|
|
const [first, second] = urls;
|
|
```
|
|
|
|
### Naming Conventions
|
|
|
|
```javascript
|
|
// Functions: camelCase with descriptive names
|
|
function normalizeNetflixUrl(inputUrl) { }
|
|
function extractYearFromJsonLd(jsonData) { }
|
|
|
|
// Constants: UPPER_SNAKE_CASE
|
|
const DEFAULT_TIMEOUT_MS = 15000;
|
|
const TURKISH_UI_PATTERNS = [/pattern/, /another/];
|
|
|
|
// Variables: camelCase, meaningful names
|
|
const normalizedUrl = normalizeNetflixUrl(inputUrl);
|
|
const seasonCount = extractNumberOfSeasons(metadata);
|
|
|
|
// Files: kebab-case for utilities, camelCase for modules
|
|
// parser.js, headless.js, polyfill.js
|
|
// netflix-url-utils.js, html-cleaner.js
|
|
```
|
|
|
|
### Error Handling Patterns
|
|
|
|
```javascript
|
|
// Always include context in error messages
|
|
function validateNetflixUrl(url) {
|
|
if (!url) {
|
|
throw new Error('Netflix URL\'i gereklidir.');
|
|
}
|
|
|
|
if (!url.includes('netflix')) {
|
|
throw new Error('URL netflix.com adresini göstermelidir.');
|
|
}
|
|
}
|
|
|
|
// Use Turkish error messages for Turkish users
|
|
function logError(message, error) {
|
|
console.error(`❌ ${message}: ${error.message}`);
|
|
}
|
|
|
|
// Chain error context
|
|
async function fetchWithRetry(url, attempts = 3) {
|
|
try {
|
|
return await fetch(url);
|
|
} catch (error) {
|
|
if (attempts === 1) {
|
|
throw new Error(`Failed to fetch ${url}: ${error.message}`);
|
|
}
|
|
await delay(1000);
|
|
return fetchWithRetry(url, attempts - 1);
|
|
}
|
|
}
|
|
```
|
|
|
|
### JSDoc Documentation
|
|
|
|
```javascript
|
|
/**
|
|
* Netflix meta verilerini scrape eder.
|
|
* @param {string} inputUrl Netflix URL'si
|
|
* @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
|
|
* @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
|
|
* @throws {Error} URL invalid, network error, or parsing failure
|
|
*/
|
|
export async function scraperNetflix(inputUrl, options = {}) {
|
|
// Implementation
|
|
}
|
|
|
|
/**
|
|
* Clean titles by removing Netflix suffixes and UI text.
|
|
* Handles patterns like "The Witcher izlemenizi bekliyor | Netflix" → "The Witcher"
|
|
* @param {string | undefined | null} title - Raw title from Netflix
|
|
* @returns {string | undefined} Cleaned title
|
|
*/
|
|
function cleanTitle(title) {
|
|
if (!title) return undefined;
|
|
// Implementation
|
|
}
|
|
```
|
|
|
|
## 🧪 Testing Standards
|
|
|
|
### Test Structure
|
|
|
|
```javascript
|
|
import { describe, it, expect, beforeAll, beforeEach, afterEach } from 'vitest';
|
|
import { scraperNetflix, parseNetflixHtml } from '../src/index.js';
|
|
|
|
describe('scraperNetflix', () => {
|
|
// Setup before tests
|
|
beforeAll(async () => {
|
|
// One-time setup
|
|
});
|
|
|
|
beforeEach(() => {
|
|
// Reset before each test
|
|
});
|
|
|
|
afterEach(() => {
|
|
// Cleanup after each test
|
|
});
|
|
|
|
describe('URL normalization', () => {
|
|
it('normalizes Turkish Netflix URLs', () => {
|
|
const input = 'https://www.netflix.com/tr/title/80189685?s=i&vlang=tr';
|
|
const expected = 'https://www.netflix.com/title/80189685';
|
|
// Test implementation
|
|
});
|
|
|
|
it('throws error for invalid URLs', async () => {
|
|
await expect(scraperNetflix('invalid-url')).rejects.toThrow();
|
|
});
|
|
});
|
|
|
|
describe('metadata extraction', () => {
|
|
it('extracts clean title without Turkish UI text', async () => {
|
|
const result = await scraperNetflix(TEST_URL);
|
|
expect(result.name).toBeTruthy();
|
|
expect(result.name).not.toContain('izlemenizi bekliyor');
|
|
});
|
|
});
|
|
});
|
|
```
|
|
|
|
### Test Data Management
|
|
|
|
```javascript
|
|
// Use fixtures for consistent test data
|
|
import fs from 'node:fs';
|
|
|
|
function loadFixture(filename) {
|
|
return fs.readFileSync(`tests/fixtures/${filename}`, 'utf8');
|
|
}
|
|
|
|
const TEST_HTML = loadFixture('sample-title.html');
|
|
const TEST_URLS = JSON.parse(loadFixture('test-urls.json'));
|
|
|
|
// Mock external dependencies
|
|
vi.mock('playwright', () => ({
|
|
chromium: {
|
|
launch: vi.fn(() => ({
|
|
newContext: vi.fn(() => ({
|
|
newPage: vi.fn(() => ({
|
|
goto: vi.fn(),
|
|
content: vi.fn().mockResolvedValue(TEST_HTML),
|
|
waitForLoadState: vi.fn()
|
|
}))
|
|
})),
|
|
close: vi.fn()
|
|
}))
|
|
}
|
|
}));
|
|
```
|
|
|
|
### Performance Testing
|
|
|
|
```javascript
|
|
import { performance } from 'node:perf_hooks';
|
|
|
|
describe('performance', () => {
|
|
it('completes static scraping within 1 second', async () => {
|
|
const start = performance.now();
|
|
await scraperNetflix(TEST_URL, { headless: false });
|
|
const duration = performance.now() - start;
|
|
|
|
expect(duration).toBeLessThan(1000);
|
|
}, 10000);
|
|
|
|
it('handles concurrent requests efficiently', async () => {
|
|
const urls = Array(10).fill(TEST_URL);
|
|
const start = performance.now();
|
|
|
|
await Promise.all(urls.map(url => scraperNetflix(url, { headless: false })));
|
|
|
|
const duration = performance.now() - start;
|
|
expect(duration).toBeLessThan(5000); // Should be much faster than sequential
|
|
}, 30000);
|
|
});
|
|
```
|
|
|
|
## 🔄 Development Workflow
|
|
|
|
### 1. Feature Development
|
|
|
|
```bash
|
|
# Create feature branch
|
|
git checkout -b feature/turkish-title-cleaning
|
|
|
|
# Make changes
|
|
# Write tests
|
|
npm test
|
|
|
|
# Run demo to verify
|
|
npm run demo
|
|
|
|
# Commit changes
|
|
git add .
|
|
git commit -m "feat: add Turkish UI text pattern removal"
|
|
|
|
# Push and create PR
|
|
git push origin feature/turkish-title-cleaning
|
|
```
|
|
|
|
### 2. Bug Fix Process
|
|
|
|
```bash
|
|
# Create bugfix branch
|
|
git checkout -b fix/handle-missing-title-field
|
|
|
|
# Reproduce issue with test
|
|
npm test -- --grep "missing title"
|
|
|
|
# Fix the issue
|
|
# Add failing test first
|
|
npm test
|
|
|
|
# Implement fix
|
|
# Make test pass
|
|
npm test
|
|
|
|
# Verify with demo
|
|
npm run demo
|
|
|
|
# Commit with conventional commit
|
|
git commit -m "fix: handle missing title field in JSON-LD parsing"
|
|
```
|
|
|
|
### 3. Code Review Checklist
|
|
|
|
#### Functionality
|
|
- [ ] Feature works as expected
|
|
- [ ] Edge cases are handled
|
|
- [ ] Error messages are helpful
|
|
- [ ] Turkish localization works
|
|
|
|
#### Code Quality
|
|
- [ ] Code follows style conventions
|
|
- [ ] Functions are single-responsibility
|
|
- [ ] Variables have meaningful names
|
|
- [ ] JSDoc documentation is complete
|
|
|
|
#### Testing
|
|
- [ ] Tests cover happy path
|
|
- [ ] Tests cover error cases
|
|
- [ ] Tests are maintainable
|
|
- [ ] Performance tests if applicable
|
|
|
|
#### Documentation
|
|
- [ ] API documentation updated
|
|
- [ ] README examples work
|
|
- [ ] Architecture document reflects changes
|
|
- [ ] Changelog updated
|
|
|
|
## 🛠️ Debugging Guidelines
|
|
|
|
### Common Debugging Techniques
|
|
|
|
#### 1. Enable Verbose Logging
|
|
|
|
```javascript
|
|
// Add debug logging to investigation
|
|
function debugNetflixScraping(url, options) {
|
|
console.log('🔍 Input URL:', url);
|
|
console.log('⚙️ Options:', options);
|
|
|
|
const normalized = normalizeNetflixUrl(url);
|
|
console.log('🔗 Normalized:', normalized);
|
|
|
|
// Continue with debugging
|
|
}
|
|
```
|
|
|
|
#### 2. Test with Real Data
|
|
|
|
```javascript
|
|
// Create debug script
|
|
import { scraperNetflix, parseNetflixHtml } from './src/index.js';
|
|
|
|
async function debugUrl(url) {
|
|
try {
|
|
console.log('🚀 Testing URL:', url);
|
|
|
|
// Test normalization
|
|
const normalized = normalizeNetflixUrl(url);
|
|
console.log('📝 Normalized:', normalized);
|
|
|
|
// Test scraping
|
|
const result = await scraperNetflix(url);
|
|
console.log('✅ Result:', JSON.stringify(result, null, 2));
|
|
|
|
} catch (error) {
|
|
console.error('❌ Error:', error.message);
|
|
console.error('Stack:', error.stack);
|
|
}
|
|
}
|
|
|
|
debugUrl('https://www.netflix.com/title/80189685');
|
|
```
|
|
|
|
#### 3. Browser Debugging
|
|
|
|
```javascript
|
|
// Test headless mode with visible browser
|
|
const result = await scraperNetflix(url, {
|
|
headless: false, // Show browser
|
|
timeoutMs: 60000 // Longer timeout for debugging
|
|
});
|
|
```
|
|
|
|
#### 4. HTML Inspection
|
|
|
|
```javascript
|
|
// Save HTML for manual inspection
|
|
import fs from 'node:fs';
|
|
|
|
async function debugHtml(url) {
|
|
const html = await fetchStaticHtml(url);
|
|
fs.writeFileSync('debug-page.html', html);
|
|
console.log('HTML saved to debug-page.html');
|
|
|
|
const parsed = parseNetflixHtml(html);
|
|
console.log('Parsed:', parsed);
|
|
}
|
|
```
|
|
|
|
### Debugging Netflix Changes
|
|
|
|
#### Netflix UI Pattern Changes
|
|
|
|
```javascript
|
|
// When Netflix changes their UI text patterns
|
|
function updateTurkishPatterns(newPatterns) {
|
|
const TURKISH_UI_PATTERNS = [
|
|
...TURKISH_UI_PATTERNS,
|
|
...newPatterns
|
|
];
|
|
|
|
console.log('🔄 Updated Turkish patterns:', newPatterns);
|
|
}
|
|
```
|
|
|
|
#### JSON-LD Structure Changes
|
|
|
|
```javascript
|
|
// Debug JSON-LD extraction
|
|
function debugJsonLd(html) {
|
|
const $ = load(html);
|
|
|
|
$('script[type="application/ld+json"]').each((i, el) => {
|
|
const raw = $(el).contents().text();
|
|
try {
|
|
const parsed = JSON.parse(raw);
|
|
console.log(`JSON-LD ${i}:`, JSON.stringify(parsed, null, 2));
|
|
} catch (error) {
|
|
console.log(`JSON-LD ${i} parse error:`, error.message);
|
|
}
|
|
});
|
|
}
|
|
```
|
|
|
|
## 📦 Dependency Management
|
|
|
|
### Adding Dependencies
|
|
|
|
```bash
|
|
# Production dependency
|
|
npm install cheerio@^1.0.0-rc.12
|
|
|
|
# Optional dependency
|
|
npm install playwright --save-optional
|
|
|
|
# Development dependency
|
|
npm install vitest --save-dev
|
|
|
|
# Update package.json exports
|
|
```
|
|
|
|
### Updating Dependencies
|
|
|
|
```bash
|
|
# Check for outdated packages
|
|
npm outdated
|
|
|
|
# Update specific package
|
|
npm update cheerio
|
|
|
|
# Update all packages
|
|
npm update
|
|
|
|
# Test after updates
|
|
npm test
|
|
```
|
|
|
|
### Polyfill Management
|
|
|
|
```javascript
|
|
// src/polyfill.js - Keep minimal and targeted
|
|
import { Blob } from 'node:buffer';
|
|
|
|
// Only polyfill what's needed for undici/fetch
|
|
class PolyfillFile extends Blob {
|
|
constructor(parts, name, options = {}) {
|
|
super(parts, options);
|
|
this.name = String(name);
|
|
this.lastModified = options.lastModified ?? Date.now();
|
|
}
|
|
}
|
|
|
|
globalThis.File = globalThis.File || PolyfillFile;
|
|
globalThis.Blob = globalThis.Blob || Blob;
|
|
```
|
|
|
|
## 🚀 Performance Optimization
|
|
|
|
### Profiling
|
|
|
|
```javascript
|
|
import { performance } from 'node:perf_hooks';
|
|
|
|
async function profileScraping(url) {
|
|
const start = performance.now();
|
|
|
|
// Profile URL normalization
|
|
const normStart = performance.now();
|
|
const normalized = normalizeNetflixUrl(url);
|
|
console.log('Normalization:', performance.now() - normStart, 'ms');
|
|
|
|
// Profile HTML fetch
|
|
const fetchStart = performance.now();
|
|
const html = await fetchStaticHtml(normalized);
|
|
console.log('HTML fetch:', performance.now() - fetchStart, 'ms');
|
|
|
|
// Profile parsing
|
|
const parseStart = performance.now();
|
|
const parsed = parseNetflixHtml(html);
|
|
console.log('Parsing:', performance.now() - parseStart, 'ms');
|
|
|
|
const total = performance.now() - start;
|
|
console.log('Total:', total, 'ms');
|
|
|
|
return parsed;
|
|
}
|
|
```
|
|
|
|
### Memory Optimization
|
|
|
|
```javascript
|
|
// Clean up browser resources properly
|
|
export async function fetchPageContentWithPlaywright(url, options) {
|
|
const browser = await chromium.launch({ headless: options.headless !== false });
|
|
|
|
try {
|
|
const context = await browser.newContext({ userAgent: options.userAgent });
|
|
const page = await context.newPage();
|
|
|
|
await page.goto(url, { timeout: options.timeoutMs });
|
|
return await page.content();
|
|
} finally {
|
|
// Always close browser to prevent memory leaks
|
|
await browser.close();
|
|
}
|
|
}
|
|
```
|
|
|
|
## 🤝 Contribution Process
|
|
|
|
### Before Contributing
|
|
|
|
1. **Read Documentation**: Familiarize yourself with the codebase
|
|
2. **Run Tests**: Ensure existing tests pass
|
|
3. **Understand Scope**: Keep changes focused and minimal
|
|
|
|
### Submitting Changes
|
|
|
|
1. **Fork Repository**: Create your own fork
|
|
2. **Create Branch**: Use descriptive branch names
|
|
3. **Write Tests**: Ensure new code is tested
|
|
4. **Update Docs**: Update relevant documentation
|
|
5. **Submit PR**: Include clear description and testing instructions
|
|
|
|
### Pull Request Template
|
|
|
|
```markdown
|
|
## Description
|
|
Brief description of changes made
|
|
|
|
## Type of Change
|
|
- [ ] Bug fix
|
|
- [ ] New feature
|
|
- [ ] Breaking change
|
|
- [ ] Documentation update
|
|
|
|
## Testing
|
|
- [ ] All tests pass
|
|
- [ ] New tests added
|
|
- [ ] Manual testing completed
|
|
|
|
## Checklist
|
|
- [ ] Code follows style guidelines
|
|
- [ ] Self-review completed
|
|
- [ ] Documentation updated
|
|
- [ ] Performance considered
|
|
|
|
## Additional Notes
|
|
Any additional context or considerations
|
|
```
|
|
|
|
---
|
|
|
|
*Development guide last updated: 2025-11-23* |