14 KiB
14 KiB
MetaScraper Development Guide
🚀 Getting Started
Prerequisites
- Node.js: 18+ (tested on 18.18.2 and 24.x)
- npm: 8+ (comes with Node.js)
- Git: For version control
Development Setup
# Clone the repository
git clone <repository-url>
cd metascraper
# Install dependencies
npm install
# Run tests to verify setup
npm test
# Run demo to test functionality
npm run demo
IDE Configuration
VS Code Setup
Create .vscode/settings.json:
{
"editor.formatOnSave": true,
"editor.defaultFormatter": "esbenp.prettier-vscode",
"files.associations": {
"*.js": "javascript"
},
"typescript.preferences.importModuleSpecifier": "relative"
}
Recommended Extensions
- ESLint:
esbenp.prettier-vscode - Prettier:
dbaeumer.vscode-eslint - Vitest:
ZixuanChen.vitest-explorer
📁 Project Structure
metascraper/
├── src/ # Source code
│ ├── index.js # Main scraperNetflix function
│ ├── parser.js # HTML parsing and title cleaning
│ ├── headless.js # Playwright browser automation
│ └── polyfill.js # File/Blob polyfill for Node.js
├── tests/ # Test files
│ ├── scrape.test.js # Integration tests
│ └── fixtures/ # Test data and HTML samples
├── doc/ # Documentation (this directory)
│ ├── README.md # Documentation index
│ ├── ARCHITECTURE.md # System design and patterns
│ ├── API.md # Complete API reference
│ ├── DEVELOPMENT.md # Development guide (this file)
│ ├── TESTING.md # Testing patterns and procedures
│ ├── TROUBLESHOOTING.md # Common issues and solutions
│ ├── FAQ.md # Frequently asked questions
│ └── DEPLOYMENT.md # Packaging and publishing
├── local-demo.js # Demo application for testing
├── package.json # Project configuration
├── vitest.config.js # Test configuration (if exists)
└── README.md # Project README
🧱 Code Style & Conventions
JavaScript Standards
// Use ES6+ modules
import { scraperNetflix } from './index.js';
import { parseNetflixHtml } from './parser.js';
// Prefer async/await over Promise chains
async function scrapeNetflixTitle(url) {
try {
const result = await scraperNetflix(url);
return result;
} catch (error) {
console.error('Scraping failed:', error.message);
throw error;
}
}
// Use template literals for strings
const message = `Scraping ${url} completed in ${duration}ms`;
// Destructure objects and arrays
const { url, id, name, year } = result;
const [first, second] = urls;
Naming Conventions
// Functions: camelCase with descriptive names
function normalizeNetflixUrl(inputUrl) { }
function extractYearFromJsonLd(jsonData) { }
// Constants: UPPER_SNAKE_CASE
const DEFAULT_TIMEOUT_MS = 15000;
const TURKISH_UI_PATTERNS = [/pattern/, /another/];
// Variables: camelCase, meaningful names
const normalizedUrl = normalizeNetflixUrl(inputUrl);
const seasonCount = extractNumberOfSeasons(metadata);
// Files: kebab-case for utilities, camelCase for modules
// parser.js, headless.js, polyfill.js
// netflix-url-utils.js, html-cleaner.js
Error Handling Patterns
// Always include context in error messages
function validateNetflixUrl(url) {
if (!url) {
throw new Error('Netflix URL\'i gereklidir.');
}
if (!url.includes('netflix')) {
throw new Error('URL netflix.com adresini göstermelidir.');
}
}
// Use Turkish error messages for Turkish users
function logError(message, error) {
console.error(`❌ ${message}: ${error.message}`);
}
// Chain error context
async function fetchWithRetry(url, attempts = 3) {
try {
return await fetch(url);
} catch (error) {
if (attempts === 1) {
throw new Error(`Failed to fetch ${url}: ${error.message}`);
}
await delay(1000);
return fetchWithRetry(url, attempts - 1);
}
}
JSDoc Documentation
/**
* Netflix meta verilerini scrape eder.
* @param {string} inputUrl Netflix URL'si
* @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
* @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
* @throws {Error} URL invalid, network error, or parsing failure
*/
export async function scraperNetflix(inputUrl, options = {}) {
// Implementation
}
/**
* Clean titles by removing Netflix suffixes and UI text.
* Handles patterns like "The Witcher izlemenizi bekliyor | Netflix" → "The Witcher"
* @param {string | undefined | null} title - Raw title from Netflix
* @returns {string | undefined} Cleaned title
*/
function cleanTitle(title) {
if (!title) return undefined;
// Implementation
}
🧪 Testing Standards
Test Structure
import { describe, it, expect, beforeAll, beforeEach, afterEach } from 'vitest';
import { scraperNetflix, parseNetflixHtml } from '../src/index.js';
describe('scraperNetflix', () => {
// Setup before tests
beforeAll(async () => {
// One-time setup
});
beforeEach(() => {
// Reset before each test
});
afterEach(() => {
// Cleanup after each test
});
describe('URL normalization', () => {
it('normalizes Turkish Netflix URLs', () => {
const input = 'https://www.netflix.com/tr/title/80189685?s=i&vlang=tr';
const expected = 'https://www.netflix.com/title/80189685';
// Test implementation
});
it('throws error for invalid URLs', async () => {
await expect(scraperNetflix('invalid-url')).rejects.toThrow();
});
});
describe('metadata extraction', () => {
it('extracts clean title without Turkish UI text', async () => {
const result = await scraperNetflix(TEST_URL);
expect(result.name).toBeTruthy();
expect(result.name).not.toContain('izlemenizi bekliyor');
});
});
});
Test Data Management
// Use fixtures for consistent test data
import fs from 'node:fs';
function loadFixture(filename) {
return fs.readFileSync(`tests/fixtures/${filename}`, 'utf8');
}
const TEST_HTML = loadFixture('sample-title.html');
const TEST_URLS = JSON.parse(loadFixture('test-urls.json'));
// Mock external dependencies
vi.mock('playwright', () => ({
chromium: {
launch: vi.fn(() => ({
newContext: vi.fn(() => ({
newPage: vi.fn(() => ({
goto: vi.fn(),
content: vi.fn().mockResolvedValue(TEST_HTML),
waitForLoadState: vi.fn()
}))
})),
close: vi.fn()
}))
}
}));
Performance Testing
import { performance } from 'node:perf_hooks';
describe('performance', () => {
it('completes static scraping within 1 second', async () => {
const start = performance.now();
await scraperNetflix(TEST_URL, { headless: false });
const duration = performance.now() - start;
expect(duration).toBeLessThan(1000);
}, 10000);
it('handles concurrent requests efficiently', async () => {
const urls = Array(10).fill(TEST_URL);
const start = performance.now();
await Promise.all(urls.map(url => scraperNetflix(url, { headless: false })));
const duration = performance.now() - start;
expect(duration).toBeLessThan(5000); // Should be much faster than sequential
}, 30000);
});
🔄 Development Workflow
1. Feature Development
# Create feature branch
git checkout -b feature/turkish-title-cleaning
# Make changes
# Write tests
npm test
# Run demo to verify
npm run demo
# Commit changes
git add .
git commit -m "feat: add Turkish UI text pattern removal"
# Push and create PR
git push origin feature/turkish-title-cleaning
2. Bug Fix Process
# Create bugfix branch
git checkout -b fix/handle-missing-title-field
# Reproduce issue with test
npm test -- --grep "missing title"
# Fix the issue
# Add failing test first
npm test
# Implement fix
# Make test pass
npm test
# Verify with demo
npm run demo
# Commit with conventional commit
git commit -m "fix: handle missing title field in JSON-LD parsing"
3. Code Review Checklist
Functionality
- Feature works as expected
- Edge cases are handled
- Error messages are helpful
- Turkish localization works
Code Quality
- Code follows style conventions
- Functions are single-responsibility
- Variables have meaningful names
- JSDoc documentation is complete
Testing
- Tests cover happy path
- Tests cover error cases
- Tests are maintainable
- Performance tests if applicable
Documentation
- API documentation updated
- README examples work
- Architecture document reflects changes
- Changelog updated
🛠️ Debugging Guidelines
Common Debugging Techniques
1. Enable Verbose Logging
// Add debug logging to investigation
function debugNetflixScraping(url, options) {
console.log('🔍 Input URL:', url);
console.log('⚙️ Options:', options);
const normalized = normalizeNetflixUrl(url);
console.log('🔗 Normalized:', normalized);
// Continue with debugging
}
2. Test with Real Data
// Create debug script
import { scraperNetflix, parseNetflixHtml } from './src/index.js';
async function debugUrl(url) {
try {
console.log('🚀 Testing URL:', url);
// Test normalization
const normalized = normalizeNetflixUrl(url);
console.log('📝 Normalized:', normalized);
// Test scraping
const result = await scraperNetflix(url);
console.log('✅ Result:', JSON.stringify(result, null, 2));
} catch (error) {
console.error('❌ Error:', error.message);
console.error('Stack:', error.stack);
}
}
debugUrl('https://www.netflix.com/title/80189685');
3. Browser Debugging
// Test headless mode with visible browser
const result = await scraperNetflix(url, {
headless: false, // Show browser
timeoutMs: 60000 // Longer timeout for debugging
});
4. HTML Inspection
// Save HTML for manual inspection
import fs from 'node:fs';
async function debugHtml(url) {
const html = await fetchStaticHtml(url);
fs.writeFileSync('debug-page.html', html);
console.log('HTML saved to debug-page.html');
const parsed = parseNetflixHtml(html);
console.log('Parsed:', parsed);
}
Debugging Netflix Changes
Netflix UI Pattern Changes
// When Netflix changes their UI text patterns
function updateTurkishPatterns(newPatterns) {
const TURKISH_UI_PATTERNS = [
...TURKISH_UI_PATTERNS,
...newPatterns
];
console.log('🔄 Updated Turkish patterns:', newPatterns);
}
JSON-LD Structure Changes
// Debug JSON-LD extraction
function debugJsonLd(html) {
const $ = load(html);
$('script[type="application/ld+json"]').each((i, el) => {
const raw = $(el).contents().text();
try {
const parsed = JSON.parse(raw);
console.log(`JSON-LD ${i}:`, JSON.stringify(parsed, null, 2));
} catch (error) {
console.log(`JSON-LD ${i} parse error:`, error.message);
}
});
}
📦 Dependency Management
Adding Dependencies
# Production dependency
npm install cheerio@^1.0.0-rc.12
# Optional dependency
npm install playwright --save-optional
# Development dependency
npm install vitest --save-dev
# Update package.json exports
Updating Dependencies
# Check for outdated packages
npm outdated
# Update specific package
npm update cheerio
# Update all packages
npm update
# Test after updates
npm test
Polyfill Management
// src/polyfill.js - Keep minimal and targeted
import { Blob } from 'node:buffer';
// Only polyfill what's needed for undici/fetch
class PolyfillFile extends Blob {
constructor(parts, name, options = {}) {
super(parts, options);
this.name = String(name);
this.lastModified = options.lastModified ?? Date.now();
}
}
globalThis.File = globalThis.File || PolyfillFile;
globalThis.Blob = globalThis.Blob || Blob;
🚀 Performance Optimization
Profiling
import { performance } from 'node:perf_hooks';
async function profileScraping(url) {
const start = performance.now();
// Profile URL normalization
const normStart = performance.now();
const normalized = normalizeNetflixUrl(url);
console.log('Normalization:', performance.now() - normStart, 'ms');
// Profile HTML fetch
const fetchStart = performance.now();
const html = await fetchStaticHtml(normalized);
console.log('HTML fetch:', performance.now() - fetchStart, 'ms');
// Profile parsing
const parseStart = performance.now();
const parsed = parseNetflixHtml(html);
console.log('Parsing:', performance.now() - parseStart, 'ms');
const total = performance.now() - start;
console.log('Total:', total, 'ms');
return parsed;
}
Memory Optimization
// Clean up browser resources properly
export async function fetchPageContentWithPlaywright(url, options) {
const browser = await chromium.launch({ headless: options.headless !== false });
try {
const context = await browser.newContext({ userAgent: options.userAgent });
const page = await context.newPage();
await page.goto(url, { timeout: options.timeoutMs });
return await page.content();
} finally {
// Always close browser to prevent memory leaks
await browser.close();
}
}
🤝 Contribution Process
Before Contributing
- Read Documentation: Familiarize yourself with the codebase
- Run Tests: Ensure existing tests pass
- Understand Scope: Keep changes focused and minimal
Submitting Changes
- Fork Repository: Create your own fork
- Create Branch: Use descriptive branch names
- Write Tests: Ensure new code is tested
- Update Docs: Update relevant documentation
- Submit PR: Include clear description and testing instructions
Pull Request Template
## Description
Brief description of changes made
## Type of Change
- [ ] Bug fix
- [ ] New feature
- [ ] Breaking change
- [ ] Documentation update
## Testing
- [ ] All tests pass
- [ ] New tests added
- [ ] Manual testing completed
## Checklist
- [ ] Code follows style guidelines
- [ ] Self-review completed
- [ ] Documentation updated
- [ ] Performance considered
## Additional Notes
Any additional context or considerations
Development guide last updated: 2025-11-23