first commit
This commit is contained in:
614
doc/DEVELOPMENT.md
Normal file
614
doc/DEVELOPMENT.md
Normal file
@@ -0,0 +1,614 @@
|
||||
# MetaScraper Development Guide
|
||||
|
||||
## 🚀 Getting Started
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- **Node.js**: 18+ (tested on 18.18.2 and 24.x)
|
||||
- **npm**: 8+ (comes with Node.js)
|
||||
- **Git**: For version control
|
||||
|
||||
### Development Setup
|
||||
|
||||
```bash
|
||||
# Clone the repository
|
||||
git clone <repository-url>
|
||||
cd metascraper
|
||||
|
||||
# Install dependencies
|
||||
npm install
|
||||
|
||||
# Run tests to verify setup
|
||||
npm test
|
||||
|
||||
# Run demo to test functionality
|
||||
npm run demo
|
||||
```
|
||||
|
||||
### IDE Configuration
|
||||
|
||||
#### VS Code Setup
|
||||
|
||||
Create `.vscode/settings.json`:
|
||||
|
||||
```json
|
||||
{
|
||||
"editor.formatOnSave": true,
|
||||
"editor.defaultFormatter": "esbenp.prettier-vscode",
|
||||
"files.associations": {
|
||||
"*.js": "javascript"
|
||||
},
|
||||
"typescript.preferences.importModuleSpecifier": "relative"
|
||||
}
|
||||
```
|
||||
|
||||
#### Recommended Extensions
|
||||
|
||||
- **ESLint**: `esbenp.prettier-vscode`
|
||||
- **Prettier**: `dbaeumer.vscode-eslint`
|
||||
- **Vitest**: `ZixuanChen.vitest-explorer`
|
||||
|
||||
## 📁 Project Structure
|
||||
|
||||
```
|
||||
metascraper/
|
||||
├── src/ # Source code
|
||||
│ ├── index.js # Main scraperNetflix function
|
||||
│ ├── parser.js # HTML parsing and title cleaning
|
||||
│ ├── headless.js # Playwright browser automation
|
||||
│ └── polyfill.js # File/Blob polyfill for Node.js
|
||||
├── tests/ # Test files
|
||||
│ ├── scrape.test.js # Integration tests
|
||||
│ └── fixtures/ # Test data and HTML samples
|
||||
├── doc/ # Documentation (this directory)
|
||||
│ ├── README.md # Documentation index
|
||||
│ ├── ARCHITECTURE.md # System design and patterns
|
||||
│ ├── API.md # Complete API reference
|
||||
│ ├── DEVELOPMENT.md # Development guide (this file)
|
||||
│ ├── TESTING.md # Testing patterns and procedures
|
||||
│ ├── TROUBLESHOOTING.md # Common issues and solutions
|
||||
│ ├── FAQ.md # Frequently asked questions
|
||||
│ └── DEPLOYMENT.md # Packaging and publishing
|
||||
├── local-demo.js # Demo application for testing
|
||||
├── package.json # Project configuration
|
||||
├── vitest.config.js # Test configuration (if exists)
|
||||
└── README.md # Project README
|
||||
```
|
||||
|
||||
## 🧱 Code Style & Conventions
|
||||
|
||||
### JavaScript Standards
|
||||
|
||||
```javascript
|
||||
// Use ES6+ modules
|
||||
import { scraperNetflix } from './index.js';
|
||||
import { parseNetflixHtml } from './parser.js';
|
||||
|
||||
// Prefer async/await over Promise chains
|
||||
async function scrapeNetflixTitle(url) {
|
||||
try {
|
||||
const result = await scraperNetflix(url);
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error('Scraping failed:', error.message);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
// Use template literals for strings
|
||||
const message = `Scraping ${url} completed in ${duration}ms`;
|
||||
|
||||
// Destructure objects and arrays
|
||||
const { url, id, name, year } = result;
|
||||
const [first, second] = urls;
|
||||
```
|
||||
|
||||
### Naming Conventions
|
||||
|
||||
```javascript
|
||||
// Functions: camelCase with descriptive names
|
||||
function normalizeNetflixUrl(inputUrl) { }
|
||||
function extractYearFromJsonLd(jsonData) { }
|
||||
|
||||
// Constants: UPPER_SNAKE_CASE
|
||||
const DEFAULT_TIMEOUT_MS = 15000;
|
||||
const TURKISH_UI_PATTERNS = [/pattern/, /another/];
|
||||
|
||||
// Variables: camelCase, meaningful names
|
||||
const normalizedUrl = normalizeNetflixUrl(inputUrl);
|
||||
const seasonCount = extractNumberOfSeasons(metadata);
|
||||
|
||||
// Files: kebab-case for utilities, camelCase for modules
|
||||
// parser.js, headless.js, polyfill.js
|
||||
// netflix-url-utils.js, html-cleaner.js
|
||||
```
|
||||
|
||||
### Error Handling Patterns
|
||||
|
||||
```javascript
|
||||
// Always include context in error messages
|
||||
function validateNetflixUrl(url) {
|
||||
if (!url) {
|
||||
throw new Error('Netflix URL\'i gereklidir.');
|
||||
}
|
||||
|
||||
if (!url.includes('netflix')) {
|
||||
throw new Error('URL netflix.com adresini göstermelidir.');
|
||||
}
|
||||
}
|
||||
|
||||
// Use Turkish error messages for Turkish users
|
||||
function logError(message, error) {
|
||||
console.error(`❌ ${message}: ${error.message}`);
|
||||
}
|
||||
|
||||
// Chain error context
|
||||
async function fetchWithRetry(url, attempts = 3) {
|
||||
try {
|
||||
return await fetch(url);
|
||||
} catch (error) {
|
||||
if (attempts === 1) {
|
||||
throw new Error(`Failed to fetch ${url}: ${error.message}`);
|
||||
}
|
||||
await delay(1000);
|
||||
return fetchWithRetry(url, attempts - 1);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### JSDoc Documentation
|
||||
|
||||
```javascript
|
||||
/**
|
||||
* Netflix meta verilerini scrape eder.
|
||||
* @param {string} inputUrl Netflix URL'si
|
||||
* @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
|
||||
* @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
|
||||
* @throws {Error} URL invalid, network error, or parsing failure
|
||||
*/
|
||||
export async function scraperNetflix(inputUrl, options = {}) {
|
||||
// Implementation
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean titles by removing Netflix suffixes and UI text.
|
||||
* Handles patterns like "The Witcher izlemenizi bekliyor | Netflix" → "The Witcher"
|
||||
* @param {string | undefined | null} title - Raw title from Netflix
|
||||
* @returns {string | undefined} Cleaned title
|
||||
*/
|
||||
function cleanTitle(title) {
|
||||
if (!title) return undefined;
|
||||
// Implementation
|
||||
}
|
||||
```
|
||||
|
||||
## 🧪 Testing Standards
|
||||
|
||||
### Test Structure
|
||||
|
||||
```javascript
|
||||
import { describe, it, expect, beforeAll, beforeEach, afterEach } from 'vitest';
|
||||
import { scraperNetflix, parseNetflixHtml } from '../src/index.js';
|
||||
|
||||
describe('scraperNetflix', () => {
|
||||
// Setup before tests
|
||||
beforeAll(async () => {
|
||||
// One-time setup
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
// Reset before each test
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
// Cleanup after each test
|
||||
});
|
||||
|
||||
describe('URL normalization', () => {
|
||||
it('normalizes Turkish Netflix URLs', () => {
|
||||
const input = 'https://www.netflix.com/tr/title/80189685?s=i&vlang=tr';
|
||||
const expected = 'https://www.netflix.com/title/80189685';
|
||||
// Test implementation
|
||||
});
|
||||
|
||||
it('throws error for invalid URLs', async () => {
|
||||
await expect(scraperNetflix('invalid-url')).rejects.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
describe('metadata extraction', () => {
|
||||
it('extracts clean title without Turkish UI text', async () => {
|
||||
const result = await scraperNetflix(TEST_URL);
|
||||
expect(result.name).toBeTruthy();
|
||||
expect(result.name).not.toContain('izlemenizi bekliyor');
|
||||
});
|
||||
});
|
||||
});
|
||||
```
|
||||
|
||||
### Test Data Management
|
||||
|
||||
```javascript
|
||||
// Use fixtures for consistent test data
|
||||
import fs from 'node:fs';
|
||||
|
||||
function loadFixture(filename) {
|
||||
return fs.readFileSync(`tests/fixtures/${filename}`, 'utf8');
|
||||
}
|
||||
|
||||
const TEST_HTML = loadFixture('sample-title.html');
|
||||
const TEST_URLS = JSON.parse(loadFixture('test-urls.json'));
|
||||
|
||||
// Mock external dependencies
|
||||
vi.mock('playwright', () => ({
|
||||
chromium: {
|
||||
launch: vi.fn(() => ({
|
||||
newContext: vi.fn(() => ({
|
||||
newPage: vi.fn(() => ({
|
||||
goto: vi.fn(),
|
||||
content: vi.fn().mockResolvedValue(TEST_HTML),
|
||||
waitForLoadState: vi.fn()
|
||||
}))
|
||||
})),
|
||||
close: vi.fn()
|
||||
}))
|
||||
}
|
||||
}));
|
||||
```
|
||||
|
||||
### Performance Testing
|
||||
|
||||
```javascript
|
||||
import { performance } from 'node:perf_hooks';
|
||||
|
||||
describe('performance', () => {
|
||||
it('completes static scraping within 1 second', async () => {
|
||||
const start = performance.now();
|
||||
await scraperNetflix(TEST_URL, { headless: false });
|
||||
const duration = performance.now() - start;
|
||||
|
||||
expect(duration).toBeLessThan(1000);
|
||||
}, 10000);
|
||||
|
||||
it('handles concurrent requests efficiently', async () => {
|
||||
const urls = Array(10).fill(TEST_URL);
|
||||
const start = performance.now();
|
||||
|
||||
await Promise.all(urls.map(url => scraperNetflix(url, { headless: false })));
|
||||
|
||||
const duration = performance.now() - start;
|
||||
expect(duration).toBeLessThan(5000); // Should be much faster than sequential
|
||||
}, 30000);
|
||||
});
|
||||
```
|
||||
|
||||
## 🔄 Development Workflow
|
||||
|
||||
### 1. Feature Development
|
||||
|
||||
```bash
|
||||
# Create feature branch
|
||||
git checkout -b feature/turkish-title-cleaning
|
||||
|
||||
# Make changes
|
||||
# Write tests
|
||||
npm test
|
||||
|
||||
# Run demo to verify
|
||||
npm run demo
|
||||
|
||||
# Commit changes
|
||||
git add .
|
||||
git commit -m "feat: add Turkish UI text pattern removal"
|
||||
|
||||
# Push and create PR
|
||||
git push origin feature/turkish-title-cleaning
|
||||
```
|
||||
|
||||
### 2. Bug Fix Process
|
||||
|
||||
```bash
|
||||
# Create bugfix branch
|
||||
git checkout -b fix/handle-missing-title-field
|
||||
|
||||
# Reproduce issue with test
|
||||
npm test -- --grep "missing title"
|
||||
|
||||
# Fix the issue
|
||||
# Add failing test first
|
||||
npm test
|
||||
|
||||
# Implement fix
|
||||
# Make test pass
|
||||
npm test
|
||||
|
||||
# Verify with demo
|
||||
npm run demo
|
||||
|
||||
# Commit with conventional commit
|
||||
git commit -m "fix: handle missing title field in JSON-LD parsing"
|
||||
```
|
||||
|
||||
### 3. Code Review Checklist
|
||||
|
||||
#### Functionality
|
||||
- [ ] Feature works as expected
|
||||
- [ ] Edge cases are handled
|
||||
- [ ] Error messages are helpful
|
||||
- [ ] Turkish localization works
|
||||
|
||||
#### Code Quality
|
||||
- [ ] Code follows style conventions
|
||||
- [ ] Functions are single-responsibility
|
||||
- [ ] Variables have meaningful names
|
||||
- [ ] JSDoc documentation is complete
|
||||
|
||||
#### Testing
|
||||
- [ ] Tests cover happy path
|
||||
- [ ] Tests cover error cases
|
||||
- [ ] Tests are maintainable
|
||||
- [ ] Performance tests if applicable
|
||||
|
||||
#### Documentation
|
||||
- [ ] API documentation updated
|
||||
- [ ] README examples work
|
||||
- [ ] Architecture document reflects changes
|
||||
- [ ] Changelog updated
|
||||
|
||||
## 🛠️ Debugging Guidelines
|
||||
|
||||
### Common Debugging Techniques
|
||||
|
||||
#### 1. Enable Verbose Logging
|
||||
|
||||
```javascript
|
||||
// Add debug logging to investigation
|
||||
function debugNetflixScraping(url, options) {
|
||||
console.log('🔍 Input URL:', url);
|
||||
console.log('⚙️ Options:', options);
|
||||
|
||||
const normalized = normalizeNetflixUrl(url);
|
||||
console.log('🔗 Normalized:', normalized);
|
||||
|
||||
// Continue with debugging
|
||||
}
|
||||
```
|
||||
|
||||
#### 2. Test with Real Data
|
||||
|
||||
```javascript
|
||||
// Create debug script
|
||||
import { scraperNetflix, parseNetflixHtml } from './src/index.js';
|
||||
|
||||
async function debugUrl(url) {
|
||||
try {
|
||||
console.log('🚀 Testing URL:', url);
|
||||
|
||||
// Test normalization
|
||||
const normalized = normalizeNetflixUrl(url);
|
||||
console.log('📝 Normalized:', normalized);
|
||||
|
||||
// Test scraping
|
||||
const result = await scraperNetflix(url);
|
||||
console.log('✅ Result:', JSON.stringify(result, null, 2));
|
||||
|
||||
} catch (error) {
|
||||
console.error('❌ Error:', error.message);
|
||||
console.error('Stack:', error.stack);
|
||||
}
|
||||
}
|
||||
|
||||
debugUrl('https://www.netflix.com/title/80189685');
|
||||
```
|
||||
|
||||
#### 3. Browser Debugging
|
||||
|
||||
```javascript
|
||||
// Test headless mode with visible browser
|
||||
const result = await scraperNetflix(url, {
|
||||
headless: false, // Show browser
|
||||
timeoutMs: 60000 // Longer timeout for debugging
|
||||
});
|
||||
```
|
||||
|
||||
#### 4. HTML Inspection
|
||||
|
||||
```javascript
|
||||
// Save HTML for manual inspection
|
||||
import fs from 'node:fs';
|
||||
|
||||
async function debugHtml(url) {
|
||||
const html = await fetchStaticHtml(url);
|
||||
fs.writeFileSync('debug-page.html', html);
|
||||
console.log('HTML saved to debug-page.html');
|
||||
|
||||
const parsed = parseNetflixHtml(html);
|
||||
console.log('Parsed:', parsed);
|
||||
}
|
||||
```
|
||||
|
||||
### Debugging Netflix Changes
|
||||
|
||||
#### Netflix UI Pattern Changes
|
||||
|
||||
```javascript
|
||||
// When Netflix changes their UI text patterns
|
||||
function updateTurkishPatterns(newPatterns) {
|
||||
const TURKISH_UI_PATTERNS = [
|
||||
...TURKISH_UI_PATTERNS,
|
||||
...newPatterns
|
||||
];
|
||||
|
||||
console.log('🔄 Updated Turkish patterns:', newPatterns);
|
||||
}
|
||||
```
|
||||
|
||||
#### JSON-LD Structure Changes
|
||||
|
||||
```javascript
|
||||
// Debug JSON-LD extraction
|
||||
function debugJsonLd(html) {
|
||||
const $ = load(html);
|
||||
|
||||
$('script[type="application/ld+json"]').each((i, el) => {
|
||||
const raw = $(el).contents().text();
|
||||
try {
|
||||
const parsed = JSON.parse(raw);
|
||||
console.log(`JSON-LD ${i}:`, JSON.stringify(parsed, null, 2));
|
||||
} catch (error) {
|
||||
console.log(`JSON-LD ${i} parse error:`, error.message);
|
||||
}
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
## 📦 Dependency Management
|
||||
|
||||
### Adding Dependencies
|
||||
|
||||
```bash
|
||||
# Production dependency
|
||||
npm install cheerio@^1.0.0-rc.12
|
||||
|
||||
# Optional dependency
|
||||
npm install playwright --save-optional
|
||||
|
||||
# Development dependency
|
||||
npm install vitest --save-dev
|
||||
|
||||
# Update package.json exports
|
||||
```
|
||||
|
||||
### Updating Dependencies
|
||||
|
||||
```bash
|
||||
# Check for outdated packages
|
||||
npm outdated
|
||||
|
||||
# Update specific package
|
||||
npm update cheerio
|
||||
|
||||
# Update all packages
|
||||
npm update
|
||||
|
||||
# Test after updates
|
||||
npm test
|
||||
```
|
||||
|
||||
### Polyfill Management
|
||||
|
||||
```javascript
|
||||
// src/polyfill.js - Keep minimal and targeted
|
||||
import { Blob } from 'node:buffer';
|
||||
|
||||
// Only polyfill what's needed for undici/fetch
|
||||
class PolyfillFile extends Blob {
|
||||
constructor(parts, name, options = {}) {
|
||||
super(parts, options);
|
||||
this.name = String(name);
|
||||
this.lastModified = options.lastModified ?? Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
globalThis.File = globalThis.File || PolyfillFile;
|
||||
globalThis.Blob = globalThis.Blob || Blob;
|
||||
```
|
||||
|
||||
## 🚀 Performance Optimization
|
||||
|
||||
### Profiling
|
||||
|
||||
```javascript
|
||||
import { performance } from 'node:perf_hooks';
|
||||
|
||||
async function profileScraping(url) {
|
||||
const start = performance.now();
|
||||
|
||||
// Profile URL normalization
|
||||
const normStart = performance.now();
|
||||
const normalized = normalizeNetflixUrl(url);
|
||||
console.log('Normalization:', performance.now() - normStart, 'ms');
|
||||
|
||||
// Profile HTML fetch
|
||||
const fetchStart = performance.now();
|
||||
const html = await fetchStaticHtml(normalized);
|
||||
console.log('HTML fetch:', performance.now() - fetchStart, 'ms');
|
||||
|
||||
// Profile parsing
|
||||
const parseStart = performance.now();
|
||||
const parsed = parseNetflixHtml(html);
|
||||
console.log('Parsing:', performance.now() - parseStart, 'ms');
|
||||
|
||||
const total = performance.now() - start;
|
||||
console.log('Total:', total, 'ms');
|
||||
|
||||
return parsed;
|
||||
}
|
||||
```
|
||||
|
||||
### Memory Optimization
|
||||
|
||||
```javascript
|
||||
// Clean up browser resources properly
|
||||
export async function fetchPageContentWithPlaywright(url, options) {
|
||||
const browser = await chromium.launch({ headless: options.headless !== false });
|
||||
|
||||
try {
|
||||
const context = await browser.newContext({ userAgent: options.userAgent });
|
||||
const page = await context.newPage();
|
||||
|
||||
await page.goto(url, { timeout: options.timeoutMs });
|
||||
return await page.content();
|
||||
} finally {
|
||||
// Always close browser to prevent memory leaks
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## 🤝 Contribution Process
|
||||
|
||||
### Before Contributing
|
||||
|
||||
1. **Read Documentation**: Familiarize yourself with the codebase
|
||||
2. **Run Tests**: Ensure existing tests pass
|
||||
3. **Understand Scope**: Keep changes focused and minimal
|
||||
|
||||
### Submitting Changes
|
||||
|
||||
1. **Fork Repository**: Create your own fork
|
||||
2. **Create Branch**: Use descriptive branch names
|
||||
3. **Write Tests**: Ensure new code is tested
|
||||
4. **Update Docs**: Update relevant documentation
|
||||
5. **Submit PR**: Include clear description and testing instructions
|
||||
|
||||
### Pull Request Template
|
||||
|
||||
```markdown
|
||||
## Description
|
||||
Brief description of changes made
|
||||
|
||||
## Type of Change
|
||||
- [ ] Bug fix
|
||||
- [ ] New feature
|
||||
- [ ] Breaking change
|
||||
- [ ] Documentation update
|
||||
|
||||
## Testing
|
||||
- [ ] All tests pass
|
||||
- [ ] New tests added
|
||||
- [ ] Manual testing completed
|
||||
|
||||
## Checklist
|
||||
- [ ] Code follows style guidelines
|
||||
- [ ] Self-review completed
|
||||
- [ ] Documentation updated
|
||||
- [ ] Performance considered
|
||||
|
||||
## Additional Notes
|
||||
Any additional context or considerations
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*Development guide last updated: 2025-11-23*
|
||||
Reference in New Issue
Block a user