Files
metascraper/doc/DEVELOPMENT.md
2025-11-23 14:25:09 +03:00

14 KiB

MetaScraper Development Guide

🚀 Getting Started

Prerequisites

  • Node.js: 18+ (tested on 18.18.2 and 24.x)
  • npm: 8+ (comes with Node.js)
  • Git: For version control

Development Setup

# Clone the repository
git clone <repository-url>
cd metascraper

# Install dependencies
npm install

# Run tests to verify setup
npm test

# Run demo to test functionality
npm run demo

IDE Configuration

VS Code Setup

Create .vscode/settings.json:

{
  "editor.formatOnSave": true,
  "editor.defaultFormatter": "esbenp.prettier-vscode",
  "files.associations": {
    "*.js": "javascript"
  },
  "typescript.preferences.importModuleSpecifier": "relative"
}
  • ESLint: esbenp.prettier-vscode
  • Prettier: dbaeumer.vscode-eslint
  • Vitest: ZixuanChen.vitest-explorer

📁 Project Structure

metascraper/
├── src/                      # Source code
│   ├── index.js             # Main scraperNetflix function
│   ├── parser.js            # HTML parsing and title cleaning
│   ├── headless.js          # Playwright browser automation
│   └── polyfill.js          # File/Blob polyfill for Node.js
├── tests/                    # Test files
│   ├── scrape.test.js       # Integration tests
│   └── fixtures/            # Test data and HTML samples
├── doc/                      # Documentation (this directory)
│   ├── README.md            # Documentation index
│   ├── ARCHITECTURE.md      # System design and patterns
│   ├── API.md               # Complete API reference
│   ├── DEVELOPMENT.md       # Development guide (this file)
│   ├── TESTING.md           # Testing patterns and procedures
│   ├── TROUBLESHOOTING.md   # Common issues and solutions
│   ├── FAQ.md               # Frequently asked questions
│   └── DEPLOYMENT.md        # Packaging and publishing
├── local-demo.js             # Demo application for testing
├── package.json              # Project configuration
├── vitest.config.js         # Test configuration (if exists)
└── README.md               # Project README

🧱 Code Style & Conventions

JavaScript Standards

// Use ES6+ modules
import { scraperNetflix } from './index.js';
import { parseNetflixHtml } from './parser.js';

// Prefer async/await over Promise chains
async function scrapeNetflixTitle(url) {
  try {
    const result = await scraperNetflix(url);
    return result;
  } catch (error) {
    console.error('Scraping failed:', error.message);
    throw error;
  }
}

// Use template literals for strings
const message = `Scraping ${url} completed in ${duration}ms`;

// Destructure objects and arrays
const { url, id, name, year } = result;
const [first, second] = urls;

Naming Conventions

// Functions: camelCase with descriptive names
function normalizeNetflixUrl(inputUrl) { }
function extractYearFromJsonLd(jsonData) { }

// Constants: UPPER_SNAKE_CASE
const DEFAULT_TIMEOUT_MS = 15000;
const TURKISH_UI_PATTERNS = [/pattern/, /another/];

// Variables: camelCase, meaningful names
const normalizedUrl = normalizeNetflixUrl(inputUrl);
const seasonCount = extractNumberOfSeasons(metadata);

// Files: kebab-case for utilities, camelCase for modules
// parser.js, headless.js, polyfill.js
// netflix-url-utils.js, html-cleaner.js

Error Handling Patterns

// Always include context in error messages
function validateNetflixUrl(url) {
  if (!url) {
    throw new Error('Netflix URL\'i gereklidir.');
  }

  if (!url.includes('netflix')) {
    throw new Error('URL netflix.com adresini göstermelidir.');
  }
}

// Use Turkish error messages for Turkish users
function logError(message, error) {
  console.error(`❌ ${message}: ${error.message}`);
}

// Chain error context
async function fetchWithRetry(url, attempts = 3) {
  try {
    return await fetch(url);
  } catch (error) {
    if (attempts === 1) {
      throw new Error(`Failed to fetch ${url}: ${error.message}`);
    }
    await delay(1000);
    return fetchWithRetry(url, attempts - 1);
  }
}

JSDoc Documentation

/**
 * Netflix meta verilerini scrape eder.
 * @param {string} inputUrl Netflix URL'si
 * @param {{ headless?: boolean, timeoutMs?: number, userAgent?: string }} [options]
 * @returns {Promise<{ url: string, id: string, name: string, year: string | number | undefined, seasons: string | null }>}
 * @throws {Error} URL invalid, network error, or parsing failure
 */
export async function scraperNetflix(inputUrl, options = {}) {
  // Implementation
}

/**
 * Clean titles by removing Netflix suffixes and UI text.
 * Handles patterns like "The Witcher izlemenizi bekliyor | Netflix" → "The Witcher"
 * @param {string | undefined | null} title - Raw title from Netflix
 * @returns {string | undefined} Cleaned title
 */
function cleanTitle(title) {
  if (!title) return undefined;
  // Implementation
}

🧪 Testing Standards

Test Structure

import { describe, it, expect, beforeAll, beforeEach, afterEach } from 'vitest';
import { scraperNetflix, parseNetflixHtml } from '../src/index.js';

describe('scraperNetflix', () => {
  // Setup before tests
  beforeAll(async () => {
    // One-time setup
  });

  beforeEach(() => {
    // Reset before each test
  });

  afterEach(() => {
    // Cleanup after each test
  });

  describe('URL normalization', () => {
    it('normalizes Turkish Netflix URLs', () => {
      const input = 'https://www.netflix.com/tr/title/80189685?s=i&vlang=tr';
      const expected = 'https://www.netflix.com/title/80189685';
      // Test implementation
    });

    it('throws error for invalid URLs', async () => {
      await expect(scraperNetflix('invalid-url')).rejects.toThrow();
    });
  });

  describe('metadata extraction', () => {
    it('extracts clean title without Turkish UI text', async () => {
      const result = await scraperNetflix(TEST_URL);
      expect(result.name).toBeTruthy();
      expect(result.name).not.toContain('izlemenizi bekliyor');
    });
  });
});

Test Data Management

// Use fixtures for consistent test data
import fs from 'node:fs';

function loadFixture(filename) {
  return fs.readFileSync(`tests/fixtures/${filename}`, 'utf8');
}

const TEST_HTML = loadFixture('sample-title.html');
const TEST_URLS = JSON.parse(loadFixture('test-urls.json'));

// Mock external dependencies
vi.mock('playwright', () => ({
  chromium: {
    launch: vi.fn(() => ({
      newContext: vi.fn(() => ({
        newPage: vi.fn(() => ({
          goto: vi.fn(),
          content: vi.fn().mockResolvedValue(TEST_HTML),
          waitForLoadState: vi.fn()
        }))
      })),
      close: vi.fn()
    }))
  }
}));

Performance Testing

import { performance } from 'node:perf_hooks';

describe('performance', () => {
  it('completes static scraping within 1 second', async () => {
    const start = performance.now();
    await scraperNetflix(TEST_URL, { headless: false });
    const duration = performance.now() - start;

    expect(duration).toBeLessThan(1000);
  }, 10000);

  it('handles concurrent requests efficiently', async () => {
    const urls = Array(10).fill(TEST_URL);
    const start = performance.now();

    await Promise.all(urls.map(url => scraperNetflix(url, { headless: false })));

    const duration = performance.now() - start;
    expect(duration).toBeLessThan(5000); // Should be much faster than sequential
  }, 30000);
});

🔄 Development Workflow

1. Feature Development

# Create feature branch
git checkout -b feature/turkish-title-cleaning

# Make changes
# Write tests
npm test

# Run demo to verify
npm run demo

# Commit changes
git add .
git commit -m "feat: add Turkish UI text pattern removal"

# Push and create PR
git push origin feature/turkish-title-cleaning

2. Bug Fix Process

# Create bugfix branch
git checkout -b fix/handle-missing-title-field

# Reproduce issue with test
npm test -- --grep "missing title"

# Fix the issue
# Add failing test first
npm test

# Implement fix
# Make test pass
npm test

# Verify with demo
npm run demo

# Commit with conventional commit
git commit -m "fix: handle missing title field in JSON-LD parsing"

3. Code Review Checklist

Functionality

  • Feature works as expected
  • Edge cases are handled
  • Error messages are helpful
  • Turkish localization works

Code Quality

  • Code follows style conventions
  • Functions are single-responsibility
  • Variables have meaningful names
  • JSDoc documentation is complete

Testing

  • Tests cover happy path
  • Tests cover error cases
  • Tests are maintainable
  • Performance tests if applicable

Documentation

  • API documentation updated
  • README examples work
  • Architecture document reflects changes
  • Changelog updated

🛠️ Debugging Guidelines

Common Debugging Techniques

1. Enable Verbose Logging

// Add debug logging to investigation
function debugNetflixScraping(url, options) {
  console.log('🔍 Input URL:', url);
  console.log('⚙️ Options:', options);

  const normalized = normalizeNetflixUrl(url);
  console.log('🔗 Normalized:', normalized);

  // Continue with debugging
}

2. Test with Real Data

// Create debug script
import { scraperNetflix, parseNetflixHtml } from './src/index.js';

async function debugUrl(url) {
  try {
    console.log('🚀 Testing URL:', url);

    // Test normalization
    const normalized = normalizeNetflixUrl(url);
    console.log('📝 Normalized:', normalized);

    // Test scraping
    const result = await scraperNetflix(url);
    console.log('✅ Result:', JSON.stringify(result, null, 2));

  } catch (error) {
    console.error('❌ Error:', error.message);
    console.error('Stack:', error.stack);
  }
}

debugUrl('https://www.netflix.com/title/80189685');

3. Browser Debugging

// Test headless mode with visible browser
const result = await scraperNetflix(url, {
  headless: false,  // Show browser
  timeoutMs: 60000  // Longer timeout for debugging
});

4. HTML Inspection

// Save HTML for manual inspection
import fs from 'node:fs';

async function debugHtml(url) {
  const html = await fetchStaticHtml(url);
  fs.writeFileSync('debug-page.html', html);
  console.log('HTML saved to debug-page.html');

  const parsed = parseNetflixHtml(html);
  console.log('Parsed:', parsed);
}

Debugging Netflix Changes

Netflix UI Pattern Changes

// When Netflix changes their UI text patterns
function updateTurkishPatterns(newPatterns) {
  const TURKISH_UI_PATTERNS = [
    ...TURKISH_UI_PATTERNS,
    ...newPatterns
  ];

  console.log('🔄 Updated Turkish patterns:', newPatterns);
}

JSON-LD Structure Changes

// Debug JSON-LD extraction
function debugJsonLd(html) {
  const $ = load(html);

  $('script[type="application/ld+json"]').each((i, el) => {
    const raw = $(el).contents().text();
    try {
      const parsed = JSON.parse(raw);
      console.log(`JSON-LD ${i}:`, JSON.stringify(parsed, null, 2));
    } catch (error) {
      console.log(`JSON-LD ${i} parse error:`, error.message);
    }
  });
}

📦 Dependency Management

Adding Dependencies

# Production dependency
npm install cheerio@^1.0.0-rc.12

# Optional dependency
npm install playwright --save-optional

# Development dependency
npm install vitest --save-dev

# Update package.json exports

Updating Dependencies

# Check for outdated packages
npm outdated

# Update specific package
npm update cheerio

# Update all packages
npm update

# Test after updates
npm test

Polyfill Management

// src/polyfill.js - Keep minimal and targeted
import { Blob } from 'node:buffer';

// Only polyfill what's needed for undici/fetch
class PolyfillFile extends Blob {
  constructor(parts, name, options = {}) {
    super(parts, options);
    this.name = String(name);
    this.lastModified = options.lastModified ?? Date.now();
  }
}

globalThis.File = globalThis.File || PolyfillFile;
globalThis.Blob = globalThis.Blob || Blob;

🚀 Performance Optimization

Profiling

import { performance } from 'node:perf_hooks';

async function profileScraping(url) {
  const start = performance.now();

  // Profile URL normalization
  const normStart = performance.now();
  const normalized = normalizeNetflixUrl(url);
  console.log('Normalization:', performance.now() - normStart, 'ms');

  // Profile HTML fetch
  const fetchStart = performance.now();
  const html = await fetchStaticHtml(normalized);
  console.log('HTML fetch:', performance.now() - fetchStart, 'ms');

  // Profile parsing
  const parseStart = performance.now();
  const parsed = parseNetflixHtml(html);
  console.log('Parsing:', performance.now() - parseStart, 'ms');

  const total = performance.now() - start;
  console.log('Total:', total, 'ms');

  return parsed;
}

Memory Optimization

// Clean up browser resources properly
export async function fetchPageContentWithPlaywright(url, options) {
  const browser = await chromium.launch({ headless: options.headless !== false });

  try {
    const context = await browser.newContext({ userAgent: options.userAgent });
    const page = await context.newPage();

    await page.goto(url, { timeout: options.timeoutMs });
    return await page.content();
  } finally {
    // Always close browser to prevent memory leaks
    await browser.close();
  }
}

🤝 Contribution Process

Before Contributing

  1. Read Documentation: Familiarize yourself with the codebase
  2. Run Tests: Ensure existing tests pass
  3. Understand Scope: Keep changes focused and minimal

Submitting Changes

  1. Fork Repository: Create your own fork
  2. Create Branch: Use descriptive branch names
  3. Write Tests: Ensure new code is tested
  4. Update Docs: Update relevant documentation
  5. Submit PR: Include clear description and testing instructions

Pull Request Template

## Description
Brief description of changes made

## Type of Change
- [ ] Bug fix
- [ ] New feature
- [ ] Breaking change
- [ ] Documentation update

## Testing
- [ ] All tests pass
- [ ] New tests added
- [ ] Manual testing completed

## Checklist
- [ ] Code follows style guidelines
- [ ] Self-review completed
- [ ] Documentation updated
- [ ] Performance considered

## Additional Notes
Any additional context or considerations

Development guide last updated: 2025-11-23