Files
subwatcher/services/api/src/lib/subtitleEngine.ts
wisecolt 9f07ff445e feat(api): turkcealtyazi gerçek modu desteği ekle
TurkceAltyazi sağlayıcısı için gerçek HTTP istekleri ve HTML
parsing özelliği eklendi. Özellik bayrak ile açılıp kapatılabilir
ve hata durumunda mock moduna dönüş yapabilir.

Yapılan değişiklikler:
- Yeni ortam değişkenleri eklendi (ENABLE_TURKCEALTYAZI_REAL, vb.)
- axios ve cheerio bağımlılıkları eklendi
- Gerçek indirme ve arama işlemleri için turkcealtyaziReal.ts modülü eklendi
- Dokümantasyon güncellendi
- Detaylı trace logging desteği eklendi
2026-02-16 09:29:01 +03:00

221 lines
8.0 KiB
TypeScript

import fs from 'node:fs/promises';
import path from 'node:path';
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import fse from 'fs-extra';
import { env } from '../config/env.js';
import type { SearchParams, TraceLog } from '../types/index.js';
import { SubtitleProvider, Candidate } from '../types/index.js';
import { TurkceAltyaziProvider } from '../providers/TurkceAltyaziProvider.js';
import { OpenSubtitlesProvider } from '../providers/OpenSubtitlesProvider.js';
import { collectFilesRecursive, ensureInsideRoot, validateExtractionLimits } from './security.js';
import { detectSubtitleType, isProbablyText } from './validators.js';
import { chooseBest, scoreCandidateFile } from './scoring.js';
const execFileAsync = promisify(execFile);
const providerEntries: Array<{ name: Candidate['provider']; impl: SubtitleProvider }> = [
{ name: 'turkcealtyazi', impl: new TurkceAltyaziProvider() },
{ name: 'opensubtitles', impl: new OpenSubtitlesProvider() }
];
function defaultLimits() {
return { maxFiles: 300, maxTotalBytes: 250 * 1024 * 1024, maxSingleBytes: 10 * 1024 * 1024 };
}
async function ensureJobDirs(jobToken: string) {
const base = path.join(env.tempRoot, jobToken);
const download = path.join(base, 'download');
const extracted = path.join(base, 'extracted');
await fs.mkdir(download, { recursive: true });
await fs.mkdir(extracted, { recursive: true });
return { base, download, extracted };
}
async function extractArchive(archivePath: string, extractedDir: string, trace: TraceLog[]): Promise<string[]> {
trace.push({ level: 'info', step: 'EXTRACT_STARTED', message: archivePath });
await execFileAsync('7z', ['x', '-y', archivePath, `-o${extractedDir}`]);
const files = await collectFilesRecursive(extractedDir);
trace.push({ level: 'info', step: 'EXTRACT_DONE', message: `Extracted ${files.length} files` });
return files;
}
export async function searchSubtitles(input: SearchParams) {
const jobToken = input.jobToken ?? `job-${Date.now()}`;
const trace: TraceLog[] = [];
const limits = input.securityLimits ?? defaultLimits();
const dirs = await ensureJobDirs(jobToken);
const allCandidates: Candidate[] = [];
for (const p of providerEntries) {
if (p.name === 'turkcealtyazi') {
trace.push({ level: 'info', step: 'TA_SEARCH_REQUEST', message: 'TurkceAltyazi provider search started' });
}
trace.push({ level: 'info', step: 'SUBTITLE_SEARCH_STARTED', message: `Provider search started: ${p.name}` });
const c = await p.impl.search(input);
trace.push({ level: 'info', step: 'SUBTITLE_SEARCH_DONE', message: `Provider search done: ${p.name}`, meta: { count: c.length } });
if (p.name === 'turkcealtyazi') {
const realCount = c.filter((item) => item.scoreHints.includes('real_provider')).length;
trace.push({
level: 'info',
step: 'TA_SEARCH_PARSED',
message: `TurkceAltyazi candidates parsed`,
meta: { total: c.length, real: realCount, mock: c.length - realCount }
});
}
allCandidates.push(...c);
}
const scored: any[] = [];
for (const candidate of allCandidates) {
const provider = providerEntries.find((p) => p.name === candidate.provider)?.impl;
if (!provider) continue;
const dl = await provider.download(candidate, input, jobToken);
if (Array.isArray(dl.trace)) {
trace.push(...dl.trace);
}
trace.push({ level: 'info', step: 'ARCHIVE_DOWNLOADED', message: `${candidate.provider}:${candidate.id}`, meta: { path: dl.filePath, type: dl.type } });
let files: string[] = [];
if (dl.type === 'archive') {
const perCandidateExtractDir = path.join(dirs.extracted, candidate.id);
await fs.mkdir(perCandidateExtractDir, { recursive: true });
files = await extractArchive(dl.filePath, perCandidateExtractDir, trace);
for (const file of files) {
const inside = await ensureInsideRoot(perCandidateExtractDir, file);
if (!inside) {
trace.push({ level: 'warn', step: 'ZIPSLIP_REJECTED', message: `Rejected path traversal candidate: ${file}` });
await fse.remove(file);
}
}
files = await collectFilesRecursive(perCandidateExtractDir);
const lim = await validateExtractionLimits(files, limits);
if (!lim.ok) {
trace.push({ level: 'warn', step: 'LIMIT_REJECTED', message: lim.reason ?? 'limit rejected' });
continue;
}
} else {
files = [dl.filePath];
}
for (const file of files) {
const buf = await fs.readFile(file);
if (!isProbablyText(buf)) {
await fse.remove(file);
trace.push({ level: 'warn', step: 'INVALID_SUBTITLE_DELETED', message: `Deleted binary/invalid: ${file}` });
continue;
}
const text = buf.toString('utf8');
const ext = detectSubtitleType(text);
if (!ext) {
await fse.remove(file);
trace.push({ level: 'warn', step: 'INVALID_SUBTITLE_DELETED', message: `Deleted unknown subtitle content: ${file}` });
continue;
}
const s = scoreCandidateFile(file, ext, candidate, input);
if (s) scored.push(s);
}
}
trace.push({ level: 'info', step: 'CANDIDATES_SCANNED', message: `Scored ${scored.length} subtitle files` });
const decision = chooseBest(scored);
const manifestPath = path.join(dirs.base, 'manifest.json');
await fs.writeFile(manifestPath, JSON.stringify({ jobToken, input, scored: decision.candidates }, null, 2), 'utf8');
if (decision.status === 'FOUND' && decision.best) {
const bestPath = path.join(dirs.base, `best.${decision.best.ext}`);
await fs.copyFile(decision.best.filePath, bestPath);
trace.push({ level: 'info', step: 'BEST_SELECTED', message: `Selected ${decision.best.filePath}`, meta: { score: decision.best.score } });
return {
status: 'FOUND',
jobToken,
bestPath,
confidence: decision.confidence,
source: decision.best.provider,
candidates: decision.candidates,
trace
};
}
if (decision.status === 'AMBIGUOUS') {
trace.push({ level: 'warn', step: 'AMBIGUOUS_NEEDS_REVIEW', message: 'Top candidates too close' });
return {
status: 'AMBIGUOUS',
jobToken,
confidence: 0.5,
source: 'multi',
candidates: decision.candidates,
trace
};
}
trace.push({ level: 'warn', step: 'NOT_FOUND_NEEDS_REVIEW', message: 'No valid subtitle file found' });
return {
status: 'NOT_FOUND',
jobToken,
confidence: 0,
source: 'none',
candidates: [],
trace
};
}
export async function chooseSubtitle(jobToken: string, chosenCandidateId?: string, chosenPath?: string) {
const base = path.join(env.tempRoot, jobToken);
const manifestPath = path.join(base, 'manifest.json');
const raw = await fs.readFile(manifestPath, 'utf8');
const manifest = JSON.parse(raw);
const list = manifest.scored ?? [];
const found = chosenPath
? list.find((x: any) => x.filePath === chosenPath || x.id === chosenPath)
: list.find((x: any) => x.id === chosenCandidateId || x.candidateId === chosenCandidateId);
if (!found) {
return { status: 'NOT_FOUND', message: 'Chosen candidate not found' };
}
const bestPath = path.join(base, `best.${found.ext}`);
await fs.copyFile(found.filePath, bestPath);
return {
status: 'FOUND',
bestPath,
confidence: Math.max(0.5, Math.min(0.98, found.score / 130)),
source: found.provider
};
}
export async function cleanupJobToken(jobToken: string) {
const dir = path.join(env.tempRoot, jobToken);
await fse.remove(dir);
}
export async function cleanupOldTemp(hours = 24): Promise<number> {
await fs.mkdir(env.tempRoot, { recursive: true });
const entries = await fs.readdir(env.tempRoot, { withFileTypes: true });
const now = Date.now();
let count = 0;
for (const e of entries) {
if (!e.isDirectory()) continue;
const p = path.join(env.tempRoot, e.name);
const st = await fs.stat(p);
const ageHours = (now - st.mtimeMs) / 1000 / 3600;
if (ageHours > hours) {
await fse.remove(p);
count += 1;
}
}
return count;
}