import fs from 'node:fs/promises'; import path from 'node:path'; import { execFile } from 'node:child_process'; import { promisify } from 'node:util'; import fse from 'fs-extra'; import { env } from '../config/env.js'; import type { SearchParams, TraceLog } from '../types/index.js'; import { SubtitleProvider, Candidate } from '../types/index.js'; import { TurkceAltyaziProvider } from '../providers/TurkceAltyaziProvider.js'; import { OpenSubtitlesProvider } from '../providers/OpenSubtitlesProvider.js'; import { collectFilesRecursive, ensureInsideRoot, validateExtractionLimits } from './security.js'; import { detectSubtitleType, isProbablyText } from './validators.js'; import { chooseBest, scoreCandidateFile } from './scoring.js'; const execFileAsync = promisify(execFile); const providerEntries: Array<{ name: Candidate['provider']; impl: SubtitleProvider }> = [ { name: 'turkcealtyazi', impl: new TurkceAltyaziProvider() }, { name: 'opensubtitles', impl: new OpenSubtitlesProvider() } ]; function defaultLimits() { return { maxFiles: 300, maxTotalBytes: 250 * 1024 * 1024, maxSingleBytes: 10 * 1024 * 1024 }; } async function ensureJobDirs(jobToken: string) { const base = path.join(env.tempRoot, jobToken); const download = path.join(base, 'download'); const extracted = path.join(base, 'extracted'); await fs.mkdir(download, { recursive: true }); await fs.mkdir(extracted, { recursive: true }); return { base, download, extracted }; } async function extractArchive(archivePath: string, extractedDir: string, trace: TraceLog[]): Promise { trace.push({ level: 'info', step: 'EXTRACT_STARTED', message: archivePath }); await execFileAsync('7z', ['x', '-y', archivePath, `-o${extractedDir}`]); const files = await collectFilesRecursive(extractedDir); trace.push({ level: 'info', step: 'EXTRACT_DONE', message: `Extracted ${files.length} files` }); return files; } export async function searchSubtitles(input: SearchParams) { const jobToken = input.jobToken ?? `job-${Date.now()}`; const trace: TraceLog[] = []; const limits = input.securityLimits ?? defaultLimits(); const dirs = await ensureJobDirs(jobToken); const allCandidates: Candidate[] = []; for (const p of providerEntries) { if (p.name === 'turkcealtyazi') { trace.push({ level: 'info', step: 'TA_SEARCH_REQUEST', message: 'TurkceAltyazi provider search started' }); } trace.push({ level: 'info', step: 'SUBTITLE_SEARCH_STARTED', message: `Provider search started: ${p.name}` }); const c = await p.impl.search(input); trace.push({ level: 'info', step: 'SUBTITLE_SEARCH_DONE', message: `Provider search done: ${p.name}`, meta: { count: c.length } }); if (p.name === 'turkcealtyazi') { const realCount = c.filter((item) => item.scoreHints.includes('real_provider')).length; trace.push({ level: 'info', step: 'TA_SEARCH_PARSED', message: `TurkceAltyazi candidates parsed`, meta: { total: c.length, real: realCount, mock: c.length - realCount } }); } allCandidates.push(...c); } const scored: any[] = []; for (const candidate of allCandidates) { const provider = providerEntries.find((p) => p.name === candidate.provider)?.impl; if (!provider) continue; const dl = await provider.download(candidate, input, jobToken); if (Array.isArray(dl.trace)) { trace.push(...dl.trace); } trace.push({ level: 'info', step: 'ARCHIVE_DOWNLOADED', message: `${candidate.provider}:${candidate.id}`, meta: { path: dl.filePath, type: dl.type } }); let files: string[] = []; if (dl.type === 'archive') { const perCandidateExtractDir = path.join(dirs.extracted, candidate.id); await fs.mkdir(perCandidateExtractDir, { recursive: true }); files = await extractArchive(dl.filePath, perCandidateExtractDir, trace); for (const file of files) { const inside = await ensureInsideRoot(perCandidateExtractDir, file); if (!inside) { trace.push({ level: 'warn', step: 'ZIPSLIP_REJECTED', message: `Rejected path traversal candidate: ${file}` }); await fse.remove(file); } } files = await collectFilesRecursive(perCandidateExtractDir); const lim = await validateExtractionLimits(files, limits); if (!lim.ok) { trace.push({ level: 'warn', step: 'LIMIT_REJECTED', message: lim.reason ?? 'limit rejected' }); continue; } } else { files = [dl.filePath]; } for (const file of files) { const buf = await fs.readFile(file); if (!isProbablyText(buf)) { await fse.remove(file); trace.push({ level: 'warn', step: 'INVALID_SUBTITLE_DELETED', message: `Deleted binary/invalid: ${file}` }); continue; } const text = buf.toString('utf8'); const ext = detectSubtitleType(text); if (!ext) { await fse.remove(file); trace.push({ level: 'warn', step: 'INVALID_SUBTITLE_DELETED', message: `Deleted unknown subtitle content: ${file}` }); continue; } const s = scoreCandidateFile(file, ext, candidate, input); if (s) scored.push(s); } } trace.push({ level: 'info', step: 'CANDIDATES_SCANNED', message: `Scored ${scored.length} subtitle files` }); const decision = chooseBest(scored); const manifestPath = path.join(dirs.base, 'manifest.json'); await fs.writeFile(manifestPath, JSON.stringify({ jobToken, input, scored: decision.candidates }, null, 2), 'utf8'); if (decision.status === 'FOUND' && decision.best) { const bestPath = path.join(dirs.base, `best.${decision.best.ext}`); await fs.copyFile(decision.best.filePath, bestPath); trace.push({ level: 'info', step: 'BEST_SELECTED', message: `Selected ${decision.best.filePath}`, meta: { score: decision.best.score } }); return { status: 'FOUND', jobToken, bestPath, confidence: decision.confidence, source: decision.best.provider, candidates: decision.candidates, trace }; } if (decision.status === 'AMBIGUOUS') { trace.push({ level: 'warn', step: 'AMBIGUOUS_NEEDS_REVIEW', message: 'Top candidates too close' }); return { status: 'AMBIGUOUS', jobToken, confidence: 0.5, source: 'multi', candidates: decision.candidates, trace }; } trace.push({ level: 'warn', step: 'NOT_FOUND_NEEDS_REVIEW', message: 'No valid subtitle file found' }); return { status: 'NOT_FOUND', jobToken, confidence: 0, source: 'none', candidates: [], trace }; } export async function chooseSubtitle(jobToken: string, chosenCandidateId?: string, chosenPath?: string) { const base = path.join(env.tempRoot, jobToken); const manifestPath = path.join(base, 'manifest.json'); const raw = await fs.readFile(manifestPath, 'utf8'); const manifest = JSON.parse(raw); const list = manifest.scored ?? []; const found = chosenPath ? list.find((x: any) => x.filePath === chosenPath || x.id === chosenPath) : list.find((x: any) => x.id === chosenCandidateId || x.candidateId === chosenCandidateId); if (!found) { return { status: 'NOT_FOUND', message: 'Chosen candidate not found' }; } const bestPath = path.join(base, `best.${found.ext}`); await fs.copyFile(found.filePath, bestPath); return { status: 'FOUND', bestPath, confidence: Math.max(0.5, Math.min(0.98, found.score / 130)), source: found.provider }; } export async function cleanupJobToken(jobToken: string) { const dir = path.join(env.tempRoot, jobToken); await fse.remove(dir); } export async function cleanupOldTemp(hours = 24): Promise { await fs.mkdir(env.tempRoot, { recursive: true }); const entries = await fs.readdir(env.tempRoot, { withFileTypes: true }); const now = Date.now(); let count = 0; for (const e of entries) { if (!e.isDirectory()) continue; const p = path.join(env.tempRoot, e.name); const st = await fs.stat(p); const ageHours = (now - st.mtimeMs) / 1000 / 3600; if (ageHours > hours) { await fse.remove(p); count += 1; } } return count; }