TurkceAltyazi sağlayıcısı için gerçek HTTP istekleri ve HTML parsing özelliği eklendi. Özellik bayrak ile açılıp kapatılabilir ve hata durumunda mock moduna dönüş yapabilir. Yapılan değişiklikler: - Yeni ortam değişkenleri eklendi (ENABLE_TURKCEALTYAZI_REAL, vb.) - axios ve cheerio bağımlılıkları eklendi - Gerçek indirme ve arama işlemleri için turkcealtyaziReal.ts modülü eklendi - Dokümantasyon güncellendi - Detaylı trace logging desteği eklendi
221 lines
8.0 KiB
TypeScript
221 lines
8.0 KiB
TypeScript
import fs from 'node:fs/promises';
|
|
import path from 'node:path';
|
|
import { execFile } from 'node:child_process';
|
|
import { promisify } from 'node:util';
|
|
import fse from 'fs-extra';
|
|
import { env } from '../config/env.js';
|
|
import type { SearchParams, TraceLog } from '../types/index.js';
|
|
import { SubtitleProvider, Candidate } from '../types/index.js';
|
|
import { TurkceAltyaziProvider } from '../providers/TurkceAltyaziProvider.js';
|
|
import { OpenSubtitlesProvider } from '../providers/OpenSubtitlesProvider.js';
|
|
import { collectFilesRecursive, ensureInsideRoot, validateExtractionLimits } from './security.js';
|
|
import { detectSubtitleType, isProbablyText } from './validators.js';
|
|
import { chooseBest, scoreCandidateFile } from './scoring.js';
|
|
|
|
const execFileAsync = promisify(execFile);
|
|
|
|
const providerEntries: Array<{ name: Candidate['provider']; impl: SubtitleProvider }> = [
|
|
{ name: 'turkcealtyazi', impl: new TurkceAltyaziProvider() },
|
|
{ name: 'opensubtitles', impl: new OpenSubtitlesProvider() }
|
|
];
|
|
|
|
function defaultLimits() {
|
|
return { maxFiles: 300, maxTotalBytes: 250 * 1024 * 1024, maxSingleBytes: 10 * 1024 * 1024 };
|
|
}
|
|
|
|
async function ensureJobDirs(jobToken: string) {
|
|
const base = path.join(env.tempRoot, jobToken);
|
|
const download = path.join(base, 'download');
|
|
const extracted = path.join(base, 'extracted');
|
|
await fs.mkdir(download, { recursive: true });
|
|
await fs.mkdir(extracted, { recursive: true });
|
|
return { base, download, extracted };
|
|
}
|
|
|
|
async function extractArchive(archivePath: string, extractedDir: string, trace: TraceLog[]): Promise<string[]> {
|
|
trace.push({ level: 'info', step: 'EXTRACT_STARTED', message: archivePath });
|
|
await execFileAsync('7z', ['x', '-y', archivePath, `-o${extractedDir}`]);
|
|
const files = await collectFilesRecursive(extractedDir);
|
|
trace.push({ level: 'info', step: 'EXTRACT_DONE', message: `Extracted ${files.length} files` });
|
|
return files;
|
|
}
|
|
|
|
export async function searchSubtitles(input: SearchParams) {
|
|
const jobToken = input.jobToken ?? `job-${Date.now()}`;
|
|
const trace: TraceLog[] = [];
|
|
const limits = input.securityLimits ?? defaultLimits();
|
|
const dirs = await ensureJobDirs(jobToken);
|
|
|
|
const allCandidates: Candidate[] = [];
|
|
for (const p of providerEntries) {
|
|
if (p.name === 'turkcealtyazi') {
|
|
trace.push({ level: 'info', step: 'TA_SEARCH_REQUEST', message: 'TurkceAltyazi provider search started' });
|
|
}
|
|
trace.push({ level: 'info', step: 'SUBTITLE_SEARCH_STARTED', message: `Provider search started: ${p.name}` });
|
|
const c = await p.impl.search(input);
|
|
trace.push({ level: 'info', step: 'SUBTITLE_SEARCH_DONE', message: `Provider search done: ${p.name}`, meta: { count: c.length } });
|
|
if (p.name === 'turkcealtyazi') {
|
|
const realCount = c.filter((item) => item.scoreHints.includes('real_provider')).length;
|
|
trace.push({
|
|
level: 'info',
|
|
step: 'TA_SEARCH_PARSED',
|
|
message: `TurkceAltyazi candidates parsed`,
|
|
meta: { total: c.length, real: realCount, mock: c.length - realCount }
|
|
});
|
|
}
|
|
allCandidates.push(...c);
|
|
}
|
|
|
|
const scored: any[] = [];
|
|
|
|
for (const candidate of allCandidates) {
|
|
const provider = providerEntries.find((p) => p.name === candidate.provider)?.impl;
|
|
if (!provider) continue;
|
|
|
|
const dl = await provider.download(candidate, input, jobToken);
|
|
if (Array.isArray(dl.trace)) {
|
|
trace.push(...dl.trace);
|
|
}
|
|
trace.push({ level: 'info', step: 'ARCHIVE_DOWNLOADED', message: `${candidate.provider}:${candidate.id}`, meta: { path: dl.filePath, type: dl.type } });
|
|
|
|
let files: string[] = [];
|
|
if (dl.type === 'archive') {
|
|
const perCandidateExtractDir = path.join(dirs.extracted, candidate.id);
|
|
await fs.mkdir(perCandidateExtractDir, { recursive: true });
|
|
files = await extractArchive(dl.filePath, perCandidateExtractDir, trace);
|
|
|
|
for (const file of files) {
|
|
const inside = await ensureInsideRoot(perCandidateExtractDir, file);
|
|
if (!inside) {
|
|
trace.push({ level: 'warn', step: 'ZIPSLIP_REJECTED', message: `Rejected path traversal candidate: ${file}` });
|
|
await fse.remove(file);
|
|
}
|
|
}
|
|
|
|
files = await collectFilesRecursive(perCandidateExtractDir);
|
|
const lim = await validateExtractionLimits(files, limits);
|
|
if (!lim.ok) {
|
|
trace.push({ level: 'warn', step: 'LIMIT_REJECTED', message: lim.reason ?? 'limit rejected' });
|
|
continue;
|
|
}
|
|
} else {
|
|
files = [dl.filePath];
|
|
}
|
|
|
|
for (const file of files) {
|
|
const buf = await fs.readFile(file);
|
|
if (!isProbablyText(buf)) {
|
|
await fse.remove(file);
|
|
trace.push({ level: 'warn', step: 'INVALID_SUBTITLE_DELETED', message: `Deleted binary/invalid: ${file}` });
|
|
continue;
|
|
}
|
|
|
|
const text = buf.toString('utf8');
|
|
const ext = detectSubtitleType(text);
|
|
if (!ext) {
|
|
await fse.remove(file);
|
|
trace.push({ level: 'warn', step: 'INVALID_SUBTITLE_DELETED', message: `Deleted unknown subtitle content: ${file}` });
|
|
continue;
|
|
}
|
|
|
|
const s = scoreCandidateFile(file, ext, candidate, input);
|
|
if (s) scored.push(s);
|
|
}
|
|
}
|
|
|
|
trace.push({ level: 'info', step: 'CANDIDATES_SCANNED', message: `Scored ${scored.length} subtitle files` });
|
|
|
|
const decision = chooseBest(scored);
|
|
const manifestPath = path.join(dirs.base, 'manifest.json');
|
|
await fs.writeFile(manifestPath, JSON.stringify({ jobToken, input, scored: decision.candidates }, null, 2), 'utf8');
|
|
|
|
if (decision.status === 'FOUND' && decision.best) {
|
|
const bestPath = path.join(dirs.base, `best.${decision.best.ext}`);
|
|
await fs.copyFile(decision.best.filePath, bestPath);
|
|
trace.push({ level: 'info', step: 'BEST_SELECTED', message: `Selected ${decision.best.filePath}`, meta: { score: decision.best.score } });
|
|
|
|
return {
|
|
status: 'FOUND',
|
|
jobToken,
|
|
bestPath,
|
|
confidence: decision.confidence,
|
|
source: decision.best.provider,
|
|
candidates: decision.candidates,
|
|
trace
|
|
};
|
|
}
|
|
|
|
if (decision.status === 'AMBIGUOUS') {
|
|
trace.push({ level: 'warn', step: 'AMBIGUOUS_NEEDS_REVIEW', message: 'Top candidates too close' });
|
|
return {
|
|
status: 'AMBIGUOUS',
|
|
jobToken,
|
|
confidence: 0.5,
|
|
source: 'multi',
|
|
candidates: decision.candidates,
|
|
trace
|
|
};
|
|
}
|
|
|
|
trace.push({ level: 'warn', step: 'NOT_FOUND_NEEDS_REVIEW', message: 'No valid subtitle file found' });
|
|
return {
|
|
status: 'NOT_FOUND',
|
|
jobToken,
|
|
confidence: 0,
|
|
source: 'none',
|
|
candidates: [],
|
|
trace
|
|
};
|
|
}
|
|
|
|
export async function chooseSubtitle(jobToken: string, chosenCandidateId?: string, chosenPath?: string) {
|
|
const base = path.join(env.tempRoot, jobToken);
|
|
const manifestPath = path.join(base, 'manifest.json');
|
|
const raw = await fs.readFile(manifestPath, 'utf8');
|
|
const manifest = JSON.parse(raw);
|
|
const list = manifest.scored ?? [];
|
|
|
|
const found = chosenPath
|
|
? list.find((x: any) => x.filePath === chosenPath || x.id === chosenPath)
|
|
: list.find((x: any) => x.id === chosenCandidateId || x.candidateId === chosenCandidateId);
|
|
|
|
if (!found) {
|
|
return { status: 'NOT_FOUND', message: 'Chosen candidate not found' };
|
|
}
|
|
|
|
const bestPath = path.join(base, `best.${found.ext}`);
|
|
await fs.copyFile(found.filePath, bestPath);
|
|
|
|
return {
|
|
status: 'FOUND',
|
|
bestPath,
|
|
confidence: Math.max(0.5, Math.min(0.98, found.score / 130)),
|
|
source: found.provider
|
|
};
|
|
}
|
|
|
|
export async function cleanupJobToken(jobToken: string) {
|
|
const dir = path.join(env.tempRoot, jobToken);
|
|
await fse.remove(dir);
|
|
}
|
|
|
|
export async function cleanupOldTemp(hours = 24): Promise<number> {
|
|
await fs.mkdir(env.tempRoot, { recursive: true });
|
|
const entries = await fs.readdir(env.tempRoot, { withFileTypes: true });
|
|
const now = Date.now();
|
|
let count = 0;
|
|
|
|
for (const e of entries) {
|
|
if (!e.isDirectory()) continue;
|
|
const p = path.join(env.tempRoot, e.name);
|
|
const st = await fs.stat(p);
|
|
const ageHours = (now - st.mtimeMs) / 1000 / 3600;
|
|
if (ageHours > hours) {
|
|
await fse.remove(p);
|
|
count += 1;
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|