128 lines
4.6 KiB
Python
128 lines
4.6 KiB
Python
# worker/translator.py
|
|
from abc import ABC, abstractmethod
|
|
from typing import Iterable, List
|
|
import os, re
|
|
|
|
SEP = "\n" # 문장 구분자 - 줄바꿈
|
|
MAX_CHARS = 4500 # 한 번에 보낼 최대 길이(웹엔진 URL/쿼리 한계 회피)
|
|
|
|
# ===== 인터페이스 =====
|
|
class ITranslator(ABC):
|
|
@abstractmethod
|
|
def translate_batch(self, texts: Iterable[str], src: str, dest: str) -> List[str]:
|
|
...
|
|
|
|
# ===== 현재 기본: deep-translator =====
|
|
class DeepTranslatorAdapter(ITranslator):
|
|
def __init__(self):
|
|
from deep_translator import GoogleTranslator
|
|
self.client = GoogleTranslator(source="zh-CN", target="ko")
|
|
|
|
def _translate_one(self, text: str, src: str, dest: str) -> str:
|
|
self.client.source = src
|
|
self.client.target = dest
|
|
return self.client.translate(text)
|
|
|
|
def translate_batch(self, texts: Iterable[str], src: str, dest: str) -> List[str]:
|
|
texts = list(texts)
|
|
print(f"translate_batch in: {texts}")
|
|
if not texts:
|
|
return []
|
|
|
|
out: List[str] = []
|
|
buf: List[str] = []
|
|
total = 0
|
|
|
|
for t in texts:
|
|
add_len = len(t) + len(SEP)
|
|
if total + add_len > MAX_CHARS and buf:
|
|
out.extend(self._translate_joined(buf, src, dest))
|
|
buf, total = [], 0
|
|
buf.append(t)
|
|
total += add_len
|
|
|
|
if buf:
|
|
out.extend(self._translate_joined(buf, src, dest))
|
|
|
|
# 개수 불일치 시 안전하게 개별 재번역
|
|
if len(out) != len(texts):
|
|
print(f"[WARN] translate_batch count mismatch: in={len(texts)} out={len(out)} -> fallback per-sentence")
|
|
out = [self._safe_translate_item(t, src, dest) for t in texts]
|
|
|
|
print(f"translate_batch out: {out}")
|
|
return out
|
|
|
|
def _safe_translate_item(self, t: str, src: str, dest: str) -> str:
|
|
try:
|
|
return self._normalize_spaces(self._translate_one(t, src, dest))
|
|
except Exception as e:
|
|
return t
|
|
|
|
def _normalize_spaces(self, s: str) -> str:
|
|
s = s.replace("\u200b", "")
|
|
s = re.sub(r"[ \t]+", " ", s)
|
|
s = re.sub(r"\s+\n", "\n", s)
|
|
s = re.sub(r"\n\s+", "\n", s)
|
|
return s.strip()
|
|
|
|
def _translate_joined(self, parts: List[str], src: str, dest: str) -> List[str]:
|
|
joined_text = SEP.join(parts)
|
|
try:
|
|
tr = self._translate_one(joined_text, src, dest)
|
|
except Exception as e:
|
|
print(f"[WARN] batch translate error -> per-item fallback: {e}")
|
|
return [self._safe_translate_item(p, src, dest) for p in parts]
|
|
|
|
results = tr.split(SEP)
|
|
results = [self._normalize_spaces(s) for s in results]
|
|
|
|
if len(results) != len(parts):
|
|
print(f"[WARN] split size mismatch: expected={len(parts)} got={len(results)} -> per-item fallback")
|
|
return [self._safe_translate_item(p, src, dest) for p in parts]
|
|
|
|
return results
|
|
|
|
# ===== Azure =====
|
|
class AzureTranslatorAdapter(ITranslator):
|
|
def __init__(self):
|
|
import requests
|
|
self.session = requests.Session()
|
|
self.key = os.environ["AZURE_TRANSLATOR_KEY"]
|
|
self.region = os.environ.get("AZURE_TRANSLATOR_REGION", "")
|
|
self.endpoint = os.environ.get(
|
|
"AZURE_TRANSLATOR_ENDPOINT",
|
|
"https://api.cognitive.microsofttranslator.com"
|
|
)
|
|
|
|
def translate_batch(self, texts, src, dest):
|
|
import json, time
|
|
# API는 한 번에 여러 문장 전송 가능 (요금/레이트 리밋에 유리)
|
|
url = f"{self.endpoint}/translate?api-version=3.0&from={src}&to={dest}"
|
|
headers = {
|
|
"Ocp-Apim-Subscription-Key": self.key,
|
|
"Ocp-Apim-Subscription-Region": self.region,
|
|
"Content-Type": "application/json"
|
|
}
|
|
body = [{"text": t} for t in texts]
|
|
# 간단 재시도
|
|
for i in range(3):
|
|
r = self.session.post(url, headers=headers, data=json.dumps(body), timeout=10)
|
|
if r.ok:
|
|
data = r.json()
|
|
return [item["translations"][0]["text"] for item in data]
|
|
time.sleep(0.5 * (2 ** i))
|
|
r.raise_for_status() # 실패 시 예외
|
|
|
|
_provider_singleton: ITranslator | None = None
|
|
|
|
def get_translator() -> ITranslator:
|
|
global _provider_singleton
|
|
if _provider_singleton is not None:
|
|
return _provider_singleton
|
|
provider = os.environ.get("TRANSLATOR_PROVIDER", "deep").lower()
|
|
if provider == "azure":
|
|
_provider_singleton = AzureTranslatorAdapter()
|
|
else:
|
|
_provider_singleton = DeepTranslatorAdapter()
|
|
return _provider_singleton
|