ImageProcessor_MainServer/worker/translator.py

128 lines
4.6 KiB
Python

# worker/translator.py
from abc import ABC, abstractmethod
from typing import Iterable, List
import os, re
SEP = "\n" # 문장 구분자 - 줄바꿈
MAX_CHARS = 4500 # 한 번에 보낼 최대 길이(웹엔진 URL/쿼리 한계 회피)
# ===== 인터페이스 =====
class ITranslator(ABC):
@abstractmethod
def translate_batch(self, texts: Iterable[str], src: str, dest: str) -> List[str]:
...
# ===== 현재 기본: deep-translator =====
class DeepTranslatorAdapter(ITranslator):
def __init__(self):
from deep_translator import GoogleTranslator
self.client = GoogleTranslator(source="zh-CN", target="ko")
def _translate_one(self, text: str, src: str, dest: str) -> str:
self.client.source = src
self.client.target = dest
return self.client.translate(text)
def translate_batch(self, texts: Iterable[str], src: str, dest: str) -> List[str]:
texts = list(texts)
print(f"translate_batch in: {texts}")
if not texts:
return []
out: List[str] = []
buf: List[str] = []
total = 0
for t in texts:
add_len = len(t) + len(SEP)
if total + add_len > MAX_CHARS and buf:
out.extend(self._translate_joined(buf, src, dest))
buf, total = [], 0
buf.append(t)
total += add_len
if buf:
out.extend(self._translate_joined(buf, src, dest))
# 개수 불일치 시 안전하게 개별 재번역
if len(out) != len(texts):
print(f"[WARN] translate_batch count mismatch: in={len(texts)} out={len(out)} -> fallback per-sentence")
out = [self._safe_translate_item(t, src, dest) for t in texts]
print(f"translate_batch out: {out}")
return out
def _safe_translate_item(self, t: str, src: str, dest: str) -> str:
try:
return self._normalize_spaces(self._translate_one(t, src, dest))
except Exception as e:
return t
def _normalize_spaces(self, s: str) -> str:
s = s.replace("\u200b", "")
s = re.sub(r"[ \t]+", " ", s)
s = re.sub(r"\s+\n", "\n", s)
s = re.sub(r"\n\s+", "\n", s)
return s.strip()
def _translate_joined(self, parts: List[str], src: str, dest: str) -> List[str]:
joined_text = SEP.join(parts)
try:
tr = self._translate_one(joined_text, src, dest)
except Exception as e:
print(f"[WARN] batch translate error -> per-item fallback: {e}")
return [self._safe_translate_item(p, src, dest) for p in parts]
results = tr.split(SEP)
results = [self._normalize_spaces(s) for s in results]
if len(results) != len(parts):
print(f"[WARN] split size mismatch: expected={len(parts)} got={len(results)} -> per-item fallback")
return [self._safe_translate_item(p, src, dest) for p in parts]
return results
# ===== Azure =====
class AzureTranslatorAdapter(ITranslator):
def __init__(self):
import requests
self.session = requests.Session()
self.key = os.environ["AZURE_TRANSLATOR_KEY"]
self.region = os.environ.get("AZURE_TRANSLATOR_REGION", "")
self.endpoint = os.environ.get(
"AZURE_TRANSLATOR_ENDPOINT",
"https://api.cognitive.microsofttranslator.com"
)
def translate_batch(self, texts, src, dest):
import json, time
# API는 한 번에 여러 문장 전송 가능 (요금/레이트 리밋에 유리)
url = f"{self.endpoint}/translate?api-version=3.0&from={src}&to={dest}"
headers = {
"Ocp-Apim-Subscription-Key": self.key,
"Ocp-Apim-Subscription-Region": self.region,
"Content-Type": "application/json"
}
body = [{"text": t} for t in texts]
# 간단 재시도
for i in range(3):
r = self.session.post(url, headers=headers, data=json.dumps(body), timeout=10)
if r.ok:
data = r.json()
return [item["translations"][0]["text"] for item in data]
time.sleep(0.5 * (2 ** i))
r.raise_for_status() # 실패 시 예외
_provider_singleton: ITranslator | None = None
def get_translator() -> ITranslator:
global _provider_singleton
if _provider_singleton is not None:
return _provider_singleton
provider = os.environ.get("TRANSLATOR_PROVIDER", "deep").lower()
if provider == "azure":
_provider_singleton = AzureTranslatorAdapter()
else:
_provider_singleton = DeepTranslatorAdapter()
return _provider_singleton