HUTAMS_AUDIO/app/core/dictionary.py

import json
import os
import csv
import logging
from pydantic import BaseModel

logger = logging.getLogger("uvicorn.error")

RAILWAY_TERMS_LIST: list[str] = []
RAILWAY_TERMS_DICT: dict[str, list[dict]] = {}

def load_terms_from_csv(file_path: str):
    """
    [Chapter 6.1] 대규모 도메인 사전(CSV) 안전 적재
    동음이의어(중복 키)를 허용하기 위해 List 형태로 Value를 저장하며,
    RapidFuzz 일괄 대조를 위한 중복 제거 리스트를 함께 생성합니다.
    """
    global RAILWAY_TERMS_LIST, RAILWAY_TERMS_DICT
    RAILWAY_TERMS_LIST.clear()
    RAILWAY_TERMS_DICT.clear()

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"CSV 사전을 찾을 수 없습니다: {file_path}")

    # 인코딩 Fallback: utf-8-sig 우선, 실패 시 cp949
    try:
        with open(file_path, mode="r", encoding="utf-8-sig") as f:
            reader = csv.DictReader(f)
            rows = list(reader)
    except UnicodeDecodeError:
        logger.warning(f"UTF-8-SIG 디코딩 실패. CP949로 재시도합니다: {file_path}")
        with open(file_path, mode="r", encoding="cp949") as f:
            reader = csv.DictReader(f)
            rows = list(reader)

    unique_terms = set()
    count = 0

    for row in rows:
        keyword = row.get("용어명", "").strip()
        desc = row.get("내용", "").strip()
        category = row.get("관련분야", "").strip()

        if not keyword:
            continue

        unique_terms.add(keyword)
        if keyword not in RAILWAY_TERMS_DICT:
            RAILWAY_TERMS_DICT[keyword] = []

        RAILWAY_TERMS_DICT[keyword].append({
            "desc": desc,
            "category": category
        })
        count += 1

    RAILWAY_TERMS_LIST.extend(list(unique_terms))
    logger.info(f"✅ 대규모 사전 로드 완료: 총 {count}행 처리, 매칭 대상 {len(RAILWAY_TERMS_LIST)}단어 (파일: {file_path})")


# ── 화자 분류용 호출 부호 외부 정의 ──────────────────────────────────────────
# speaker_classifier.py가 이 목록을 임포트하여 사용합니다.
# 현장 호출 부호가 추가될 때 여기만 수정하면 전체 파이프라인에 반영됩니다.

CALLSIGNS_CONTROL: list[str] = [
    # ── 부산 도시철도 관제 호출 코드 ──
    "전철 보안", "전철 범일", "전철 호포", "전철 신평", "전철 관제", "전철 통제",
    "전철보안", "전철범일", "전철호포", "전철신평",
    # 관제사 특유 어구
    "진로 확인 부탁", "통과 허가", "신호진로 확인",
]

CALLSIGNS_TRAIN: list[str] = [
    # ── 차량/열차 유형 ──
    "모터카", "전기 모터카", "신호 모터카", "검측차", "검축차", "궤도 검측차",
    # 열차 발화 특유 어구
    "출발 합니다", "출발하겠습니다", "통과 하겠습니다", "통과하겠습니다",
    "확인하고 통과", "신호 확인 후 통과",
]

class DomainDictionary(BaseModel):
    stations: list[str] = [
        "다대포해수욕장", "다대포항", "낫개", "신장림", "장림", "동매", "신평", "하단", "당리", "사하", "괴정",
        "대티", "서대신", "동대신", "토성", "자갈치", "남포", "중앙", "노포", "범어사", "남산", "두실", "구서", "장전",
        "부산대", "온천장", "명륜", "동래", "교대", "연산", "시청", "양정", "부전", "서면", "범내골", "범일", "좌천", "부산진",
        "초량", "부산역"
    ]
    railway_terms: list[str] = [
        "모터카", "분기기", "신호기", "궤도", "검축차", "하선", "상선", "입고", "출고", "무전", "수신", "양호",
        "신호 모터카", "전기 모터카", "궤도 검측차"
    ]
    HARDCODED_FIXES: dict[str, str] = {
        "신호질로": "신호 진로",
        "멀티플": "멀티플 타이탬퍼"
    }

    def get_prompt(self) -> str:
        """STT 모델에 주입할 initial_prompt 문자열을 반환합니다."""
        return " ".join(self.stations + self.railway_terms)
    def post_process_correction(self, text: str, threshold: float = 85.0) -> str:
        """
        [Chapter 6.1 최적화 알고리즘]
        RapidFuzz를 사용하여 문장 내 특정 어절들을 사전 단어와 비교 후
        오타로 판단되면 철도 전문 용어로 자동 교정합니다. (띄어쓰기 기준으로 토큰화)
        """
        from rapidfuzz import process, fuzz
        # RAILWAY_TERMS_LIST가 비어있으면 기본 하드코딩을 쓰고, 있으면 합침
        all_terms = self.stations + self.railway_terms
        if RAILWAY_TERMS_LIST:
            all_terms.extend(RAILWAY_TERMS_LIST)

        # 1. 하드코딩 교정 (단순 매칭/replace)
        for bad_word, good_word in self.HARDCODED_FIXES.items():
            text = text.replace(bad_word, good_word)

        words = text.split()
        corrected_words = []

        # 유효한 최적화를 위해 중복 제거
        unique_terms = list(set(all_terms))

        for word in words:
            # 특수 기호 경계 지우기 (단순화)
            clean_word = "".join(c for c in word if c.isalnum())
            best_match = None

            # 짧은 단어 무시 (길이 2 이상만 대조, CPU 최적화)
            if len(clean_word) >= 2 and unique_terms:
                match_result = process.extractOne(
                    clean_word,
                    unique_terms,
                    scorer=fuzz.WRatio,
                    score_cutoff=threshold
                )
                if match_result:
                    best_match = match_result[0]

            if best_match:
                corrected_words.append(best_match)
            else:
                corrected_words.append(word)

        # 3. 추가 보정 (선택사항)
        for i, cw in enumerate(corrected_words):
            if "다대포" in cw and not cw.endswith("역") and not "해수욕" in cw:
                corrected_words[i] = cw + "역"

        return " ".join(corrected_words)

# 전역 인스턴스
domain_dict = DomainDictionary()