TransWorker/modules/ocr_module.py

import cv2
import numpy as np
import os
import logging
from typing import List, Dict, Any

class OCRModule:
    def __init__(self, logger=None, base_dir=None):
        self.logger = logger
        self.base_dir = base_dir

        # CUDA 사용 가능하도록 환경 변수 설정 제거
        # os.environ['CUDA_VISIBLE_DEVICES'] = ''

        self.ocr = None

        self.ocr = self.initialize_ocr()
        if self.ocr is None:
            raise Exception("PaddleOCR 초기화 실패")

    def initialize_ocr(self):
        """
        PaddleOCR 초기화. det_enabled 옵션에 따라 Detection 모델 사용 여부 결정.
        """
        # 모델 디렉토리 설정
        self.rec_model_dir = os.path.join(self.base_dir, "modules", "PP_Models", "rec")
        self.det_model_dir = os.path.join(self.base_dir, "modules", "PP_Models", "det")
        self.cls_model_dir = os.path.join(self.base_dir, "modules", "PP_Models", "cls")

        try:
            from paddleocr import PaddleOCR
            import paddle
            use_gpu = False
            use_tensorrt = False
            try:
                use_gpu = paddle.is_compiled_with_cuda() and paddle.device.is_compiled_with_cuda()
                # TensorRT 사용 가능 여부 확인
                if use_gpu:
                    try:
                        import tensorrt as trt
                        use_tensorrt = True
                        self.logger.log(f"TensorRT 사용 가능: {trt.__version__}", level=logging.INFO)
                    except ImportError:
                        self.logger.log("TensorRT 패키지가 설치되지 않음", level=logging.WARNING)
                        use_tensorrt = False
            except Exception as e:
                self.logger.log(f"GPU 사용 가능 여부 확인 중 오류: {e}", level=logging.WARNING)
                use_gpu = False
                use_tensorrt = False

            self.logger.log(f"PaddleOCR use_gpu: {use_gpu}, use_tensorrt: {use_tensorrt}", level=logging.INFO)

            ocr = PaddleOCR(
                use_gpu=use_gpu,            # GPU 사용 가능하면 활성화
                use_tensorrt=use_tensorrt,  # TensorRT 활성화
                use_angle_cls=True,         # 텍스트 방향 분류 활성화
                lang="ch",
                precision='fp16',           # FP16 정밀도 사용
                use_mp=True,                # 멀티프로세스 활성화
                show_log=True,
                det_model_dir=self.det_model_dir,
                rec_model_dir=self.rec_model_dir,
                cls_model_dir=self.cls_model_dir
            )
            return ocr
        except Exception as e:
            self.logger.log(f"❌ PaddleOCR 초기화 실패: {e}", level=logging.ERROR, exc_info=True)
            # raise e  # 에러 발생시 프로그램 종료
            return None


    def detect_text(self, image_path: str, method: str = 'polygon') -> List[Dict[str, Any]]:
        """
        이미지에서 텍스트를 감지하고 다양한 방식으로 영역 반환

        Args:
            image_path (str): 이미지 파일 경로
            method (str): 감지 방식 ('polygon', 'bbox', 'expanded_bbox', 'rotated_bbox', 'contour')

        Returns:
            List[Dict]: 감지된 텍스트 정보 리스트
                - text: 감지된 텍스트
                - confidence: 신뢰도
                - polygon: 폴리곤 좌표 (4개 점)
                - bbox: 바운딩 박스 좌표 (x, y, w, h)
                - method: 사용된 감지 방식
        """
        if not os.path.exists(image_path):
            self.logger.log(f"이미지 파일을 찾을 수 없습니다: {image_path}", level=logging.ERROR)
            return []

        try:
            # 이미지 읽기
            image = cv2.imread(image_path)
            if image is None:
                self.logger.log(f"이미지를 읽을 수 없습니다: {image_path}", level=logging.ERROR)
                return []

            self.logger.log(f"🔍 OCR 감지 방식: {method}", level=logging.INFO)

            # 실제 OCR 실행
            # ocr_raw_results = self.ocr.predict(image)
            ocr_raw_results = self.ocr.ocr(image)

            self.logger.log(f"ocr_raw_results: {ocr_raw_results}", level=logging.INFO)
            for line in ocr_raw_results:
                self.logger.log(f"line: {line}", level=logging.INFO)

            if not ocr_raw_results or len(ocr_raw_results) == 0:
                self.logger.log("⚠️ OCR 결과가 비어있습니다.", level=logging.WARNING)
                return []

            # paddleocr 2.x 결과 파싱
            converted_results = []
            for page in ocr_raw_results:  # page는 텍스트별 결과 리스트
                for line in page:
                    poly = line[0]
                    text = line[1][0]
                    score = line[1][1]
                    converted_results.append([poly, [text, score]])

            # 감지 방식에 따라 결과 처리
            if method == 'polygon':
                ocr_results = self._detect_with_polygon(image, converted_results)
            elif method == 'bbox':
                ocr_results = self._detect_with_bbox(image, converted_results)
            elif method == 'expanded_bbox':
                ocr_results = self._detect_with_expanded_bbox(image, converted_results)
            elif method == 'rotated_bbox':
                ocr_results = self._detect_with_rotated_bbox(image, converted_results)
            elif method == 'contour':
                ocr_results = self._detect_with_contour(image, converted_results)
            else:
                self.logger.log(f"⚠️ 지원하지 않는 감지 방식: {method}, 기본 polygon 방식 사용", level=logging.WARNING)
                ocr_results = self._detect_with_polygon(image, converted_results)

            return ocr_results

        except Exception as e:
            self.logger.log(f"❌ OCR 처리 중 오류 발생: {e}", level=logging.ERROR, exc_info=True)
            return []

    def filter_chinese_text(self, ocr_results: List[Dict]) -> List[Dict]:
        """
        중국어 텍스트만 필터링

        Args:
            ocr_results (List[Dict]): OCR 결과

        Returns:
            List[Dict]: 중국어 텍스트만 포함된 결과
        """
        chinese_results = []

        for result in ocr_results:
            text = result['text']
            # 중국어 문자 범위 확인 (간체/번체 포함)
            if any('\u4e00' <= char <= '\u9fff' for char in text):
                chinese_results.append(result)

        self.logger.log(f"중국어 텍스트 {len(chinese_results)}개 필터링 완료", level=logging.INFO)
        return chinese_results


    def _detect_with_polygon(self, image: np.ndarray, ocr_raw_results: List) -> List[Dict[str, Any]]:
        """폴리곤 방식으로 텍스트 영역 감지 (기본 방식)"""
        ocr_results = []

        for line in ocr_raw_results:
            if len(line) >= 2:
                polygon = line[0]  # 폴리곤 좌표 (4개 점)
                text_info = line[1]  # (텍스트, 신뢰도)

                if len(text_info) >= 2:
                    text = text_info[0]
                    confidence = text_info[1]

                    # 폴리곤을 바운딩 박스로 변환
                    polygon_np = np.array(polygon, dtype=np.int32)
                    x, y, w, h = cv2.boundingRect(polygon_np)

                    ocr_result = {
                        'text': text,
                        'confidence': confidence,
                        'polygon': polygon,
                        'bbox': (x, y, w, h),
                        'method': 'polygon'
                    }
                    ocr_results.append(ocr_result)

        return ocr_results

    def _detect_with_bbox(self, image: np.ndarray, ocr_raw_results: List) -> List[Dict[str, Any]]:
        """바운딩 박스 방식으로 텍스트 영역 감지"""
        ocr_results = []

        for line in ocr_raw_results:
            if len(line) >= 2:
                polygon = line[0]
                text_info = line[1]

                if len(text_info) >= 2:
                    text = text_info[0]
                    confidence = text_info[1]

                    # 바운딩 박스 계산
                    polygon_np = np.array(polygon, dtype=np.int32)
                    x, y, w, h = cv2.boundingRect(polygon_np)

                    # 바운딩 박스를 폴리곤으로 변환
                    bbox_polygon = [
                        [x, y],
                        [x + w, y],
                        [x + w, y + h],
                        [x, y + h]
                    ]

                    ocr_result = {
                        'text': text,
                        'confidence': confidence,
                        'polygon': bbox_polygon,
                        'bbox': (x, y, w, h),
                        'method': 'bbox'
                    }
                    ocr_results.append(ocr_result)

        return ocr_results

    def _detect_with_expanded_bbox(self, image: np.ndarray, ocr_raw_results: List) -> List[Dict[str, Any]]:
        """확장된 바운딩 박스 방식으로 텍스트 영역 감지"""
        ocr_results = []
        h_img, w_img = image.shape[:2]

        for line in ocr_raw_results:
            if len(line) >= 2:
                polygon = line[0]
                text_info = line[1]

                if len(text_info) >= 2:
                    text = text_info[0]
                    confidence = text_info[1]

                    # 기본 바운딩 박스
                    polygon_np = np.array(polygon, dtype=np.int32)
                    x, y, w, h = cv2.boundingRect(polygon_np)

                    # 확장 크기 계산 (텍스트 크기의 20%)
                    expand_x = max(1, int(w * 0.2))
                    expand_y = max(1, int(h * 0.2))

                    # 확장된 바운딩 박스
                    x_exp = max(0, x - expand_x)
                    y_exp = max(0, y - expand_y)
                    w_exp = min(w_img - x_exp, w + 2 * expand_x)
                    h_exp = min(h_img - y_exp, h + 2 * expand_y)

                    # 확장된 바운딩 박스를 폴리곤으로 변환
                    expanded_polygon = [
                        [x_exp, y_exp],
                        [x_exp + w_exp, y_exp],
                        [x_exp + w_exp, y_exp + h_exp],
                        [x_exp, y_exp + h_exp]
                    ]

                    ocr_result = {
                        'text': text,
                        'confidence': confidence,
                        'polygon': expanded_polygon,
                        'bbox': (x_exp, y_exp, w_exp, h_exp),
                        'method': 'expanded_bbox'
                    }
                    ocr_results.append(ocr_result)

        return ocr_results

    def _detect_with_rotated_bbox(self, image: np.ndarray, ocr_raw_results: List) -> List[Dict[str, Any]]:
        """회전된 바운딩 박스 방식으로 텍스트 영역 감지"""
        ocr_results = []

        for line in ocr_raw_results:
            if len(line) >= 2:
                polygon = line[0]
                text_info = line[1]

                if len(text_info) >= 2:
                    text = text_info[0]
                    confidence = text_info[1]

                    # 회전된 바운딩 박스 계산
                    polygon_np = np.array(polygon, dtype=np.float32)
                    rect = cv2.minAreaRect(polygon_np)
                    box = cv2.boxPoints(rect)
                    box = np.int32(box)

                    # 일반 바운딩 박스도 계산
                    x, y, w, h = cv2.boundingRect(polygon_np.astype(np.int32))

                    ocr_result = {
                        'text': text,
                        'confidence': confidence,
                        'polygon': box.tolist(),
                        'bbox': (x, y, w, h),
                        'method': 'rotated_bbox',
                        'rotation_info': {
                            'center': rect[0],
                            'size': rect[1],
                            'angle': rect[2]
                        }
                    }
                    ocr_results.append(ocr_result)

        return ocr_results

    def _detect_with_contour(self, image: np.ndarray, ocr_raw_results: List) -> List[Dict[str, Any]]:
        """컨투어 방식으로 텍스트 영역 감지"""
        ocr_results = []

        for line in ocr_raw_results:
            if len(line) >= 2:
                polygon = line[0]
                text_info = line[1]

                if len(text_info) >= 2:
                    text = text_info[0]
                    confidence = text_info[1]

                    # 폴리곤을 컨투어로 변환
                    polygon_np = np.array(polygon, dtype=np.int32)

                    # 컨투어 근사화
                    epsilon = 0.02 * cv2.arcLength(polygon_np, True)
                    approx_contour = cv2.approxPolyDP(polygon_np, epsilon, True)

                    # 컨투어를 다시 폴리곤으로 변환
                    contour_polygon = approx_contour.reshape(-1, 2).tolist()

                    # 바운딩 박스 계산
                    x, y, w, h = cv2.boundingRect(polygon_np)

                    ocr_result = {
                        'text': text,
                        'confidence': confidence,
                        'polygon': contour_polygon,
                        'bbox': (x, y, w, h),
                        'method': 'contour',
                        'contour_points': len(contour_polygon)
                    }
                    ocr_results.append(ocr_result)

        return ocr_results