AutoPercenty3/test/ch_img_test.py

import requests
from PIL import Image
from io import BytesIO
import pytesseract
import re
import os
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import cv2

# 지정된 헤더
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0"
}

# 현재 실행 중인 main.py의 경로를 기준으로 프로젝트 폴더 경로 가져오기
current_dir = os.path.dirname(os.path.abspath(__file__))

# src/Tesseract-OCR 폴더의 경로 생성
tesseract_path = os.path.join(current_dir, 'src', 'Tesseract-OCR', 'tesseract.exe')
tessdata_path = os.path.join(current_dir, 'src', 'Tesseract-OCR', 'tessdata') + os.sep  # 경로 끝에 슬래시 추가

# Tesseract 경로 설정
pytesseract.pytesseract.tesseract_cmd = tesseract_path
os.environ['TESSDATA_PREFIX'] = tessdata_path

def preprocess_image(image):
    # Convert to grayscale
    image = image.convert('L')

    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2)

    # Convert to numpy array for OpenCV operations
    image_cv = np.array(image)

    # Apply thresholding with OpenCV
    _, image_cv = cv2.threshold(image_cv, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Convert back to PIL Image for pytesseract
    image = Image.fromarray(image_cv)

    return image


def has_chinese_text_from_url(image_url):
    # 이미지 URL로부터 이미지 다운로드
    response = requests.get(image_url, headers=headers)
    response.raise_for_status()  # 요청 실패 시 예외 발생

    image = Image.open(BytesIO(response.content))

    image = preprocess_image(image)

    extracted_text = pytesseract.image_to_string(image, lang='chi_sim')  # 중국어 간체 분석
    # extracted_text = pytesseract.image_to_string(image, lang='chi_sim', config='--psm 6')
    # extracted_text = pytesseract.image_to_string(image, lang='chi_sim', config='--psm 11 --oem 3')


    # 정규 표현식으로 중국어 문자만 필터링
    chinese_text_only = re.findall(r'[\u4e00-\u9fff]+', extracted_text)

    if chinese_text_only:
        # 필터링된 중국어 텍스트 출력
        detected_text = ''.join(chinese_text_only)
        print("감지된 중국어 텍스트:", detected_text)
        return detected_text, True
    else:
        print("중국어 텍스트가 감지되지 않았습니다.")
        return "", False

# 사용 예시
# image_url = 'http://img.alicdn.com/bao/uploaded/i4/2135832463/O1CN01DQvqMd1U46T2FwASX_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2212366255626/O1CN01xbWLX51rQl25kKKhJ_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i2/3102536893/O1CN01IkHwDy20n2wskvDJr_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2204176702764/O1CN01NnOqzA1WHxjPnJ1wq_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2200695998050/O1CN01g33zH129Kx6fNAo48_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/382860353/O1CN019YDA451ETifJzz0FT_!!382860353.jpg' # ???
image_url = 'https://img.alicdn.com/bao/uploaded/i3/2201294982631/O1CN016BlLOY1VJ2n8wQAtZ_!!0-item_pic.jpg'
has_chinese_text_from_url(image_url)