import requests from PIL import Image from io import BytesIO import pytesseract import re import os from PIL import Image, ImageEnhance, ImageFilter import numpy as np import cv2 # 지정된 헤더 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "DNT": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Cache-Control": "max-age=0" } # 현재 실행 중인 main.py의 경로를 기준으로 프로젝트 폴더 경로 가져오기 current_dir = os.path.dirname(os.path.abspath(__file__)) # src/Tesseract-OCR 폴더의 경로 생성 tesseract_path = os.path.join(current_dir, 'src', 'Tesseract-OCR', 'tesseract.exe') tessdata_path = os.path.join(current_dir, 'src', 'Tesseract-OCR', 'tessdata') + os.sep # 경로 끝에 슬래시 추가 # Tesseract 경로 설정 pytesseract.pytesseract.tesseract_cmd = tesseract_path os.environ['TESSDATA_PREFIX'] = tessdata_path def preprocess_image(image): # Convert to grayscale image = image.convert('L') # Enhance contrast enhancer = ImageEnhance.Contrast(image) image = enhancer.enhance(2) # Convert to numpy array for OpenCV operations image_cv = np.array(image) # Apply thresholding with OpenCV _, image_cv = cv2.threshold(image_cv, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # Convert back to PIL Image for pytesseract image = Image.fromarray(image_cv) return image def has_chinese_text_from_url(image_url): # 이미지 URL로부터 이미지 다운로드 response = requests.get(image_url, headers=headers) response.raise_for_status() # 요청 실패 시 예외 발생 image = Image.open(BytesIO(response.content)) image = preprocess_image(image) extracted_text = pytesseract.image_to_string(image, lang='chi_sim') # 중국어 간체 분석 # extracted_text = pytesseract.image_to_string(image, lang='chi_sim', config='--psm 6') # extracted_text = pytesseract.image_to_string(image, lang='chi_sim', config='--psm 11 --oem 3') # 정규 표현식으로 중국어 문자만 필터링 chinese_text_only = re.findall(r'[\u4e00-\u9fff]+', extracted_text) if chinese_text_only: # 필터링된 중국어 텍스트 출력 detected_text = ''.join(chinese_text_only) print("감지된 중국어 텍스트:", detected_text) return detected_text, True else: print("중국어 텍스트가 감지되지 않았습니다.") return "", False # 사용 예시 # image_url = 'http://img.alicdn.com/bao/uploaded/i4/2135832463/O1CN01DQvqMd1U46T2FwASX_!!0-item_pic.jpg' # image_url = 'https://img.alicdn.com/bao/uploaded/i3/2212366255626/O1CN01xbWLX51rQl25kKKhJ_!!0-item_pic.jpg' # image_url = 'https://img.alicdn.com/bao/uploaded/i2/3102536893/O1CN01IkHwDy20n2wskvDJr_!!0-item_pic.jpg' # image_url = 'https://img.alicdn.com/bao/uploaded/i3/2204176702764/O1CN01NnOqzA1WHxjPnJ1wq_!!0-item_pic.jpg' # image_url = 'https://img.alicdn.com/bao/uploaded/i3/2200695998050/O1CN01g33zH129Kx6fNAo48_!!0-item_pic.jpg' # image_url = 'https://img.alicdn.com/bao/uploaded/i3/382860353/O1CN019YDA451ETifJzz0FT_!!382860353.jpg' # ??? image_url = 'https://img.alicdn.com/bao/uploaded/i3/2201294982631/O1CN016BlLOY1VJ2n8wQAtZ_!!0-item_pic.jpg' has_chinese_text_from_url(image_url)