AutoPercenty3/test/ch_img_test.py

90 lines
3.6 KiB
Python

import requests
from PIL import Image
from io import BytesIO
import pytesseract
import re
import os
from PIL import Image, ImageEnhance, ImageFilter
import numpy as np
import cv2
# 지정된 헤더
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Cache-Control": "max-age=0"
}
# 현재 실행 중인 main.py의 경로를 기준으로 프로젝트 폴더 경로 가져오기
current_dir = os.path.dirname(os.path.abspath(__file__))
# src/Tesseract-OCR 폴더의 경로 생성
tesseract_path = os.path.join(current_dir, 'src', 'Tesseract-OCR', 'tesseract.exe')
tessdata_path = os.path.join(current_dir, 'src', 'Tesseract-OCR', 'tessdata') + os.sep # 경로 끝에 슬래시 추가
# Tesseract 경로 설정
pytesseract.pytesseract.tesseract_cmd = tesseract_path
os.environ['TESSDATA_PREFIX'] = tessdata_path
def preprocess_image(image):
# Convert to grayscale
image = image.convert('L')
# Enhance contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2)
# Convert to numpy array for OpenCV operations
image_cv = np.array(image)
# Apply thresholding with OpenCV
_, image_cv = cv2.threshold(image_cv, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# Convert back to PIL Image for pytesseract
image = Image.fromarray(image_cv)
return image
def has_chinese_text_from_url(image_url):
# 이미지 URL로부터 이미지 다운로드
response = requests.get(image_url, headers=headers)
response.raise_for_status() # 요청 실패 시 예외 발생
image = Image.open(BytesIO(response.content))
image = preprocess_image(image)
extracted_text = pytesseract.image_to_string(image, lang='chi_sim') # 중국어 간체 분석
# extracted_text = pytesseract.image_to_string(image, lang='chi_sim', config='--psm 6')
# extracted_text = pytesseract.image_to_string(image, lang='chi_sim', config='--psm 11 --oem 3')
# 정규 표현식으로 중국어 문자만 필터링
chinese_text_only = re.findall(r'[\u4e00-\u9fff]+', extracted_text)
if chinese_text_only:
# 필터링된 중국어 텍스트 출력
detected_text = ''.join(chinese_text_only)
print("감지된 중국어 텍스트:", detected_text)
return detected_text, True
else:
print("중국어 텍스트가 감지되지 않았습니다.")
return "", False
# 사용 예시
# image_url = 'http://img.alicdn.com/bao/uploaded/i4/2135832463/O1CN01DQvqMd1U46T2FwASX_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2212366255626/O1CN01xbWLX51rQl25kKKhJ_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i2/3102536893/O1CN01IkHwDy20n2wskvDJr_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2204176702764/O1CN01NnOqzA1WHxjPnJ1wq_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2200695998050/O1CN01g33zH129Kx6fNAo48_!!0-item_pic.jpg'
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/382860353/O1CN019YDA451ETifJzz0FT_!!382860353.jpg' # ???
image_url = 'https://img.alicdn.com/bao/uploaded/i3/2201294982631/O1CN016BlLOY1VJ2n8wQAtZ_!!0-item_pic.jpg'
has_chinese_text_from_url(image_url)