90 lines
3.6 KiB
Python
90 lines
3.6 KiB
Python
import requests
|
|
from PIL import Image
|
|
from io import BytesIO
|
|
import pytesseract
|
|
import re
|
|
import os
|
|
from PIL import Image, ImageEnhance, ImageFilter
|
|
import numpy as np
|
|
import cv2
|
|
|
|
# 지정된 헤더
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Cache-Control": "max-age=0"
|
|
}
|
|
|
|
# 현재 실행 중인 main.py의 경로를 기준으로 프로젝트 폴더 경로 가져오기
|
|
current_dir = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
# src/Tesseract-OCR 폴더의 경로 생성
|
|
tesseract_path = os.path.join(current_dir, 'src', 'Tesseract-OCR', 'tesseract.exe')
|
|
tessdata_path = os.path.join(current_dir, 'src', 'Tesseract-OCR', 'tessdata') + os.sep # 경로 끝에 슬래시 추가
|
|
|
|
# Tesseract 경로 설정
|
|
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
|
os.environ['TESSDATA_PREFIX'] = tessdata_path
|
|
|
|
def preprocess_image(image):
|
|
# Convert to grayscale
|
|
image = image.convert('L')
|
|
|
|
# Enhance contrast
|
|
enhancer = ImageEnhance.Contrast(image)
|
|
image = enhancer.enhance(2)
|
|
|
|
# Convert to numpy array for OpenCV operations
|
|
image_cv = np.array(image)
|
|
|
|
# Apply thresholding with OpenCV
|
|
_, image_cv = cv2.threshold(image_cv, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
|
|
|
# Convert back to PIL Image for pytesseract
|
|
image = Image.fromarray(image_cv)
|
|
|
|
return image
|
|
|
|
|
|
def has_chinese_text_from_url(image_url):
|
|
# 이미지 URL로부터 이미지 다운로드
|
|
response = requests.get(image_url, headers=headers)
|
|
response.raise_for_status() # 요청 실패 시 예외 발생
|
|
|
|
image = Image.open(BytesIO(response.content))
|
|
|
|
image = preprocess_image(image)
|
|
|
|
extracted_text = pytesseract.image_to_string(image, lang='chi_sim') # 중국어 간체 분석
|
|
# extracted_text = pytesseract.image_to_string(image, lang='chi_sim', config='--psm 6')
|
|
# extracted_text = pytesseract.image_to_string(image, lang='chi_sim', config='--psm 11 --oem 3')
|
|
|
|
|
|
# 정규 표현식으로 중국어 문자만 필터링
|
|
chinese_text_only = re.findall(r'[\u4e00-\u9fff]+', extracted_text)
|
|
|
|
if chinese_text_only:
|
|
# 필터링된 중국어 텍스트 출력
|
|
detected_text = ''.join(chinese_text_only)
|
|
print("감지된 중국어 텍스트:", detected_text)
|
|
return detected_text, True
|
|
else:
|
|
print("중국어 텍스트가 감지되지 않았습니다.")
|
|
return "", False
|
|
|
|
# 사용 예시
|
|
# image_url = 'http://img.alicdn.com/bao/uploaded/i4/2135832463/O1CN01DQvqMd1U46T2FwASX_!!0-item_pic.jpg'
|
|
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2212366255626/O1CN01xbWLX51rQl25kKKhJ_!!0-item_pic.jpg'
|
|
# image_url = 'https://img.alicdn.com/bao/uploaded/i2/3102536893/O1CN01IkHwDy20n2wskvDJr_!!0-item_pic.jpg'
|
|
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2204176702764/O1CN01NnOqzA1WHxjPnJ1wq_!!0-item_pic.jpg'
|
|
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/2200695998050/O1CN01g33zH129Kx6fNAo48_!!0-item_pic.jpg'
|
|
# image_url = 'https://img.alicdn.com/bao/uploaded/i3/382860353/O1CN019YDA451ETifJzz0FT_!!382860353.jpg' # ???
|
|
image_url = 'https://img.alicdn.com/bao/uploaded/i3/2201294982631/O1CN016BlLOY1VJ2n8wQAtZ_!!0-item_pic.jpg'
|
|
has_chinese_text_from_url(image_url)
|
|
|