522 lines
18 KiB
Python
522 lines
18 KiB
Python
# ocr_runtime_module.py
|
|
# PP-OCRv4 계열 ONNX 추론 파이프라인 (DET/CLS/REC) + PaddleOCR 비교 + 프로바이더 선택
|
|
# Windows/PowerShell 환경에서 바로 동작하도록 설계
|
|
|
|
import os
|
|
import time
|
|
import glob
|
|
import math
|
|
import cv2
|
|
import numpy as np
|
|
import onnxruntime as ort
|
|
|
|
# ----------------------------
|
|
# Utils
|
|
# ----------------------------
|
|
|
|
def read_image_bgr(path_or_bgr):
|
|
if isinstance(path_or_bgr, str):
|
|
img = cv2.imdecode(np.fromfile(path_or_bgr, dtype=np.uint8), cv2.IMREAD_COLOR)
|
|
if img is None:
|
|
raise FileNotFoundError(f"Cannot read image: {path_or_bgr}")
|
|
return img
|
|
return path_or_bgr
|
|
|
|
def sigmoid(x):
|
|
return 1 / (1 + np.exp(-x))
|
|
|
|
def softmax(x, axis=-1):
|
|
x = x - np.max(x, axis=axis, keepdims=True)
|
|
e = np.exp(x)
|
|
return e / np.sum(e, axis=axis, keepdims=True)
|
|
|
|
def to_chw(img):
|
|
return img.transpose(2, 0, 1)
|
|
|
|
def to_nchw(img):
|
|
if img.ndim == 3:
|
|
return img[np.newaxis, ...]
|
|
return img
|
|
|
|
def timer_ms(fn):
|
|
t0 = time.perf_counter()
|
|
out = fn()
|
|
t1 = time.perf_counter()
|
|
return out, (t1 - t0) * 1000.0
|
|
|
|
def ensure_exists(p, msg="File not found"):
|
|
if p is None: return
|
|
if not os.path.exists(p):
|
|
raise FileNotFoundError(f"{msg}: {p}")
|
|
|
|
def find_best_onnx(onnx_dir, stem_hint):
|
|
"""
|
|
onnx_dir 안에서 우선순위에 따라 파일 탐색:
|
|
1) *{stem_hint}*.fp16.onnx
|
|
2) *{stem_hint}*.opt.onnx
|
|
3) *{stem_hint}*.simp.onnx
|
|
4) *{stem_hint}*.onnx
|
|
"""
|
|
patterns = [
|
|
f"*{stem_hint}*.fp16.onnx",
|
|
f"*{stem_hint}*.opt.onnx",
|
|
f"*{stem_hint}*.simp.onnx",
|
|
f"*{stem_hint}*.onnx",
|
|
]
|
|
for pat in patterns:
|
|
cands = sorted(glob.glob(os.path.join(onnx_dir, pat)))
|
|
if cands:
|
|
return cands[0]
|
|
return None
|
|
|
|
# ----------------------------
|
|
# Provider selection
|
|
# ----------------------------
|
|
|
|
def resolve_providers(choice="auto", use_trt_fp16=False, trt_max_workspace=2<<30):
|
|
"""
|
|
choice: auto|cpu|cuda|trt
|
|
Returns: (used_det, used_rec, used_cls, provider_options_dict)
|
|
- 모든 세션 동일한 provider 목록 사용 (단순/안정성 우선)
|
|
"""
|
|
avail = ort.get_available_providers()
|
|
choice = (choice or "auto").lower()
|
|
|
|
# 기본 후보 목록 구성
|
|
if choice == "cpu":
|
|
used = ["CPUExecutionProvider"]
|
|
elif choice == "cuda":
|
|
used = [p for p in ["CUDAExecutionProvider", "CPUExecutionProvider"] if p in avail] or ["CPUExecutionProvider"]
|
|
elif choice == "trt":
|
|
# TRT -> CUDA -> CPU
|
|
used = [p for p in ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"] if p in avail]
|
|
if not used:
|
|
used = ["CPUExecutionProvider"]
|
|
else: # auto
|
|
# TRT 선호, 그다음 CUDA, 그다음 CPU
|
|
used = [p for p in ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"] if p in avail]
|
|
if not used:
|
|
used = ["CPUExecutionProvider"]
|
|
|
|
# 각 EP별 provider options
|
|
po = {}
|
|
if "TensorrtExecutionProvider" in used:
|
|
po["TensorrtExecutionProvider"] = {
|
|
"trt_engine_cache_enable": True,
|
|
"trt_fp16_enable": bool(use_trt_fp16),
|
|
"trt_max_workspace_size": int(trt_max_workspace),
|
|
}
|
|
if "CUDAExecutionProvider" in used:
|
|
po["CUDAExecutionProvider"] = {
|
|
"arena_extend_strategy": "kNextPowerOfTwo"
|
|
}
|
|
|
|
return used, used, used, po, avail
|
|
|
|
def create_sess(model_path, providers, provider_options=None, intra_op=0, inter_op=0, graph_optim=True):
|
|
ensure_exists(model_path, "ONNX model missing")
|
|
so = ort.SessionOptions()
|
|
if graph_optim:
|
|
so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
if intra_op > 0:
|
|
so.intra_op_num_threads = intra_op
|
|
if inter_op > 0:
|
|
so.inter_op_num_threads = inter_op
|
|
|
|
# provider options 매핑
|
|
prov_opts = []
|
|
for p in providers:
|
|
if provider_options and p in provider_options:
|
|
prov_opts.append(provider_options[p])
|
|
else:
|
|
prov_opts.append({})
|
|
|
|
# 세션 생성 및 예열
|
|
sess = ort.InferenceSession(model_path, sess_options=so, providers=providers, provider_options=prov_opts)
|
|
return sess
|
|
|
|
# ----------------------------
|
|
# Geometry / crop
|
|
# ----------------------------
|
|
|
|
def order_points_clockwise(pts):
|
|
pts = np.array(pts).astype(np.float32)
|
|
c = np.mean(pts, axis=0)
|
|
angles = np.arctan2(pts[:,1]-c[1], pts[:,0]-c[0])
|
|
idx = np.argsort(angles)
|
|
return pts[idx]
|
|
|
|
def four_point_warp(bgr, box, out_h=48):
|
|
"""박스(4x2)를 대상으로 퍼스펙티브 워프, 가로길이는 비율로 잡아줌"""
|
|
box = order_points_clockwise(box)
|
|
(tl, tr, br, bl) = box
|
|
w1 = np.linalg.norm(br - bl)
|
|
w2 = np.linalg.norm(tr - tl)
|
|
h1 = np.linalg.norm(tr - br)
|
|
h2 = np.linalg.norm(tl - bl)
|
|
width = int(max(w1, w2))
|
|
height = int(max(h1, h2))
|
|
if height <= 0 or width <= 0:
|
|
return None
|
|
dst = np.array([[0,0],[width-1,0],[width-1,height-1],[0,height-1]], dtype=np.float32)
|
|
M = cv2.getPerspectiveTransform(box.astype(np.float32), dst)
|
|
warped = cv2.warpPerspective(bgr, M, (width, height))
|
|
# rec 입력을 위해 높이를 out_h로 리사이즈(가로는 비율 유지)
|
|
scale = out_h / max(1, warped.shape[0])
|
|
out_w = max(1, int(warped.shape[1] * scale))
|
|
warped = cv2.resize(warped, (out_w, out_h), interpolation=cv2.INTER_LINEAR)
|
|
return warped
|
|
|
|
def crop_by_boxes(img_bgr, boxes, use_warp=True, out_h=48):
|
|
crops = []
|
|
for box in boxes:
|
|
if use_warp:
|
|
crop = four_point_warp(img_bgr, np.array(box), out_h=out_h)
|
|
if crop is not None: crops.append(crop)
|
|
else:
|
|
b = np.array(box)
|
|
x0, y0 = np.min(b, axis=0).astype(int)
|
|
x1, y1 = np.max(b, axis=0).astype(int)
|
|
x0 = max(0, x0); y0 = max(0, y0)
|
|
x1 = min(img_bgr.shape[1]-1, x1); y1 = min(img_bgr.shape[0]-1, y1)
|
|
if x1 > x0 and y1 > y0:
|
|
crops.append(img_bgr[y0:y1, x0:x1].copy())
|
|
return crops
|
|
|
|
# ----------------------------
|
|
# DET preprocess/postprocess (DB)
|
|
# ----------------------------
|
|
|
|
def det_resize(img, max_side=960, limit_type="max"):
|
|
h, w = img.shape[:2]
|
|
if limit_type == "max":
|
|
scale = min(max_side / max(h, w), 1.0)
|
|
else:
|
|
scale = max_side / float(max(h, w))
|
|
nh, nw = int(h*scale), int(w*scale)
|
|
nh = max(nh, 32); nw = max(nw, 32)
|
|
nh = nh // 32 * 32
|
|
nw = nw // 32 * 32
|
|
resized = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_LINEAR)
|
|
return resized, (nh / h, nw / w)
|
|
|
|
def det_preprocess(img_bgr):
|
|
img = img_bgr.astype(np.float32)
|
|
img = img / 255.0
|
|
# PP-OCR det: mean/std (ImageNet)
|
|
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)[None,None,:]
|
|
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)[None,None,:]
|
|
img = (img - mean) / std
|
|
img = to_chw(img) # (3,H,W)
|
|
return img
|
|
|
|
def boxes_from_bitmap(pred, thresh=0.3, box_thresh=0.6, unclip_ratio=1.5, min_size=3):
|
|
"""
|
|
매우 간단/견고하게: pred(1,H,W) → 바이너리 → 컨투어 → minAreaRect 박스
|
|
"""
|
|
prob_map = pred[0]
|
|
_, bin_map = cv2.threshold(prob_map.astype(np.float32), thresh, 1, 0)
|
|
bin_map = (bin_map * 255).astype(np.uint8)
|
|
|
|
contours, _ = cv2.findContours(bin_map, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
|
|
boxes = []
|
|
scores = []
|
|
for cnt in contours:
|
|
if cv2.contourArea(cnt) < min_size:
|
|
continue
|
|
rect = cv2.minAreaRect(cnt)
|
|
box = cv2.boxPoints(rect) # (4,2)
|
|
box = np.array(box, dtype=np.float32)
|
|
|
|
# 점수 계산: 박스 내부 평균값
|
|
mask = np.zeros_like(bin_map, dtype=np.uint8)
|
|
cv2.fillPoly(mask, [box.astype(np.int32)], 1)
|
|
score = float((prob_map * mask).sum() / (mask.sum() + 1e-6))
|
|
if score < box_thresh:
|
|
continue
|
|
|
|
# 언클립(확장)
|
|
area = cv2.contourArea(box.astype(np.int32))
|
|
length = cv2.arcLength(box.astype(np.int32), True)
|
|
if length > 0:
|
|
distance = area * unclip_ratio / length
|
|
# OpenCV 4.7+의 unclip이 없다면 폴리곤 오프셋 간단 근사: box 중심으로 확장
|
|
c = box.mean(axis=0)
|
|
box = (box - c) * (1.0 + distance / (np.linalg.norm(box[0]-c)+1e-6)) + c
|
|
|
|
# 정렬/정수화
|
|
box = order_points_clockwise(box).astype(np.int32)
|
|
boxes.append(box)
|
|
scores.append(score)
|
|
return boxes, scores
|
|
|
|
# ----------------------------
|
|
# CTC decoder (rec)
|
|
# ----------------------------
|
|
|
|
class CTCLabelDecoder:
|
|
def __init__(self, dict_path, use_space_char=True):
|
|
chars = []
|
|
with open(dict_path, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
ch = line.strip('\n\r')
|
|
if len(ch) > 0:
|
|
chars.append(ch)
|
|
if use_space_char and " " not in chars:
|
|
chars.append(" ")
|
|
|
|
self.blank_idx = 0
|
|
self.idx2char = ["<blank>"] + chars
|
|
self.char2idx = {c:i for i,c in enumerate(self.idx2char)}
|
|
|
|
def decode(self, probs): # probs: (T, C)
|
|
idxs = probs.argmax(axis=1)
|
|
confs = probs.max(axis=1)
|
|
text = []
|
|
conf_list = []
|
|
prev = self.blank_idx
|
|
for i, (ix, cf) in enumerate(zip(idxs, confs)):
|
|
if ix != self.blank_idx and not (i>0 and ix == prev):
|
|
text.append(self.idx2char[ix])
|
|
conf_list.append(float(cf))
|
|
prev = ix
|
|
s = "".join(text)
|
|
conf = float(np.mean(conf_list)) if conf_list else 0.0
|
|
return s, conf
|
|
|
|
# ----------------------------
|
|
# ONNX models: Det / Cls / Rec
|
|
# ----------------------------
|
|
|
|
class ORTDet:
|
|
def __init__(self, model_path, providers, provider_options=None, max_side=960):
|
|
self.sess = create_sess(model_path, providers, provider_options)
|
|
self.inp = self.sess.get_inputs()[0].name
|
|
self.out = self.sess.get_outputs()[0].name
|
|
self.max_side = max_side
|
|
|
|
def infer(self, img_bgr):
|
|
# resize
|
|
(resized, scale), t_resize = timer_ms(lambda: det_resize(img_bgr, self.max_side, "max"))
|
|
# norm
|
|
det_x = det_preprocess(resized)
|
|
det_x = to_nchw(det_x).astype(np.float32)
|
|
|
|
# run
|
|
(y,), t_run = timer_ms(lambda: self.sess.run([self.out], {self.inp: det_x}))
|
|
|
|
# post
|
|
def _post():
|
|
if y.ndim == 4:
|
|
# (N,1,H,W) or (N, H, W, 1)
|
|
if y.shape[1] == 1:
|
|
pm = y[0,0]
|
|
elif y.shape[-1] == 1:
|
|
pm = y[0,...,0]
|
|
else:
|
|
pm = y[0,0]
|
|
elif y.ndim == 3:
|
|
pm = y[0]
|
|
else:
|
|
pm = y
|
|
pm = sigmoid(pm)
|
|
boxes, scores = boxes_from_bitmap(pm, thresh=0.3, box_thresh=0.6, unclip_ratio=1.5)
|
|
# resize back to original scale
|
|
sy, sx = 1.0/scale[0], 1.0/scale[1]
|
|
boxes_orig = []
|
|
for b in boxes:
|
|
b = b.astype(np.float32)
|
|
b[:,0] = np.clip(b[:,0] * sx, 0, img_bgr.shape[1]-1)
|
|
b[:,1] = np.clip(b[:,1] * sy, 0, img_bgr.shape[0]-1)
|
|
boxes_orig.append(b.astype(np.int32))
|
|
return boxes_orig, scores
|
|
(boxes, scores), t_post = timer_ms(_post)
|
|
|
|
T = {"resize": t_resize, "det": t_run, "post": t_post}
|
|
return boxes, T
|
|
|
|
class ORTCls:
|
|
def __init__(self, model_path, providers, provider_options=None, thresh=0.9):
|
|
self.sess = create_sess(model_path, providers, provider_options)
|
|
self.inp = self.sess.get_inputs()[0].name
|
|
self.out = self.sess.get_outputs()[0].name
|
|
self.thresh = float(thresh)
|
|
|
|
def preprocess(self, crop_bgr):
|
|
# PP-OCR cls: 3x48x192, mean=0.5, std=0.5
|
|
h, w = crop_bgr.shape[:2]
|
|
target_h, target_w = 48, 192
|
|
scale = target_h / max(1, h)
|
|
nw = max(1, int(w*scale))
|
|
if nw > target_w:
|
|
nw = target_w
|
|
resized = cv2.resize(crop_bgr, (nw, target_h), interpolation=cv2.INTER_LINEAR)
|
|
pad = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
|
|
pad[:, :nw, :] = resized
|
|
img = pad[:, :, ::-1].astype(np.float32) / 255.0 # BGR->RGB
|
|
img = (img - 0.5) / 0.5
|
|
img = to_chw(img)
|
|
return img
|
|
|
|
def infer(self, crops):
|
|
if not crops:
|
|
return []
|
|
batch = np.stack([self.preprocess(c) for c in crops], axis=0).astype(np.float32)
|
|
(y,), t = timer_ms(lambda: self.sess.run([self.out], {self.inp: batch}))
|
|
# assume y: (N,2)
|
|
y = softmax(y, axis=1)
|
|
res = []
|
|
for i in range(y.shape[0]):
|
|
lbl = int(np.argmax(y[i]))
|
|
prob = float(np.max(y[i]))
|
|
need_rotate = (lbl == 1 and prob > self.thresh)
|
|
res.append((need_rotate, prob))
|
|
return res
|
|
|
|
class ORTRec:
|
|
def __init__(self, model_path, providers, provider_options=None, dict_path=None, use_space_char=True, img_h=48, img_w=320, batch_size=6):
|
|
self.sess = create_sess(model_path, providers, provider_options)
|
|
self.inp = self.sess.get_inputs()[0].name
|
|
outs = self.sess.get_outputs()
|
|
# 일부 모델은 single output, 일부는 sequence logits (N,T,C)
|
|
self.out = outs[0].name
|
|
self.decoder = CTCLabelDecoder(dict_path, use_space_char)
|
|
self.img_h = img_h
|
|
self.img_w = img_w
|
|
self.batch_size = max(1, int(batch_size))
|
|
|
|
def _prep_one(self, crop_bgr):
|
|
# PP-OCR rec: 3x48x320, mean=0.5, std=0.5, RGB
|
|
h, w = crop_bgr.shape[:2]
|
|
scale = self.img_h / max(1, h)
|
|
nw = max(1, int(w*scale))
|
|
if nw > self.img_w:
|
|
nw = self.img_w
|
|
resized = cv2.resize(crop_bgr, (nw, self.img_h), interpolation=cv2.INTER_LINEAR)
|
|
pad = np.ones((self.img_h, self.img_w, 3), dtype=np.uint8) * 255
|
|
pad[:, :nw, :] = resized
|
|
img = pad[:, :, ::-1].astype(np.float32) / 255.0
|
|
img = (img - 0.5) / 0.5
|
|
img = to_chw(img)
|
|
return img
|
|
|
|
def infer(self, crops):
|
|
if not crops:
|
|
return [], {"prep": 0.0, "rec": 0.0}
|
|
def _prep():
|
|
return np.stack([self._prep_one(c) for c in crops], axis=0).astype(np.float32)
|
|
batch, t_prep = timer_ms(_prep)
|
|
|
|
texts = []
|
|
# 배치 추론
|
|
t_rec_total = 0.0
|
|
for i in range(0, batch.shape[0], self.batch_size):
|
|
chunk = batch[i:i+self.batch_size]
|
|
(y,), t_rec = timer_ms(lambda: self.sess.run([self.out], {self.inp: chunk}))
|
|
t_rec_total += t_rec
|
|
# y: (N, T, C) 또는 (N, C, T)일 수 있으므로 정규화
|
|
if y.ndim == 3 and y.shape[1] != self.decoder.blank_idx and y.shape[1] < y.shape[2]:
|
|
# (N, T, C)
|
|
probs = softmax(y, axis=2)
|
|
elif y.ndim == 3 and y.shape[1] > y.shape[2]:
|
|
# (N, C, T) -> (N, T, C)
|
|
probs = softmax(np.transpose(y, (0,2,1)), axis=2)
|
|
else:
|
|
# (N, C) 같은 이상 케이스 방어
|
|
probs = softmax(y, axis=-1)
|
|
probs = probs[:, np.newaxis, :]
|
|
|
|
for j in range(probs.shape[0]):
|
|
txt, conf = self.decoder.decode(probs[j])
|
|
texts.append((txt, conf))
|
|
return texts, {"prep": t_prep, "rec": t_rec_total}
|
|
|
|
# ----------------------------
|
|
# Orchestrator
|
|
# ----------------------------
|
|
|
|
class ONNXOCR:
|
|
def __init__(self, onnx_dir=None, det_path=None, rec_path=None, cls_path=None,
|
|
dict_path=None, ep="auto", trt_fp16=False, trt_workspace=2<<30,
|
|
rec_bs=6, use_warp_crop=True):
|
|
# 모델 경로 찾기
|
|
if det_path is None:
|
|
det_path = find_best_onnx(onnx_dir, "det")
|
|
if rec_path is None:
|
|
rec_path = find_best_onnx(onnx_dir, "rec")
|
|
# cls는 선택
|
|
if cls_path is None:
|
|
cls_path = find_best_onnx(onnx_dir, "cls")
|
|
|
|
ensure_exists(det_path, "det model missing")
|
|
ensure_exists(rec_path, "rec model missing")
|
|
|
|
if dict_path is None:
|
|
# PaddleOCR 내부 dict 사용 경로를 찾아볼 수도 있지만, 인자로 넘기는 걸 권장
|
|
raise FileNotFoundError("Provide --dict path to ppocr_keys_v1.txt")
|
|
|
|
self.providers_det, self.providers_rec, self.providers_cls, self.po, self.avail = resolve_providers(
|
|
ep, trt_fp16, trt_workspace
|
|
)
|
|
|
|
self.det = ORTDet(det_path, self.providers_det, self.po if self.po else None)
|
|
self.rec = ORTRec(rec_path, self.providers_rec, self.po if self.po else None,
|
|
dict_path=dict_path, img_h=48, img_w=320, batch_size=rec_bs)
|
|
self.cls = None
|
|
if cls_path and os.path.exists(cls_path):
|
|
self.cls = ORTCls(cls_path, self.providers_cls, self.po if self.po else None, thresh=0.9)
|
|
|
|
self.use_warp_crop = bool(use_warp_crop)
|
|
|
|
def run(self, img_bgr, use_cls=False, rec_bs=None):
|
|
# DET
|
|
boxes, t_det = self.det.infer(img_bgr)
|
|
# CROP
|
|
crops = crop_by_boxes(img_bgr, boxes, use_warp=self.use_warp_crop, out_h=48)
|
|
# CLS (optional)
|
|
if use_cls and self.cls and len(crops) > 0:
|
|
res = self.cls.infer(crops)
|
|
for i, (need_rot, prob) in enumerate(res):
|
|
if need_rot:
|
|
crops[i] = cv2.rotate(crops[i], cv2.ROTATE_180)
|
|
# REC
|
|
if rec_bs is not None:
|
|
self.rec.batch_size = max(1, int(rec_bs))
|
|
rec_res, t_rec = self.rec.infer(crops)
|
|
|
|
T = {
|
|
"det_resize_ms": t_det["resize"],
|
|
"det_ms": t_det["det"],
|
|
"det_post_ms": t_det["post"],
|
|
"rec_prep_ms": t_rec["prep"],
|
|
"rec_ms": t_rec["rec"],
|
|
}
|
|
return boxes, rec_res, T
|
|
|
|
# ----------------------------
|
|
# PaddleOCR 비교 러너 (선택)
|
|
# ----------------------------
|
|
|
|
class PaddleRunner:
|
|
def __init__(self, use_angle_cls=False, lang='ch'):
|
|
from paddleocr import PaddleOCR
|
|
self.ocr = PaddleOCR(use_angle_cls=use_angle_cls, lang=lang, show_log=True)
|
|
|
|
def run(self, img_bgr):
|
|
h, w = img_bgr.shape[:2]
|
|
# PaddleOCR는 경로 입력 선호 → 메모리 입력을 위해 임시 인코딩
|
|
_, buf = cv2.imencode(".jpg", img_bgr)
|
|
img_bytes = buf.tobytes()
|
|
# warmup
|
|
_ = self.ocr.ocr(img_bytes, cls=True)
|
|
t0 = time.perf_counter()
|
|
res = self.ocr.ocr(img_bytes, cls=True)
|
|
t1 = time.perf_counter()
|
|
out = []
|
|
if res and len(res)>0:
|
|
for line in res[0]:
|
|
txt = line[1][0]
|
|
conf = float(line[1][1])
|
|
out.append((txt, conf))
|
|
return out, (t1 - t0) * 1000.0
|