AutoPercenty3/test/paddle2onnx/ocr_runtime_module.py

522 lines
18 KiB
Python

# ocr_runtime_module.py
# PP-OCRv4 계열 ONNX 추론 파이프라인 (DET/CLS/REC) + PaddleOCR 비교 + 프로바이더 선택
# Windows/PowerShell 환경에서 바로 동작하도록 설계
import os
import time
import glob
import math
import cv2
import numpy as np
import onnxruntime as ort
# ----------------------------
# Utils
# ----------------------------
def read_image_bgr(path_or_bgr):
if isinstance(path_or_bgr, str):
img = cv2.imdecode(np.fromfile(path_or_bgr, dtype=np.uint8), cv2.IMREAD_COLOR)
if img is None:
raise FileNotFoundError(f"Cannot read image: {path_or_bgr}")
return img
return path_or_bgr
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def softmax(x, axis=-1):
x = x - np.max(x, axis=axis, keepdims=True)
e = np.exp(x)
return e / np.sum(e, axis=axis, keepdims=True)
def to_chw(img):
return img.transpose(2, 0, 1)
def to_nchw(img):
if img.ndim == 3:
return img[np.newaxis, ...]
return img
def timer_ms(fn):
t0 = time.perf_counter()
out = fn()
t1 = time.perf_counter()
return out, (t1 - t0) * 1000.0
def ensure_exists(p, msg="File not found"):
if p is None: return
if not os.path.exists(p):
raise FileNotFoundError(f"{msg}: {p}")
def find_best_onnx(onnx_dir, stem_hint):
"""
onnx_dir 안에서 우선순위에 따라 파일 탐색:
1) *{stem_hint}*.fp16.onnx
2) *{stem_hint}*.opt.onnx
3) *{stem_hint}*.simp.onnx
4) *{stem_hint}*.onnx
"""
patterns = [
f"*{stem_hint}*.fp16.onnx",
f"*{stem_hint}*.opt.onnx",
f"*{stem_hint}*.simp.onnx",
f"*{stem_hint}*.onnx",
]
for pat in patterns:
cands = sorted(glob.glob(os.path.join(onnx_dir, pat)))
if cands:
return cands[0]
return None
# ----------------------------
# Provider selection
# ----------------------------
def resolve_providers(choice="auto", use_trt_fp16=False, trt_max_workspace=2<<30):
"""
choice: auto|cpu|cuda|trt
Returns: (used_det, used_rec, used_cls, provider_options_dict)
- 모든 세션 동일한 provider 목록 사용 (단순/안정성 우선)
"""
avail = ort.get_available_providers()
choice = (choice or "auto").lower()
# 기본 후보 목록 구성
if choice == "cpu":
used = ["CPUExecutionProvider"]
elif choice == "cuda":
used = [p for p in ["CUDAExecutionProvider", "CPUExecutionProvider"] if p in avail] or ["CPUExecutionProvider"]
elif choice == "trt":
# TRT -> CUDA -> CPU
used = [p for p in ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"] if p in avail]
if not used:
used = ["CPUExecutionProvider"]
else: # auto
# TRT 선호, 그다음 CUDA, 그다음 CPU
used = [p for p in ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"] if p in avail]
if not used:
used = ["CPUExecutionProvider"]
# 각 EP별 provider options
po = {}
if "TensorrtExecutionProvider" in used:
po["TensorrtExecutionProvider"] = {
"trt_engine_cache_enable": True,
"trt_fp16_enable": bool(use_trt_fp16),
"trt_max_workspace_size": int(trt_max_workspace),
}
if "CUDAExecutionProvider" in used:
po["CUDAExecutionProvider"] = {
"arena_extend_strategy": "kNextPowerOfTwo"
}
return used, used, used, po, avail
def create_sess(model_path, providers, provider_options=None, intra_op=0, inter_op=0, graph_optim=True):
ensure_exists(model_path, "ONNX model missing")
so = ort.SessionOptions()
if graph_optim:
so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
if intra_op > 0:
so.intra_op_num_threads = intra_op
if inter_op > 0:
so.inter_op_num_threads = inter_op
# provider options 매핑
prov_opts = []
for p in providers:
if provider_options and p in provider_options:
prov_opts.append(provider_options[p])
else:
prov_opts.append({})
# 세션 생성 및 예열
sess = ort.InferenceSession(model_path, sess_options=so, providers=providers, provider_options=prov_opts)
return sess
# ----------------------------
# Geometry / crop
# ----------------------------
def order_points_clockwise(pts):
pts = np.array(pts).astype(np.float32)
c = np.mean(pts, axis=0)
angles = np.arctan2(pts[:,1]-c[1], pts[:,0]-c[0])
idx = np.argsort(angles)
return pts[idx]
def four_point_warp(bgr, box, out_h=48):
"""박스(4x2)를 대상으로 퍼스펙티브 워프, 가로길이는 비율로 잡아줌"""
box = order_points_clockwise(box)
(tl, tr, br, bl) = box
w1 = np.linalg.norm(br - bl)
w2 = np.linalg.norm(tr - tl)
h1 = np.linalg.norm(tr - br)
h2 = np.linalg.norm(tl - bl)
width = int(max(w1, w2))
height = int(max(h1, h2))
if height <= 0 or width <= 0:
return None
dst = np.array([[0,0],[width-1,0],[width-1,height-1],[0,height-1]], dtype=np.float32)
M = cv2.getPerspectiveTransform(box.astype(np.float32), dst)
warped = cv2.warpPerspective(bgr, M, (width, height))
# rec 입력을 위해 높이를 out_h로 리사이즈(가로는 비율 유지)
scale = out_h / max(1, warped.shape[0])
out_w = max(1, int(warped.shape[1] * scale))
warped = cv2.resize(warped, (out_w, out_h), interpolation=cv2.INTER_LINEAR)
return warped
def crop_by_boxes(img_bgr, boxes, use_warp=True, out_h=48):
crops = []
for box in boxes:
if use_warp:
crop = four_point_warp(img_bgr, np.array(box), out_h=out_h)
if crop is not None: crops.append(crop)
else:
b = np.array(box)
x0, y0 = np.min(b, axis=0).astype(int)
x1, y1 = np.max(b, axis=0).astype(int)
x0 = max(0, x0); y0 = max(0, y0)
x1 = min(img_bgr.shape[1]-1, x1); y1 = min(img_bgr.shape[0]-1, y1)
if x1 > x0 and y1 > y0:
crops.append(img_bgr[y0:y1, x0:x1].copy())
return crops
# ----------------------------
# DET preprocess/postprocess (DB)
# ----------------------------
def det_resize(img, max_side=960, limit_type="max"):
h, w = img.shape[:2]
if limit_type == "max":
scale = min(max_side / max(h, w), 1.0)
else:
scale = max_side / float(max(h, w))
nh, nw = int(h*scale), int(w*scale)
nh = max(nh, 32); nw = max(nw, 32)
nh = nh // 32 * 32
nw = nw // 32 * 32
resized = cv2.resize(img, (nw, nh), interpolation=cv2.INTER_LINEAR)
return resized, (nh / h, nw / w)
def det_preprocess(img_bgr):
img = img_bgr.astype(np.float32)
img = img / 255.0
# PP-OCR det: mean/std (ImageNet)
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)[None,None,:]
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)[None,None,:]
img = (img - mean) / std
img = to_chw(img) # (3,H,W)
return img
def boxes_from_bitmap(pred, thresh=0.3, box_thresh=0.6, unclip_ratio=1.5, min_size=3):
"""
매우 간단/견고하게: pred(1,H,W) → 바이너리 → 컨투어 → minAreaRect 박스
"""
prob_map = pred[0]
_, bin_map = cv2.threshold(prob_map.astype(np.float32), thresh, 1, 0)
bin_map = (bin_map * 255).astype(np.uint8)
contours, _ = cv2.findContours(bin_map, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
boxes = []
scores = []
for cnt in contours:
if cv2.contourArea(cnt) < min_size:
continue
rect = cv2.minAreaRect(cnt)
box = cv2.boxPoints(rect) # (4,2)
box = np.array(box, dtype=np.float32)
# 점수 계산: 박스 내부 평균값
mask = np.zeros_like(bin_map, dtype=np.uint8)
cv2.fillPoly(mask, [box.astype(np.int32)], 1)
score = float((prob_map * mask).sum() / (mask.sum() + 1e-6))
if score < box_thresh:
continue
# 언클립(확장)
area = cv2.contourArea(box.astype(np.int32))
length = cv2.arcLength(box.astype(np.int32), True)
if length > 0:
distance = area * unclip_ratio / length
# OpenCV 4.7+의 unclip이 없다면 폴리곤 오프셋 간단 근사: box 중심으로 확장
c = box.mean(axis=0)
box = (box - c) * (1.0 + distance / (np.linalg.norm(box[0]-c)+1e-6)) + c
# 정렬/정수화
box = order_points_clockwise(box).astype(np.int32)
boxes.append(box)
scores.append(score)
return boxes, scores
# ----------------------------
# CTC decoder (rec)
# ----------------------------
class CTCLabelDecoder:
def __init__(self, dict_path, use_space_char=True):
chars = []
with open(dict_path, 'r', encoding='utf-8') as f:
for line in f:
ch = line.strip('\n\r')
if len(ch) > 0:
chars.append(ch)
if use_space_char and " " not in chars:
chars.append(" ")
self.blank_idx = 0
self.idx2char = ["<blank>"] + chars
self.char2idx = {c:i for i,c in enumerate(self.idx2char)}
def decode(self, probs): # probs: (T, C)
idxs = probs.argmax(axis=1)
confs = probs.max(axis=1)
text = []
conf_list = []
prev = self.blank_idx
for i, (ix, cf) in enumerate(zip(idxs, confs)):
if ix != self.blank_idx and not (i>0 and ix == prev):
text.append(self.idx2char[ix])
conf_list.append(float(cf))
prev = ix
s = "".join(text)
conf = float(np.mean(conf_list)) if conf_list else 0.0
return s, conf
# ----------------------------
# ONNX models: Det / Cls / Rec
# ----------------------------
class ORTDet:
def __init__(self, model_path, providers, provider_options=None, max_side=960):
self.sess = create_sess(model_path, providers, provider_options)
self.inp = self.sess.get_inputs()[0].name
self.out = self.sess.get_outputs()[0].name
self.max_side = max_side
def infer(self, img_bgr):
# resize
(resized, scale), t_resize = timer_ms(lambda: det_resize(img_bgr, self.max_side, "max"))
# norm
det_x = det_preprocess(resized)
det_x = to_nchw(det_x).astype(np.float32)
# run
(y,), t_run = timer_ms(lambda: self.sess.run([self.out], {self.inp: det_x}))
# post
def _post():
if y.ndim == 4:
# (N,1,H,W) or (N, H, W, 1)
if y.shape[1] == 1:
pm = y[0,0]
elif y.shape[-1] == 1:
pm = y[0,...,0]
else:
pm = y[0,0]
elif y.ndim == 3:
pm = y[0]
else:
pm = y
pm = sigmoid(pm)
boxes, scores = boxes_from_bitmap(pm, thresh=0.3, box_thresh=0.6, unclip_ratio=1.5)
# resize back to original scale
sy, sx = 1.0/scale[0], 1.0/scale[1]
boxes_orig = []
for b in boxes:
b = b.astype(np.float32)
b[:,0] = np.clip(b[:,0] * sx, 0, img_bgr.shape[1]-1)
b[:,1] = np.clip(b[:,1] * sy, 0, img_bgr.shape[0]-1)
boxes_orig.append(b.astype(np.int32))
return boxes_orig, scores
(boxes, scores), t_post = timer_ms(_post)
T = {"resize": t_resize, "det": t_run, "post": t_post}
return boxes, T
class ORTCls:
def __init__(self, model_path, providers, provider_options=None, thresh=0.9):
self.sess = create_sess(model_path, providers, provider_options)
self.inp = self.sess.get_inputs()[0].name
self.out = self.sess.get_outputs()[0].name
self.thresh = float(thresh)
def preprocess(self, crop_bgr):
# PP-OCR cls: 3x48x192, mean=0.5, std=0.5
h, w = crop_bgr.shape[:2]
target_h, target_w = 48, 192
scale = target_h / max(1, h)
nw = max(1, int(w*scale))
if nw > target_w:
nw = target_w
resized = cv2.resize(crop_bgr, (nw, target_h), interpolation=cv2.INTER_LINEAR)
pad = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255
pad[:, :nw, :] = resized
img = pad[:, :, ::-1].astype(np.float32) / 255.0 # BGR->RGB
img = (img - 0.5) / 0.5
img = to_chw(img)
return img
def infer(self, crops):
if not crops:
return []
batch = np.stack([self.preprocess(c) for c in crops], axis=0).astype(np.float32)
(y,), t = timer_ms(lambda: self.sess.run([self.out], {self.inp: batch}))
# assume y: (N,2)
y = softmax(y, axis=1)
res = []
for i in range(y.shape[0]):
lbl = int(np.argmax(y[i]))
prob = float(np.max(y[i]))
need_rotate = (lbl == 1 and prob > self.thresh)
res.append((need_rotate, prob))
return res
class ORTRec:
def __init__(self, model_path, providers, provider_options=None, dict_path=None, use_space_char=True, img_h=48, img_w=320, batch_size=6):
self.sess = create_sess(model_path, providers, provider_options)
self.inp = self.sess.get_inputs()[0].name
outs = self.sess.get_outputs()
# 일부 모델은 single output, 일부는 sequence logits (N,T,C)
self.out = outs[0].name
self.decoder = CTCLabelDecoder(dict_path, use_space_char)
self.img_h = img_h
self.img_w = img_w
self.batch_size = max(1, int(batch_size))
def _prep_one(self, crop_bgr):
# PP-OCR rec: 3x48x320, mean=0.5, std=0.5, RGB
h, w = crop_bgr.shape[:2]
scale = self.img_h / max(1, h)
nw = max(1, int(w*scale))
if nw > self.img_w:
nw = self.img_w
resized = cv2.resize(crop_bgr, (nw, self.img_h), interpolation=cv2.INTER_LINEAR)
pad = np.ones((self.img_h, self.img_w, 3), dtype=np.uint8) * 255
pad[:, :nw, :] = resized
img = pad[:, :, ::-1].astype(np.float32) / 255.0
img = (img - 0.5) / 0.5
img = to_chw(img)
return img
def infer(self, crops):
if not crops:
return [], {"prep": 0.0, "rec": 0.0}
def _prep():
return np.stack([self._prep_one(c) for c in crops], axis=0).astype(np.float32)
batch, t_prep = timer_ms(_prep)
texts = []
# 배치 추론
t_rec_total = 0.0
for i in range(0, batch.shape[0], self.batch_size):
chunk = batch[i:i+self.batch_size]
(y,), t_rec = timer_ms(lambda: self.sess.run([self.out], {self.inp: chunk}))
t_rec_total += t_rec
# y: (N, T, C) 또는 (N, C, T)일 수 있으므로 정규화
if y.ndim == 3 and y.shape[1] != self.decoder.blank_idx and y.shape[1] < y.shape[2]:
# (N, T, C)
probs = softmax(y, axis=2)
elif y.ndim == 3 and y.shape[1] > y.shape[2]:
# (N, C, T) -> (N, T, C)
probs = softmax(np.transpose(y, (0,2,1)), axis=2)
else:
# (N, C) 같은 이상 케이스 방어
probs = softmax(y, axis=-1)
probs = probs[:, np.newaxis, :]
for j in range(probs.shape[0]):
txt, conf = self.decoder.decode(probs[j])
texts.append((txt, conf))
return texts, {"prep": t_prep, "rec": t_rec_total}
# ----------------------------
# Orchestrator
# ----------------------------
class ONNXOCR:
def __init__(self, onnx_dir=None, det_path=None, rec_path=None, cls_path=None,
dict_path=None, ep="auto", trt_fp16=False, trt_workspace=2<<30,
rec_bs=6, use_warp_crop=True):
# 모델 경로 찾기
if det_path is None:
det_path = find_best_onnx(onnx_dir, "det")
if rec_path is None:
rec_path = find_best_onnx(onnx_dir, "rec")
# cls는 선택
if cls_path is None:
cls_path = find_best_onnx(onnx_dir, "cls")
ensure_exists(det_path, "det model missing")
ensure_exists(rec_path, "rec model missing")
if dict_path is None:
# PaddleOCR 내부 dict 사용 경로를 찾아볼 수도 있지만, 인자로 넘기는 걸 권장
raise FileNotFoundError("Provide --dict path to ppocr_keys_v1.txt")
self.providers_det, self.providers_rec, self.providers_cls, self.po, self.avail = resolve_providers(
ep, trt_fp16, trt_workspace
)
self.det = ORTDet(det_path, self.providers_det, self.po if self.po else None)
self.rec = ORTRec(rec_path, self.providers_rec, self.po if self.po else None,
dict_path=dict_path, img_h=48, img_w=320, batch_size=rec_bs)
self.cls = None
if cls_path and os.path.exists(cls_path):
self.cls = ORTCls(cls_path, self.providers_cls, self.po if self.po else None, thresh=0.9)
self.use_warp_crop = bool(use_warp_crop)
def run(self, img_bgr, use_cls=False, rec_bs=None):
# DET
boxes, t_det = self.det.infer(img_bgr)
# CROP
crops = crop_by_boxes(img_bgr, boxes, use_warp=self.use_warp_crop, out_h=48)
# CLS (optional)
if use_cls and self.cls and len(crops) > 0:
res = self.cls.infer(crops)
for i, (need_rot, prob) in enumerate(res):
if need_rot:
crops[i] = cv2.rotate(crops[i], cv2.ROTATE_180)
# REC
if rec_bs is not None:
self.rec.batch_size = max(1, int(rec_bs))
rec_res, t_rec = self.rec.infer(crops)
T = {
"det_resize_ms": t_det["resize"],
"det_ms": t_det["det"],
"det_post_ms": t_det["post"],
"rec_prep_ms": t_rec["prep"],
"rec_ms": t_rec["rec"],
}
return boxes, rec_res, T
# ----------------------------
# PaddleOCR 비교 러너 (선택)
# ----------------------------
class PaddleRunner:
def __init__(self, use_angle_cls=False, lang='ch'):
from paddleocr import PaddleOCR
self.ocr = PaddleOCR(use_angle_cls=use_angle_cls, lang=lang, show_log=True)
def run(self, img_bgr):
h, w = img_bgr.shape[:2]
# PaddleOCR는 경로 입력 선호 → 메모리 입력을 위해 임시 인코딩
_, buf = cv2.imencode(".jpg", img_bgr)
img_bytes = buf.tobytes()
# warmup
_ = self.ocr.ocr(img_bytes, cls=True)
t0 = time.perf_counter()
res = self.ocr.ocr(img_bytes, cls=True)
t1 = time.perf_counter()
out = []
if res and len(res)>0:
for line in res[0]:
txt = line[1][0]
conf = float(line[1][1])
out.append((txt, conf))
return out, (t1 - t0) * 1000.0