AI_MMI_Analyser/app/data/fast_parser.py

254 lines
12 KiB
Python

import os
import numpy as np
import os
import numpy as np
import polars as pl
from typing import Optional
class FastLogParser:
CHUNK_SIZE = 84
@staticmethod
def parse_to_parquet(input_path: str, output_path: str = None) -> Optional[str]:
"""
Numpy 벡터 연산을 사용하여 MMI 바이너리 로그를 초고속으로 파싱 후 Parquet 저장.
(VDI/VDO 포함 모든 필드 완벽 구현)
"""
if not os.path.exists(input_path):
return None
file_size = os.path.getsize(input_path)
n_records = file_size // FastLogParser.CHUNK_SIZE
if n_records == 0:
return None
try:
with open(input_path, 'rb') as f:
# 파일 전체를 한번에 메모리로 로드 (가장 빠름)
raw_data = np.fromfile(f, dtype=np.uint8)
# 잘린 마지막 청크 제거 및 Reshape
limit = n_records * FastLogParser.CHUNK_SIZE
raw_data = raw_data[:limit]
# (Row수, 84바이트) 형태의 2차원 배열로 변환
m = raw_data.reshape(n_records, FastLogParser.CHUNK_SIZE)
# --- [1. Basic Info] ---
seq = m[:, 1]
source = m[:, 3]
# 16(0x10) or 80(0x50) -> 1계, 그 외 -> 2계
system_id = np.where((source == 16) | (source == 80), 1, 2).astype(np.uint8)
# 이중계 판단 (32가 아니면 Break/Standby)
is_break = (source != 32)
# Time (BCD-like hex display: 0x25 -> 25)
# (val // 16) * 10 + (val % 16) 로직이 맞음
def bcd(arr): return (arr // 16) * 10 + (arr % 16)
year = bcd(m[:, 4]).astype(np.int16) + 2000
month = bcd(m[:, 5])
day = bcd(m[:, 6])
hour = bcd(m[:, 7])
minute = bcd(m[:, 8])
second = bcd(m[:, 9])
# --- [2. Speed & Analog] ---
# uint8 두 개를 합쳐 uint16으로 변환 시 반드시 먼저 astype(uint16)을 해야 함 (안 그러면 오버플로우)
trainspeed = ((m[:, 10].astype(np.uint16) << 8) | m[:, 11].astype(np.uint16)) / 10.0
limitspeed = m[:, 12]
pwm_value = m[:, 22]
ato_limitSpeed = m[:, 33]
tasc_value = m[:, 33]
# --- [3. Digital Flags (Bitwise Vectorized)] ---
# Byte 13: DO ATC
b13 = m[:, 13]
do_zvr = (b13 & 32) > 0; do_edl = (b13 & 16) > 0; do_edr = (b13 & 8) > 0
do_fsb = (b13 & 4) > 0; do_ebm = (b13 & 2) > 0; do_ebp = (b13 & 1) > 0
# Byte 14: Status
b14 = m[:, 14]
system_active = (b14 & 128) > 0; over_spd_warning = (b14 & 64) > 0
tcr = (b14 & 32) > 0; hcr = (b14 & 16) > 0
door_open = (b14 & 8) > 0; door_close = (b14 & 4) > 0
psd_open = (b14 & 2) > 0; psd_close = (b14 & 1) > 0
# Byte 15: Mode
b15 = m[:, 15]
fa = (b15 & 128) > 0; auto = (b15 & 64) > 0; mcs = (b15 & 32) > 0
yard = (b15 & 16) > 0; fmc = (b15 & 8) > 0
reverser_rvs = (b15 & 4) > 0; reverser_fwd = (b15 & 2) > 0; reverser_neu = (b15 & 1) > 0
# Byte 16: Mascon
b16 = m[:, 16]
ato_start_btn = (b16 & 128) > 0; ato_eb_req = (b16 & 64) > 0
tacho_dir_a = (b16 & 32) > 0; tacho_dir_b = (b16 & 16) > 0
mascon_dr = (b16 & 2) > 0; mascon_br = (b16 & 4) > 0; mascon_eb = (b16 & 8) > 0
# Byte 19: ATC Status
b19 = m[:, 19]
val_19 = b19 & 192 # 0, 192, 64, 128
wheelcheck = (b19 & 32) > 0
# Byte 20: Fail
b20 = m[:, 20]
fail_atcr = (b20 & 128) > 0; fail_atoc = (b20 & 64) > 0; fail_tcms = (b20 & 32) > 0
fail_tacho2 = (b20 & 2) > 0; fail_tacho1 = (b20 & 1) > 0
# Byte 21: Marker
b21 = m[:, 21]
recovery = (b21 & 128) > 0; nomal = (b21 & 64) > 0; tasc = (b21 & 32) > 0
marker_val = b21 & 31 # 16, 8, 4, 2, 1
# Byte 23: RLY ATO
b23 = m[:, 23]
trac_dr = (b23 & 128) > 0; trac_br = (b23 & 64) > 0; trac_cs = (b23 & 32) > 0
ador = (b23 & 16) > 0; adol = (b23 & 8) > 0; adc = (b23 & 4) > 0
start_enable = (b23 & 2) > 0; trainberth = (b23 & 1) > 0
# Byte 24: TCMS
b24 = m[:, 24]
tc2 = (b24 & 128) > 0; tc1 = (b24 & 64) > 0; tascdb = (b24 & 32) > 0
# Byte 25: ETC
b25 = m[:, 25]
pre_brake = (b25 & 128) > 0; limit_drive = (b25 & 64) > 0
ov_stop1 = (b25 & 8) > 0; ov_stop2 = (b25 & 4) > 0
sh_stop1 = (b25 & 2) > 0; sh_stop2 = (b25 & 1) > 0
# Byte 26-27: Train No
trainno_int = (m[:, 26].astype(np.uint16) << 8) | m[:, 27].astype(np.uint16)
# Byte 28-30: Stations
pstn = m[:, 28]; nstn = m[:, 29]; dstn = m[:, 30]
# Byte 31-32: DTG (Distance to Go) - 정밀 로직 구현
num_dtg = (m[:, 31].astype(np.uint16) << 8) | m[:, 32].astype(np.uint16)
# DTG 부호 처리: uint16을 int16으로 해석해서 음수 여부 판단
# C# 로직: if (num_dtg & 32768) != 0: num_dtg -= 65536
# Numpy에서는 그냥 .view(np.int16)하거나 astype(np.int16)하면 32768(0x8000) 이상은 자동으로 음수가 됨.
dtg_signed = num_dtg.astype(np.int16)
# 조건: TASC 모드가 아니면서 속도가 0이고 역행(Dr)이 아닐 때 부호 반전
cond_reverse = (~tasc) & (trainspeed == 0.0) & (~trac_dr)
# np.where(조건, 참일때값, 거짓일때값)
# 참일 때: (값 * -1) / 100.0
# 거짓일 때: TASC면 / 100.0, 아니면 / 10.0
dtg = np.where(
cond_reverse,
dtg_signed * -1.0 / 100.0,
np.where(tasc, num_dtg / 100.0, num_dtg / 10.0)
)
# Byte 34: TWC
b34 = m[:, 34]
twct_enable = (b34 & 16) > 0; door_close_warning = (b34 & 8) > 0; wrongdoor = (b34 & 4) > 0
# Byte 35: ATC Code
b35 = m[:, 35]
atc_idx = (b35 & 240) >> 4
osc_f0_ok = (b35 & 8) > 0
# Byte 36-39, 55, 58: Freq (XOR)
atc_code_carrier_f = (((m[:, 36]^65).astype(np.uint16) << 8) | (m[:, 37]^82).astype(np.uint16)) * 10.0
atc_code_f = (((m[:, 38]^99).astype(np.uint16) << 8) | (m[:, 39]^116).astype(np.uint16)) / 10.0
osc_f = (((m[:, 55]^99).astype(np.uint16) << 8) | (m[:, 58]^100).astype(np.uint16)) / 10.0
# --- [Missing Fields Restored: VDI/VDO] ---
# VDI A (63-65)
vdia_rvs = (m[:, 63] & 128) > 0; vdia_neu = (m[:, 63] & 64) > 0; vdia_fwd = (m[:, 63] & 32) > 0
vdia_mascondr = (m[:, 63] & 16) > 0; vdia_masconbr = (m[:, 63] & 8) > 0; vdia_masconeb = (m[:, 63] & 4) > 0
vdia_doorclose = (m[:, 63] & 2) > 0; vdia_dooropen = (m[:, 63] & 1) > 0
vdia_fmc = (m[:, 64] & 64) > 0; vdia_yard = (m[:, 64] & 32) > 0; vdia_mcs = (m[:, 64] & 16) > 0
vdia_auto = (m[:, 64] & 8) > 0; vdia_fa = (m[:, 64] & 4) > 0
# VDI B (66-68)
vdib_rvs = (m[:, 66] & 128) > 0; vdib_neu = (m[:, 66] & 64) > 0; vdib_fwd = (m[:, 66] & 32) > 0
vdib_mascondr = (m[:, 66] & 16) > 0; vdib_masconbr = (m[:, 66] & 8) > 0; vdib_masconeb = (m[:, 66] & 4) > 0
vdib_doorclose = (m[:, 66] & 2) > 0; vdib_dooropen = (m[:, 66] & 1) > 0
# VDI C (69-71)
vdic_tc2 = (m[:, 69] & 128) > 0; vdic_tc1 = (m[:, 69] & 64) > 0
vdic_edlfb = (m[:, 69] & 32) > 0; vdic_edrfb = (m[:, 69] & 16) > 0
vdic_psdclose = (m[:, 70] & 2) > 0; vdic_psdopen = (m[:, 70] & 1) > 0
# VDO A (75-76)
vdoa_edl = (m[:, 75] & 32) > 0; vdoa_edr = (m[:, 75] & 16) > 0
vdoa_zvr = (m[:, 75] & 8) > 0; vdoa_fsb = (m[:, 75] & 4) > 0
# --- [4. Create DataFrame] ---
data_dict = {
"seq": seq, "source": source, "system_id": system_id, "is_break": is_break,
"year": year, "month": month, "day": day, "hour": hour, "minute": minute, "second": second,
"trainspeed": trainspeed, "limitspeed": limitspeed, "pwm_value": pwm_value,
"ato_limitSpeed": ato_limitSpeed, "tasc_value": tasc_value,
"do_zvr": do_zvr, "do_edl": do_edl, "do_edr": do_edr, "do_fsb": do_fsb, "do_ebm": do_ebm, "do_ebp": do_ebp,
"system_active": system_active, "over_spd_warning": over_spd_warning, "tcr": tcr, "hcr": hcr,
"door_open": door_open, "door_close": door_close, "psd_open": psd_open, "psd_close": psd_close,
"fa": fa, "auto": auto, "mcs": mcs, "yard": yard, "fmc": fmc,
"reverser_rvs": reverser_rvs, "reverser_fwd": reverser_fwd, "reverser_neu": reverser_neu,
"ato_start_btn": ato_start_btn, "ato_eb_req": ato_eb_req,
"tacho_dir_a": tacho_dir_a, "tacho_dir_b": tacho_dir_b,
"mascon_dr": mascon_dr, "mascon_br": mascon_br, "mascon_eb": mascon_eb,
"atc_status_code": val_19,
"fail_atcr": fail_atcr, "fail_atoc": fail_atoc, "fail_tcms": fail_tcms,
"recovery": recovery, "nomal": nomal, "tasc": tasc, "marker_val": marker_val,
"trac_dr": trac_dr, "trac_br": trac_br, "trac_cs": trac_cs,
"tc2": tc2, "tc1": tc1, "tascdb": tascdb,
"pre_brake": pre_brake, "limit_drive": limit_drive,
"trainno_int": trainno_int,
"pstn": pstn, "nstn": nstn, "dstn": dstn,
"dtg": dtg,
"twct_enable": twct_enable, "door_close_warning": door_close_warning, "wrongdoor": wrongdoor,
"atc_idx": atc_idx, "osc_f0_ok": osc_f0_ok,
"atc_code_carrier_f": atc_code_carrier_f, "atc_code_f": atc_code_f, "osc_f": osc_f,
# 복원된 VDI/VDO
"vdia_rvs": vdia_rvs, "vdia_neu": vdia_neu, "vdia_fwd": vdia_fwd,
"vdia_mascondr": vdia_mascondr, "vdia_masconbr": vdia_masconbr,
"vdia_doorclose": vdia_doorclose, "vdia_dooropen": vdia_dooropen,
"vdib_doorclose": vdib_doorclose, "vdib_dooropen": vdib_dooropen,
"vdoa_edl": vdoa_edl, "vdoa_edr": vdoa_edr, "vdoa_zvr": vdoa_zvr
}
df = pl.DataFrame(data_dict)
# --- [5. Post-Processing] ---
# Polars의 강력한 문자열 처리 기능 사용 (Lambda 루프 제거 -> 속도 향상)
# 시간 문자열 생성
df = df.with_columns(
pl.format("{}-{}-{} {}:{}:{}",
pl.col("year"),
pl.col("month").cast(pl.String).str.zfill(2),
pl.col("day").cast(pl.String).str.zfill(2),
pl.col("hour").cast(pl.String).str.zfill(2),
pl.col("minute").cast(pl.String).str.zfill(2),
pl.col("second").cast(pl.String).str.zfill(2)
).alias("time")
)
# 열차번호 Hex String 변환
df = df.with_columns(
pl.col("trainno_int")
.map_elements(lambda x: f"{x:04X}", return_dtype=pl.String)
.alias("trainno")
)
# [Optional] 여기서 역 이름 매핑이나 코드 변환을 수행할 수 있음
# (이전 대화의 Processor 로직을 여기에 통합해도 됨)
if output_path is None:
output_path = input_path.replace(".dat", ".parquet")
df.write_parquet(output_path, compression="zstd")
return output_path
except Exception as e:
print(f"[FastParser] Error: {e}")
import traceback
traceback.print_exc()
return None