AI_MMI_Analyser/app/data/fast_parser.py

275 lines
12 KiB
Python

import os
import numpy as np
import os
import numpy as np
import os
import numpy as np
import os
import numpy as np
import polars as pl
from typing import Optional
class FastLogParser:
CHUNK_SIZE = 84
@staticmethod
def parse(input_path: str) -> Optional[pl.DataFrame]:
"""
Numpy 벡터 연산을 사용하여 MMI 바이너리 로그를 초고속으로 파싱 후 DataFrame 반환.
"""
if not os.path.exists(input_path):
return None
file_size = os.path.getsize(input_path)
n_records = file_size // FastLogParser.CHUNK_SIZE
if n_records == 0:
return None
try:
with open(input_path, 'rb') as f:
# 파일 전체를 한번에 메모리로 로드 (가장 빠름)
raw_data = np.fromfile(f, dtype=np.uint8)
# 잘린 마지막 청크 제거 및 Reshape
limit = n_records * FastLogParser.CHUNK_SIZE
raw_data = raw_data[:limit]
# (Row수, 84바이트) 형태의 2차원 배열로 변환
m = raw_data.reshape(n_records, FastLogParser.CHUNK_SIZE)
# --- [1. Basic Info] ---
seq = m[:, 1]
source = m[:, 3]
# 16(0x10) or 80(0x50) -> 1계, 그 외 -> 2계
system_id = np.where((source == 16) | (source == 80), 1, 2).astype(np.uint8)
# 이중계 판단 (32가 아니면 Break/Standby)
is_break = (source != 32)
# Time (BCD-like hex display: 0x25 -> 25)
# (val // 16) * 10 + (val % 16) 로직이 맞음
def bcd(arr): return (arr // 16) * 10 + (arr % 16)
year = bcd(m[:, 4]).astype(np.int16) + 2000
month = bcd(m[:, 5])
day = bcd(m[:, 6])
hour = bcd(m[:, 7])
minute = bcd(m[:, 8])
second = bcd(m[:, 9])
# --- [2. Speed & Analog] ---
# uint8 두 개를 합쳐 uint16으로 변환 시 반드시 먼저 astype(uint16)을 해야 함 (안 그러면 오버플로우)
trainspeed = ((m[:, 10].astype(np.uint16) << 8) | m[:, 11].astype(np.uint16)) / 10.0
limitspeed = m[:, 12]
pwm_value = m[:, 22]
ato_limitSpeed = m[:, 33]
tasc_value = m[:, 33]
# --- [3. Digital Flags (Bitwise Vectorized)] ---
# Byte 13: DO ATC
b13 = m[:, 13]
do_zvr = (b13 & 32) > 0; do_edl = (b13 & 16) > 0; do_edr = (b13 & 8) > 0
do_fsb = (b13 & 4) > 0; do_ebm = (b13 & 2) > 0; do_ebp = (b13 & 1) > 0
# Byte 14: Status
b14 = m[:, 14]
system_active = (b14 & 128) > 0; over_spd_warning = (b14 & 64) > 0
tcr = (b14 & 32) > 0; hcr = (b14 & 16) > 0
door_open = (b14 & 8) > 0; door_close = (b14 & 4) > 0
psd_open = (b14 & 2) > 0; psd_close = (b14 & 1) > 0
# Byte 15: Mode
b15 = m[:, 15]
fa = (b15 & 128) > 0; auto = (b15 & 64) > 0; mcs = (b15 & 32) > 0
yard = (b15 & 16) > 0; fmc = (b15 & 8) > 0
reverser_rvs = (b15 & 4) > 0; reverser_fwd = (b15 & 2) > 0; reverser_neu = (b15 & 1) > 0
# Byte 16: Mascon
b16 = m[:, 16]
ato_start_btn = (b16 & 128) > 0; ato_eb_req = (b16 & 64) > 0
tacho_dir_a = (b16 & 32) > 0; tacho_dir_b = (b16 & 16) > 0
mascon_dr = (b16 & 2) > 0; mascon_br = (b16 & 4) > 0; mascon_eb = (b16 & 8) > 0
# Byte 19: ATC Status
b19 = m[:, 19]
val_19 = b19 & 192 # 0, 192, 64, 128
wheelcheck = (b19 & 32) > 0
# Byte 20: Fail
b20 = m[:, 20]
fail_atcr = (b20 & 128) > 0; fail_atoc = (b20 & 64) > 0; fail_tcms = (b20 & 32) > 0
fail_tacho2 = (b20 & 2) > 0; fail_tacho1 = (b20 & 1) > 0
# Byte 21: Marker
b21 = m[:, 21]
recovery = (b21 & 128) > 0; nomal = (b21 & 64) > 0; tasc = (b21 & 32) > 0
marker_val = b21 & 31 # 16, 8, 4, 2, 1
# Byte 23: RLY ATO
b23 = m[:, 23]
trac_dr = (b23 & 128) > 0; trac_br = (b23 & 64) > 0; trac_cs = (b23 & 32) > 0
ador = (b23 & 16) > 0; adol = (b23 & 8) > 0; adc = (b23 & 4) > 0
start_enable = (b23 & 2) > 0; trainberth = (b23 & 1) > 0
# Byte 24: TCMS
b24 = m[:, 24]
tc2 = (b24 & 128) > 0; tc1 = (b24 & 64) > 0; tascdb = (b24 & 32) > 0
# Byte 25: ETC
b25 = m[:, 25]
pre_brake = (b25 & 128) > 0; limit_drive = (b25 & 64) > 0
ov_stop1 = (b25 & 8) > 0; ov_stop2 = (b25 & 4) > 0
sh_stop1 = (b25 & 2) > 0; sh_stop2 = (b25 & 1) > 0
# Byte 26-27: Train No
trainno_int = (m[:, 26].astype(np.uint16) << 8) | m[:, 27].astype(np.uint16)
# Byte 28-30: Stations
pstn = m[:, 28]; nstn = m[:, 29]; dstn = m[:, 30]
# Byte 31-32: DTG (Distance to Go) - 정밀 로직 구현
num_dtg = (m[:, 31].astype(np.uint16) << 8) | m[:, 32].astype(np.uint16)
# DTG 부호 처리: uint16을 int16으로 해석해서 음수 여부 판단
# C# 로직: if (num_dtg & 32768) != 0: num_dtg -= 65536
# Numpy에서는 그냥 .view(np.int16)하거나 astype(np.int16)하면 32768(0x8000) 이상은 자동으로 음수가 됨.
dtg_signed = num_dtg.astype(np.int16)
# 조건: TASC 모드가 아니면서 속도가 0이고 역행(Dr)이 아닐 때 부호 반전
cond_reverse = (~tasc) & (trainspeed == 0.0) & (~trac_dr)
# np.where(조건, 참일때값, 거짓일때값)
# 참일 때: (값 * -1) / 100.0
# 거짓일 때: TASC면 / 100.0, 아니면 / 10.0
dtg = np.where(
cond_reverse,
dtg_signed * -1.0 / 100.0,
np.where(tasc, num_dtg / 100.0, num_dtg / 10.0)
)
# Byte 34: TWC
b34 = m[:, 34]
twct_enable = (b34 & 16) > 0; door_close_warning = (b34 & 8) > 0; wrongdoor = (b34 & 4) > 0
# Byte 35: ATC Code
b35 = m[:, 35]
atc_idx = (b35 & 240) >> 4
osc_f0_ok = (b35 & 8) > 0
# Byte 36-39, 55, 58: Freq (XOR)
atc_code_carrier_f = (((m[:, 36]^65).astype(np.uint16) << 8) | (m[:, 37]^82).astype(np.uint16)) * 10.0
atc_code_f = (((m[:, 38]^99).astype(np.uint16) << 8) | (m[:, 39]^116).astype(np.uint16)) / 10.0
osc_f = (((m[:, 55]^99).astype(np.uint16) << 8) | (m[:, 58]^100).astype(np.uint16)) / 10.0
# --- [Missing Fields Restored: VDI/VDO] ---
# VDI A (63-65)
vdia_rvs = (m[:, 63] & 128) > 0; vdia_neu = (m[:, 63] & 64) > 0; vdia_fwd = (m[:, 63] & 32) > 0
vdia_mascondr = (m[:, 63] & 16) > 0; vdia_masconbr = (m[:, 63] & 8) > 0; vdia_masconeb = (m[:, 63] & 4) > 0
vdia_doorclose = (m[:, 63] & 2) > 0; vdia_dooropen = (m[:, 63] & 1) > 0
vdia_fmc = (m[:, 64] & 64) > 0; vdia_yard = (m[:, 64] & 32) > 0; vdia_mcs = (m[:, 64] & 16) > 0
vdia_auto = (m[:, 64] & 8) > 0; vdia_fa = (m[:, 64] & 4) > 0
# VDI B (66-68)
vdib_rvs = (m[:, 66] & 128) > 0; vdib_neu = (m[:, 66] & 64) > 0; vdib_fwd = (m[:, 66] & 32) > 0
vdib_mascondr = (m[:, 66] & 16) > 0; vdib_masconbr = (m[:, 66] & 8) > 0; vdib_masconeb = (m[:, 66] & 4) > 0
vdib_doorclose = (m[:, 66] & 2) > 0; vdib_dooropen = (m[:, 66] & 1) > 0
# VDI C (69-71)
vdic_tc2 = (m[:, 69] & 128) > 0; vdic_tc1 = (m[:, 69] & 64) > 0
vdic_edlfb = (m[:, 69] & 32) > 0; vdic_edrfb = (m[:, 69] & 16) > 0
vdic_psdclose = (m[:, 70] & 2) > 0; vdic_psdopen = (m[:, 70] & 1) > 0
# VDO A (75-76)
vdoa_edl = (m[:, 75] & 32) > 0; vdoa_edr = (m[:, 75] & 16) > 0
vdoa_zvr = (m[:, 75] & 8) > 0; vdoa_fsb = (m[:, 75] & 4) > 0
# --- [4. Create DataFrame] ---
data_dict = {
"seq": seq, "source": source, "system_id": system_id, "is_break": is_break,
"year": year, "month": month, "day": day, "hour": hour, "minute": minute, "second": second,
"trainspeed": trainspeed, "limitspeed": limitspeed, "pwm_value": pwm_value,
"ato_limitSpeed": ato_limitSpeed, "tasc_value": tasc_value,
"do_zvr": do_zvr, "do_edl": do_edl, "do_edr": do_edr, "do_fsb": do_fsb, "do_ebm": do_ebm, "do_ebp": do_ebp,
"system_active": system_active, "over_spd_warning": over_spd_warning, "tcr": tcr, "hcr": hcr,
"door_open": door_open, "door_close": door_close, "psd_open": psd_open, "psd_close": psd_close,
"fa": fa, "auto": auto, "mcs": mcs, "yard": yard, "fmc": fmc,
"reverser_rvs": reverser_rvs, "reverser_fwd": reverser_fwd, "reverser_neu": reverser_neu,
"ato_start_btn": ato_start_btn, "ato_eb_req": ato_eb_req,
"tacho_dir_a": tacho_dir_a, "tacho_dir_b": tacho_dir_b,
"mascon_dr": mascon_dr, "mascon_br": mascon_br, "mascon_eb": mascon_eb,
"atc_status_code": val_19,
"fail_atcr": fail_atcr, "fail_atoc": fail_atoc, "fail_tcms": fail_tcms,
"recovery": recovery, "nomal": nomal, "tasc": tasc, "marker_val": marker_val,
"trac_dr": trac_dr, "trac_br": trac_br, "trac_cs": trac_cs,
"tc2": tc2, "tc1": tc1, "tascdb": tascdb,
"pre_brake": pre_brake, "limit_drive": limit_drive,
"trainno_int": trainno_int,
"pstn": pstn, "nstn": nstn, "dstn": dstn,
"dtg": dtg,
"twct_enable": twct_enable, "door_close_warning": door_close_warning, "wrongdoor": wrongdoor,
"atc_idx": atc_idx, "osc_f0_ok": osc_f0_ok,
"atc_code_carrier_f": atc_code_carrier_f, "atc_code_f": atc_code_f, "osc_f": osc_f,
# 복원된 VDI/VDO
"vdia_rvs": vdia_rvs, "vdia_neu": vdia_neu, "vdia_fwd": vdia_fwd,
"vdia_mascondr": vdia_mascondr, "vdia_masconbr": vdia_masconbr,
"vdia_doorclose": vdia_doorclose, "vdia_dooropen": vdia_dooropen,
"vdib_doorclose": vdib_doorclose, "vdib_dooropen": vdib_dooropen,
"vdoa_edl": vdoa_edl, "vdoa_edr": vdoa_edr, "vdoa_zvr": vdoa_zvr
}
df = pl.DataFrame(data_dict)
# --- [5. Post-Processing] ---
# Polars의 강력한 문자열 처리 기능 사용 (Lambda 루프 제거 -> 속도 향상)
# 시간 문자열 생성
df = df.with_columns(
pl.format("{}-{}-{} {}:{}:{}",
pl.col("year"),
pl.col("month").cast(pl.String).str.zfill(2),
pl.col("day").cast(pl.String).str.zfill(2),
pl.col("hour").cast(pl.String).str.zfill(2),
pl.col("minute").cast(pl.String).str.zfill(2),
pl.col("second").cast(pl.String).str.zfill(2)
).alias("time")
)
# 열차번호 Hex String 변환
df = df.with_columns(
pl.col("trainno_int")
.map_elements(lambda x: f"{x:04X}", return_dtype=pl.String)
.alias("trainno")
)
# [Optional] 여기서 역 이름 매핑이나 코드 변환을 수행할 수 있음
# (이전 대화의 Processor 로직을 여기에 통합해도 됨)
return df
except Exception as e:
print(f"[FastParser] Error: {e}")
import traceback
traceback.print_exc()
return None
@staticmethod
def parse_to_parquet(input_path: str, output_path: str = None) -> Optional[str]:
"""
MMI 바이너리 로그를 파싱하여 Parquet로 저장 (Wrapper)
"""
df = FastLogParser.parse(input_path)
if df is None:
return None
try:
if output_path is None:
output_path = input_path.replace(".dat", ".parquet")
df.write_parquet(output_path, compression="zstd")
return output_path
except Exception as e:
print(f"[FastParser] Error: {e}")
import traceback
traceback.print_exc()
return None