import os import numpy as np import os import numpy as np import polars as pl from typing import Optional class FastLogParser: CHUNK_SIZE = 84 @staticmethod def parse_to_parquet(input_path: str, output_path: str = None) -> Optional[str]: """ Numpy 벡터 연산을 사용하여 MMI 바이너리 로그를 초고속으로 파싱 후 Parquet 저장. (VDI/VDO 포함 모든 필드 완벽 구현) """ if not os.path.exists(input_path): return None file_size = os.path.getsize(input_path) n_records = file_size // FastLogParser.CHUNK_SIZE if n_records == 0: return None try: with open(input_path, 'rb') as f: # 파일 전체를 한번에 메모리로 로드 (가장 빠름) raw_data = np.fromfile(f, dtype=np.uint8) # 잘린 마지막 청크 제거 및 Reshape limit = n_records * FastLogParser.CHUNK_SIZE raw_data = raw_data[:limit] # (Row수, 84바이트) 형태의 2차원 배열로 변환 m = raw_data.reshape(n_records, FastLogParser.CHUNK_SIZE) # --- [1. Basic Info] --- seq = m[:, 1] source = m[:, 3] # 16(0x10) or 80(0x50) -> 1계, 그 외 -> 2계 system_id = np.where((source == 16) | (source == 80), 1, 2).astype(np.uint8) # 이중계 판단 (32가 아니면 Break/Standby) is_break = (source != 32) # Time (BCD-like hex display: 0x25 -> 25) # (val // 16) * 10 + (val % 16) 로직이 맞음 def bcd(arr): return (arr // 16) * 10 + (arr % 16) year = bcd(m[:, 4]).astype(np.int16) + 2000 month = bcd(m[:, 5]) day = bcd(m[:, 6]) hour = bcd(m[:, 7]) minute = bcd(m[:, 8]) second = bcd(m[:, 9]) # --- [2. Speed & Analog] --- # uint8 두 개를 합쳐 uint16으로 변환 시 반드시 먼저 astype(uint16)을 해야 함 (안 그러면 오버플로우) trainspeed = ((m[:, 10].astype(np.uint16) << 8) | m[:, 11].astype(np.uint16)) / 10.0 limitspeed = m[:, 12] pwm_value = m[:, 22] ato_limitSpeed = m[:, 33] tasc_value = m[:, 33] # --- [3. Digital Flags (Bitwise Vectorized)] --- # Byte 13: DO ATC b13 = m[:, 13] do_zvr = (b13 & 32) > 0; do_edl = (b13 & 16) > 0; do_edr = (b13 & 8) > 0 do_fsb = (b13 & 4) > 0; do_ebm = (b13 & 2) > 0; do_ebp = (b13 & 1) > 0 # Byte 14: Status b14 = m[:, 14] system_active = (b14 & 128) > 0; over_spd_warning = (b14 & 64) > 0 tcr = (b14 & 32) > 0; hcr = (b14 & 16) > 0 door_open = (b14 & 8) > 0; door_close = (b14 & 4) > 0 psd_open = (b14 & 2) > 0; psd_close = (b14 & 1) > 0 # Byte 15: Mode b15 = m[:, 15] fa = (b15 & 128) > 0; auto = (b15 & 64) > 0; mcs = (b15 & 32) > 0 yard = (b15 & 16) > 0; fmc = (b15 & 8) > 0 reverser_rvs = (b15 & 4) > 0; reverser_fwd = (b15 & 2) > 0; reverser_neu = (b15 & 1) > 0 # Byte 16: Mascon b16 = m[:, 16] ato_start_btn = (b16 & 128) > 0; ato_eb_req = (b16 & 64) > 0 tacho_dir_a = (b16 & 32) > 0; tacho_dir_b = (b16 & 16) > 0 mascon_dr = (b16 & 2) > 0; mascon_br = (b16 & 4) > 0; mascon_eb = (b16 & 8) > 0 # Byte 19: ATC Status b19 = m[:, 19] val_19 = b19 & 192 # 0, 192, 64, 128 wheelcheck = (b19 & 32) > 0 # Byte 20: Fail b20 = m[:, 20] fail_atcr = (b20 & 128) > 0; fail_atoc = (b20 & 64) > 0; fail_tcms = (b20 & 32) > 0 fail_tacho2 = (b20 & 2) > 0; fail_tacho1 = (b20 & 1) > 0 # Byte 21: Marker b21 = m[:, 21] recovery = (b21 & 128) > 0; nomal = (b21 & 64) > 0; tasc = (b21 & 32) > 0 marker_val = b21 & 31 # 16, 8, 4, 2, 1 # Byte 23: RLY ATO b23 = m[:, 23] trac_dr = (b23 & 128) > 0; trac_br = (b23 & 64) > 0; trac_cs = (b23 & 32) > 0 ador = (b23 & 16) > 0; adol = (b23 & 8) > 0; adc = (b23 & 4) > 0 start_enable = (b23 & 2) > 0; trainberth = (b23 & 1) > 0 # Byte 24: TCMS b24 = m[:, 24] tc2 = (b24 & 128) > 0; tc1 = (b24 & 64) > 0; tascdb = (b24 & 32) > 0 # Byte 25: ETC b25 = m[:, 25] pre_brake = (b25 & 128) > 0; limit_drive = (b25 & 64) > 0 ov_stop1 = (b25 & 8) > 0; ov_stop2 = (b25 & 4) > 0 sh_stop1 = (b25 & 2) > 0; sh_stop2 = (b25 & 1) > 0 # Byte 26-27: Train No trainno_int = (m[:, 26].astype(np.uint16) << 8) | m[:, 27].astype(np.uint16) # Byte 28-30: Stations pstn = m[:, 28]; nstn = m[:, 29]; dstn = m[:, 30] # Byte 31-32: DTG (Distance to Go) - 정밀 로직 구현 num_dtg = (m[:, 31].astype(np.uint16) << 8) | m[:, 32].astype(np.uint16) # DTG 부호 처리: uint16을 int16으로 해석해서 음수 여부 판단 # C# 로직: if (num_dtg & 32768) != 0: num_dtg -= 65536 # Numpy에서는 그냥 .view(np.int16)하거나 astype(np.int16)하면 32768(0x8000) 이상은 자동으로 음수가 됨. dtg_signed = num_dtg.astype(np.int16) # 조건: TASC 모드가 아니면서 속도가 0이고 역행(Dr)이 아닐 때 부호 반전 cond_reverse = (~tasc) & (trainspeed == 0.0) & (~trac_dr) # np.where(조건, 참일때값, 거짓일때값) # 참일 때: (값 * -1) / 100.0 # 거짓일 때: TASC면 / 100.0, 아니면 / 10.0 dtg = np.where( cond_reverse, dtg_signed * -1.0 / 100.0, np.where(tasc, num_dtg / 100.0, num_dtg / 10.0) ) # Byte 34: TWC b34 = m[:, 34] twct_enable = (b34 & 16) > 0; door_close_warning = (b34 & 8) > 0; wrongdoor = (b34 & 4) > 0 # Byte 35: ATC Code b35 = m[:, 35] atc_idx = (b35 & 240) >> 4 osc_f0_ok = (b35 & 8) > 0 # Byte 36-39, 55, 58: Freq (XOR) atc_code_carrier_f = (((m[:, 36]^65).astype(np.uint16) << 8) | (m[:, 37]^82).astype(np.uint16)) * 10.0 atc_code_f = (((m[:, 38]^99).astype(np.uint16) << 8) | (m[:, 39]^116).astype(np.uint16)) / 10.0 osc_f = (((m[:, 55]^99).astype(np.uint16) << 8) | (m[:, 58]^100).astype(np.uint16)) / 10.0 # --- [Missing Fields Restored: VDI/VDO] --- # VDI A (63-65) vdia_rvs = (m[:, 63] & 128) > 0; vdia_neu = (m[:, 63] & 64) > 0; vdia_fwd = (m[:, 63] & 32) > 0 vdia_mascondr = (m[:, 63] & 16) > 0; vdia_masconbr = (m[:, 63] & 8) > 0; vdia_masconeb = (m[:, 63] & 4) > 0 vdia_doorclose = (m[:, 63] & 2) > 0; vdia_dooropen = (m[:, 63] & 1) > 0 vdia_fmc = (m[:, 64] & 64) > 0; vdia_yard = (m[:, 64] & 32) > 0; vdia_mcs = (m[:, 64] & 16) > 0 vdia_auto = (m[:, 64] & 8) > 0; vdia_fa = (m[:, 64] & 4) > 0 # VDI B (66-68) vdib_rvs = (m[:, 66] & 128) > 0; vdib_neu = (m[:, 66] & 64) > 0; vdib_fwd = (m[:, 66] & 32) > 0 vdib_mascondr = (m[:, 66] & 16) > 0; vdib_masconbr = (m[:, 66] & 8) > 0; vdib_masconeb = (m[:, 66] & 4) > 0 vdib_doorclose = (m[:, 66] & 2) > 0; vdib_dooropen = (m[:, 66] & 1) > 0 # VDI C (69-71) vdic_tc2 = (m[:, 69] & 128) > 0; vdic_tc1 = (m[:, 69] & 64) > 0 vdic_edlfb = (m[:, 69] & 32) > 0; vdic_edrfb = (m[:, 69] & 16) > 0 vdic_psdclose = (m[:, 70] & 2) > 0; vdic_psdopen = (m[:, 70] & 1) > 0 # VDO A (75-76) vdoa_edl = (m[:, 75] & 32) > 0; vdoa_edr = (m[:, 75] & 16) > 0 vdoa_zvr = (m[:, 75] & 8) > 0; vdoa_fsb = (m[:, 75] & 4) > 0 # --- [4. Create DataFrame] --- data_dict = { "seq": seq, "source": source, "system_id": system_id, "is_break": is_break, "year": year, "month": month, "day": day, "hour": hour, "minute": minute, "second": second, "trainspeed": trainspeed, "limitspeed": limitspeed, "pwm_value": pwm_value, "ato_limitSpeed": ato_limitSpeed, "tasc_value": tasc_value, "do_zvr": do_zvr, "do_edl": do_edl, "do_edr": do_edr, "do_fsb": do_fsb, "do_ebm": do_ebm, "do_ebp": do_ebp, "system_active": system_active, "over_spd_warning": over_spd_warning, "tcr": tcr, "hcr": hcr, "door_open": door_open, "door_close": door_close, "psd_open": psd_open, "psd_close": psd_close, "fa": fa, "auto": auto, "mcs": mcs, "yard": yard, "fmc": fmc, "reverser_rvs": reverser_rvs, "reverser_fwd": reverser_fwd, "reverser_neu": reverser_neu, "ato_start_btn": ato_start_btn, "ato_eb_req": ato_eb_req, "tacho_dir_a": tacho_dir_a, "tacho_dir_b": tacho_dir_b, "mascon_dr": mascon_dr, "mascon_br": mascon_br, "mascon_eb": mascon_eb, "atc_status_code": val_19, "fail_atcr": fail_atcr, "fail_atoc": fail_atoc, "fail_tcms": fail_tcms, "recovery": recovery, "nomal": nomal, "tasc": tasc, "marker_val": marker_val, "trac_dr": trac_dr, "trac_br": trac_br, "trac_cs": trac_cs, "tc2": tc2, "tc1": tc1, "tascdb": tascdb, "pre_brake": pre_brake, "limit_drive": limit_drive, "trainno_int": trainno_int, "pstn": pstn, "nstn": nstn, "dstn": dstn, "dtg": dtg, "twct_enable": twct_enable, "door_close_warning": door_close_warning, "wrongdoor": wrongdoor, "atc_idx": atc_idx, "osc_f0_ok": osc_f0_ok, "atc_code_carrier_f": atc_code_carrier_f, "atc_code_f": atc_code_f, "osc_f": osc_f, # 복원된 VDI/VDO "vdia_rvs": vdia_rvs, "vdia_neu": vdia_neu, "vdia_fwd": vdia_fwd, "vdia_mascondr": vdia_mascondr, "vdia_masconbr": vdia_masconbr, "vdia_doorclose": vdia_doorclose, "vdia_dooropen": vdia_dooropen, "vdib_doorclose": vdib_doorclose, "vdib_dooropen": vdib_dooropen, "vdoa_edl": vdoa_edl, "vdoa_edr": vdoa_edr, "vdoa_zvr": vdoa_zvr } df = pl.DataFrame(data_dict) # --- [5. Post-Processing] --- # Polars의 강력한 문자열 처리 기능 사용 (Lambda 루프 제거 -> 속도 향상) # 시간 문자열 생성 df = df.with_columns( pl.format("{}-{}-{} {}:{}:{}", pl.col("year"), pl.col("month").cast(pl.String).str.zfill(2), pl.col("day").cast(pl.String).str.zfill(2), pl.col("hour").cast(pl.String).str.zfill(2), pl.col("minute").cast(pl.String).str.zfill(2), pl.col("second").cast(pl.String).str.zfill(2) ).alias("time") ) # 열차번호 Hex String 변환 df = df.with_columns( pl.col("trainno_int") .map_elements(lambda x: f"{x:04X}", return_dtype=pl.String) .alias("trainno") ) # [Optional] 여기서 역 이름 매핑이나 코드 변환을 수행할 수 있음 # (이전 대화의 Processor 로직을 여기에 통합해도 됨) if output_path is None: output_path = input_path.replace(".dat", ".parquet") df.write_parquet(output_path, compression="zstd") return output_path except Exception as e: print(f"[FastParser] Error: {e}") import traceback traceback.print_exc() return None