VOC_Monitor/test/test_parser.py

from selectolax.parser import HTMLParser
import os

def test_list_parsing():
    print("=== 리스트 파싱 테스트 ===")
    path = "d:\\py_train\\voc_noti\\test\\sample_list.html"
    if not os.path.exists(path):
        print("샘플 파일이 없습니다.")
        return

    with open(path, "r", encoding="utf-8") as f:
        html = f.read()

    tree = HTMLParser(html)
    rows = tree.css("tr[bgcolor='#ffffff']")
    print(f"발견된 행 개수: {len(rows)}")

    for i, row in enumerate(rows):
        cols = row.css("td")
        if len(cols) < 7: continue

        voc_id = cols[0].text(strip=True)
        channel = cols[1].text(strip=True)

        # 제목: a 태그의 title 속성 우선
        title_a = cols[2].css_first("a")
        title_full = title_a.attributes.get("title", "") if title_a else ""
        if not title_full and title_a:
            title_full = title_a.text(strip=True)

        title_display = cols[2].text(strip=True) # 화면에 보이는 제목

        writer = cols[3].text(strip=True)
        is_public_txt = cols[5].text(strip=True)
        status = cols[6].text(strip=True)

        print(f"[{i+1}] ID: {voc_id} | 채널: {channel} | 이름: {writer} | 공개: {is_public_txt} | 상태: {status}")
        print(f"    제목(표시): {title_display}")
        print(f"    제목(전체): {title_full}")
        print("-" * 47)

def test_detail_parsing():
    print("\n=== 상세 파싱 테스트 ===")
    path = "d:\\py_train\\voc_noti\\test\\sample_content.html"
    if not os.path.exists(path):
        print("샘플 파일이 없습니다.")
        return

    with open(path, "r", encoding="utf-8") as f:
        html = f.read()

    tree = HTMLParser(html)

    data = {}

    # 라벨 기반 데이터 추출
    labels = tree.css("td[bgcolor='#E0EDEF']")
    for label in labels:
        key = label.text(strip=True)

        # 부모 tr에서 인덱스 찾기로 형제 td 접근 (model.py와 동일 로직)
        parent = label.parent
        if not parent: continue
        tds = parent.css("td")

        idx = -1
        for i, td in enumerate(tds):
            if td == label:
                idx = i
                break

        if idx != -1 and idx + 1 < len(tds):
            val_node = tds[idx+1]

            # 응답구분 특별 처리
            if "응답구분" in key:
                checked = val_node.css_first("input[checked]")
                if checked:
                    val = checked.attributes.get('value')
                    # 샘플 기준 매핑
                    mapping = {'1':'게시판', '2':'우편', '3':'팩스', '4':'전화', '5':'E-mail', '6':'기타'}
                    val_txt = mapping.get(val, f"알수없음({val})")
                else:
                    # 체크된게 없으면 텍스트라도 가져오기
                    val_txt = val_node.text(strip=True)
            else:
                 # 줄바꿈 보존 (model.py와 동일 로직)
                val_txt = val_node.text(separator="\n", strip=True)

            data[key] = val_txt

    print("추출된 데이터:")
    for k, v in data.items():
        print(f"[{k}]:\n{v}\n")

if __name__ == "__main__":
    test_list_parsing()
    test_detail_parsing()