baidu_web/test/test4.py

import time
import json
import re, os
import argparse
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright

# 명령줄 인자 설정
parser = argparse.ArgumentParser(description="상품 정보 필터링 및 구매 링크 변환")
parser.add_argument('--sources', nargs='+', default=['淘宝', 'tmall', '1688'], help="필터링할 출처 (예: 淘宝 tmall 1688)")
args = parser.parse_args()
filtered_sources = set(args.sources)  # 선택된 출처를 집합으로 사용

# Playwright 사용
def process_image(page, file_name):
    # 이미지 업로드

    # 절대 경로로 변환
    absolute_file_path = os.path.abspath(file_name)


    page.click('//*[@id="app"]/div/div[1]/div[7]/div/span[1]/span[1]')
    page.set_input_files('//*[@id="app"]/div/div[1]/div[7]/div/div/div[2]/div[2]/div/form/input', absolute_file_path)

    # 확장 버튼 클릭
    expand_button_xpath = '//*[@id="app"]/div/div[2]/div/div[1]/ul/li[2]'
    page.wait_for_selector(expand_button_xpath)  # 버튼이 나타날 때까지 대기
    page.click(expand_button_xpath)

    # 확장된 상품 카드 요소가 로드될 때까지 대기
    product_card_selector = 'div.graph-product-list-img img'  # 상품 카드의 CSS 선택자
    page.wait_for_selector(product_card_selector)

    # 검색 결과 페이지에서 JSON 데이터 추출
    content = page.content()
    soup = BeautifulSoup(content, 'html.parser')

    # JSON 데이터가 포함된 스크립트 태그 추출
    script_tag = soup.select_one("html > head > script:nth-of-type(2)")
    if script_tag:
        # 불필요한 부분 제거
        raw_data = script_tag.string.strip()
        match = re.search(r"window\.cardData\s*=\s*(\[\{.*\}\]);", raw_data, re.DOTALL)
        if match:
            json_data_str = match.group(1)  # JSON 데이터만 추출

            try:
                # JSON으로 변환
                data = json.loads(json_data_str)

                # 필요한 데이터 추출 및 출력
                for card in data:
                    if card.get("cardName") == "product":
                        products = card["tplData"]["list"]
                        for product in products:
                            title = product.get("desc", "")
                            source = product.get("source", "")
                            price = product.get("text", "")
                            buyurl = product.get("buyurl", "")

                            # 필터링할 출처 체크
                            if source in filtered_sources:
                                # 원래 페이지 링크 요청
                                try:
                                    response = requests.get(buyurl, headers={
                                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36"
                                    })
                                    # 리디렉션된 URL 가져오기
                                    original_url = response.url if response.status_code == 200 else "변환 실패"
                                except requests.RequestException:
                                    original_url = "변환 실패"

                                # 결과 출력
                                print(f"상품명: {title}, 출처: {source}, 가격: {price}")
                                print(f"구매 링크 (암호화): {buyurl}")
                                print(f"구매 링크 (원래 링크): {original_url}\n")
            except json.JSONDecodeError as e:
                print("JSON 디코딩 오류:", e)
        else:
            print("JSON 데이터를 찾을 수 없습니다.")
    else:
        print("JSON 데이터가 포함된 스크립트를 찾지 못했습니다.")

# 파일 리스트 정의
file_list = [f"{i}.jpg" for i in range(1, 6)]

# 각 파일을 처리하고 사용자 입력 대기
with sync_playwright() as p:
    browser = p.webkit.launch(headless=False)
    page = browser.new_page()
    url = "https://graph.baidu.com/pcpage/index?tpl_from=pc"
    page.goto(url)


    for file_name in file_list:
        print(f"{file_name} 처리 중...")
        process_image(page, file_name)

        # '뒤로가기' 버튼 클릭
        back_button_xpath = '//*[@id="app"]/div/div[2]/div/div[1]/div[1]/a'  # 뒤로가기 버튼의 XPath
        page.wait_for_selector(back_button_xpath)  # 버튼이 나타날 때까지 대기
        page.click(back_button_xpath)

        input("다음 파일로 진행하려면 아무 키나 누르세요...")

    # 브라우저 종료
    browser.close()