132 lines
6.1 KiB
Python
132 lines
6.1 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
import logging
|
|
from typing import List, Dict, Optional
|
|
|
|
# 로거 인스턴스 가져오기
|
|
logger = logging.getLogger('default_logger')
|
|
|
|
class NaverParser:
|
|
def __init__(self):
|
|
self.base_url = "https://search.shopping.naver.com/search/all?query="
|
|
self.headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"DNT": "1",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Cache-Control": "max-age=0"
|
|
}
|
|
|
|
def fetch_search_results(self, keyword: str, product_set: str = "overseas") -> Optional[Dict]:
|
|
"""네이버 쇼핑에서 키워드 검색 결과를 가져옴. product_set 인자로 제품 유형을 선택 가능."""
|
|
url = f"{self.base_url}{keyword}&frm=NVSHATC&pagingIndex=1&pagingSize=40&productSet={product_set}&sort=rel×tamp=&viewType=list"
|
|
logger.debug(f"검색 URL: {url}")
|
|
|
|
try:
|
|
response = requests.get(url, headers=self.headers)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
next_data = soup.find("script", {"id": "__NEXT_DATA__"})
|
|
if next_data:
|
|
next_data_json = json.loads(next_data.string)
|
|
return next_data_json
|
|
else:
|
|
logger.error("검색 결과에서 '__NEXT_DATA__' 태그를 찾을 수 없음.")
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"네이버 쇼핑 HTML 가져오기 실패: {e}")
|
|
return None
|
|
|
|
def get_product_list(self, data: Dict) -> List[Dict]:
|
|
"""검색 결과에서 제품 리스트를 추출."""
|
|
try:
|
|
products_list = data["props"]["pageProps"]["initialState"]["products"]["list"]
|
|
logger.debug(f"총 {len(products_list)}개의 제품이 검색됨.")
|
|
return products_list
|
|
except KeyError as e:
|
|
logger.error(f"제품 리스트를 추출하는 중 오류 발생: {e}")
|
|
return []
|
|
|
|
def filter_products_by_price(self, products: List[Dict], min_price: int = 50000) -> List[Dict]:
|
|
"""최소 가격 기준으로 제품을 필터링."""
|
|
filtered_products = [product for product in products if int(product.get("item", {}).get("price", 0)) > min_price]
|
|
logger.debug(f"가격이 {min_price}원 이상인 제품 {len(filtered_products)}개 필터링됨.")
|
|
return filtered_products
|
|
|
|
def extract_product_info(self, product: Dict) -> Dict:
|
|
"""제품 정보에서 필요한 데이터만 추출."""
|
|
item = product.get("item", {})
|
|
product_info = {
|
|
"title": item.get("productTitle"),
|
|
"price": item.get("price"),
|
|
"mall_name": item.get("mallName"),
|
|
"image_url": item.get("imageUrl"),
|
|
"product_url": item.get("mallProductUrl"),
|
|
"category": [
|
|
item.get("category1Name"),
|
|
item.get("category2Name"),
|
|
item.get("category3Name"),
|
|
item.get("category4Name")
|
|
],
|
|
"rank": item.get("rank"),
|
|
"review_count": item.get("reviewCount"),
|
|
"review_count_sum": item.get("reviewCountSum"),
|
|
"score_info": item.get("scoreInfo"),
|
|
"mobile_low_price": item.get("mobileLowPrice"),
|
|
"low_price": item.get("lowPrice"),
|
|
"delivery_fee_content": item.get("deliveryFeeContent"),
|
|
"dlvry_low_price": item.get("dlvryLowPrice"),
|
|
"open_date": item.get("openDate"),
|
|
"mall_count": item.get("mallCount"),
|
|
"keep_count": item.get("keepCnt"),
|
|
"oversea_tp": item.get("overseaTp"),
|
|
"purchase_count": item.get("purchaseCnt"),
|
|
"manu_tag": item.get("manuTag"),
|
|
"img_size": item.get("imgSz"),
|
|
"search_keyword": item.get("searchKeyword"),
|
|
"mall_pc_url": item.get("mallPcUrl"),
|
|
}
|
|
return product_info
|
|
|
|
def get_top_n_products(self, products: List[Dict], top_n: int = 5) -> List[Dict]:
|
|
"""상위 N개의 제품 정보 추출."""
|
|
sorted_products = sorted(products, key=lambda p: int(p.get("item", {}).get("rank", 0)))
|
|
top_products = sorted_products[:top_n]
|
|
logger.debug(f"상위 {top_n}개 제품을 추출함.")
|
|
return [self.extract_product_info(product) for product in top_products]
|
|
|
|
def get_related_tags(self, data: Dict) -> List[str]:
|
|
"""연관 검색어를 추출."""
|
|
try:
|
|
related_tags = data["props"]["pageProps"]["relatedTags"]
|
|
filtered_tags = [tag.strip() for tag in related_tags if tag]
|
|
logger.debug(f"연관 검색어: {filtered_tags}")
|
|
return filtered_tags
|
|
except KeyError:
|
|
logger.error("연관 검색어를 추출하는 중 오류 발생.")
|
|
return []
|
|
|
|
def search_and_parse(self, keyword: str, min_price: int = 10000, top_n: int = 5) -> Dict:
|
|
"""키워드로 검색 후 필터링된 상위 제품 정보 및 연관 검색어 반환."""
|
|
data = self.fetch_search_results(keyword)
|
|
if data:
|
|
products = self.get_product_list(data)
|
|
filtered_products = self.filter_products_by_price(products, min_price)
|
|
top_products = self.get_top_n_products(filtered_products, top_n)
|
|
related_tags = self.get_related_tags(data)
|
|
return {
|
|
"top_products": top_products,
|
|
"related_tags": related_tags
|
|
}
|
|
return {}
|
|
|
|
# 사용 예제
|
|
if __name__ == "__main__":
|
|
parser = NaverParser()
|
|
keyword = "순간접착제"
|
|
result = parser.search_and_parse(keyword)
|
|
print("검색 결과:", json.dumps(result, ensure_ascii=False, indent=4))
|