ConvenienceStore_Crawler/CU_crawler.py at master · GreenAppleSoda/ConvenienceStore_Crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import time
import firebase_admin
from firebase_admin import credentials, firestore

# Firebase 초기화
cred = credentials.Certificate("neomopyeonhaeng-firebase-adminsdk-fbsvc-65d84fd85a.json")  # 본인의 Firebase 서비스 계정 키 파일 경로
firebase_admin.initialize_app(cred)
db = firestore.client()

# 🔥 Firestore에서 문서를 50개씩 나눠서 가져오는 함수
def get_all_docs_in_batches(collection_name, batch_size=50):
    docs = []
    query = db.collection(collection_name).order_by("__name__").limit(batch_size)
    while True:
        batch_docs = query.stream()
        batch_list = list(batch_docs)
        if not batch_list:
            break  # 가져올 문서가 없으면 종료
        docs.extend(batch_list)
        last_doc = batch_list[-1]  # 마지막 문서 저장
        query = db.collection(collection_name).order_by("__name__").start_after(last_doc).limit(batch_size)
    return docs

# 기존 문서 삭제
collection_name = "CU_events"
docs = get_all_docs_in_batches(collection_name, batch_size=50)
for doc in docs:
    doc.reference.delete()

print(f"{collection_name} 컬렉션 초기화 완료!")


# 웹드라이버 실행
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # 브라우저 창을 띄우지 않음 (테스트 시 제거 가능)
driver = webdriver.Chrome(options=options)

# CU 행사 페이지 URL
url = "https://cu.bgfretail.com/event/plus.do?category=event&depth2=1&sf=N"
driver.get(url)

# 페이지 로딩 대기
time.sleep(3)

# "더보기" 버튼이 존재할 때까지 반복
while True:
    try:
        # 페이지 파싱
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 상품 목록 가져오기
        products = soup.find_all("li", class_="prod_list")
        print(f"현재 상품 개수: {len(products)}개")

        # "더보기" 버튼 찾기
        more_button = driver.find_element(By.CLASS_NAME, "prodListBtn-w")

        # "더보기" 버튼 클릭
        more_button.click()
        time.sleep(3)  # 데이터 로딩 대기

    except NoSuchElementException:
        print("더보기 버튼이 없어 크롤링을 종료합니다.")
        break  # 더 이상 버튼이 없으면 종료

# 최종적으로 모든 상품 가져오기
soup = BeautifulSoup(driver.page_source, "html.parser")
products = soup.find_all("li", class_="prod_list")

# Firestore 저장
cu_ref = db.collection("CU_events")

for product in products:
    # 제품명
    name_tag = product.find("div", class_="name")
    product_name = name_tag.p.text.strip() if name_tag and name_tag.p else "정보 없음"

    # 가격
    price_tag = product.find("div", class_="price")
    product_price = price_tag.strong.text.strip() if price_tag and price_tag.strong else "정보 없음"

    # 이미지 URL (앞에 `https:` 추가 필요)
    img_tag = product.find("div", class_="prod_img")
    img_url = "https:" + img_tag.img["src"] if img_tag and img_tag.img else "정보 없음"

    # 행사 타입 (1+1, 2+1 등)
    event_tag = product.find("div", class_="badge")
    event_type = event_tag.span.text.strip() if event_tag and event_tag.span else "행사 없음"

    # Firestore에 저장할 데이터 구조
    product_data = {
        "store": "CU",
        "event_type": event_type,
        "name": product_name,
        "price": product_price,
        "image_url": img_url,
        "timestamp": firestore.SERVER_TIMESTAMP  # Firestore 서버 타임스탬프 사용
    }

    # Firestore에 추가
    cu_ref.add(product_data)
    print(f"✅ Firestore 저장 완료: {product_name}")

# 드라이버 종료
driver.quit()