-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathyoutube_fetch.py
More file actions
109 lines (89 loc) · 3.68 KB
/
youtube_fetch.py
File metadata and controls
109 lines (89 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# youtube_fetch.py
# ➤ 유튜브 채널에서 최대 200개의 영상 메타데이터(제목, 길이 등)를 수집해 video_ids.json에 저장하는 스크립트
import json
import os
import re
from googleapiclient.discovery import build
from dotenv import load_dotenv
from datetime import datetime
# ✅ 환경 변수 로딩
load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")
CHANNEL_ID = "UC2IIBYSTMSvJaK2UJzCC06g"
# ✅ @handle 또는 채널 URL → channel_id 변환
def resolve_channel_id(api_key, channel_url):
"""
다양한 채널 URL 형식에서 channel_id를 추출한다.
- https://www.youtube.com/@handle → forHandle API 사용
- https://www.youtube.com/channel/UCxxxx → 그대로 사용
"""
youtube = build("youtube", "v3", developerKey=api_key)
# @handle 형식
m = re.search(r"@([\w.-]+)", channel_url)
if m:
handle = m.group(1)
resp = youtube.channels().list(part="id", forHandle=handle).execute()
items = resp.get("items", [])
if items:
return items[0]["id"]
raise ValueError(f"채널을 찾을 수 없습니다: @{handle}")
# /channel/UCxxxx 형식
m = re.search(r"/channel/(UC[\w-]+)", channel_url)
if m:
return m.group(1)
raise ValueError(f"지원하지 않는 채널 URL 형식: {channel_url}")
# ✅ 유니코드 문자 제거
def sanitize(text):
return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF))
# ✅ 유튜브 영상 목록 가져오기
def get_video_list(api_key, channel_id, max_results=200):
youtube = build("youtube", "v3", developerKey=api_key)
videos = []
next_page_token = None
while len(videos) < max_results:
search_response = youtube.search().list(
channelId=channel_id,
part="id",
order="date",
maxResults=50,
type="video",
pageToken=next_page_token
).execute()
video_ids = [item["id"]["videoId"] for item in search_response["items"]]
video_response = youtube.videos().list(
part="snippet,contentDetails",
id=','.join(video_ids)
).execute()
for item in video_response["items"]:
snippet = item.get("snippet", {})
duration = item.get("contentDetails", {}).get("duration", "")
match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?', duration)
if match:
hours = int(match.group(1) or 0)
minutes = int(match.group(2) or 0)
seconds = int(match.group(3) or 0)
else:
hours = minutes = seconds = 0
total_seconds = hours * 3600 + minutes * 60 + seconds
videos.append({
"video_id": item["id"],
"url": f"https://youtu.be/{item['id']}",
"title": sanitize(snippet.get("title", "")),
"description": sanitize(snippet.get("description", "")),
"channel_title": snippet.get("channelTitle", ""),
"published_at": snippet.get("publishedAt", ""),
"duration": total_seconds,
"processed": False,
"reason": None
})
next_page_token = search_response.get("nextPageToken")
if not next_page_token:
break
return videos[:max_results]
# ✅ 실행
if __name__ == "__main__":
results = get_video_list(API_KEY, CHANNEL_ID, max_results=200)
os.makedirs("data", exist_ok=True)
with open("data/video_ids.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"✅ 총 {len(results)}개의 영상 정보를 저장했습니다 → data/video_ids.json")