-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchunker.py
More file actions
162 lines (139 loc) · 5.73 KB
/
chunker.py
File metadata and controls
162 lines (139 loc) · 5.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Video-based chunking module for transcript text.
Combines multiple complete videos per chunk (videos are never split).
"""
import tiktoken
from typing import List, Dict
def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
"""Count tokens in text using tiktoken."""
try:
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
except (KeyError, ValueError):
# Fallback to cl100k_base (used by gpt-3.5-turbo and gpt-4)
encoding = tiktoken.get_encoding("cl100k_base")
return len(encoding.encode(text))
def chunk_video_transcripts(
transcripts_data: List[Dict],
max_tokens: int = 30000,
model: str = "gpt-3.5-turbo"
) -> List[Dict[str, any]]:
"""
Chunk transcripts by combining multiple complete videos per chunk.
Videos are never split - each chunk contains one or more complete videos.
Args:
transcripts_data: List of video transcript dicts from JSON
max_tokens: Maximum tokens per chunk (default: 30000 for 32K context)
model: Model name for token counting
Returns:
List of chunks, each containing one or more complete videos
"""
# Get encoding for token counting
try:
encoding = tiktoken.encoding_for_model(model)
except (KeyError, ValueError):
encoding = tiktoken.get_encoding("cl100k_base")
# Filter to successful transcripts
valid_videos = [
v for v in transcripts_data
if v.get('success') and v.get('transcript_full_text')
]
if not valid_videos:
return []
chunks = []
current_chunk_videos = []
current_chunk_tokens = 0
chunk_id = 0
# Format string for video separator (counts as tokens)
video_separator = "\n\n---VIDEO---\n\n"
separator_tokens = len(encoding.encode(video_separator))
# Batch encode all transcripts for better performance
# Encode all transcripts at once (much faster than one-by-one)
print(f"Encoding {len(valid_videos)} video transcripts...", flush=True)
all_transcripts = [v.get('transcript_full_text', '') for v in valid_videos]
# Batch encode: encode all at once is faster
encoded_transcripts = [encoding.encode(text) for text in all_transcripts]
video_token_counts = [len(encoded) for encoded in encoded_transcripts]
print(f"✓ Encoded all transcripts", flush=True)
for idx, video in enumerate(valid_videos):
video_id = video.get('video_id', 'unknown')
video_title = video.get('video_title', 'Unknown')
video_url = video.get('video_url', '')
upload_date = video.get('upload_date', '')
transcript_text = video.get('transcript_full_text', '')
# Use pre-computed token count (much faster)
video_tokens = video_token_counts[idx]
# Check if adding this video would exceed max_tokens
# Account for separator tokens if this isn't the first video in chunk
tokens_needed = video_tokens
if current_chunk_videos:
tokens_needed += separator_tokens
if current_chunk_tokens + tokens_needed > max_tokens and current_chunk_videos:
# Save current chunk (before adding this video)
chunk_text = video_separator.join([
f"VIDEO: {v['video_title']}\n{v['transcript_text']}"
for v in current_chunk_videos
])
# Collect video metadata
video_metadata = [
{
'video_id': v['video_id'],
'video_title': v['video_title'],
'video_url': v['video_url'],
'upload_date': v['upload_date']
}
for v in current_chunk_videos
]
chunks.append({
'chunk_id': chunk_id,
'text': chunk_text,
'tokens': current_chunk_tokens,
'videos': video_metadata, # List of videos in this chunk
'num_videos': len(current_chunk_videos)
})
chunk_id += 1
# Start new chunk with this video
current_chunk_videos = [{
'video_id': video_id,
'video_title': video_title,
'video_url': video_url,
'upload_date': upload_date,
'transcript_text': transcript_text
}]
current_chunk_tokens = video_tokens
else:
# Add this video to current chunk
# Add separator tokens if this isn't the first video
if current_chunk_videos:
current_chunk_tokens += separator_tokens
current_chunk_videos.append({
'video_id': video_id,
'video_title': video_title,
'video_url': video_url,
'upload_date': upload_date,
'transcript_text': transcript_text
})
current_chunk_tokens += video_tokens
# Add final chunk
if current_chunk_videos:
chunk_text = video_separator.join([
f"VIDEO: {v['video_title']}\n{v['transcript_text']}"
for v in current_chunk_videos
])
video_metadata = [
{
'video_id': v['video_id'],
'video_title': v['video_title'],
'video_url': v['video_url'],
'upload_date': v['upload_date']
}
for v in current_chunk_videos
]
chunks.append({
'chunk_id': chunk_id,
'text': chunk_text,
'tokens': current_chunk_tokens,
'videos': video_metadata,
'num_videos': len(current_chunk_videos)
})
return chunks