local-rag/chunker.py at main · JordanmFrancis/local-rag · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Video-based chunking module for transcript text.
Combines multiple complete videos per chunk (videos are never split).
"""

import tiktoken
from typing import List, Dict


def count_tokens(text: str, model: str = "gpt-3.5-turbo") -> int:
    """Count tokens in text using tiktoken."""
    try:
        encoding = tiktoken.encoding_for_model(model)
        return len(encoding.encode(text))
    except (KeyError, ValueError):
        # Fallback to cl100k_base (used by gpt-3.5-turbo and gpt-4)
        encoding = tiktoken.get_encoding("cl100k_base")
        return len(encoding.encode(text))


def chunk_video_transcripts(
    transcripts_data: List[Dict],
    max_tokens: int = 30000,
    model: str = "gpt-3.5-turbo"
) -> List[Dict[str, any]]:
    """
    Chunk transcripts by combining multiple complete videos per chunk.
    Videos are never split - each chunk contains one or more complete videos.

    Args:
        transcripts_data: List of video transcript dicts from JSON
        max_tokens: Maximum tokens per chunk (default: 30000 for 32K context)
        model: Model name for token counting

    Returns:
        List of chunks, each containing one or more complete videos
    """
    # Get encoding for token counting
    try:
        encoding = tiktoken.encoding_for_model(model)
    except (KeyError, ValueError):
        encoding = tiktoken.get_encoding("cl100k_base")

    # Filter to successful transcripts
    valid_videos = [
        v for v in transcripts_data
        if v.get('success') and v.get('transcript_full_text')
    ]

    if not valid_videos:
        return []

    chunks = []
    current_chunk_videos = []
    current_chunk_tokens = 0
    chunk_id = 0

    # Format string for video separator (counts as tokens)
    video_separator = "\n\n---VIDEO---\n\n"
    separator_tokens = len(encoding.encode(video_separator))

    # Batch encode all transcripts for better performance
    # Encode all transcripts at once (much faster than one-by-one)
    print(f"Encoding {len(valid_videos)} video transcripts...", flush=True)
    all_transcripts = [v.get('transcript_full_text', '') for v in valid_videos]
    # Batch encode: encode all at once is faster
    encoded_transcripts = [encoding.encode(text) for text in all_transcripts]
    video_token_counts = [len(encoded) for encoded in encoded_transcripts]
    print(f"✓ Encoded all transcripts", flush=True)

    for idx, video in enumerate(valid_videos):
        video_id = video.get('video_id', 'unknown')
        video_title = video.get('video_title', 'Unknown')
        video_url = video.get('video_url', '')
        upload_date = video.get('upload_date', '')
        transcript_text = video.get('transcript_full_text', '')

        # Use pre-computed token count (much faster)
        video_tokens = video_token_counts[idx]

        # Check if adding this video would exceed max_tokens
        # Account for separator tokens if this isn't the first video in chunk
        tokens_needed = video_tokens
        if current_chunk_videos:
            tokens_needed += separator_tokens

        if current_chunk_tokens + tokens_needed > max_tokens and current_chunk_videos:
            # Save current chunk (before adding this video)
            chunk_text = video_separator.join([
                f"VIDEO: {v['video_title']}\n{v['transcript_text']}"
                for v in current_chunk_videos
            ])

            # Collect video metadata
            video_metadata = [
                {
                    'video_id': v['video_id'],
                    'video_title': v['video_title'],
                    'video_url': v['video_url'],
                    'upload_date': v['upload_date']
                }
                for v in current_chunk_videos
            ]

            chunks.append({
                'chunk_id': chunk_id,
                'text': chunk_text,
                'tokens': current_chunk_tokens,
                'videos': video_metadata,  # List of videos in this chunk
                'num_videos': len(current_chunk_videos)
            })
            chunk_id += 1

            # Start new chunk with this video
            current_chunk_videos = [{
                'video_id': video_id,
                'video_title': video_title,
                'video_url': video_url,
                'upload_date': upload_date,
                'transcript_text': transcript_text
            }]
            current_chunk_tokens = video_tokens
        else:
            # Add this video to current chunk
            # Add separator tokens if this isn't the first video
            if current_chunk_videos:
                current_chunk_tokens += separator_tokens
            current_chunk_videos.append({
                'video_id': video_id,
                'video_title': video_title,
                'video_url': video_url,
                'upload_date': upload_date,
                'transcript_text': transcript_text
            })
            current_chunk_tokens += video_tokens

    # Add final chunk
    if current_chunk_videos:
        chunk_text = video_separator.join([
            f"VIDEO: {v['video_title']}\n{v['transcript_text']}"
            for v in current_chunk_videos
        ])

        video_metadata = [
            {
                'video_id': v['video_id'],
                'video_title': v['video_title'],
                'video_url': v['video_url'],
                'upload_date': v['upload_date']
            }
            for v in current_chunk_videos
        ]

        chunks.append({
            'chunk_id': chunk_id,
            'text': chunk_text,
            'tokens': current_chunk_tokens,
            'videos': video_metadata,
            'num_videos': len(current_chunk_videos)
        })

    return chunks