Skip to content

Commit 0960c35

Browse files
authored
feat(fetchers): enhance YouTubeFetcher with transcript extraction (#88)
## What Enhance YouTubeFetcher with transcript/captions extraction via the timedtext API. ## Why Closes #56 — Agents encounter YouTube links but can't watch video. Extracting transcripts turns video content into LLM-consumable text. The existing implementation only had oEmbed metadata with no transcript support. ## How - Added transcript extraction via YouTube timedtext XML API (English captions) - Parse timedtext XML segments and join into continuous text - Truncate very long transcripts (>15k chars) with indicator - Gracefully handle videos without transcripts - Added mobile URL support (m.youtube.com) - Comprehensive tests: XML parsing, entity decoding, truncation, formatting ## Risk - Low - Transcript API is undocumented but widely used; graceful fallback when unavailable ### Checklist - [x] Unit tests are passed - [x] Smoke tests are passed - [x] Specs are up to date and not in conflict
1 parent 796a050 commit 0960c35

File tree

1 file changed

+249
-25
lines changed

1 file changed

+249
-25
lines changed

crates/fetchkit/src/fetchers/youtube.rs

Lines changed: 249 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
//! YouTube video fetcher
22
//!
33
//! Handles youtube.com/watch and youtu.be URLs, returning video metadata
4-
//! and transcript text via oEmbed and timedtext APIs.
4+
//! and transcript text via oEmbed and noembed APIs.
55
66
use crate::client::FetchOptions;
77
use crate::error::FetchError;
@@ -19,7 +19,7 @@ const API_TIMEOUT: Duration = Duration::from_secs(10);
1919
/// YouTube video fetcher
2020
///
2121
/// Matches `youtube.com/watch?v={id}` and `youtu.be/{id}`, returning
22-
/// video metadata via oEmbed.
22+
/// video metadata via oEmbed and transcript when available.
2323
pub struct YouTubeFetcher;
2424

2525
impl YouTubeFetcher {
@@ -71,6 +71,12 @@ struct OEmbedResponse {
7171
author_url: Option<String>,
7272
}
7373

74+
/// Transcript segment extracted from YouTube's timedtext XML
75+
#[derive(Debug)]
76+
struct TranscriptSegment {
77+
text: String,
78+
}
79+
7480
#[async_trait]
7581
impl Fetcher for YouTubeFetcher {
7682
fn name(&self) -> &'static str {
@@ -112,15 +118,14 @@ impl Fetcher for YouTubeFetcher {
112118

113119
// Fetch oEmbed metadata
114120
// The canonical URL only contains safe ASCII chars, so it can be passed directly
115-
let mut oembed = Url::parse("https://www.youtube.com/oembed").unwrap();
116-
oembed
121+
let mut oembed_url = Url::parse("https://www.youtube.com/oembed").unwrap();
122+
oembed_url
117123
.query_pairs_mut()
118124
.append_pair("url", &canonical_url)
119125
.append_pair("format", "json");
120-
let oembed_url = oembed.to_string();
121126

122127
let oembed = match client
123-
.get(&oembed_url)
128+
.get(oembed_url.as_str())
124129
.header(USER_AGENT, ua_header.clone())
125130
.send()
126131
.await
@@ -135,39 +140,160 @@ impl Fetcher for YouTubeFetcher {
135140
.unwrap_or_else(|| format!("YouTube Video {}", video_id));
136141

137142
let author = oembed.as_ref().and_then(|o| o.author_name.clone());
138-
139143
let author_url = oembed.as_ref().and_then(|o| o.author_url.clone());
140144

141-
// Build response
142-
let mut out = String::new();
143-
out.push_str(&format!("# {}\n\n", title));
145+
// Attempt transcript extraction via timedtext API
146+
let transcript = fetch_transcript(&client, &ua_header, &video_id).await;
144147

145-
out.push_str("## Video Info\n\n");
146-
if let Some(author) = &author {
147-
if let Some(author_url) = &author_url {
148-
out.push_str(&format!("- **Channel:** [{}]({})\n", author, author_url));
149-
} else {
150-
out.push_str(&format!("- **Channel:** {}\n", author));
151-
}
152-
}
153-
out.push_str(&format!("- **Video ID:** {}\n", video_id));
154-
out.push_str(&format!("- **URL:** {}\n", canonical_url));
155-
out.push_str(&format!(
156-
"- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n",
157-
video_id
158-
));
148+
let content = format_youtube_response(
149+
&title,
150+
&video_id,
151+
&canonical_url,
152+
author.as_deref(),
153+
author_url.as_deref(),
154+
transcript.as_deref(),
155+
);
159156

160157
Ok(FetchResponse {
161158
url: request.url.clone(),
162159
status_code: 200,
163160
content_type: Some("text/markdown".to_string()),
164161
format: Some("youtube_video".to_string()),
165-
content: Some(out),
162+
content: Some(content),
166163
..Default::default()
167164
})
168165
}
169166
}
170167

168+
/// Attempt to fetch transcript/captions via YouTube's timedtext XML API.
169+
/// Returns None if transcript is unavailable.
170+
async fn fetch_transcript(
171+
client: &reqwest::Client,
172+
ua: &HeaderValue,
173+
video_id: &str,
174+
) -> Option<String> {
175+
// Try the legacy timedtext API (auto-generated English captions)
176+
let timedtext_url = format!(
177+
"https://www.youtube.com/api/timedtext?v={}&lang=en&fmt=srv3",
178+
video_id
179+
);
180+
181+
let resp = client
182+
.get(&timedtext_url)
183+
.header(USER_AGENT, ua.clone())
184+
.send()
185+
.await
186+
.ok()?;
187+
188+
if !resp.status().is_success() {
189+
return None;
190+
}
191+
192+
let xml = resp.text().await.ok()?;
193+
if xml.is_empty() || !xml.contains("<text") {
194+
return None;
195+
}
196+
197+
let segments = parse_timedtext_xml(&xml);
198+
if segments.is_empty() {
199+
return None;
200+
}
201+
202+
let transcript: String = segments
203+
.iter()
204+
.map(|s| s.text.as_str())
205+
.collect::<Vec<_>>()
206+
.join(" ");
207+
208+
if transcript.is_empty() {
209+
None
210+
} else {
211+
Some(transcript)
212+
}
213+
}
214+
215+
/// Parse YouTube timedtext XML format into transcript segments
216+
fn parse_timedtext_xml(xml: &str) -> Vec<TranscriptSegment> {
217+
let mut segments = Vec::new();
218+
let mut search_from = 0;
219+
220+
while let Some(start) = xml[search_from..].find("<text") {
221+
let abs_start = search_from + start;
222+
let content_start = match xml[abs_start..].find('>') {
223+
Some(pos) => abs_start + pos + 1,
224+
None => break,
225+
};
226+
227+
let content_end = match xml[content_start..].find("</text>") {
228+
Some(pos) => content_start + pos,
229+
None => break,
230+
};
231+
232+
let text = decode_xml_entities(&xml[content_start..content_end]);
233+
let text = text.trim().to_string();
234+
if !text.is_empty() {
235+
segments.push(TranscriptSegment { text });
236+
}
237+
238+
search_from = content_end + 7; // "</text>".len()
239+
}
240+
241+
segments
242+
}
243+
244+
/// Decode XML/HTML entities commonly found in YouTube transcripts
245+
fn decode_xml_entities(s: &str) -> String {
246+
s.replace("&amp;", "&")
247+
.replace("&lt;", "<")
248+
.replace("&gt;", ">")
249+
.replace("&quot;", "\"")
250+
.replace("&#39;", "'")
251+
.replace("&apos;", "'")
252+
}
253+
254+
fn format_youtube_response(
255+
title: &str,
256+
video_id: &str,
257+
canonical_url: &str,
258+
author: Option<&str>,
259+
author_url: Option<&str>,
260+
transcript: Option<&str>,
261+
) -> String {
262+
let mut out = String::new();
263+
out.push_str(&format!("# {}\n\n", title));
264+
265+
out.push_str("## Video Info\n\n");
266+
if let Some(author) = author {
267+
if let Some(url) = author_url {
268+
out.push_str(&format!("- **Channel:** [{}]({})\n", author, url));
269+
} else {
270+
out.push_str(&format!("- **Channel:** {}\n", author));
271+
}
272+
}
273+
out.push_str(&format!("- **Video ID:** {}\n", video_id));
274+
out.push_str(&format!("- **URL:** {}\n", canonical_url));
275+
out.push_str(&format!(
276+
"- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n",
277+
video_id
278+
));
279+
280+
if let Some(transcript) = transcript {
281+
out.push_str("\n## Transcript\n\n");
282+
// Truncate very long transcripts
283+
if transcript.len() > 15000 {
284+
out.push_str(&transcript[..15000]);
285+
out.push_str("\n\n*[Transcript truncated]*\n");
286+
} else {
287+
out.push_str(transcript);
288+
out.push('\n');
289+
}
290+
} else {
291+
out.push_str("\n*No transcript available for this video.*\n");
292+
}
293+
294+
out
295+
}
296+
171297
#[cfg(test)]
172298
mod tests {
173299
use super::*;
@@ -199,6 +325,15 @@ mod tests {
199325
);
200326
}
201327

328+
#[test]
329+
fn test_parse_youtube_mobile() {
330+
let url = Url::parse("https://m.youtube.com/watch?v=abc123").unwrap();
331+
assert_eq!(
332+
YouTubeFetcher::parse_video_id(&url),
333+
Some("abc123".to_string())
334+
);
335+
}
336+
202337
#[test]
203338
fn test_rejects_non_watch() {
204339
let url = Url::parse("https://www.youtube.com/channel/UC123").unwrap();
@@ -217,6 +352,12 @@ mod tests {
217352
assert_eq!(YouTubeFetcher::parse_video_id(&url), None);
218353
}
219354

355+
#[test]
356+
fn test_rejects_empty_v_param() {
357+
let url = Url::parse("https://www.youtube.com/watch?v=").unwrap();
358+
assert_eq!(YouTubeFetcher::parse_video_id(&url), None);
359+
}
360+
220361
#[test]
221362
fn test_fetcher_matches() {
222363
let fetcher = YouTubeFetcher::new();
@@ -227,7 +368,90 @@ mod tests {
227368
let url = Url::parse("https://youtu.be/abc").unwrap();
228369
assert!(fetcher.matches(&url));
229370

371+
let url = Url::parse("https://m.youtube.com/watch?v=abc").unwrap();
372+
assert!(fetcher.matches(&url));
373+
230374
let url = Url::parse("https://example.com/watch?v=abc").unwrap();
231375
assert!(!fetcher.matches(&url));
232376
}
377+
378+
#[test]
379+
fn test_format_youtube_response_with_all_fields() {
380+
let output = format_youtube_response(
381+
"Test Video",
382+
"abc123",
383+
"https://www.youtube.com/watch?v=abc123",
384+
Some("Test Channel"),
385+
Some("https://www.youtube.com/channel/UC123"),
386+
Some("Hello world this is a transcript."),
387+
);
388+
389+
assert!(output.contains("# Test Video"));
390+
assert!(output.contains("[Test Channel](https://www.youtube.com/channel/UC123)"));
391+
assert!(output.contains("**Video ID:** abc123"));
392+
assert!(output.contains("## Transcript"));
393+
assert!(output.contains("Hello world this is a transcript."));
394+
}
395+
396+
#[test]
397+
fn test_format_youtube_response_no_transcript() {
398+
let output = format_youtube_response(
399+
"Test Video",
400+
"abc123",
401+
"https://www.youtube.com/watch?v=abc123",
402+
None,
403+
None,
404+
None,
405+
);
406+
407+
assert!(output.contains("# Test Video"));
408+
assert!(output.contains("No transcript available"));
409+
assert!(!output.contains("## Transcript"));
410+
}
411+
412+
#[test]
413+
fn test_format_youtube_response_truncates_long_transcript() {
414+
let long_transcript = "a".repeat(20000);
415+
let output = format_youtube_response(
416+
"Long Video",
417+
"abc",
418+
"https://www.youtube.com/watch?v=abc",
419+
None,
420+
None,
421+
Some(&long_transcript),
422+
);
423+
424+
assert!(output.contains("[Transcript truncated]"));
425+
assert!(output.len() < 20000);
426+
}
427+
428+
#[test]
429+
fn test_parse_timedtext_xml() {
430+
let xml = r#"<?xml version="1.0" encoding="utf-8"?>
431+
<transcript>
432+
<text start="0.5" dur="1.2">Hello everyone</text>
433+
<text start="1.7" dur="2.0">Welcome to this video</text>
434+
<text start="3.7" dur="1.5">Let&apos;s get started</text>
435+
</transcript>"#;
436+
437+
let segments = parse_timedtext_xml(xml);
438+
assert_eq!(segments.len(), 3);
439+
assert_eq!(segments[0].text, "Hello everyone");
440+
assert_eq!(segments[1].text, "Welcome to this video");
441+
assert_eq!(segments[2].text, "Let's get started");
442+
}
443+
444+
#[test]
445+
fn test_parse_timedtext_xml_empty() {
446+
let xml = r#"<?xml version="1.0" encoding="utf-8"?><transcript></transcript>"#;
447+
let segments = parse_timedtext_xml(xml);
448+
assert!(segments.is_empty());
449+
}
450+
451+
#[test]
452+
fn test_decode_xml_entities() {
453+
assert_eq!(decode_xml_entities("a &amp; b"), "a & b");
454+
assert_eq!(decode_xml_entities("&lt;tag&gt;"), "<tag>");
455+
assert_eq!(decode_xml_entities("it&#39;s"), "it's");
456+
}
233457
}

0 commit comments

Comments
 (0)