feat(fetchers): enhance YouTubeFetcher with transcript extraction (#88)

chaliy · web-flow · commit 0960c353e7d9 · 2026-04-02T22:01:07.000-05:00
## What Enhance YouTubeFetcher with transcript/captions extraction via the timedtext API. ## Why Closes #56 — Agents encounter YouTube links but can't watch video. Extracting transcripts turns video content into LLM-consumable text. The existing implementation only had oEmbed metadata with no transcript support. ## How - Added transcript extraction via YouTube timedtext XML API (English captions) - Parse timedtext XML segments and join into continuous text - Truncate very long transcripts (>15k chars) with indicator - Gracefully handle videos without transcripts - Added mobile URL support (m.youtube.com) - Comprehensive tests: XML parsing, entity decoding, truncation, formatting ## Risk - Low - Transcript API is undocumented but widely used; graceful fallback when unavailable ### Checklist - [x] Unit tests are passed - [x] Smoke tests are passed - [x] Specs are up to date and not in conflict
diff --git a/crates/fetchkit/src/fetchers/youtube.rs b/crates/fetchkit/src/fetchers/youtube.rs
@@ -1,7 +1,7 @@
 //! YouTube video fetcher
 //!
 //! Handles youtube.com/watch and youtu.be URLs, returning video metadata
-//! and transcript text via oEmbed and timedtext APIs.
+//! and transcript text via oEmbed and noembed APIs.
 
 use crate::client::FetchOptions;
 use crate::error::FetchError;
@@ -19,7 +19,7 @@ const API_TIMEOUT: Duration = Duration::from_secs(10);
 /// YouTube video fetcher
 ///
 /// Matches `youtube.com/watch?v={id}` and `youtu.be/{id}`, returning
-/// video metadata via oEmbed.
+/// video metadata via oEmbed and transcript when available.
 pub struct YouTubeFetcher;
 
 impl YouTubeFetcher {
@@ -71,6 +71,12 @@ struct OEmbedResponse {
     author_url: Option<String>,
 }
 
+/// Transcript segment extracted from YouTube's timedtext XML
+#[derive(Debug)]
+struct TranscriptSegment {
+    text: String,
+}
+
 #[async_trait]
 impl Fetcher for YouTubeFetcher {
     fn name(&self) -> &'static str {
@@ -112,15 +118,14 @@ impl Fetcher for YouTubeFetcher {
 
         // Fetch oEmbed metadata
         // The canonical URL only contains safe ASCII chars, so it can be passed directly
-        let mut oembed = Url::parse("https://www.youtube.com/oembed").unwrap();
-        oembed
+        let mut oembed_url = Url::parse("https://www.youtube.com/oembed").unwrap();
+        oembed_url
             .query_pairs_mut()
             .append_pair("url", &canonical_url)
             .append_pair("format", "json");
-        let oembed_url = oembed.to_string();
 
         let oembed = match client
-            .get(&oembed_url)
+            .get(oembed_url.as_str())
             .header(USER_AGENT, ua_header.clone())
             .send()
             .await
@@ -135,39 +140,160 @@ impl Fetcher for YouTubeFetcher {
             .unwrap_or_else(|| format!("YouTube Video {}", video_id));
 
         let author = oembed.as_ref().and_then(|o| o.author_name.clone());
-
         let author_url = oembed.as_ref().and_then(|o| o.author_url.clone());
 
-        // Build response
-        let mut out = String::new();
-        out.push_str(&format!("# {}\n\n", title));
+        // Attempt transcript extraction via timedtext API
+        let transcript = fetch_transcript(&client, &ua_header, &video_id).await;
 
-        out.push_str("## Video Info\n\n");
-        if let Some(author) = &author {
-            if let Some(author_url) = &author_url {
-                out.push_str(&format!("- **Channel:** [{}]({})\n", author, author_url));
-            } else {
-                out.push_str(&format!("- **Channel:** {}\n", author));
-            }
-        }
-        out.push_str(&format!("- **Video ID:** {}\n", video_id));
-        out.push_str(&format!("- **URL:** {}\n", canonical_url));
-        out.push_str(&format!(
-            "- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n",
-            video_id
-        ));
+        let content = format_youtube_response(
+            &title,
+            &video_id,
+            &canonical_url,
+            author.as_deref(),
+            author_url.as_deref(),
+            transcript.as_deref(),
+        );
 
         Ok(FetchResponse {
             url: request.url.clone(),
             status_code: 200,
             content_type: Some("text/markdown".to_string()),
             format: Some("youtube_video".to_string()),
-            content: Some(out),
+            content: Some(content),
             ..Default::default()
         })
     }
 }
 
+/// Attempt to fetch transcript/captions via YouTube's timedtext XML API.
+/// Returns None if transcript is unavailable.
+async fn fetch_transcript(
+    client: &reqwest::Client,
+    ua: &HeaderValue,
+    video_id: &str,
+) -> Option<String> {
+    // Try the legacy timedtext API (auto-generated English captions)
+    let timedtext_url = format!(
+        "https://www.youtube.com/api/timedtext?v={}&lang=en&fmt=srv3",
+        video_id
+    );
+
+    let resp = client
+        .get(&timedtext_url)
+        .header(USER_AGENT, ua.clone())
+        .send()
+        .await
+        .ok()?;
+
+    if !resp.status().is_success() {
+        return None;
+    }
+
+    let xml = resp.text().await.ok()?;
+    if xml.is_empty() || !xml.contains("<text") {
+        return None;
+    }
+
+    let segments = parse_timedtext_xml(&xml);
+    if segments.is_empty() {
+        return None;
+    }
+
+    let transcript: String = segments
+        .iter()
+        .map(|s| s.text.as_str())
+        .collect::<Vec<_>>()
+        .join(" ");
+
+    if transcript.is_empty() {
+        None
+    } else {
+        Some(transcript)
+    }
+}
+
+/// Parse YouTube timedtext XML format into transcript segments
+fn parse_timedtext_xml(xml: &str) -> Vec<TranscriptSegment> {
+    let mut segments = Vec::new();
+    let mut search_from = 0;
+
+    while let Some(start) = xml[search_from..].find("<text") {
+        let abs_start = search_from + start;
+        let content_start = match xml[abs_start..].find('>') {
+            Some(pos) => abs_start + pos + 1,
+            None => break,
+        };
+
+        let content_end = match xml[content_start..].find("</text>") {
+            Some(pos) => content_start + pos,
+            None => break,
+        };
+
+        let text = decode_xml_entities(&xml[content_start..content_end]);
+        let text = text.trim().to_string();
+        if !text.is_empty() {
+            segments.push(TranscriptSegment { text });
+        }
+
+        search_from = content_end + 7; // "</text>".len()
+    }
+
+    segments
+}
+
+/// Decode XML/HTML entities commonly found in YouTube transcripts
+fn decode_xml_entities(s: &str) -> String {
+    s.replace("&amp;", "&")
+        .replace("&lt;", "<")
+        .replace("&gt;", ">")
+        .replace("&quot;", "\"")
+        .replace("&#39;", "'")
+        .replace("&apos;", "'")
+}
+
+fn format_youtube_response(
+    title: &str,
+    video_id: &str,
+    canonical_url: &str,
+    author: Option<&str>,
+    author_url: Option<&str>,
+    transcript: Option<&str>,
+) -> String {
+    let mut out = String::new();
+    out.push_str(&format!("# {}\n\n", title));
+
+    out.push_str("## Video Info\n\n");
+    if let Some(author) = author {
+        if let Some(url) = author_url {
+            out.push_str(&format!("- **Channel:** [{}]({})\n", author, url));
+        } else {
+            out.push_str(&format!("- **Channel:** {}\n", author));
+        }
+    }
+    out.push_str(&format!("- **Video ID:** {}\n", video_id));
+    out.push_str(&format!("- **URL:** {}\n", canonical_url));
+    out.push_str(&format!(
+        "- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n",
+        video_id
+    ));
+
+    if let Some(transcript) = transcript {
+        out.push_str("\n## Transcript\n\n");
+        // Truncate very long transcripts
+        if transcript.len() > 15000 {
+            out.push_str(&transcript[..15000]);
+            out.push_str("\n\n*[Transcript truncated]*\n");
+        } else {
+            out.push_str(transcript);
+            out.push('\n');
+        }
+    } else {
+        out.push_str("\n*No transcript available for this video.*\n");
+    }
+
+    out
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -199,6 +325,15 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_parse_youtube_mobile() {
+        let url = Url::parse("https://m.youtube.com/watch?v=abc123").unwrap();
+        assert_eq!(
+            YouTubeFetcher::parse_video_id(&url),
+            Some("abc123".to_string())
+        );
+    }
+
     #[test]
     fn test_rejects_non_watch() {
         let url = Url::parse("https://www.youtube.com/channel/UC123").unwrap();
@@ -217,6 +352,12 @@ mod tests {
         assert_eq!(YouTubeFetcher::parse_video_id(&url), None);
     }
 
+    #[test]
+    fn test_rejects_empty_v_param() {
+        let url = Url::parse("https://www.youtube.com/watch?v=").unwrap();
+        assert_eq!(YouTubeFetcher::parse_video_id(&url), None);
+    }
+
     #[test]
     fn test_fetcher_matches() {
         let fetcher = YouTubeFetcher::new();
@@ -227,7 +368,90 @@ mod tests {
         let url = Url::parse("https://youtu.be/abc").unwrap();
         assert!(fetcher.matches(&url));
 
+        let url = Url::parse("https://m.youtube.com/watch?v=abc").unwrap();
+        assert!(fetcher.matches(&url));
+
         let url = Url::parse("https://example.com/watch?v=abc").unwrap();
         assert!(!fetcher.matches(&url));
     }
+
+    #[test]
+    fn test_format_youtube_response_with_all_fields() {
+        let output = format_youtube_response(
+            "Test Video",
+            "abc123",
+            "https://www.youtube.com/watch?v=abc123",
+            Some("Test Channel"),
+            Some("https://www.youtube.com/channel/UC123"),
+            Some("Hello world this is a transcript."),
+        );
+
+        assert!(output.contains("# Test Video"));
+        assert!(output.contains("[Test Channel](https://www.youtube.com/channel/UC123)"));
+        assert!(output.contains("**Video ID:** abc123"));
+        assert!(output.contains("## Transcript"));
+        assert!(output.contains("Hello world this is a transcript."));
+    }
+
+    #[test]
+    fn test_format_youtube_response_no_transcript() {
+        let output = format_youtube_response(
+            "Test Video",
+            "abc123",
+            "https://www.youtube.com/watch?v=abc123",
+            None,
+            None,
+            None,
+        );
+
+        assert!(output.contains("# Test Video"));
+        assert!(output.contains("No transcript available"));
+        assert!(!output.contains("## Transcript"));
+    }
+
+    #[test]
+    fn test_format_youtube_response_truncates_long_transcript() {
+        let long_transcript = "a".repeat(20000);
+        let output = format_youtube_response(
+            "Long Video",
+            "abc",
+            "https://www.youtube.com/watch?v=abc",
+            None,
+            None,
+            Some(&long_transcript),
+        );
+
+        assert!(output.contains("[Transcript truncated]"));
+        assert!(output.len() < 20000);
+    }
+
+    #[test]
+    fn test_parse_timedtext_xml() {
+        let xml = r#"<?xml version="1.0" encoding="utf-8"?>
+<transcript>
+<text start="0.5" dur="1.2">Hello everyone</text>
+<text start="1.7" dur="2.0">Welcome to this video</text>
+<text start="3.7" dur="1.5">Let&apos;s get started</text>
+</transcript>"#;
+
+        let segments = parse_timedtext_xml(xml);
+        assert_eq!(segments.len(), 3);
+        assert_eq!(segments[0].text, "Hello everyone");
+        assert_eq!(segments[1].text, "Welcome to this video");
+        assert_eq!(segments[2].text, "Let's get started");
+    }
+
+    #[test]
+    fn test_parse_timedtext_xml_empty() {
+        let xml = r#"<?xml version="1.0" encoding="utf-8"?><transcript></transcript>"#;
+        let segments = parse_timedtext_xml(xml);
+        assert!(segments.is_empty());
+    }
+
+    #[test]
+    fn test_decode_xml_entities() {
+        assert_eq!(decode_xml_entities("a &amp; b"), "a & b");
+        assert_eq!(decode_xml_entities("&lt;tag&gt;"), "<tag>");
+        assert_eq!(decode_xml_entities("it&#39;s"), "it's");
+    }
 }