11//! YouTube video fetcher
22//!
33//! Handles youtube.com/watch and youtu.be URLs, returning video metadata
4- //! and transcript text via oEmbed and timedtext APIs.
4+ //! and transcript text via oEmbed and noembed APIs.
55
66use crate :: client:: FetchOptions ;
77use crate :: error:: FetchError ;
@@ -19,7 +19,7 @@ const API_TIMEOUT: Duration = Duration::from_secs(10);
1919/// YouTube video fetcher
2020///
2121/// Matches `youtube.com/watch?v={id}` and `youtu.be/{id}`, returning
22- /// video metadata via oEmbed.
22+ /// video metadata via oEmbed and transcript when available .
2323pub struct YouTubeFetcher ;
2424
2525impl YouTubeFetcher {
@@ -71,6 +71,12 @@ struct OEmbedResponse {
7171 author_url : Option < String > ,
7272}
7373
74+ /// Transcript segment extracted from YouTube's timedtext XML
75+ #[ derive( Debug ) ]
76+ struct TranscriptSegment {
77+ text : String ,
78+ }
79+
7480#[ async_trait]
7581impl Fetcher for YouTubeFetcher {
7682 fn name ( & self ) -> & ' static str {
@@ -112,15 +118,14 @@ impl Fetcher for YouTubeFetcher {
112118
113119 // Fetch oEmbed metadata
114120 // The canonical URL only contains safe ASCII chars, so it can be passed directly
115- let mut oembed = Url :: parse ( "https://www.youtube.com/oembed" ) . unwrap ( ) ;
116- oembed
121+ let mut oembed_url = Url :: parse ( "https://www.youtube.com/oembed" ) . unwrap ( ) ;
122+ oembed_url
117123 . query_pairs_mut ( )
118124 . append_pair ( "url" , & canonical_url)
119125 . append_pair ( "format" , "json" ) ;
120- let oembed_url = oembed. to_string ( ) ;
121126
122127 let oembed = match client
123- . get ( & oembed_url)
128+ . get ( oembed_url. as_str ( ) )
124129 . header ( USER_AGENT , ua_header. clone ( ) )
125130 . send ( )
126131 . await
@@ -135,39 +140,160 @@ impl Fetcher for YouTubeFetcher {
135140 . unwrap_or_else ( || format ! ( "YouTube Video {}" , video_id) ) ;
136141
137142 let author = oembed. as_ref ( ) . and_then ( |o| o. author_name . clone ( ) ) ;
138-
139143 let author_url = oembed. as_ref ( ) . and_then ( |o| o. author_url . clone ( ) ) ;
140144
141- // Build response
142- let mut out = String :: new ( ) ;
143- out. push_str ( & format ! ( "# {}\n \n " , title) ) ;
145+ // Attempt transcript extraction via timedtext API
146+ let transcript = fetch_transcript ( & client, & ua_header, & video_id) . await ;
144147
145- out. push_str ( "## Video Info\n \n " ) ;
146- if let Some ( author) = & author {
147- if let Some ( author_url) = & author_url {
148- out. push_str ( & format ! ( "- **Channel:** [{}]({})\n " , author, author_url) ) ;
149- } else {
150- out. push_str ( & format ! ( "- **Channel:** {}\n " , author) ) ;
151- }
152- }
153- out. push_str ( & format ! ( "- **Video ID:** {}\n " , video_id) ) ;
154- out. push_str ( & format ! ( "- **URL:** {}\n " , canonical_url) ) ;
155- out. push_str ( & format ! (
156- "- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n " ,
157- video_id
158- ) ) ;
148+ let content = format_youtube_response (
149+ & title,
150+ & video_id,
151+ & canonical_url,
152+ author. as_deref ( ) ,
153+ author_url. as_deref ( ) ,
154+ transcript. as_deref ( ) ,
155+ ) ;
159156
160157 Ok ( FetchResponse {
161158 url : request. url . clone ( ) ,
162159 status_code : 200 ,
163160 content_type : Some ( "text/markdown" . to_string ( ) ) ,
164161 format : Some ( "youtube_video" . to_string ( ) ) ,
165- content : Some ( out ) ,
162+ content : Some ( content ) ,
166163 ..Default :: default ( )
167164 } )
168165 }
169166}
170167
168+ /// Attempt to fetch transcript/captions via YouTube's timedtext XML API.
169+ /// Returns None if transcript is unavailable.
170+ async fn fetch_transcript (
171+ client : & reqwest:: Client ,
172+ ua : & HeaderValue ,
173+ video_id : & str ,
174+ ) -> Option < String > {
175+ // Try the legacy timedtext API (auto-generated English captions)
176+ let timedtext_url = format ! (
177+ "https://www.youtube.com/api/timedtext?v={}&lang=en&fmt=srv3" ,
178+ video_id
179+ ) ;
180+
181+ let resp = client
182+ . get ( & timedtext_url)
183+ . header ( USER_AGENT , ua. clone ( ) )
184+ . send ( )
185+ . await
186+ . ok ( ) ?;
187+
188+ if !resp. status ( ) . is_success ( ) {
189+ return None ;
190+ }
191+
192+ let xml = resp. text ( ) . await . ok ( ) ?;
193+ if xml. is_empty ( ) || !xml. contains ( "<text" ) {
194+ return None ;
195+ }
196+
197+ let segments = parse_timedtext_xml ( & xml) ;
198+ if segments. is_empty ( ) {
199+ return None ;
200+ }
201+
202+ let transcript: String = segments
203+ . iter ( )
204+ . map ( |s| s. text . as_str ( ) )
205+ . collect :: < Vec < _ > > ( )
206+ . join ( " " ) ;
207+
208+ if transcript. is_empty ( ) {
209+ None
210+ } else {
211+ Some ( transcript)
212+ }
213+ }
214+
215+ /// Parse YouTube timedtext XML format into transcript segments
216+ fn parse_timedtext_xml ( xml : & str ) -> Vec < TranscriptSegment > {
217+ let mut segments = Vec :: new ( ) ;
218+ let mut search_from = 0 ;
219+
220+ while let Some ( start) = xml[ search_from..] . find ( "<text" ) {
221+ let abs_start = search_from + start;
222+ let content_start = match xml[ abs_start..] . find ( '>' ) {
223+ Some ( pos) => abs_start + pos + 1 ,
224+ None => break ,
225+ } ;
226+
227+ let content_end = match xml[ content_start..] . find ( "</text>" ) {
228+ Some ( pos) => content_start + pos,
229+ None => break ,
230+ } ;
231+
232+ let text = decode_xml_entities ( & xml[ content_start..content_end] ) ;
233+ let text = text. trim ( ) . to_string ( ) ;
234+ if !text. is_empty ( ) {
235+ segments. push ( TranscriptSegment { text } ) ;
236+ }
237+
238+ search_from = content_end + 7 ; // "</text>".len()
239+ }
240+
241+ segments
242+ }
243+
244+ /// Decode XML/HTML entities commonly found in YouTube transcripts
245+ fn decode_xml_entities ( s : & str ) -> String {
246+ s. replace ( "&" , "&" )
247+ . replace ( "<" , "<" )
248+ . replace ( ">" , ">" )
249+ . replace ( """ , "\" " )
250+ . replace ( "'" , "'" )
251+ . replace ( "'" , "'" )
252+ }
253+
254+ fn format_youtube_response (
255+ title : & str ,
256+ video_id : & str ,
257+ canonical_url : & str ,
258+ author : Option < & str > ,
259+ author_url : Option < & str > ,
260+ transcript : Option < & str > ,
261+ ) -> String {
262+ let mut out = String :: new ( ) ;
263+ out. push_str ( & format ! ( "# {}\n \n " , title) ) ;
264+
265+ out. push_str ( "## Video Info\n \n " ) ;
266+ if let Some ( author) = author {
267+ if let Some ( url) = author_url {
268+ out. push_str ( & format ! ( "- **Channel:** [{}]({})\n " , author, url) ) ;
269+ } else {
270+ out. push_str ( & format ! ( "- **Channel:** {}\n " , author) ) ;
271+ }
272+ }
273+ out. push_str ( & format ! ( "- **Video ID:** {}\n " , video_id) ) ;
274+ out. push_str ( & format ! ( "- **URL:** {}\n " , canonical_url) ) ;
275+ out. push_str ( & format ! (
276+ "- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n " ,
277+ video_id
278+ ) ) ;
279+
280+ if let Some ( transcript) = transcript {
281+ out. push_str ( "\n ## Transcript\n \n " ) ;
282+ // Truncate very long transcripts
283+ if transcript. len ( ) > 15000 {
284+ out. push_str ( & transcript[ ..15000 ] ) ;
285+ out. push_str ( "\n \n *[Transcript truncated]*\n " ) ;
286+ } else {
287+ out. push_str ( transcript) ;
288+ out. push ( '\n' ) ;
289+ }
290+ } else {
291+ out. push_str ( "\n *No transcript available for this video.*\n " ) ;
292+ }
293+
294+ out
295+ }
296+
171297#[ cfg( test) ]
172298mod tests {
173299 use super :: * ;
@@ -199,6 +325,15 @@ mod tests {
199325 ) ;
200326 }
201327
328+ #[ test]
329+ fn test_parse_youtube_mobile ( ) {
330+ let url = Url :: parse ( "https://m.youtube.com/watch?v=abc123" ) . unwrap ( ) ;
331+ assert_eq ! (
332+ YouTubeFetcher :: parse_video_id( & url) ,
333+ Some ( "abc123" . to_string( ) )
334+ ) ;
335+ }
336+
202337 #[ test]
203338 fn test_rejects_non_watch ( ) {
204339 let url = Url :: parse ( "https://www.youtube.com/channel/UC123" ) . unwrap ( ) ;
@@ -217,6 +352,12 @@ mod tests {
217352 assert_eq ! ( YouTubeFetcher :: parse_video_id( & url) , None ) ;
218353 }
219354
355+ #[ test]
356+ fn test_rejects_empty_v_param ( ) {
357+ let url = Url :: parse ( "https://www.youtube.com/watch?v=" ) . unwrap ( ) ;
358+ assert_eq ! ( YouTubeFetcher :: parse_video_id( & url) , None ) ;
359+ }
360+
220361 #[ test]
221362 fn test_fetcher_matches ( ) {
222363 let fetcher = YouTubeFetcher :: new ( ) ;
@@ -227,7 +368,90 @@ mod tests {
227368 let url = Url :: parse ( "https://youtu.be/abc" ) . unwrap ( ) ;
228369 assert ! ( fetcher. matches( & url) ) ;
229370
371+ let url = Url :: parse ( "https://m.youtube.com/watch?v=abc" ) . unwrap ( ) ;
372+ assert ! ( fetcher. matches( & url) ) ;
373+
230374 let url = Url :: parse ( "https://example.com/watch?v=abc" ) . unwrap ( ) ;
231375 assert ! ( !fetcher. matches( & url) ) ;
232376 }
377+
378+ #[ test]
379+ fn test_format_youtube_response_with_all_fields ( ) {
380+ let output = format_youtube_response (
381+ "Test Video" ,
382+ "abc123" ,
383+ "https://www.youtube.com/watch?v=abc123" ,
384+ Some ( "Test Channel" ) ,
385+ Some ( "https://www.youtube.com/channel/UC123" ) ,
386+ Some ( "Hello world this is a transcript." ) ,
387+ ) ;
388+
389+ assert ! ( output. contains( "# Test Video" ) ) ;
390+ assert ! ( output. contains( "[Test Channel](https://www.youtube.com/channel/UC123)" ) ) ;
391+ assert ! ( output. contains( "**Video ID:** abc123" ) ) ;
392+ assert ! ( output. contains( "## Transcript" ) ) ;
393+ assert ! ( output. contains( "Hello world this is a transcript." ) ) ;
394+ }
395+
396+ #[ test]
397+ fn test_format_youtube_response_no_transcript ( ) {
398+ let output = format_youtube_response (
399+ "Test Video" ,
400+ "abc123" ,
401+ "https://www.youtube.com/watch?v=abc123" ,
402+ None ,
403+ None ,
404+ None ,
405+ ) ;
406+
407+ assert ! ( output. contains( "# Test Video" ) ) ;
408+ assert ! ( output. contains( "No transcript available" ) ) ;
409+ assert ! ( !output. contains( "## Transcript" ) ) ;
410+ }
411+
412+ #[ test]
413+ fn test_format_youtube_response_truncates_long_transcript ( ) {
414+ let long_transcript = "a" . repeat ( 20000 ) ;
415+ let output = format_youtube_response (
416+ "Long Video" ,
417+ "abc" ,
418+ "https://www.youtube.com/watch?v=abc" ,
419+ None ,
420+ None ,
421+ Some ( & long_transcript) ,
422+ ) ;
423+
424+ assert ! ( output. contains( "[Transcript truncated]" ) ) ;
425+ assert ! ( output. len( ) < 20000 ) ;
426+ }
427+
428+ #[ test]
429+ fn test_parse_timedtext_xml ( ) {
430+ let xml = r#"<?xml version="1.0" encoding="utf-8"?>
431+ <transcript>
432+ <text start="0.5" dur="1.2">Hello everyone</text>
433+ <text start="1.7" dur="2.0">Welcome to this video</text>
434+ <text start="3.7" dur="1.5">Let's get started</text>
435+ </transcript>"# ;
436+
437+ let segments = parse_timedtext_xml ( xml) ;
438+ assert_eq ! ( segments. len( ) , 3 ) ;
439+ assert_eq ! ( segments[ 0 ] . text, "Hello everyone" ) ;
440+ assert_eq ! ( segments[ 1 ] . text, "Welcome to this video" ) ;
441+ assert_eq ! ( segments[ 2 ] . text, "Let's get started" ) ;
442+ }
443+
444+ #[ test]
445+ fn test_parse_timedtext_xml_empty ( ) {
446+ let xml = r#"<?xml version="1.0" encoding="utf-8"?><transcript></transcript>"# ;
447+ let segments = parse_timedtext_xml ( xml) ;
448+ assert ! ( segments. is_empty( ) ) ;
449+ }
450+
451+ #[ test]
452+ fn test_decode_xml_entities ( ) {
453+ assert_eq ! ( decode_xml_entities( "a & b" ) , "a & b" ) ;
454+ assert_eq ! ( decode_xml_entities( "<tag>" ) , "<tag>" ) ;
455+ assert_eq ! ( decode_xml_entities( "it's" ) , "it's" ) ;
456+ }
233457}
0 commit comments