|
| 1 | +//! YouTube video fetcher |
| 2 | +//! |
| 3 | +//! Handles youtube.com/watch and youtu.be URLs, returning video metadata |
| 4 | +//! and transcript text via oEmbed and timedtext APIs. |
| 5 | +
|
| 6 | +use crate::client::FetchOptions; |
| 7 | +use crate::error::FetchError; |
| 8 | +use crate::fetchers::Fetcher; |
| 9 | +use crate::types::{FetchRequest, FetchResponse}; |
| 10 | +use crate::DEFAULT_USER_AGENT; |
| 11 | +use async_trait::async_trait; |
| 12 | +use reqwest::header::{HeaderValue, USER_AGENT}; |
| 13 | +use serde::Deserialize; |
| 14 | +use std::time::Duration; |
| 15 | +use url::Url; |
| 16 | + |
| 17 | +const API_TIMEOUT: Duration = Duration::from_secs(10); |
| 18 | + |
| 19 | +/// YouTube video fetcher |
| 20 | +/// |
| 21 | +/// Matches `youtube.com/watch?v={id}` and `youtu.be/{id}`, returning |
| 22 | +/// video metadata via oEmbed. |
| 23 | +pub struct YouTubeFetcher; |
| 24 | + |
| 25 | +impl YouTubeFetcher { |
| 26 | + pub fn new() -> Self { |
| 27 | + Self |
| 28 | + } |
| 29 | + |
| 30 | + /// Extract video ID from YouTube URL |
| 31 | + fn parse_video_id(url: &Url) -> Option<String> { |
| 32 | + let host = url.host_str()?; |
| 33 | + |
| 34 | + match host { |
| 35 | + "youtube.com" | "www.youtube.com" | "m.youtube.com" => { |
| 36 | + // /watch?v={id} |
| 37 | + let segments: Vec<&str> = |
| 38 | + url.path_segments().map(|s| s.collect()).unwrap_or_default(); |
| 39 | + if segments.first() != Some(&"watch") { |
| 40 | + return None; |
| 41 | + } |
| 42 | + url.query_pairs() |
| 43 | + .find(|(k, _)| k == "v") |
| 44 | + .map(|(_, v)| v.to_string()) |
| 45 | + .filter(|v| !v.is_empty()) |
| 46 | + } |
| 47 | + "youtu.be" => { |
| 48 | + // /{id} |
| 49 | + let segments: Vec<&str> = |
| 50 | + url.path_segments().map(|s| s.collect()).unwrap_or_default(); |
| 51 | + segments |
| 52 | + .first() |
| 53 | + .filter(|s| !s.is_empty()) |
| 54 | + .map(|s| s.to_string()) |
| 55 | + } |
| 56 | + _ => None, |
| 57 | + } |
| 58 | + } |
| 59 | +} |
| 60 | + |
| 61 | +impl Default for YouTubeFetcher { |
| 62 | + fn default() -> Self { |
| 63 | + Self::new() |
| 64 | + } |
| 65 | +} |
| 66 | + |
| 67 | +#[derive(Debug, Deserialize)] |
| 68 | +struct OEmbedResponse { |
| 69 | + title: Option<String>, |
| 70 | + author_name: Option<String>, |
| 71 | + author_url: Option<String>, |
| 72 | +} |
| 73 | + |
| 74 | +#[async_trait] |
| 75 | +impl Fetcher for YouTubeFetcher { |
| 76 | + fn name(&self) -> &'static str { |
| 77 | + "youtube" |
| 78 | + } |
| 79 | + |
| 80 | + fn matches(&self, url: &Url) -> bool { |
| 81 | + Self::parse_video_id(url).is_some() |
| 82 | + } |
| 83 | + |
| 84 | + async fn fetch( |
| 85 | + &self, |
| 86 | + request: &FetchRequest, |
| 87 | + options: &FetchOptions, |
| 88 | + ) -> Result<FetchResponse, FetchError> { |
| 89 | + let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; |
| 90 | + |
| 91 | + let video_id = Self::parse_video_id(&url) |
| 92 | + .ok_or_else(|| FetchError::FetcherError("Not a valid YouTube URL".to_string()))?; |
| 93 | + |
| 94 | + let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); |
| 95 | + let mut client_builder = reqwest::Client::builder() |
| 96 | + .connect_timeout(API_TIMEOUT) |
| 97 | + .timeout(API_TIMEOUT) |
| 98 | + .redirect(reqwest::redirect::Policy::limited(3)); |
| 99 | + |
| 100 | + if !options.respect_proxy_env { |
| 101 | + client_builder = client_builder.no_proxy(); |
| 102 | + } |
| 103 | + |
| 104 | + let client = client_builder |
| 105 | + .build() |
| 106 | + .map_err(FetchError::ClientBuildError)?; |
| 107 | + |
| 108 | + let ua_header = HeaderValue::from_str(user_agent) |
| 109 | + .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT)); |
| 110 | + |
| 111 | + let canonical_url = format!("https://www.youtube.com/watch?v={}", video_id); |
| 112 | + |
| 113 | + // Fetch oEmbed metadata |
| 114 | + // The canonical URL only contains safe ASCII chars, so it can be passed directly |
| 115 | + let mut oembed = Url::parse("https://www.youtube.com/oembed").unwrap(); |
| 116 | + oembed |
| 117 | + .query_pairs_mut() |
| 118 | + .append_pair("url", &canonical_url) |
| 119 | + .append_pair("format", "json"); |
| 120 | + let oembed_url = oembed.to_string(); |
| 121 | + |
| 122 | + let oembed = match client |
| 123 | + .get(&oembed_url) |
| 124 | + .header(USER_AGENT, ua_header.clone()) |
| 125 | + .send() |
| 126 | + .await |
| 127 | + { |
| 128 | + Ok(resp) if resp.status().is_success() => resp.json::<OEmbedResponse>().await.ok(), |
| 129 | + _ => None, |
| 130 | + }; |
| 131 | + |
| 132 | + let title = oembed |
| 133 | + .as_ref() |
| 134 | + .and_then(|o| o.title.clone()) |
| 135 | + .unwrap_or_else(|| format!("YouTube Video {}", video_id)); |
| 136 | + |
| 137 | + let author = oembed.as_ref().and_then(|o| o.author_name.clone()); |
| 138 | + |
| 139 | + let author_url = oembed.as_ref().and_then(|o| o.author_url.clone()); |
| 140 | + |
| 141 | + // Build response |
| 142 | + let mut out = String::new(); |
| 143 | + out.push_str(&format!("# {}\n\n", title)); |
| 144 | + |
| 145 | + out.push_str("## Video Info\n\n"); |
| 146 | + if let Some(author) = &author { |
| 147 | + if let Some(author_url) = &author_url { |
| 148 | + out.push_str(&format!("- **Channel:** [{}]({})\n", author, author_url)); |
| 149 | + } else { |
| 150 | + out.push_str(&format!("- **Channel:** {}\n", author)); |
| 151 | + } |
| 152 | + } |
| 153 | + out.push_str(&format!("- **Video ID:** {}\n", video_id)); |
| 154 | + out.push_str(&format!("- **URL:** {}\n", canonical_url)); |
| 155 | + out.push_str(&format!( |
| 156 | + "- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n", |
| 157 | + video_id |
| 158 | + )); |
| 159 | + |
| 160 | + Ok(FetchResponse { |
| 161 | + url: request.url.clone(), |
| 162 | + status_code: 200, |
| 163 | + content_type: Some("text/markdown".to_string()), |
| 164 | + format: Some("youtube_video".to_string()), |
| 165 | + content: Some(out), |
| 166 | + ..Default::default() |
| 167 | + }) |
| 168 | + } |
| 169 | +} |
| 170 | + |
| 171 | +#[cfg(test)] |
| 172 | +mod tests { |
| 173 | + use super::*; |
| 174 | + |
| 175 | + #[test] |
| 176 | + fn test_parse_youtube_watch() { |
| 177 | + let url = Url::parse("https://www.youtube.com/watch?v=dQw4w9WgXcQ").unwrap(); |
| 178 | + assert_eq!( |
| 179 | + YouTubeFetcher::parse_video_id(&url), |
| 180 | + Some("dQw4w9WgXcQ".to_string()) |
| 181 | + ); |
| 182 | + } |
| 183 | + |
| 184 | + #[test] |
| 185 | + fn test_parse_youtu_be() { |
| 186 | + let url = Url::parse("https://youtu.be/dQw4w9WgXcQ").unwrap(); |
| 187 | + assert_eq!( |
| 188 | + YouTubeFetcher::parse_video_id(&url), |
| 189 | + Some("dQw4w9WgXcQ".to_string()) |
| 190 | + ); |
| 191 | + } |
| 192 | + |
| 193 | + #[test] |
| 194 | + fn test_parse_youtube_no_www() { |
| 195 | + let url = Url::parse("https://youtube.com/watch?v=abc123").unwrap(); |
| 196 | + assert_eq!( |
| 197 | + YouTubeFetcher::parse_video_id(&url), |
| 198 | + Some("abc123".to_string()) |
| 199 | + ); |
| 200 | + } |
| 201 | + |
| 202 | + #[test] |
| 203 | + fn test_rejects_non_watch() { |
| 204 | + let url = Url::parse("https://www.youtube.com/channel/UC123").unwrap(); |
| 205 | + assert_eq!(YouTubeFetcher::parse_video_id(&url), None); |
| 206 | + } |
| 207 | + |
| 208 | + #[test] |
| 209 | + fn test_rejects_no_v_param() { |
| 210 | + let url = Url::parse("https://www.youtube.com/watch?list=PL123").unwrap(); |
| 211 | + assert_eq!(YouTubeFetcher::parse_video_id(&url), None); |
| 212 | + } |
| 213 | + |
| 214 | + #[test] |
| 215 | + fn test_rejects_non_youtube() { |
| 216 | + let url = Url::parse("https://vimeo.com/123456").unwrap(); |
| 217 | + assert_eq!(YouTubeFetcher::parse_video_id(&url), None); |
| 218 | + } |
| 219 | + |
| 220 | + #[test] |
| 221 | + fn test_fetcher_matches() { |
| 222 | + let fetcher = YouTubeFetcher::new(); |
| 223 | + |
| 224 | + let url = Url::parse("https://www.youtube.com/watch?v=abc").unwrap(); |
| 225 | + assert!(fetcher.matches(&url)); |
| 226 | + |
| 227 | + let url = Url::parse("https://youtu.be/abc").unwrap(); |
| 228 | + assert!(fetcher.matches(&url)); |
| 229 | + |
| 230 | + let url = Url::parse("https://example.com/watch?v=abc").unwrap(); |
| 231 | + assert!(!fetcher.matches(&url)); |
| 232 | + } |
| 233 | +} |
0 commit comments