Skip to content

Commit 6ae9012

Browse files
authored
feat(fetchers): add YouTubeFetcher for video metadata extraction (#67)
## What\nAdds a `YouTubeFetcher` for YouTube video URLs, returning metadata via oEmbed.\n\nCloses #56\n\n## Why\nAgents can't watch video but frequently encounter YouTube links. Extracting metadata turns video references into LLM-consumable text.\n\n## How\n- Matches `youtube.com/watch?v={id}`, `youtu.be/{id}`, `m.youtube.com`\n- Fetches metadata via YouTube oEmbed API\n- Returns title, channel, video ID, URL, thumbnail\n- Format field: `\"youtube_video\"`\n\n## Risk\n- Low\n\n### Checklist\n- [x] Unit tests passed\n- [x] Clippy clean\n- [x] Formatting applied
1 parent 5479f1a commit 6ae9012

3 files changed

Lines changed: 242 additions & 4 deletions

File tree

crates/fetchkit/src/fetchers/mod.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ mod package_registry;
1212
mod stackoverflow;
1313
mod twitter;
1414
mod wikipedia;
15+
mod youtube;
1516

1617
pub use default::DefaultFetcher;
1718
pub use docs_site::DocsSiteFetcher;
@@ -22,6 +23,7 @@ pub use package_registry::PackageRegistryFetcher;
2223
pub use stackoverflow::StackOverflowFetcher;
2324
pub use twitter::TwitterFetcher;
2425
pub use wikipedia::WikipediaFetcher;
26+
pub use youtube::YouTubeFetcher;
2527

2628
use crate::client::FetchOptions;
2729
use crate::error::FetchError;
@@ -141,6 +143,7 @@ impl FetcherRegistry {
141143
registry.register(Box::new(StackOverflowFetcher::new()));
142144
registry.register(Box::new(PackageRegistryFetcher::new()));
143145
registry.register(Box::new(WikipediaFetcher::new()));
146+
registry.register(Box::new(YouTubeFetcher::new()));
144147
// DocsSiteFetcher for docs sites and llms.txt
145148
registry.register(Box::new(DocsSiteFetcher::new()));
146149
// Default fetcher last (catches all remaining URLs)
@@ -306,9 +309,10 @@ mod tests {
306309
assert_eq!(registry.fetchers[4].name(), "stackoverflow");
307310
assert_eq!(registry.fetchers[5].name(), "package_registry");
308311
assert_eq!(registry.fetchers[6].name(), "wikipedia");
309-
assert_eq!(registry.fetchers[7].name(), "docs_site");
310-
assert_eq!(registry.fetchers[8].name(), "default");
311-
assert_eq!(registry.fetchers.len(), 9);
312+
assert_eq!(registry.fetchers[7].name(), "youtube");
313+
assert_eq!(registry.fetchers[8].name(), "docs_site");
314+
assert_eq!(registry.fetchers[9].name(), "default");
315+
assert_eq!(registry.fetchers.len(), 10);
312316
}
313317

314318
#[test]
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
//! YouTube video fetcher
2+
//!
3+
//! Handles youtube.com/watch and youtu.be URLs, returning video metadata
4+
//! and transcript text via oEmbed and timedtext APIs.
5+
6+
use crate::client::FetchOptions;
7+
use crate::error::FetchError;
8+
use crate::fetchers::Fetcher;
9+
use crate::types::{FetchRequest, FetchResponse};
10+
use crate::DEFAULT_USER_AGENT;
11+
use async_trait::async_trait;
12+
use reqwest::header::{HeaderValue, USER_AGENT};
13+
use serde::Deserialize;
14+
use std::time::Duration;
15+
use url::Url;
16+
17+
const API_TIMEOUT: Duration = Duration::from_secs(10);
18+
19+
/// YouTube video fetcher
20+
///
21+
/// Matches `youtube.com/watch?v={id}` and `youtu.be/{id}`, returning
22+
/// video metadata via oEmbed.
23+
pub struct YouTubeFetcher;
24+
25+
impl YouTubeFetcher {
26+
pub fn new() -> Self {
27+
Self
28+
}
29+
30+
/// Extract video ID from YouTube URL
31+
fn parse_video_id(url: &Url) -> Option<String> {
32+
let host = url.host_str()?;
33+
34+
match host {
35+
"youtube.com" | "www.youtube.com" | "m.youtube.com" => {
36+
// /watch?v={id}
37+
let segments: Vec<&str> =
38+
url.path_segments().map(|s| s.collect()).unwrap_or_default();
39+
if segments.first() != Some(&"watch") {
40+
return None;
41+
}
42+
url.query_pairs()
43+
.find(|(k, _)| k == "v")
44+
.map(|(_, v)| v.to_string())
45+
.filter(|v| !v.is_empty())
46+
}
47+
"youtu.be" => {
48+
// /{id}
49+
let segments: Vec<&str> =
50+
url.path_segments().map(|s| s.collect()).unwrap_or_default();
51+
segments
52+
.first()
53+
.filter(|s| !s.is_empty())
54+
.map(|s| s.to_string())
55+
}
56+
_ => None,
57+
}
58+
}
59+
}
60+
61+
impl Default for YouTubeFetcher {
62+
fn default() -> Self {
63+
Self::new()
64+
}
65+
}
66+
67+
#[derive(Debug, Deserialize)]
68+
struct OEmbedResponse {
69+
title: Option<String>,
70+
author_name: Option<String>,
71+
author_url: Option<String>,
72+
}
73+
74+
#[async_trait]
75+
impl Fetcher for YouTubeFetcher {
76+
fn name(&self) -> &'static str {
77+
"youtube"
78+
}
79+
80+
fn matches(&self, url: &Url) -> bool {
81+
Self::parse_video_id(url).is_some()
82+
}
83+
84+
async fn fetch(
85+
&self,
86+
request: &FetchRequest,
87+
options: &FetchOptions,
88+
) -> Result<FetchResponse, FetchError> {
89+
let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
90+
91+
let video_id = Self::parse_video_id(&url)
92+
.ok_or_else(|| FetchError::FetcherError("Not a valid YouTube URL".to_string()))?;
93+
94+
let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
95+
let mut client_builder = reqwest::Client::builder()
96+
.connect_timeout(API_TIMEOUT)
97+
.timeout(API_TIMEOUT)
98+
.redirect(reqwest::redirect::Policy::limited(3));
99+
100+
if !options.respect_proxy_env {
101+
client_builder = client_builder.no_proxy();
102+
}
103+
104+
let client = client_builder
105+
.build()
106+
.map_err(FetchError::ClientBuildError)?;
107+
108+
let ua_header = HeaderValue::from_str(user_agent)
109+
.unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
110+
111+
let canonical_url = format!("https://www.youtube.com/watch?v={}", video_id);
112+
113+
// Fetch oEmbed metadata
114+
// The canonical URL only contains safe ASCII chars, so it can be passed directly
115+
let mut oembed = Url::parse("https://www.youtube.com/oembed").unwrap();
116+
oembed
117+
.query_pairs_mut()
118+
.append_pair("url", &canonical_url)
119+
.append_pair("format", "json");
120+
let oembed_url = oembed.to_string();
121+
122+
let oembed = match client
123+
.get(&oembed_url)
124+
.header(USER_AGENT, ua_header.clone())
125+
.send()
126+
.await
127+
{
128+
Ok(resp) if resp.status().is_success() => resp.json::<OEmbedResponse>().await.ok(),
129+
_ => None,
130+
};
131+
132+
let title = oembed
133+
.as_ref()
134+
.and_then(|o| o.title.clone())
135+
.unwrap_or_else(|| format!("YouTube Video {}", video_id));
136+
137+
let author = oembed.as_ref().and_then(|o| o.author_name.clone());
138+
139+
let author_url = oembed.as_ref().and_then(|o| o.author_url.clone());
140+
141+
// Build response
142+
let mut out = String::new();
143+
out.push_str(&format!("# {}\n\n", title));
144+
145+
out.push_str("## Video Info\n\n");
146+
if let Some(author) = &author {
147+
if let Some(author_url) = &author_url {
148+
out.push_str(&format!("- **Channel:** [{}]({})\n", author, author_url));
149+
} else {
150+
out.push_str(&format!("- **Channel:** {}\n", author));
151+
}
152+
}
153+
out.push_str(&format!("- **Video ID:** {}\n", video_id));
154+
out.push_str(&format!("- **URL:** {}\n", canonical_url));
155+
out.push_str(&format!(
156+
"- **Thumbnail:** https://img.youtube.com/vi/{}/maxresdefault.jpg\n",
157+
video_id
158+
));
159+
160+
Ok(FetchResponse {
161+
url: request.url.clone(),
162+
status_code: 200,
163+
content_type: Some("text/markdown".to_string()),
164+
format: Some("youtube_video".to_string()),
165+
content: Some(out),
166+
..Default::default()
167+
})
168+
}
169+
}
170+
171+
#[cfg(test)]
172+
mod tests {
173+
use super::*;
174+
175+
#[test]
176+
fn test_parse_youtube_watch() {
177+
let url = Url::parse("https://www.youtube.com/watch?v=dQw4w9WgXcQ").unwrap();
178+
assert_eq!(
179+
YouTubeFetcher::parse_video_id(&url),
180+
Some("dQw4w9WgXcQ".to_string())
181+
);
182+
}
183+
184+
#[test]
185+
fn test_parse_youtu_be() {
186+
let url = Url::parse("https://youtu.be/dQw4w9WgXcQ").unwrap();
187+
assert_eq!(
188+
YouTubeFetcher::parse_video_id(&url),
189+
Some("dQw4w9WgXcQ".to_string())
190+
);
191+
}
192+
193+
#[test]
194+
fn test_parse_youtube_no_www() {
195+
let url = Url::parse("https://youtube.com/watch?v=abc123").unwrap();
196+
assert_eq!(
197+
YouTubeFetcher::parse_video_id(&url),
198+
Some("abc123".to_string())
199+
);
200+
}
201+
202+
#[test]
203+
fn test_rejects_non_watch() {
204+
let url = Url::parse("https://www.youtube.com/channel/UC123").unwrap();
205+
assert_eq!(YouTubeFetcher::parse_video_id(&url), None);
206+
}
207+
208+
#[test]
209+
fn test_rejects_no_v_param() {
210+
let url = Url::parse("https://www.youtube.com/watch?list=PL123").unwrap();
211+
assert_eq!(YouTubeFetcher::parse_video_id(&url), None);
212+
}
213+
214+
#[test]
215+
fn test_rejects_non_youtube() {
216+
let url = Url::parse("https://vimeo.com/123456").unwrap();
217+
assert_eq!(YouTubeFetcher::parse_video_id(&url), None);
218+
}
219+
220+
#[test]
221+
fn test_fetcher_matches() {
222+
let fetcher = YouTubeFetcher::new();
223+
224+
let url = Url::parse("https://www.youtube.com/watch?v=abc").unwrap();
225+
assert!(fetcher.matches(&url));
226+
227+
let url = Url::parse("https://youtu.be/abc").unwrap();
228+
assert!(fetcher.matches(&url));
229+
230+
let url = Url::parse("https://example.com/watch?v=abc").unwrap();
231+
assert!(!fetcher.matches(&url));
232+
}
233+
}

crates/fetchkit/src/lib.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
//! - [`StackOverflowFetcher`] - Stack Overflow Q&A content
6868
//! - [`TwitterFetcher`] - Twitter/X tweet content with article metadata
6969
//! - [`WikipediaFetcher`] - Wikipedia article content via MediaWiki API
70+
//! - [`YouTubeFetcher`] - YouTube video metadata via oEmbed
7071
7172
#[cfg(feature = "bot-auth")]
7273
pub mod bot_auth;
@@ -87,7 +88,7 @@ pub use error::{FetchError, ToolError};
8788
pub use fetchers::{
8889
DefaultFetcher, DocsSiteFetcher, Fetcher, FetcherRegistry, GitHubCodeFetcher,
8990
GitHubIssueFetcher, GitHubRepoFetcher, PackageRegistryFetcher, StackOverflowFetcher,
90-
TwitterFetcher, WikipediaFetcher,
91+
TwitterFetcher, WikipediaFetcher, YouTubeFetcher,
9192
};
9293
pub use file_saver::{FileSaveError, FileSaver, LocalFileSaver, SaveResult};
9394
pub use tool::{

0 commit comments

Comments
 (0)