From a2bcea95716670abc46a5b469199f320f61122e4 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 6 Feb 2026 16:24:13 +0000 Subject: [PATCH 1/2] Add TTS Provider interface and OpenAI provider (WIP) Begin refactoring TTS to support multiple providers (OpenAI, ElevenLabs) for custom voice cloning. Introduces Provider interface and extracts OpenAI-specific logic into its own provider implementation. https://claude.ai/code/session_01WZdrtu92JoNdG5MhQnPmie --- internal/tts/openai.go | 111 +++++++++++++++++++++++++++++++++++++++ internal/tts/provider.go | 27 ++++++++++ 2 files changed, 138 insertions(+) create mode 100644 internal/tts/openai.go create mode 100644 internal/tts/provider.go diff --git a/internal/tts/openai.go b/internal/tts/openai.go new file mode 100644 index 0000000..a2f983e --- /dev/null +++ b/internal/tts/openai.go @@ -0,0 +1,111 @@ +package tts + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" +) + +const ( + openAITTSURL = "https://api.openai.com/v1/audio/speech" + openAIMaxChunkSize = 4000 // OpenAI TTS max is 4096 chars, leave some margin +) + +// OpenAIProvider implements the Provider interface for OpenAI's TTS API +type OpenAIProvider struct { + apiKey string + model string + voice string + client *http.Client +} + +// NewOpenAIProvider creates a new OpenAI TTS provider +func NewOpenAIProvider(apiKey, model, voice string) *OpenAIProvider { + if model == "" { + model = "tts-1" + } + if voice == "" { + voice = "alloy" + } + return &OpenAIProvider{ + apiKey: apiKey, + model: model, + voice: voice, + client: &http.Client{}, + } +} + +// openAITTSRequest is the request body for OpenAI's TTS API +type openAITTSRequest struct { + Model string `json:"model"` + Input string `json:"input"` + Voice string `json:"voice"` +} + +func (p *OpenAIProvider) GenerateChunkAudio(ctx context.Context, text, voiceID string) ([]byte, error) { + if voiceID == "" { + voiceID = p.voice + } + + reqBody := openAITTSRequest{ + Model: p.model, + Input: text, + Voice: voiceID, + } + + jsonBody, err := json.Marshal(reqBody) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %w", err) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, openAITTSURL, bytes.NewReader(jsonBody)) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Authorization", "Bearer "+p.apiKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := p.client.Do(req) + if err != nil { + return nil, fmt.Errorf("TTS API request failed: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("TTS API returned status %d: %s", resp.StatusCode, string(body)) + } + + return body, nil +} + +func (p *OpenAIProvider) AvailableVoices() []Voice { + return []Voice{ + {ID: "alloy", Name: "Alloy"}, + {ID: "echo", Name: "Echo"}, + {ID: "fable", Name: "Fable"}, + {ID: "onyx", Name: "Onyx"}, + {ID: "nova", Name: "Nova"}, + {ID: "shimmer", Name: "Shimmer"}, + } +} + +func (p *OpenAIProvider) DefaultVoice() string { + return p.voice +} + +func (p *OpenAIProvider) MaxChunkSize() int { + return openAIMaxChunkSize +} + +func (p *OpenAIProvider) Name() string { + return "openai" +} diff --git a/internal/tts/provider.go b/internal/tts/provider.go new file mode 100644 index 0000000..d82d4e7 --- /dev/null +++ b/internal/tts/provider.go @@ -0,0 +1,27 @@ +package tts + +import "context" + +// Voice represents a TTS voice option +type Voice struct { + ID string // Provider-specific voice identifier + Name string // Human-readable display name +} + +// Provider defines the interface for TTS providers (OpenAI, ElevenLabs, etc.) +type Provider interface { + // GenerateChunkAudio converts a single chunk of text to audio bytes. + GenerateChunkAudio(ctx context.Context, text, voiceID string) ([]byte, error) + + // AvailableVoices returns the list of voices available from this provider. + AvailableVoices() []Voice + + // DefaultVoice returns the default voice ID for this provider. + DefaultVoice() string + + // MaxChunkSize returns the maximum characters per API call. + MaxChunkSize() int + + // Name returns the provider name (e.g., "openai", "elevenlabs"). + Name() string +} From 418bfd2db127e0055e73f227bab1be39c3603c27 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 6 Feb 2026 16:31:47 +0000 Subject: [PATCH 2/2] Add ElevenLabs custom voice cloning and Pocket Casts podcast support - Refactor TTS into Provider interface supporting multiple backends - Add ElevenLabs provider for custom voice cloning (Swedish via eleven_multilingual_v2 model) - Enhance podcast RSS feed with iTunes namespace tags for Pocket Casts compatibility (itunes:author, itunes:image, itunes:category, language) - Add configurable podcast language (default: sv) and cover image URL - Update voice selector UI to work with provider-specific voices - Config: TTS_PROVIDER, ELEVENLABS_API_KEY, ELEVENLABS_VOICE_ID, ELEVENLABS_MODEL, ELEVENLABS_VOICE_NAME, PODCAST_LANGUAGE, PODCAST_IMAGE_URL https://claude.ai/code/session_01WZdrtu92JoNdG5MhQnPmie --- .env.example | 23 ++++- cmd/kiln/main.go | 34 ++++++-- docker-compose.yml | 7 ++ internal/config/config.go | 30 +++++-- internal/server/rss.go | 134 ++++++++++++++++++++++------ internal/server/server.go | 33 +++++-- internal/server/templates.templ | 40 +++++---- internal/server/templates_templ.go | 136 ++++++++++++++++------------- internal/tts/elevenlabs.go | 124 ++++++++++++++++++++++++++ internal/tts/tts.go | 105 ++++++---------------- 10 files changed, 462 insertions(+), 204 deletions(-) create mode 100644 internal/tts/elevenlabs.go diff --git a/.env.example b/.env.example index 62cf8ea..6853656 100644 --- a/.env.example +++ b/.env.example @@ -15,11 +15,30 @@ FEED_LINK=http://localhost:8080 FEED_AUTHOR=Your Name # TTS (Text-to-Speech) Configuration +# Provider: "openai" or "elevenlabs" (default: openai) +TTS_PROVIDER=openai +# Directory for storing generated audio files +AUDIO_DIR=/data/audio + +# OpenAI TTS (when TTS_PROVIDER=openai) # Set your OpenAI API key to enable TTS. Without it, TTS features are disabled. OPENAI_API_KEY=your_openai_api_key # Available models: tts-1 (faster), tts-1-hd (higher quality) TTS_MODEL=tts-1 # Available voices: alloy, echo, fable, onyx, nova, shimmer TTS_VOICE=alloy -# Directory for storing generated audio files -AUDIO_DIR=/data/audio + +# ElevenLabs TTS (when TTS_PROVIDER=elevenlabs) +# For custom voice cloning - clone your voice at https://elevenlabs.io +# ELEVENLABS_API_KEY=your_elevenlabs_api_key +# ELEVENLABS_VOICE_ID=your_cloned_voice_id +# Model: eleven_multilingual_v2 recommended for Swedish +# ELEVENLABS_MODEL=eleven_multilingual_v2 +# Display name for your custom voice +# ELEVENLABS_VOICE_NAME=My Voice + +# Podcast Configuration +# Language code for podcast feed (default: sv for Swedish) +PODCAST_LANGUAGE=sv +# Cover image URL for podcast apps (optional) +# PODCAST_IMAGE_URL=https://example.com/podcast-cover.jpg diff --git a/cmd/kiln/main.go b/cmd/kiln/main.go index a10e1dc..7f54e0c 100644 --- a/cmd/kiln/main.go +++ b/cmd/kiln/main.go @@ -52,16 +52,38 @@ func run() error { defer scraper.Close() log.Println("Initialized scraper") - // Initialize TTS service (optional - only if API key is configured) + // Initialize TTS service (optional - requires provider API key) var ttsSvc *tts.Service - if cfg.OpenAIAPIKey != "" { - ttsSvc, err = tts.New(cfg.OpenAIAPIKey, cfg.TTSModel, cfg.TTSVoice, cfg.AudioDir) + var ttsProvider tts.Provider + + switch cfg.TTSProvider { + case "elevenlabs": + if cfg.ElevenLabsAPIKey != "" && cfg.ElevenLabsVoiceID != "" { + ttsProvider = tts.NewElevenLabsProvider( + cfg.ElevenLabsAPIKey, + cfg.ElevenLabsModel, + cfg.ElevenLabsVoiceID, + cfg.ElevenLabsVoiceName, + ) + log.Printf("Using ElevenLabs TTS provider (model=%s, voice=%s)", cfg.ElevenLabsModel, cfg.ElevenLabsVoiceName) + } else { + log.Println("TTS disabled (ELEVENLABS_API_KEY and ELEVENLABS_VOICE_ID required for elevenlabs provider)") + } + default: // "openai" + if cfg.OpenAIAPIKey != "" { + ttsProvider = tts.NewOpenAIProvider(cfg.OpenAIAPIKey, cfg.TTSModel, cfg.TTSVoice) + log.Printf("Using OpenAI TTS provider (model=%s, voice=%s)", cfg.TTSModel, cfg.TTSVoice) + } else { + log.Println("TTS disabled (OPENAI_API_KEY not set)") + } + } + + if ttsProvider != nil { + ttsSvc, err = tts.New(ttsProvider, cfg.AudioDir) if err != nil { return fmt.Errorf("failed to initialize TTS: %w", err) } - log.Printf("Initialized TTS service (model=%s, voice=%s, dir=%s)", cfg.TTSModel, cfg.TTSVoice, cfg.AudioDir) - } else { - log.Println("TTS disabled (OPENAI_API_KEY not set)") + log.Printf("Initialized TTS service (provider=%s, dir=%s)", ttsProvider.Name(), cfg.AudioDir) } // Create server diff --git a/docker-compose.yml b/docker-compose.yml index 337ea39..84b41ff 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -14,10 +14,17 @@ services: - FEED_DESCRIPTION=${FEED_DESCRIPTION:-Articles from Gasetten} - FEED_LINK=${FEED_LINK:-http://localhost:8080} - FEED_AUTHOR=${FEED_AUTHOR:-Kiln User} + - TTS_PROVIDER=${TTS_PROVIDER:-openai} - OPENAI_API_KEY=${OPENAI_API_KEY} - TTS_MODEL=${TTS_MODEL:-tts-1} - TTS_VOICE=${TTS_VOICE:-alloy} + - ELEVENLABS_API_KEY=${ELEVENLABS_API_KEY} + - ELEVENLABS_VOICE_ID=${ELEVENLABS_VOICE_ID} + - ELEVENLABS_MODEL=${ELEVENLABS_MODEL:-eleven_multilingual_v2} + - ELEVENLABS_VOICE_NAME=${ELEVENLABS_VOICE_NAME:-Custom Voice} - AUDIO_DIR=/data/audio + - PODCAST_LANGUAGE=${PODCAST_LANGUAGE:-sv} + - PODCAST_IMAGE_URL=${PODCAST_IMAGE_URL} depends_on: db: condition: service_healthy diff --git a/internal/config/config.go b/internal/config/config.go index 2b8111d..43e3c77 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -28,10 +28,23 @@ type Config struct { ScraperHeadless bool // TTS (Text-to-Speech) + TTSProvider string // "openai" or "elevenlabs" + AudioDir string + + // OpenAI TTS OpenAIAPIKey string TTSModel string TTSVoice string - AudioDir string + + // ElevenLabs TTS (for custom voice cloning) + ElevenLabsAPIKey string + ElevenLabsVoiceID string + ElevenLabsModel string + ElevenLabsVoiceName string + + // Podcast + PodcastLanguage string + PodcastImageURL string } // Load reads configuration from environment variables @@ -46,10 +59,17 @@ func Load() (*Config, error) { FeedLink: getEnv("FEED_LINK", "http://localhost:8080"), FeedAuthor: getEnv("FEED_AUTHOR", "Kiln User"), ScraperHeadless: getEnvAsBool("SCRAPER_HEADLESS", true), - OpenAIAPIKey: getEnv("OPENAI_API_KEY", ""), - TTSModel: getEnv("TTS_MODEL", "tts-1"), - TTSVoice: getEnv("TTS_VOICE", "alloy"), - AudioDir: getEnv("AUDIO_DIR", "/data/audio"), + TTSProvider: getEnv("TTS_PROVIDER", "openai"), + AudioDir: getEnv("AUDIO_DIR", "/data/audio"), + OpenAIAPIKey: getEnv("OPENAI_API_KEY", ""), + TTSModel: getEnv("TTS_MODEL", "tts-1"), + TTSVoice: getEnv("TTS_VOICE", "alloy"), + ElevenLabsAPIKey: getEnv("ELEVENLABS_API_KEY", ""), + ElevenLabsVoiceID: getEnv("ELEVENLABS_VOICE_ID", ""), + ElevenLabsModel: getEnv("ELEVENLABS_MODEL", "eleven_multilingual_v2"), + ElevenLabsVoiceName: getEnv("ELEVENLABS_VOICE_NAME", "Custom Voice"), + PodcastLanguage: getEnv("PODCAST_LANGUAGE", "sv"), + PodcastImageURL: getEnv("PODCAST_IMAGE_URL", ""), } // Validate required fields diff --git a/internal/server/rss.go b/internal/server/rss.go index f24ad34..da8d1f3 100644 --- a/internal/server/rss.go +++ b/internal/server/rss.go @@ -1,6 +1,7 @@ package server import ( + "encoding/xml" "fmt" "time" @@ -66,64 +67,141 @@ func GenerateRSSFeed(articles []*database.Article, cfg *config.Config) (string, return rss, nil } -// GeneratePodcastFeed creates a podcast-compatible RSS feed with audio enclosures +// Podcast RSS XML structures with iTunes namespace for Pocket Casts compatibility + +type podcastRSS struct { + XMLName xml.Name `xml:"rss"` + Version string `xml:"version,attr"` + ITunes string `xml:"xmlns:itunes,attr"` + Channel podcastChannel `xml:"channel"` +} + +type podcastChannel struct { + Title string `xml:"title"` + Link string `xml:"link"` + Description string `xml:"description"` + Language string `xml:"language"` + LastBuildDate string `xml:"lastBuildDate"` + ITunesAuthor string `xml:"itunes:author"` + ITunesSummary string `xml:"itunes:summary"` + ITunesExplicit string `xml:"itunes:explicit"` + ITunesType string `xml:"itunes:type"` + ITunesImage *podcastITunesImage `xml:"itunes:image,omitempty"` + ITunesCategory podcastCategory `xml:"itunes:category"` + Items []podcastItem `xml:"item"` +} + +type podcastITunesImage struct { + Href string `xml:"href,attr"` +} + +type podcastCategory struct { + Text string `xml:"text,attr"` +} + +type podcastItem struct { + Title string `xml:"title"` + Link string `xml:"link"` + GUID podcastGUID `xml:"guid"` + Description string `xml:"description"` + Author string `xml:"itunes:author,omitempty"` + PubDate string `xml:"pubDate"` + Enclosure podcastEnclosure `xml:"enclosure"` + ITunesDuration string `xml:"itunes:duration,omitempty"` + ITunesExplicit string `xml:"itunes:explicit"` +} + +type podcastGUID struct { + IsPermaLink string `xml:"isPermaLink,attr"` + Value string `xml:",chardata"` +} + +type podcastEnclosure struct { + URL string `xml:"url,attr"` + Length string `xml:"length,attr"` + Type string `xml:"type,attr"` +} + +// GeneratePodcastFeed creates a podcast-compatible RSS feed with iTunes namespace +// tags for Pocket Casts and other podcast apps. func GeneratePodcastFeed(articles []*database.Article, audioMap map[int]*database.AudioFile, cfg *config.Config) (string, error) { now := time.Now() - feed := &feeds.Feed{ - Title: cfg.FeedTitle + " (Podcast)", - Link: &feeds.Link{Href: cfg.FeedLink + "/podcast.xml"}, - Description: cfg.FeedDescription + " - Audio versions of articles", - Author: &feeds.Author{Name: cfg.FeedAuthor}, - Created: now, + channel := podcastChannel{ + Title: cfg.FeedTitle + " (Podcast)", + Link: cfg.FeedLink + "/podcast.xml", + Description: cfg.FeedDescription + " - Audio versions of articles", + Language: cfg.PodcastLanguage, + LastBuildDate: now.Format(time.RFC1123Z), + ITunesAuthor: cfg.FeedAuthor, + ITunesSummary: cfg.FeedDescription + " - Audio versions of articles", + ITunesExplicit: "false", + ITunesType: "episodic", + ITunesCategory: podcastCategory{Text: "News"}, + } + + if cfg.PodcastImageURL != "" { + channel.ITunesImage = &podcastITunesImage{Href: cfg.PodcastImageURL} } // Only include articles that have completed audio - feed.Items = make([]*feeds.Item, 0) for _, article := range articles { audio, hasAudio := audioMap[article.ID] if !hasAudio || audio.Status != "completed" { continue } - item := &feeds.Item{ - Title: getArticleTitle(article), - Link: &feeds.Link{Href: fmt.Sprintf("%s/articles/%d", cfg.FeedLink, article.ID)}, - Id: fmt.Sprintf("%s/articles/%d/audio", cfg.FeedLink, article.ID), - Enclosure: &feeds.Enclosure{ - Url: fmt.Sprintf("%s/articles/%d/audio?voice=%s", cfg.FeedLink, article.ID, audio.Voice), - Length: fmt.Sprintf("%d", audio.FileSize), - Type: "audio/mpeg", - }, - } - + description := "" if article.ContentText != nil { - description := *article.ContentText + description = *article.ContentText if len(description) > 500 { description = description[:500] + "..." } - item.Description = description } + author := cfg.FeedAuthor if article.Author != nil { - item.Author = &feeds.Author{Name: *article.Author} + author = *article.Author } + pubDate := article.CreatedAt if article.PublishedAt != nil { - item.Created = *article.PublishedAt - } else { - item.Created = article.CreatedAt + pubDate = *article.PublishedAt } - feed.Items = append(feed.Items, item) + item := podcastItem{ + Title: getArticleTitle(article), + Link: fmt.Sprintf("%s/articles/%d", cfg.FeedLink, article.ID), + Description: description, + Author: author, + PubDate: pubDate.Format(time.RFC1123Z), + GUID: podcastGUID{ + IsPermaLink: "false", + Value: fmt.Sprintf("%s/articles/%d/audio", cfg.FeedLink, article.ID), + }, + Enclosure: podcastEnclosure{ + URL: fmt.Sprintf("%s/articles/%d/audio?voice=%s", cfg.FeedLink, article.ID, audio.Voice), + Length: fmt.Sprintf("%d", audio.FileSize), + Type: "audio/mpeg", + }, + ITunesExplicit: "false", + } + + channel.Items = append(channel.Items, item) } - rss, err := feed.ToRss() + rss := podcastRSS{ + Version: "2.0", + ITunes: "http://www.itunes.com/dtds/podcast-1.0.dtd", + Channel: channel, + } + + output, err := xml.MarshalIndent(rss, "", " ") if err != nil { return "", fmt.Errorf("failed to generate podcast RSS: %w", err) } - return rss, nil + return xml.Header + string(output), nil } func getArticleTitle(article *database.Article) string { diff --git a/internal/server/server.go b/internal/server/server.go index 28e9503..f3e59c0 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -110,7 +110,7 @@ func (s *Server) handleArticleList(w http.ResponseWriter, r *http.Request) { audioMap, _ := s.db.GetCompletedAudioForArticles(ctx, articleIDs) // Render template - ArticleListPage(articles, audioMap, s.ttsEnabled()).Render(ctx, w) + ArticleListPage(articles, audioMap, s.ttsInfo()).Render(ctx, w) } // handleArticleDetail renders a single article @@ -134,7 +134,7 @@ func (s *Server) handleArticleDetail(w http.ResponseWriter, r *http.Request) { audioFiles, _ := s.db.GetAudioFilesByArticle(ctx, id) // Render template - ArticleDetailPage(article, audioFiles, s.ttsEnabled()).Render(ctx, w) + ArticleDetailPage(article, audioFiles, s.ttsInfo()).Render(ctx, w) } // handleScrape triggers a manual scrape operation @@ -380,6 +380,17 @@ func (s *Server) ttsEnabled() bool { return s.tts != nil } +// ttsInfo returns TTS information for templates +func (s *Server) ttsInfo() TTSInfo { + if s.tts == nil { + return TTSInfo{Enabled: false} + } + return TTSInfo{ + Enabled: true, + Voices: s.tts.AvailableVoices(), + } +} + // handleGenerateTTS triggers TTS generation for an article func (s *Server) handleGenerateTTS(w http.ResponseWriter, r *http.Request) { if !s.ttsEnabled() { @@ -399,7 +410,7 @@ func (s *Server) handleGenerateTTS(w http.ResponseWriter, r *http.Request) { // Get the voice from form data or query param voice := r.FormValue("voice") if voice == "" { - voice = s.config.TTSVoice + voice = s.tts.DefaultVoice() } // Check if article exists @@ -508,8 +519,11 @@ func (s *Server) handleServeAudio(w http.ResponseWriter, r *http.Request) { } voice := r.URL.Query().Get("voice") + if voice == "" && s.tts != nil { + voice = s.tts.DefaultVoice() + } if voice == "" { - voice = s.config.TTSVoice + voice = "alloy" // fallback } audio, err := s.db.GetAudioFileByArticle(ctx, id, voice) @@ -555,12 +569,17 @@ func (s *Server) handleServeAudio(w http.ResponseWriter, r *http.Request) { // handleTTSVoices returns the list of available TTS voices func (s *Server) handleTTSVoices(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json") - voices := tts.AvailableVoices() + if s.tts == nil { + fmt.Fprint(w, `{"voices":[],"default":""}`) + return + } + voices := s.tts.AvailableVoices() var items []string for _, v := range voices { - items = append(items, fmt.Sprintf(`"%s"`, v)) + items = append(items, fmt.Sprintf(`{"id":"%s","name":"%s"}`, v.ID, v.Name)) } - fmt.Fprintf(w, `{"voices":[%s],"default":"%s"}`, strings.Join(items, ","), s.config.TTSVoice) + fmt.Fprintf(w, `{"voices":[%s],"default":"%s","provider":"%s"}`, + strings.Join(items, ","), s.tts.DefaultVoice(), s.tts.ProviderName()) } // handleRSS generates and serves the RSS feed diff --git a/internal/server/templates.templ b/internal/server/templates.templ index 8a3472e..4a62818 100644 --- a/internal/server/templates.templ +++ b/internal/server/templates.templ @@ -6,6 +6,12 @@ import ( "fmt" ) +// TTSInfo holds TTS state passed to templates +type TTSInfo struct { + Enabled bool + Voices []tts.Voice +} + // Layout is the base HTML template templ Layout(title string) { @@ -45,7 +51,7 @@ templ Layout(title string) { } // ArticleListPage renders the list of articles -templ ArticleListPage(articles []*database.Article, audioMap map[int]*database.AudioFile, ttsEnabled bool) { +templ ArticleListPage(articles []*database.Article, audioMap map[int]*database.AudioFile, ttsInfo TTSInfo) { @Layout("Articles") {

Articles

@@ -89,7 +95,7 @@ templ ArticleListPage(articles []*database.Article, audioMap map[int]*database.A } else {
for _, article := range articles { - @ArticleCardWithAudio(article, audioMap[article.ID], ttsEnabled) + @ArticleCardWithAudio(article, audioMap[article.ID], ttsInfo.Enabled) }
} @@ -152,7 +158,7 @@ templ ArticleCard(article *database.Article) { } // ArticleDetailPage renders a single article in detail -templ ArticleDetailPage(article *database.Article, audioFiles []*database.AudioFile, ttsEnabled bool) { +templ ArticleDetailPage(article *database.Article, audioFiles []*database.AudioFile, ttsInfo TTSInfo) { @Layout(getTitle(article)) {
@@ -183,7 +189,7 @@ templ ArticleDetailPage(article *database.Article, audioFiles []*database.AudioF
// TTS Audio Section - if ttsEnabled { + if ttsInfo.Enabled {

Listen to this article

if hasCompletedAudio(audioFiles) { @@ -192,16 +198,18 @@ templ ArticleDetailPage(article *database.Article, audioFiles []*database.AudioF @AudioPlayer(article.ID, af) } } -
-
- Generate with a different voice -
- @VoiceSelector(article.ID) -
-
-
+ if len(ttsInfo.Voices) > 1 { +
+
+ Generate with a different voice +
+ @VoiceSelector(article.ID, ttsInfo.Voices) +
+
+
+ } } else { - @VoiceSelector(article.ID) + @VoiceSelector(article.ID, ttsInfo.Voices) }
} @@ -229,7 +237,7 @@ templ AudioPlayer(articleID int, audio *database.AudioFile) { } // VoiceSelector renders a voice selection form for TTS generation -templ VoiceSelector(articleID int) { +templ VoiceSelector(articleID int, voices []tts.Voice) {
@@ -238,8 +246,8 @@ templ VoiceSelector(articleID int) { id={ fmt.Sprintf("voice-select-%d", articleID) } class="text-sm border border-gray-300 rounded px-2 py-1 bg-white" > - for _, v := range tts.AvailableVoices() { - + for _, v := range voices { + }
") - if templ_7745c5c3_Err != nil { - return templ_7745c5c3_Err + if len(ttsInfo.Voices) > 1 { + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 42, "
Generate with a different voice
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = VoiceSelector(article.ID, ttsInfo.Voices).Render(ctx, templ_7745c5c3_Buffer) + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "
") + if templ_7745c5c3_Err != nil { + return templ_7745c5c3_Err + } } } else { - templ_7745c5c3_Err = VoiceSelector(article.ID).Render(ctx, templ_7745c5c3_Buffer) + templ_7745c5c3_Err = VoiceSelector(article.ID, ttsInfo.Voices).Render(ctx, templ_7745c5c3_Buffer) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 43, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 44, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, "
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -556,30 +568,30 @@ func ArticleDetailPage(article *database.Article, audioFiles []*database.AudioFi return templ_7745c5c3_Err } } else if article.ContentText != nil { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 45, "

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var24 string templ_7745c5c3_Var24, templ_7745c5c3_Err = templ.JoinStringErrs(*article.ContentText) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/server/templates.templ`, Line: 212, Col: 30} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/server/templates.templ`, Line: 220, Col: 30} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var24)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 46, "

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } else { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 47, "

No content available

") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "

No content available

") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 48, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "
") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -615,46 +627,46 @@ func AudioPlayer(articleID int, audio *database.AudioFile) templ.Component { templ_7745c5c3_Var25 = templ.NopComponent } ctx = templ.ClearChildren(ctx) - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 49, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, "\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } var templ_7745c5c3_Var27 string templ_7745c5c3_Var27, templ_7745c5c3_Err = templ.JoinStringErrs(audio.Voice) if templ_7745c5c3_Err != nil { - return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/server/templates.templ`, Line: 224, Col: 83} + return templ.Error{Err: templ_7745c5c3_Err, FileName: `internal/server/templates.templ`, Line: 232, Col: 83} } _, templ_7745c5c3_Err = templ_7745c5c3_Buffer.WriteString(templ.EscapeString(templ_7745c5c3_Var27)) if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 51, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 53, "\" type=\"audio/mpeg\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } @@ -663,7 +675,7 @@ func AudioPlayer(articleID int, audio *database.AudioFile) templ.Component { } // VoiceSelector renders a voice selection form for TTS generation -func VoiceSelector(articleID int) templ.Component { +func VoiceSelector(articleID int, voices []tts.Voice) templ.Component { return templruntime.GeneratedTemplate(func(templ_7745c5c3_Input templruntime.GeneratedComponentInput) (templ_7745c5c3_Err error) { templ_7745c5c3_W, ctx := templ_7745c5c3_Input.Writer, templ_7745c5c3_Input.Context if templ_7745c5c3_CtxErr := ctx.Err(); templ_7745c5c3_CtxErr != nil { @@ -684,95 +696,95 @@ func VoiceSelector(articleID int) templ.Component { templ_7745c5c3_Var29 = templ.NopComponent } ctx = templ.ClearChildren(ctx) - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 53, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 55, "\" class=\"text-sm border border-gray-300 rounded px-2 py-1 bg-white\">") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } - for _, v := range tts.AvailableVoices() { - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 55, "") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 58, "") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } } - templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 58, "
") + templ_7745c5c3_Err = templruntime.WriteString(templ_7745c5c3_Buffer, 62, "\" class=\"bg-green-600 hover:bg-green-700 text-white px-3 py-1 rounded text-sm font-medium\">Generate Audio") if templ_7745c5c3_Err != nil { return templ_7745c5c3_Err } diff --git a/internal/tts/elevenlabs.go b/internal/tts/elevenlabs.go new file mode 100644 index 0000000..0078a60 --- /dev/null +++ b/internal/tts/elevenlabs.go @@ -0,0 +1,124 @@ +package tts + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" +) + +const ( + elevenLabsBaseURL = "https://api.elevenlabs.io/v1/text-to-speech" + elevenLabsMaxChunk = 5000 // ElevenLabs supports up to 5000 chars per request + elevenLabsDefaultModel = "eleven_multilingual_v2" // Best for non-English (Swedish) +) + +// ElevenLabsProvider implements the Provider interface for ElevenLabs TTS API. +// Supports custom cloned voices via voice IDs. +type ElevenLabsProvider struct { + apiKey string + model string + voiceID string // The cloned voice ID from ElevenLabs + voiceName string // Display name for the custom voice + client *http.Client +} + +// NewElevenLabsProvider creates a new ElevenLabs TTS provider. +// voiceID is the ID of the cloned voice from the ElevenLabs dashboard. +// voiceName is an optional display name (defaults to "Custom Voice"). +func NewElevenLabsProvider(apiKey, model, voiceID, voiceName string) *ElevenLabsProvider { + if model == "" { + model = elevenLabsDefaultModel + } + if voiceName == "" { + voiceName = "Custom Voice" + } + return &ElevenLabsProvider{ + apiKey: apiKey, + model: model, + voiceID: voiceID, + voiceName: voiceName, + client: &http.Client{}, + } +} + +// elevenLabsRequest is the request body for the ElevenLabs TTS API +type elevenLabsRequest struct { + Text string `json:"text"` + ModelID string `json:"model_id"` + VoiceSettings elevenLabsVoiceSettings `json:"voice_settings"` +} + +type elevenLabsVoiceSettings struct { + Stability float64 `json:"stability"` + SimilarityBoost float64 `json:"similarity_boost"` + Style float64 `json:"style,omitempty"` +} + +func (p *ElevenLabsProvider) GenerateChunkAudio(ctx context.Context, text, voiceID string) ([]byte, error) { + if voiceID == "" || voiceID == "custom" { + voiceID = p.voiceID + } + + reqBody := elevenLabsRequest{ + Text: text, + ModelID: p.model, + VoiceSettings: elevenLabsVoiceSettings{ + Stability: 0.5, + SimilarityBoost: 0.75, + }, + } + + jsonBody, err := json.Marshal(reqBody) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %w", err) + } + + url := fmt.Sprintf("%s/%s?output_format=mp3_44100_128", elevenLabsBaseURL, voiceID) + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(jsonBody)) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("xi-api-key", p.apiKey) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Accept", "audio/mpeg") + + resp, err := p.client.Do(req) + if err != nil { + return nil, fmt.Errorf("ElevenLabs API request failed: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("ElevenLabs API returned status %d: %s", resp.StatusCode, string(body)) + } + + return body, nil +} + +func (p *ElevenLabsProvider) AvailableVoices() []Voice { + return []Voice{ + {ID: "custom", Name: p.voiceName}, + } +} + +func (p *ElevenLabsProvider) DefaultVoice() string { + return "custom" +} + +func (p *ElevenLabsProvider) MaxChunkSize() int { + return elevenLabsMaxChunk +} + +func (p *ElevenLabsProvider) Name() string { + return "elevenlabs" +} diff --git a/internal/tts/tts.go b/internal/tts/tts.go index d3037e4..b3da6b0 100644 --- a/internal/tts/tts.go +++ b/internal/tts/tts.go @@ -3,63 +3,37 @@ package tts import ( "bytes" "context" - "encoding/json" "fmt" - "io" "log" - "net/http" "os" "path/filepath" "strings" "unicode" ) -const ( - openAITTSURL = "https://api.openai.com/v1/audio/speech" - maxChunkSize = 4000 // OpenAI TTS max is 4096 chars, leave some margin -) - -// Service handles text-to-speech conversion using OpenAI's API +// Service handles text-to-speech conversion using a configurable provider. type Service struct { - apiKey string - model string - voice string + provider Provider audioDir string - client *http.Client } -// New creates a new TTS service -func New(apiKey, model, voice, audioDir string) (*Service, error) { - if apiKey == "" { - return nil, fmt.Errorf("OPENAI_API_KEY is required for TTS") - } - - // Ensure audio directory exists +// New creates a new TTS service with the given provider. +func New(provider Provider, audioDir string) (*Service, error) { if err := os.MkdirAll(audioDir, 0755); err != nil { return nil, fmt.Errorf("failed to create audio directory %s: %w", audioDir, err) } return &Service{ - apiKey: apiKey, - model: model, - voice: voice, + provider: provider, audioDir: audioDir, - client: &http.Client{}, }, nil } -// ttsRequest is the request body for OpenAI's TTS API -type ttsRequest struct { - Model string `json:"model"` - Input string `json:"input"` - Voice string `json:"voice"` -} - // GenerateAudio converts article text to an MP3 file. // Returns the file path and file size. func (s *Service) GenerateAudio(ctx context.Context, articleID int, text, voice string) (string, int64, error) { if voice == "" { - voice = s.voice + voice = s.provider.DefaultVoice() } // Clean up text for TTS @@ -68,16 +42,19 @@ func (s *Service) GenerateAudio(ctx context.Context, articleID int, text, voice return "", 0, fmt.Errorf("no text content to convert") } - // Chunk the text if it exceeds the max size - chunks := chunkText(text, maxChunkSize) - log.Printf("TTS: article %d - %d characters, %d chunk(s), voice=%s", articleID, len(text), len(chunks), voice) + // Chunk the text based on provider's limit + maxSize := s.provider.MaxChunkSize() + chunks := chunkText(text, maxSize) + log.Printf("TTS [%s]: article %d - %d characters, %d chunk(s), voice=%s", + s.provider.Name(), articleID, len(text), len(chunks), voice) // Generate audio for each chunk var audioData bytes.Buffer for i, chunk := range chunks { - log.Printf("TTS: article %d - generating chunk %d/%d", articleID, i+1, len(chunks)) + log.Printf("TTS [%s]: article %d - generating chunk %d/%d", + s.provider.Name(), articleID, i+1, len(chunks)) - data, err := s.callTTSAPI(ctx, chunk, voice) + data, err := s.provider.GenerateChunkAudio(ctx, chunk, voice) if err != nil { return "", 0, fmt.Errorf("failed to generate audio for chunk %d: %w", i+1, err) } @@ -93,7 +70,8 @@ func (s *Service) GenerateAudio(ctx context.Context, articleID int, text, voice } fileSize := int64(audioData.Len()) - log.Printf("TTS: article %d - audio saved to %s (%d bytes)", articleID, filePath, fileSize) + log.Printf("TTS [%s]: article %d - audio saved to %s (%d bytes)", + s.provider.Name(), articleID, filePath, fileSize) return filePath, fileSize, nil } @@ -101,7 +79,7 @@ func (s *Service) GenerateAudio(ctx context.Context, articleID int, text, voice // GetAudioPath returns the expected file path for an article's audio func (s *Service) GetAudioPath(articleID int, voice string) string { if voice == "" { - voice = s.voice + voice = s.provider.DefaultVoice() } filename := fmt.Sprintf("%d_%s.mp3", articleID, voice) return filepath.Join(s.audioDir, filename) @@ -112,48 +90,19 @@ func (s *Service) AudioDir() string { return s.audioDir } -// AvailableVoices returns the list of available OpenAI TTS voices -func AvailableVoices() []string { - return []string{"alloy", "echo", "fable", "onyx", "nova", "shimmer"} +// AvailableVoices returns the list of voices from the current provider +func (s *Service) AvailableVoices() []Voice { + return s.provider.AvailableVoices() } -// callTTSAPI makes a single request to OpenAI's TTS API -func (s *Service) callTTSAPI(ctx context.Context, text, voice string) ([]byte, error) { - reqBody := ttsRequest{ - Model: s.model, - Input: text, - Voice: voice, - } - - jsonBody, err := json.Marshal(reqBody) - if err != nil { - return nil, fmt.Errorf("failed to marshal request: %w", err) - } - - req, err := http.NewRequestWithContext(ctx, http.MethodPost, openAITTSURL, bytes.NewReader(jsonBody)) - if err != nil { - return nil, fmt.Errorf("failed to create request: %w", err) - } - - req.Header.Set("Authorization", "Bearer "+s.apiKey) - req.Header.Set("Content-Type", "application/json") - - resp, err := s.client.Do(req) - if err != nil { - return nil, fmt.Errorf("TTS API request failed: %w", err) - } - defer resp.Body.Close() - - body, err := io.ReadAll(resp.Body) - if err != nil { - return nil, fmt.Errorf("failed to read response body: %w", err) - } - - if resp.StatusCode != http.StatusOK { - return nil, fmt.Errorf("TTS API returned status %d: %s", resp.StatusCode, string(body)) - } +// DefaultVoice returns the default voice ID +func (s *Service) DefaultVoice() string { + return s.provider.DefaultVoice() +} - return body, nil +// ProviderName returns the name of the active TTS provider +func (s *Service) ProviderName() string { + return s.provider.Name() } // cleanTextForTTS prepares text for TTS conversion