diff --git a/go.mod b/go.mod index ced423d..928b6ab 100644 --- a/go.mod +++ b/go.mod @@ -3,17 +3,46 @@ module github.com/indaco/md2audio go 1.25.3 require ( + cloud.google.com/go/texttospeech v1.16.0 github.com/cenkalti/backoff/v5 v5.0.3 github.com/fatih/color v1.18.0 github.com/mattn/go-sqlite3 v1.14.32 github.com/schollz/progressbar/v3 v3.18.0 + google.golang.org/api v0.247.0 ) require ( + cloud.google.com/go v0.120.0 // indirect + cloud.google.com/go/auth v0.16.4 // indirect + cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect + cloud.google.com/go/compute/metadata v0.8.0 // indirect + cloud.google.com/go/longrunning v0.6.7 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/google/s2a-go v0.1.9 // indirect + github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect + github.com/googleapis/gax-go/v2 v2.15.0 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/rivo/uniseg v0.4.7 // indirect - golang.org/x/sys v0.30.0 // indirect - golang.org/x/term v0.28.0 // indirect + go.opentelemetry.io/auto/sdk v1.1.0 // indirect + go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 // indirect + go.opentelemetry.io/otel v1.36.0 // indirect + go.opentelemetry.io/otel/metric v1.36.0 // indirect + go.opentelemetry.io/otel/trace v1.36.0 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/term v0.34.0 // indirect + golang.org/x/text v0.28.0 // indirect + golang.org/x/time v0.12.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250811230008-5f3141c8851a // indirect + google.golang.org/grpc v1.74.2 // indirect + google.golang.org/protobuf v1.36.7 // indirect ) diff --git a/go.sum b/go.sum index b3e850a..3c8531e 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,15 @@ +cloud.google.com/go v0.120.0 h1:wc6bgG9DHyKqF5/vQvX1CiZrtHnxJjBlKUyF9nP6meA= +cloud.google.com/go v0.120.0/go.mod h1:/beW32s8/pGRuj4IILWQNd4uuebeT4dkOhKmkfit64Q= +cloud.google.com/go/auth v0.16.4 h1:fXOAIQmkApVvcIn7Pc2+5J8QTMVbUGLscnSVNl11su8= +cloud.google.com/go/auth v0.16.4/go.mod h1:j10ncYwjX/g3cdX7GpEzsdM+d+ZNsXAbb6qXA7p1Y5M= +cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= +cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= +cloud.google.com/go/compute/metadata v0.8.0 h1:HxMRIbao8w17ZX6wBnjhcDkW6lTFpgcaobyVfZWqRLA= +cloud.google.com/go/compute/metadata v0.8.0/go.mod h1:sYOGTp851OV9bOFJ9CH7elVvyzopvWQFNNghtDQ/Biw= +cloud.google.com/go/longrunning v0.6.7 h1:IGtfDWHhQCgCjwQjV9iiLnUta9LBCo8R9QmAFsS/PrE= +cloud.google.com/go/longrunning v0.6.7/go.mod h1:EAFV3IZAKmM56TyiE6VAP3VoTzhZzySwI/YI1s/nRsY= +cloud.google.com/go/texttospeech v1.16.0 h1:Ra4w+6qmaeb12ozlPBqGw8Jzdge1yfzhvZgcXWdXw30= +cloud.google.com/go/texttospeech v1.16.0/go.mod h1:AeSkoH3ziPvapsuyI07TWY4oGxluAjntX+pF4PJ2jy0= github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7mk9/PwM= @@ -6,6 +18,25 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= +github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU9uHLo7OnF5tL52HFAgMmyrf4= +github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= +github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo= +github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= @@ -22,12 +53,52 @@ github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA= github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= +github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0 h1:q4XOmH/0opmeuJtPsbFNivyl7bCt7yRBbeEm2sC/XtQ= +go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0/go.mod h1:snMWehoOh2wsEwnvvwtDyFCxVeDAODenXHtn5vzrKjo= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0 h1:F7Jx+6hwnZ41NSFTO5q4LYDtJRXBf2PD0rNBkeB/lus= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.61.0/go.mod h1:UHB22Z8QsdRDrnAtX4PntOl36ajSxcdUMt1sF7Y6E7Q= +go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg= +go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E= +go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE= +go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs= +go.opentelemetry.io/otel/sdk v1.36.0 h1:b6SYIuLRs88ztox4EyrvRti80uXIFy+Sqzoh9kFULbs= +go.opentelemetry.io/otel/sdk v1.36.0/go.mod h1:+lC+mTgD+MUWfjJubi2vvXWcVxyr9rmlshZni72pXeY= +go.opentelemetry.io/otel/sdk/metric v1.36.0 h1:r0ntwwGosWGaa0CrSt8cuNuTcccMXERFwHX4dThiPis= +go.opentelemetry.io/otel/sdk/metric v1.36.0/go.mod h1:qTNOhFDfKRwX0yXOqJYegL5WRaW376QbB7P4Pb0qva4= +go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w= +go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.28.0 h1:/Ts8HFuMR2E6IP/jlo7QVLZHggjKQbhu/7H0LJFr3Gg= -golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= +golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= +golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +google.golang.org/api v0.247.0 h1:tSd/e0QrUlLsrwMKmkbQhYVa109qIintOls2Wh6bngc= +google.golang.org/api v0.247.0/go.mod h1:r1qZOPmxXffXg6xS5uhx16Fa/UFY8QU/K4bfKrnvovM= +google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4= +google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s= +google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c h1:AtEkQdl5b6zsybXcbz00j1LwNodDuH6hVifIaNqk7NQ= +google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c/go.mod h1:ea2MjsO70ssTfCjiwHgI0ZFqcw45Ksuk2ckf9G468GA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250811230008-5f3141c8851a h1:tPE/Kp+x9dMSwUm/uM0JKK0IfdiJkwAbSMSeZBXXJXc= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250811230008-5f3141c8851a/go.mod h1:gw1tLEfykwDz2ET4a12jcXt4couGAm7IwsVaTy0Sflo= +google.golang.org/grpc v1.74.2 h1:WoosgB65DlWVC9FqI82dGsZhWFNBSLjQ84bjROOpMu4= +google.golang.org/grpc v1.74.2/go.mod h1:CtQ+BGjaAIXHs/5YS3i473GqwBBa1zGQNevxdeBEXrM= +google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= +google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/cli/voices.go b/internal/cli/voices.go index 9b3033d..dac780d 100644 --- a/internal/cli/voices.go +++ b/internal/cli/voices.go @@ -20,6 +20,7 @@ import ( "github.com/indaco/md2audio/internal/tts" "github.com/indaco/md2audio/internal/tts/elevenlabs" "github.com/indaco/md2audio/internal/tts/espeak" + "github.com/indaco/md2audio/internal/tts/google" "github.com/indaco/md2audio/internal/tts/say" "github.com/indaco/md2audio/internal/utils" ) @@ -31,14 +32,22 @@ func HandleVoiceCommands(cfg config.Config, voiceCache *cache.VoiceCache, log lo return err } - // Set logger on provider if it supports it (ElevenLabs client) + // Set logger on provider if it supports it if elevenlabsClient, ok := provider.(*elevenlabs.Client); ok { elevenlabsClient.SetLogger(log) } + if googleClient, ok := provider.(*google.Client); ok { + googleClient.SetLogger(log) + } cachedProvider := cache.NewCachedProvider(provider, voiceCache) ctx := context.Background() + // Ensure Google client is closed when done + if googleClient, ok := provider.(*google.Client); ok { + defer func() { _ = googleClient.Close() }() + } + if cfg.Commands.ExportVoices != "" { return ExportVoices(ctx, cachedProvider, provider.Name(), cfg.Commands.ExportVoices, log) } @@ -52,6 +61,8 @@ func HandleVoiceCommands(cfg config.Config, voiceCache *cache.VoiceCache, log lo // CreateProvider creates a TTS provider based on configuration. func CreateProvider(cfg config.Config) (tts.Provider, error) { + ctx := context.Background() + // Handle empty provider (use platform default) provider := cfg.Provider if provider == "" { @@ -72,6 +83,14 @@ func CreateProvider(cfg config.Config) (tts.Provider, error) { UseSpeakerBoost: cfg.ElevenLabs.VoiceSettings.UseSpeakerBoost, Speed: cfg.ElevenLabs.VoiceSettings.Speed, }) + case "google": + return google.NewClient(ctx, google.Config{ + CredentialsFile: cfg.Google.CredentialsFile, + LanguageCode: cfg.Google.LanguageCode, + SpeakingRate: cfg.Google.SpeakingRate, + Pitch: cfg.Google.Pitch, + VolumeGainDb: cfg.Google.VolumeGainDb, + }) default: return nil, fmt.Errorf("unsupported provider: %s", provider) } diff --git a/internal/config/config.go b/internal/config/config.go index 85cf787..50228fb 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -68,6 +68,16 @@ type ElevenLabsConfig struct { VoiceSettings VoiceSettings // Voice generation settings (loaded from environment variables with defaults) } +// GoogleConfig holds configuration for the Google Cloud TTS provider +type GoogleConfig struct { + VoiceName string // Google Cloud TTS voice name (e.g., "en-US-Neural2-F") + LanguageCode string // Language code (e.g., "en-US", default: "en-US") + CredentialsFile string // Path to service account JSON file (optional, uses GOOGLE_APPLICATION_CREDENTIALS env var) + SpeakingRate float64 // Speaking speed multiplier (0.25-4.0, default: 1.0) + Pitch float64 // Pitch adjustment in semitones (-20.0 to 20.0, default: 0.0) + VolumeGainDb float64 // Volume gain in decibels (-96.0 to 16.0, default: 0.0) +} + // Config holds the application configuration type Config struct { // Input/Output Options @@ -83,9 +93,10 @@ type Config struct { Commands CommandFlags // TTS Provider Configuration - Provider string // TTS provider: "say" (macOS) or "elevenlabs" (default: "say") + Provider string // TTS provider: "say" (macOS), "elevenlabs", or "google" (default: "say") Say SayConfig // Say provider configuration ElevenLabs ElevenLabsConfig // ElevenLabs provider configuration + Google GoogleConfig // Google Cloud TTS provider configuration } // GetDefaultProvider returns the default TTS provider based on the platform. @@ -120,7 +131,7 @@ func Parse() Config { // TTS Provider - auto-detect based on platform defaultProvider := GetDefaultProvider() - flag.StringVar(&config.Provider, "provider", defaultProvider, "TTS provider: 'say' (macOS), 'espeak' (Linux), or 'elevenlabs'") + flag.StringVar(&config.Provider, "provider", defaultProvider, "TTS provider: 'say' (macOS), 'espeak' (Linux), 'elevenlabs', or 'google'") // Say provider options var preset string @@ -133,6 +144,14 @@ func Parse() Config { flag.StringVar(&config.ElevenLabs.Model, "elevenlabs-model", "eleven_multilingual_v2", "ElevenLabs model ID") flag.StringVar(&config.ElevenLabs.APIKey, "elevenlabs-api-key", "", "ElevenLabs API key (prefer ELEVENLABS_API_KEY env var)") + // Google Cloud TTS provider options + flag.StringVar(&config.Google.VoiceName, "google-voice", "", "Google Cloud TTS voice name (e.g., 'en-US-Neural2-F')") + flag.StringVar(&config.Google.LanguageCode, "google-language", "en-US", "Google Cloud TTS language code (e.g., 'en-US', 'en-GB')") + flag.StringVar(&config.Google.CredentialsFile, "google-credentials", "", "Path to Google Cloud service account JSON file (prefer GOOGLE_APPLICATION_CREDENTIALS env var)") + flag.Float64Var(&config.Google.SpeakingRate, "google-speed", 1.0, "Google Cloud TTS speaking rate (0.25-4.0, default: 1.0)") + flag.Float64Var(&config.Google.Pitch, "google-pitch", 0.0, "Google Cloud TTS pitch adjustment (-20.0 to 20.0, default: 0.0)") + flag.Float64Var(&config.Google.VolumeGainDb, "google-volume", 0.0, "Google Cloud TTS volume gain in dB (-96.0 to 16.0, default: 0.0)") + // Common options flag.StringVar(&config.Format, "format", "aiff", "Output audio format (aiff, m4a, mp3)") flag.StringVar(&config.Prefix, "prefix", "section", "Prefix for output filenames") @@ -178,6 +197,20 @@ func Parse() Config { log.Faint(" # List ElevenLabs voices") log.Faint(fmt.Sprintf(" %s -provider elevenlabs -list-voices", os.Args[0])) log.Blank() + log.Default("Examples (Google Cloud TTS provider):") + log.Faint(" # Use Google Cloud TTS with environment variable") + log.Faint(" export GOOGLE_APPLICATION_CREDENTIALS='/path/to/service-account.json'") + log.Faint(fmt.Sprintf(" %s -provider google -google-voice en-US-Neural2-F -f script.md", os.Args[0])) + log.Blank() + log.Faint(" # Use Google Cloud TTS with credentials file flag") + log.Faint(fmt.Sprintf(" %s -provider google -google-credentials /path/to/creds.json -google-voice en-GB-Neural2-A -d ./docs", os.Args[0])) + log.Blank() + log.Faint(" # List Google Cloud TTS voices") + log.Faint(fmt.Sprintf(" %s -provider google -list-voices", os.Args[0])) + log.Blank() + log.Faint(" # Generate MP3 files with Google TTS") + log.Faint(fmt.Sprintf(" %s -provider google -google-voice en-US-Neural2-F -format mp3 -d ./docs", os.Args[0])) + log.Blank() log.Default("Say Voice Presets:") log.Faint(" british-female, british-male, us-female, us-male,") log.Faint(" australian-female, indian-female") @@ -264,8 +297,8 @@ func (c Config) Validate() error { } // Validate provider - if c.Provider != "say" && c.Provider != "espeak" && c.Provider != "elevenlabs" { - return fmt.Errorf("invalid provider %q: must be 'say', 'espeak', or 'elevenlabs'", c.Provider) + if c.Provider != "" && c.Provider != "say" && c.Provider != "espeak" && c.Provider != "elevenlabs" && c.Provider != "google" { + return fmt.Errorf("invalid provider %q: must be 'say', 'espeak', 'elevenlabs', or 'google'", c.Provider) } // Validate provider-specific requirements @@ -275,6 +308,9 @@ func (c Config) Validate() error { } } + // Google Cloud TTS requires credentials (checked at runtime by client) + // No validation needed here since credentials can be provided via env var + return nil } @@ -319,6 +355,20 @@ func (c Config) Print() { if c.ElevenLabs.APIKey != "" { fmt.Printf(" API Key: %s\n", maskSecret(c.ElevenLabs.APIKey)) } + case "google": + if c.Google.VoiceName != "" { + fmt.Printf(" Voice: %s\n", c.Google.VoiceName) + } + fmt.Printf(" Language: %s\n", c.Google.LanguageCode) + if c.Google.SpeakingRate != 1.0 { + fmt.Printf(" Speaking Rate: %.2f\n", c.Google.SpeakingRate) + } + if c.Google.Pitch != 0.0 { + fmt.Printf(" Pitch: %.1f\n", c.Google.Pitch) + } + if c.Google.VolumeGainDb != 0.0 { + fmt.Printf(" Volume Gain: %.1f dB\n", c.Google.VolumeGainDb) + } } fmt.Printf(" Format: %s\n", c.Format) diff --git a/internal/processor/processor.go b/internal/processor/processor.go index 1f3a03c..bd21462 100644 --- a/internal/processor/processor.go +++ b/internal/processor/processor.go @@ -24,6 +24,7 @@ import ( "github.com/indaco/md2audio/internal/logger" "github.com/indaco/md2audio/internal/parser" "github.com/indaco/md2audio/internal/tts/elevenlabs" + "github.com/indaco/md2audio/internal/tts/google" ) // ProcessDirectory processes all markdown files in a directory recursively @@ -133,18 +134,26 @@ func processSingleFile(markdownFile, outputDir string, cfg config.Config, log lo return 0, 0, fmt.Errorf("error creating TTS provider: %w", err) } - // Set logger on provider if it supports it (ElevenLabs client) + // Set logger on provider if it supports it if elevenlabsClient, ok := provider.(*elevenlabs.Client); ok { elevenlabsClient.SetLogger(log) } + if googleClient, ok := provider.(*google.Client); ok { + googleClient.SetLogger(log) + // Ensure Google client is closed when done + defer func() { _ = googleClient.Close() }() + } log.Info("Using TTS provider:", provider.Name()) log.Blank() // Determine voice to use based on provider voice := cfg.Say.Voice - if cfg.Provider == "elevenlabs" { + switch cfg.Provider { + case "elevenlabs": voice = cfg.ElevenLabs.VoiceID + case "google": + voice = cfg.Google.VoiceName } // espeak uses cfg.Say.Voice (same as say provider) diff --git a/internal/tts/google/client.go b/internal/tts/google/client.go new file mode 100644 index 0000000..30b8552 --- /dev/null +++ b/internal/tts/google/client.go @@ -0,0 +1,387 @@ +package google + +import ( + "context" + "fmt" + "os" + "path/filepath" + + texttospeech "cloud.google.com/go/texttospeech/apiv1" + "cloud.google.com/go/texttospeech/apiv1/texttospeechpb" + "google.golang.org/api/option" + + "github.com/indaco/md2audio/internal/logger" + "github.com/indaco/md2audio/internal/tts" + "github.com/indaco/md2audio/internal/utils" +) + +const ( + // DefaultVoiceName is the default Google Cloud TTS voice + DefaultVoiceName = "en-US-Neural2-F" + + // DefaultLanguageCode is the default language + DefaultLanguageCode = "en-US" + + // EnvVarCredentials is the environment variable for service account credentials + EnvVarCredentials = "GOOGLE_APPLICATION_CREDENTIALS" +) + +// VoiceType represents the voice quality tier +type VoiceType string + +const ( + VoiceTypeStandard VoiceType = "Standard" + VoiceTypeWaveNet VoiceType = "WaveNet" + VoiceTypeNeural2 VoiceType = "Neural2" + VoiceTypeStudio VoiceType = "Studio" + VoiceTypePolyglot VoiceType = "Polyglot" +) + +// Client implements the TTS Provider interface for Google Cloud Text-to-Speech API. +type Client struct { + client *texttospeech.Client + log logger.LoggerInterface + languageCode string + speakingRate float64 // 0.25 to 4.0 + pitch float64 // -20.0 to 20.0 + volumeGainDb float64 // -96.0 to 16.0 +} + +// Config holds configuration for the Google Cloud TTS client. +type Config struct { + // CredentialsFile is the path to the service account JSON file. + // If empty, uses GOOGLE_APPLICATION_CREDENTIALS environment variable. + CredentialsFile string + + // LanguageCode is the voice language (e.g., "en-US", "en-GB"). + // Default: "en-US" + LanguageCode string + + // SpeakingRate is the speed multiplier (0.25 to 4.0). + // Default: 1.0 (normal speed) + SpeakingRate float64 + + // Pitch adjustment in semitones (-20.0 to 20.0). + // Default: 0.0 (no change) + Pitch float64 + + // VolumeGainDb is the volume gain in decibels (-96.0 to 16.0). + // Default: 0.0 (no change) + VolumeGainDb float64 +} + +// NewClient creates a new Google Cloud TTS client. +func NewClient(ctx context.Context, cfg Config) (*Client, error) { + // Prepare client options + var opts []option.ClientOption + + // Use credentials file if provided + if cfg.CredentialsFile != "" { + opts = append(opts, option.WithCredentialsFile(cfg.CredentialsFile)) + } else if credsPath := os.Getenv(EnvVarCredentials); credsPath != "" { + // Environment variable is set, client will use it automatically + opts = append(opts, option.WithCredentialsFile(credsPath)) + } else { + return nil, fmt.Errorf("credentials not found for Google Cloud: set %s environment variable or provide CredentialsFile", EnvVarCredentials) + } + + // Create the client + client, err := texttospeech.NewClient(ctx, opts...) + if err != nil { + return nil, fmt.Errorf("failed to create Google Cloud TTS client: %w", err) + } + + // Set defaults + languageCode := cfg.LanguageCode + if languageCode == "" { + languageCode = DefaultLanguageCode + } + + speakingRate := cfg.SpeakingRate + if speakingRate == 0 { + speakingRate = 1.0 // Default normal speed + } + + pitch := cfg.Pitch + // pitch defaults to 0.0 if not set + + volumeGainDb := cfg.VolumeGainDb + // volumeGainDb defaults to 0.0 if not set + + return &Client{ + client: client, + languageCode: languageCode, + speakingRate: speakingRate, + pitch: pitch, + volumeGainDb: volumeGainDb, + }, nil +} + +// Name returns the provider name. +func (c *Client) Name() string { + return "google" +} + +// SetLogger sets the logger for debug output. +func (c *Client) SetLogger(log logger.LoggerInterface) { + c.log = log +} + +// Close closes the client connection. +func (c *Client) Close() error { + return c.client.Close() +} + +// Generate creates audio from text using Google Cloud TTS. +func (c *Client) Generate(ctx context.Context, req tts.GenerateRequest) (string, error) { + // Determine voice name + voiceName := req.Voice + if voiceName == "" { + voiceName = DefaultVoiceName + } + + // Parse voice name to extract language code (e.g., "en-US-Neural2-F" -> "en-US") + languageCode := c.languageCode + if len(voiceName) >= 5 && voiceName[2] == '-' { + // Extract language code from voice name (first 5 chars: "en-US") + languageCode = voiceName[:5] + } + + // Determine speaking rate + speakingRate := c.speakingRate + if req.TargetDuration != nil && *req.TargetDuration > 0 { + // Calculate speed to match target duration + speakingRate = calculateSpeed(req.Text, *req.TargetDuration) + if c.log != nil { + c.log.Debug(fmt.Sprintf("Target duration: %.1fs, Calculated speed: %.2fx", *req.TargetDuration, speakingRate)) + } + } + + // Prepare the synthesis request + ttsReq := &texttospeechpb.SynthesizeSpeechRequest{ + Input: &texttospeechpb.SynthesisInput{ + InputSource: &texttospeechpb.SynthesisInput_Text{ + Text: req.Text, + }, + }, + Voice: &texttospeechpb.VoiceSelectionParams{ + LanguageCode: languageCode, + Name: voiceName, + }, + AudioConfig: &texttospeechpb.AudioConfig{ + AudioEncoding: getAudioEncoding(req.Format), + SpeakingRate: speakingRate, + Pitch: c.pitch, + VolumeGainDb: c.volumeGainDb, + SampleRateHertz: getSampleRate(req.Format), + }, + } + + // Log API request + if c.log != nil { + c.log.Debug(fmt.Sprintf("Google Cloud TTS API: Synthesize (voice: %s, lang: %s, rate: %.2f)", voiceName, languageCode, speakingRate)) + } + + // Execute request + resp, err := c.client.SynthesizeSpeech(ctx, ttsReq) + if err != nil { + return "", fmt.Errorf("failed to synthesize speech: %w", err) + } + + // Ensure output directory exists + outputDir := filepath.Dir(req.OutputPath) + if err := os.MkdirAll(outputDir, 0755); err != nil { + return "", fmt.Errorf("failed to create output directory: %w", err) + } + + // Determine output path with correct extension + outputPath := ensureCorrectExtension(req.OutputPath, req.Format) + + // Create output file + outFile, err := os.Create(outputPath) + if err != nil { + return "", fmt.Errorf("failed to create output file: %w", err) + } + defer func() { _ = outFile.Close() }() + + // Write audio data to file + if _, err := outFile.Write(resp.AudioContent); err != nil { + return "", fmt.Errorf("failed to write audio data: %w", err) + } + + return outputPath, nil +} + +// ListVoices retrieves available voices from Google Cloud TTS. +func (c *Client) ListVoices(ctx context.Context) ([]tts.Voice, error) { + // Request voice list + req := &texttospeechpb.ListVoicesRequest{ + // LanguageCode can be empty to list all voices + } + + if c.log != nil { + c.log.Debug("Google Cloud TTS API: ListVoices") + } + + resp, err := c.client.ListVoices(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to list voices: %w", err) + } + + // Convert to tts.Voice + voices := make([]tts.Voice, 0, len(resp.Voices)) + for _, v := range resp.Voices { + // Get first language code (voices can support multiple languages) + languageCode := "" + if len(v.LanguageCodes) > 0 { + languageCode = v.LanguageCodes[0] + } + + // Determine voice type from name + voiceType := determineVoiceType(v.Name) + + // Build description + description := fmt.Sprintf("%s voice", voiceType) + if v.NaturalSampleRateHertz > 0 { + description += fmt.Sprintf(" (%d Hz)", v.NaturalSampleRateHertz) + } + + // Determine gender + gender := "" + switch v.SsmlGender { + case texttospeechpb.SsmlVoiceGender_MALE: + gender = "male" + case texttospeechpb.SsmlVoiceGender_FEMALE: + gender = "female" + case texttospeechpb.SsmlVoiceGender_NEUTRAL: + gender = "neutral" + } + + voices = append(voices, tts.Voice{ + ID: v.Name, + Name: v.Name, + Description: description, + Language: languageCode, + Gender: gender, + }) + } + + return voices, nil +} + +// getAudioEncoding returns the audio encoding for the specified format. +func getAudioEncoding(format string) texttospeechpb.AudioEncoding { + switch format { + case "mp3": + return texttospeechpb.AudioEncoding_MP3 + case "wav": + return texttospeechpb.AudioEncoding_LINEAR16 + case "ogg": + return texttospeechpb.AudioEncoding_OGG_OPUS + default: + // Default to MP3 + return texttospeechpb.AudioEncoding_MP3 + } +} + +// getSampleRate returns the appropriate sample rate for the format. +func getSampleRate(format string) int32 { + switch format { + case "wav": + return 24000 // High quality for WAV + case "mp3", "ogg": + return 24000 // Standard quality for compressed formats + default: + return 24000 + } +} + +// ensureCorrectExtension ensures the output path has the correct file extension. +func ensureCorrectExtension(outputPath, format string) string { + expectedExt := "." + format + currentExt := filepath.Ext(outputPath) + + if currentExt != expectedExt { + return outputPath[:len(outputPath)-len(currentExt)] + expectedExt + } + return outputPath +} + +// determineVoiceType extracts the voice type from the voice name. +// Examples: "en-US-Neural2-F" -> "Neural2", "en-US-Wavenet-A" -> "WaveNet" +func determineVoiceType(voiceName string) string { + // Voice name format: {languageCode}-{voiceType}-{variant} + // Example: en-US-Neural2-F, en-GB-Wavenet-A + + // Look for voice type keywords + if contains(voiceName, "Neural2") { + return "Neural2" + } + if contains(voiceName, "Wavenet") || contains(voiceName, "WaveNet") { + return "WaveNet" + } + if contains(voiceName, "Studio") { + return "Studio" + } + if contains(voiceName, "Polyglot") { + return "Polyglot" + } + if contains(voiceName, "Standard") { + return "Standard" + } + + return "Standard" +} + +// contains checks if a string contains a substring (case-insensitive). +func contains(s, substr string) bool { + return len(s) >= len(substr) && findSubstring(s, substr) +} + +func findSubstring(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false +} + +// calculateSpeed determines the speed multiplier needed to match target duration. +// Google Cloud TTS speed ranges from 0.25 (slower) to 4.0 (faster), with 1.0 being normal. +func calculateSpeed(text string, targetDuration float64) float64 { + const ( + naturalWPM = 150.0 // Assume natural speaking rate at speed 1.0 is ~150 words per minute + minSpeed = 0.25 // Google Cloud TTS minimum speed + maxSpeed = 4.0 // Google Cloud TTS maximum speed + defaultSpeed = 1.0 + ) + + wordCount := utils.CountWords(text) + if wordCount == 0 { + return defaultSpeed + } + + // Calculate natural duration at speed 1.0 + naturalDuration := utils.EstimateDuration(text, naturalWPM) + + // Calculate required speed: naturalDuration / targetDuration + // If target is shorter, we need faster speed (>1.0) + // If target is longer, we need slower speed (<1.0) + speed := naturalDuration / targetDuration + originalSpeed := speed + + // Clamp to Google Cloud TTS valid range + speed = utils.ClampFloat64(speed, minSpeed, maxSpeed) + + // Warn if we had to clamp + if speed != originalSpeed { + if originalSpeed < minSpeed { + fmt.Fprintf(os.Stderr, "Warning: Required speed (%.2f) is below minimum, clamping to %.2f (audio will be longer than target)\n", originalSpeed, minSpeed) + } else { + fmt.Fprintf(os.Stderr, "Warning: Required speed (%.2f) exceeds maximum, clamping to %.2f (audio will be shorter than target)\n", originalSpeed, maxSpeed) + } + } + + return speed +} diff --git a/internal/tts/google/client_test.go b/internal/tts/google/client_test.go new file mode 100644 index 0000000..bbbdcbd --- /dev/null +++ b/internal/tts/google/client_test.go @@ -0,0 +1,226 @@ +package google + +import ( + "testing" +) + +func TestDetermineVoiceType(t *testing.T) { + tests := []struct { + name string + voiceName string + want string + }{ + { + name: "Neural2 voice", + voiceName: "en-US-Neural2-F", + want: "Neural2", + }, + { + name: "WaveNet voice (lowercase)", + voiceName: "en-GB-Wavenet-A", + want: "WaveNet", + }, + { + name: "WaveNet voice (uppercase)", + voiceName: "en-US-WaveNet-B", + want: "WaveNet", + }, + { + name: "Studio voice", + voiceName: "en-US-Studio-M", + want: "Studio", + }, + { + name: "Polyglot voice", + voiceName: "en-US-Polyglot-1", + want: "Polyglot", + }, + { + name: "Standard voice", + voiceName: "en-US-Standard-A", + want: "Standard", + }, + { + name: "Unknown voice defaults to Standard", + voiceName: "en-US-Unknown-Voice", + want: "Standard", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := determineVoiceType(tt.voiceName) + if got != tt.want { + t.Errorf("determineVoiceType(%q) = %q, want %q", tt.voiceName, got, tt.want) + } + }) + } +} + +func TestCalculateSpeed(t *testing.T) { + tests := []struct { + name string + text string + targetDuration float64 + wantMin float64 + wantMax float64 + }{ + { + name: "Normal speed for balanced text", + text: "This is a test sentence with about ten words in it.", + targetDuration: 5.0, + wantMin: 0.5, + wantMax: 2.0, + }, + { + name: "Fast speed for short target", + text: "This is a long sentence with many words that should require faster speech to fit in a very short duration.", + targetDuration: 2.0, + wantMin: 1.0, + wantMax: 4.0, + }, + { + name: "Slow speed for long target", + text: "Short text.", + targetDuration: 10.0, + wantMin: 0.25, + wantMax: 1.0, + }, + { + name: "Clamping at maximum speed", + text: "This is a very long piece of text with many words that would require extremely fast speech to fit into an impossibly short duration of just one second which is not realistic.", + targetDuration: 0.5, + wantMin: 3.0, + wantMax: 4.0, // Should be clamped at 4.0 + }, + { + name: "Clamping at minimum speed", + text: "Word.", + targetDuration: 30.0, + wantMin: 0.25, // Should be clamped at 0.25 + wantMax: 0.3, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := calculateSpeed(tt.text, tt.targetDuration) + + // Check if within valid Google Cloud TTS range + if got < 0.25 || got > 4.0 { + t.Errorf("calculateSpeed() = %v, which is outside valid range [0.25, 4.0]", got) + } + + // Check if within expected range for this test + if got < tt.wantMin || got > tt.wantMax { + t.Errorf("calculateSpeed(%q, %.1f) = %v, want between %.2f and %.2f", + tt.text, tt.targetDuration, got, tt.wantMin, tt.wantMax) + } + }) + } +} + +func TestEnsureCorrectExtension(t *testing.T) { + tests := []struct { + name string + outputPath string + format string + want string + }{ + { + name: "Correct MP3 extension", + outputPath: "/path/to/file.mp3", + format: "mp3", + want: "/path/to/file.mp3", + }, + { + name: "Incorrect extension, should change to MP3", + outputPath: "/path/to/file.wav", + format: "mp3", + want: "/path/to/file.mp3", + }, + { + name: "Correct WAV extension", + outputPath: "/path/to/file.wav", + format: "wav", + want: "/path/to/file.wav", + }, + { + name: "Incorrect extension, should change to WAV", + outputPath: "/path/to/file.ogg", + format: "wav", + want: "/path/to/file.wav", + }, + { + name: "Correct OGG extension", + outputPath: "/path/to/file.ogg", + format: "ogg", + want: "/path/to/file.ogg", + }, + { + name: "No extension, should add format", + outputPath: "/path/to/file", + format: "mp3", + want: "/path/to/file.mp3", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ensureCorrectExtension(tt.outputPath, tt.format) + if got != tt.want { + t.Errorf("ensureCorrectExtension(%q, %q) = %q, want %q", + tt.outputPath, tt.format, got, tt.want) + } + }) + } +} + +func TestContains(t *testing.T) { + tests := []struct { + name string + s string + substr string + want bool + }{ + { + name: "Substring present", + s: "en-US-Neural2-F", + substr: "Neural2", + want: true, + }, + { + name: "Substring not present", + s: "en-US-WaveNet-A", + substr: "Neural2", + want: false, + }, + { + name: "Empty substring", + s: "test", + substr: "", + want: true, + }, + { + name: "Substring at start", + s: "Neural2Voice", + substr: "Neural2", + want: true, + }, + { + name: "Substring at end", + s: "VoiceNeural2", + substr: "Neural2", + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := contains(tt.s, tt.substr) + if got != tt.want { + t.Errorf("contains(%q, %q) = %v, want %v", tt.s, tt.substr, got, tt.want) + } + }) + } +}