From 827824b669d3eb59b4639e834dba34ac1409a2b8 Mon Sep 17 00:00:00 2001
From: hsinhoyeh <yhh92u@gmail.com>
Date: Sat, 7 Mar 2026 20:43:16 +0800
Subject: [PATCH] fix: fix flags used in llama.cpp and whisper.cpp

---
 README.md                     | 145 ++++++++++++++++++++++++----------
 ggml/llamacpp/llamacpp.go     |   3 +-
 ggml/whispercpp/whispercpp.go |   3 +-
 3 files changed, 108 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index 23e2068..1f809a3 100644
--- a/README.md
+++ b/README.md
@@ -2,12 +2,30 @@
 
 Go bindings for C++ inference frameworks via CGO, with prebuilt static libraries for zero-dependency builds.
 
+## Why go-nativeml?
+
+| Approach | Build complexity | Runtime dependency | `go mod vendor` | GPU support |
+|----------|------------------|--------------------|-----------------|-------------|
+| **go-nativeml (this project)** | `go build -tags llamacpp` — just works | None — static linking | Works via `go:embed` | Metal, CPU |
+| HTTP/subprocess wrapper (e.g. ollama server) | Separate process to manage | Running server required | N/A | Depends on server |
+| Dynamic linking (shared `.so`/`.dylib`) | Must install libs on every machine | Shared libs must exist at runtime | Cannot vendor native libs | Depends on build |
+| Build from source at `go get` | Requires C++ toolchain + cmake on every machine | None | Fragile — source download at build | Depends on build |
+| Pure Go reimplementation | Simple | None | Works | Limited/none |
+
+**Key advantages:**
+
+- **Zero build-time setup** — prebuilt `.a` files ship with the Go module. No cmake, no C++ toolchain, no downloads.
+- **Vendoring works** — `embed.go` files use `//go:embed` to ensure `go mod vendor` captures headers and static libraries. Standard Go tooling just works.
+- **No runtime dependencies** — everything is statically linked. No shared libraries to install, no server to run.
+- **Stub fallback** — without build tags, all packages compile to stubs returning errors. CI, linters, and `go build ./...` work everywhere without CGO.
+- **Type-safe Go API** — idiomatic option pattern, proper error handling, streaming callbacks. No shell-outs or HTTP round-trips.
+
 ## Supported Frameworks
 
 | Framework | Version | Package | Build Tag | Capabilities | Status |
 |-----------|---------|---------|-----------|--------------|--------|
 | [llama.cpp](https://github.com/ggerganov/llama.cpp) | `b8220` | `ggml/llamacpp` | `llamacpp` | Text generation, embeddings, tokenization | Available |
-| [whisper.cpp](https://github.com/ggerganov/whisper.cpp) | `v1.8.3` | `ggml/whispercpp` | `whispercpp` | Speech-to-text | Planned |
+| [whisper.cpp](https://github.com/ggerganov/whisper.cpp) | `v1.8.3` | `ggml/whispercpp` | `whispercpp` | Speech-to-text transcription | Available |
 
 ## Quick Start
 
@@ -15,6 +33,8 @@ Go bindings for C++ inference frameworks via CGO, with prebuilt static libraries
 go get github.com/footprintai/go-nativeml
 ```
 
+### llama.cpp — Text Generation
+
 ```go
 import "github.com/footprintai/go-nativeml/ggml/llamacpp"
 
@@ -34,12 +54,31 @@ ctx.GenerateStream("Hello, world", func(token string) bool {
 }, llamacpp.WithMaxTokens(256), llamacpp.WithTemperature(0.8))
 ```
 
+### whisper.cpp — Speech-to-Text
+
+```go
+import "github.com/footprintai/go-nativeml/ggml/whispercpp"
+
+model, _ := whispercpp.LoadModel("ggml-base.bin", whispercpp.WithGPU(true))
+defer model.Close()
+
+// pcmData: 16kHz mono float32 samples
+segments, _ := model.Transcribe(pcmData,
+    whispercpp.WithLanguage("en"),
+    whispercpp.WithThreads(4),
+)
+for _, seg := range segments {
+    fmt.Printf("[%s -> %s] %s\n", seg.Start, seg.End, seg.Text)
+}
+```
+
 ## Build Tags
 
 | Tag | Behavior |
 |-----|----------|
 | _(none)_ | Stub implementations that return errors. Allows `go build` without CGO. |
 | `llamacpp` | Enables CGO bindings to prebuilt llama.cpp static libraries. |
+| `whispercpp` | Enables CGO bindings to prebuilt whisper.cpp static libraries. |
 
 ```bash
 # Stub build (no CGO required)
@@ -47,42 +86,37 @@ go build ./...
 
 # CGO build with llama.cpp
 CGO_ENABLED=1 go build -tags llamacpp ./...
+
+# CGO build with whisper.cpp
+CGO_ENABLED=1 go build -tags whispercpp ./...
+
+# Both
+CGO_ENABLED=1 go build -tags "llamacpp whispercpp" ./...
 ```
 
 ## API
 
-### Lifecycle
+### llamacpp
 
 ```go
-llamacpp.Init()          // initialize backend
-llamacpp.Shutdown()      // cleanup
-```
-
-### Model
+// Lifecycle
+llamacpp.Init()
+llamacpp.Shutdown()
 
-```go
-model, err := llamacpp.LoadModel(path,
-    llamacpp.WithGPULayers(n),  // layers to offload to GPU
-)
+// Model
+model, err := llamacpp.LoadModel(path, llamacpp.WithGPULayers(n))
 model.Close()
-model.EmbeddingSize()    // returns embedding dimension
-```
-
-### Context
+model.EmbeddingSize()
 
-```go
+// Context
 ctx, err := model.NewContext(
     llamacpp.WithContextSize(2048),
     llamacpp.WithThreads(4),
-    llamacpp.WithEmbeddings(),     // enable embedding mode
+    llamacpp.WithEmbeddings(),
 )
 ctx.Close()
-```
 
-### Generation
-
-```go
-// Blocking
+// Generation (blocking)
 text, err := ctx.Generate(prompt,
     llamacpp.WithMaxTokens(256),
     llamacpp.WithTemperature(0.8),
@@ -93,24 +127,45 @@ text, err := ctx.Generate(prompt,
     llamacpp.WithSeed(42),
 )
 
-// Streaming
+// Generation (streaming)
 err := ctx.GenerateStream(prompt, func(token string) bool {
     fmt.Print(token)
     return true // return false to cancel
 }, llamacpp.WithMaxTokens(256))
-```
 
-### Embeddings
+// Embeddings
+embeddings, err := ctx.GetEmbeddings("some text") // []float32
 
-```go
-ctx, _ := model.NewContext(llamacpp.WithContextSize(512), llamacpp.WithEmbeddings())
-embeddings, err := ctx.GetEmbeddings("some text")  // []float32
+// Tokenization
+tokens, err := ctx.Tokenize("some text") // []int
 ```
 
-### Tokenization
+### whispercpp
 
 ```go
-tokens, err := ctx.Tokenize("some text")  // []int
+// Model
+model, err := whispercpp.LoadModel(path,
+    whispercpp.WithGPU(true),
+    whispercpp.WithFlashAttention(true),
+)
+model.Close()
+model.IsMultilingual()
+
+// Transcription (pcmData: 16kHz mono float32)
+segments, err := model.Transcribe(pcmData,
+    whispercpp.WithThreads(4),
+    whispercpp.WithLanguage("en"),
+    whispercpp.WithTranslate(false),
+    whispercpp.WithTimestamps(true),
+    whispercpp.WithTokenTimestamps(false),
+    whispercpp.WithSingleSegment(false),
+    whispercpp.WithTemperature(0.0),
+    whispercpp.WithMaxTokens(0),
+    whispercpp.WithPrompt(""),
+)
+
+// Utilities
+id := whispercpp.LangID("en") // language string -> ID
 ```
 
 ## Examples
@@ -151,21 +206,29 @@ make clean               # Remove temp build dirs
 
 ## Adding New Platforms
 
-1. Build llama.cpp static libraries for the target platform
-2. Place `.a` files in `third_party/llama.cpp/prebuilt/<os>-<arch>/`
-3. Add a `#cgo <os>,<arch> LDFLAGS` directive in `ggml/llamacpp/llamacpp.go`
+1. Build static libraries for the target platform
+2. Place `.a` files in `ggml/<pkg>/third_party/prebuilt/<os>-<arch>/`
+3. Add a `#cgo <os>,<arch> LDFLAGS` directive in the corresponding `.go` file
 
 ## Project Structure
 
 ```
-ggml/llamacpp/         Go bindings for llama.cpp
-  llamacpp.go          CGO implementation (build tag: llamacpp)
-  llamacpp_stub.go     Stub implementation (default)
-  options.go           Option builders for model, context, generation
-  wrapper.h/.cpp       C++ bridge to llama.cpp APIs
-  bridge.c             CGO callback adapter
-third_party/llama.cpp/ Upstream headers + prebuilt static libraries
-examples/              Usage examples (generate, embeddings)
+ggml/
+  llamacpp/              Go bindings for llama.cpp
+    llamacpp.go          CGO implementation (build tag: llamacpp)
+    llamacpp_stub.go     Stub implementation (default)
+    options.go           Option builders
+    wrapper.h/.cpp       C++ bridge to llama.cpp APIs
+    bridge.c             CGO callback adapter
+    embed.go             go:embed for vendoring support
+    third_party/         Upstream headers + prebuilt .a files
+  whispercpp/            Go bindings for whisper.cpp
+    whispercpp.go        CGO implementation (build tag: whispercpp)
+    whispercpp_stub.go   Stub implementation (default)
+    options.go           Option builders
+    embed.go             go:embed for vendoring support
+    third_party/         Upstream headers + prebuilt .a files
+examples/                Usage examples (generate, embeddings)
 ```
 
 ## License
diff --git a/ggml/llamacpp/llamacpp.go b/ggml/llamacpp/llamacpp.go
index 854e5f8..8e40dcb 100644
--- a/ggml/llamacpp/llamacpp.go
+++ b/ggml/llamacpp/llamacpp.go
@@ -14,7 +14,8 @@ package llamacpp
 #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64
 #cgo LDFLAGS: -lcommon -lllama -lggml-cpu -lggml-base -lggml -lstdc++ -lm
-#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation
+#cgo darwin LDFLAGS: -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation
+#cgo linux LDFLAGS: -lpthread -ldl -lrt -lgomp
 #include <stdlib.h>
 #include <stdbool.h>
 #include "wrapper.h"
diff --git a/ggml/whispercpp/whispercpp.go b/ggml/whispercpp/whispercpp.go
index 783bbcb..477c143 100644
--- a/ggml/whispercpp/whispercpp.go
+++ b/ggml/whispercpp/whispercpp.go
@@ -14,7 +14,8 @@ package whispercpp
 #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64
 #cgo LDFLAGS: -lwhisper -lcommon -lggml-cpu -lggml-base -lggml -lstdc++ -lm
-#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation
+#cgo darwin LDFLAGS: -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation
+#cgo linux LDFLAGS: -lpthread -ldl -lrt -lgomp
 #include <stdlib.h>
 #include "whisper.h"
 */