diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b9c3489..7120cfa 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -39,6 +39,8 @@ jobs: go-version: "1.24" - name: Build with CGO (llamacpp tag) run: CGO_ENABLED=1 go build -tags llamacpp ./ggml/llamacpp/... + - name: Build with CGO (whispercpp tag) + run: CGO_ENABLED=1 go build -tags whispercpp ./ggml/whispercpp/... build-libs: name: Build Libraries from Source (${{ matrix.os }}) @@ -68,9 +70,16 @@ jobs: - name: Build static libraries from source run: make build-libs - name: Verify CGO build with fresh libraries - run: CGO_ENABLED=1 go build -tags llamacpp ./ggml/llamacpp/... - - name: Upload prebuilt artifacts + run: | + CGO_ENABLED=1 go build -tags llamacpp ./ggml/llamacpp/... + CGO_ENABLED=1 go build -tags whispercpp ./ggml/whispercpp/... + - name: Upload llama.cpp prebuilt artifacts + uses: actions/upload-artifact@v4 + with: + name: prebuilt-llamacpp-${{ matrix.platform }} + path: ggml/llamacpp/third_party/prebuilt/${{ matrix.platform }}/*.a + - name: Upload whisper.cpp prebuilt artifacts uses: actions/upload-artifact@v4 with: - name: prebuilt-${{ matrix.platform }} - path: third_party/llama.cpp/prebuilt/${{ matrix.platform }}/*.a + name: prebuilt-whispercpp-${{ matrix.platform }} + path: ggml/whispercpp/third_party/prebuilt/${{ matrix.platform }}/*.a diff --git a/.gitignore b/.gitignore index de68be1..9edbd7d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # Build artifacts (source downloads, cmake dirs) -third_party/llama.cpp/src/ -third_party/whisper.cpp/src/ -third_party/**/build/ +ggml/llamacpp/third_party/src/ +ggml/whispercpp/third_party/src/ +**/build/ out/ # Prebuilt .a files and headers ARE committed — do not ignore them diff --git a/Dockerfile.libs b/Dockerfile.libs index eab9cee..7e7fbce 100644 --- a/Dockerfile.libs +++ b/Dockerfile.libs @@ -1,4 +1,4 @@ -# Dockerfile.libs — build linux-amd64 static libraries for llama.cpp +# Dockerfile.libs — build linux-amd64 static libraries for llama.cpp and whisper.cpp # # Usage: # docker build -f Dockerfile.libs -o ./out . @@ -18,7 +18,7 @@ COPY go.mod ./ COPY version.go ./ COPY cmd/versioncmd/ ./cmd/versioncmd/ -# Read versions and download sources +# Download llama.cpp RUN LLAMA_VERSION=$(go run ./cmd/versioncmd llama.cpp) && \ echo "Downloading llama.cpp ${LLAMA_VERSION}..." && \ wget -qO llama.cpp.tar.gz "https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_VERSION}.tar.gz" && \ @@ -26,12 +26,25 @@ RUN LLAMA_VERSION=$(go run ./cmd/versioncmd llama.cpp) && \ tar xzf llama.cpp.tar.gz --strip-components=1 -C llama-src && \ rm llama.cpp.tar.gz +# Download whisper.cpp +RUN WHISPER_VERSION=$(go run ./cmd/versioncmd whisper.cpp) && \ + echo "Downloading whisper.cpp ${WHISPER_VERSION}..." && \ + wget -qO whisper.cpp.tar.gz "https://github.com/ggerganov/whisper.cpp/archive/refs/tags/${WHISPER_VERSION}.tar.gz" && \ + mkdir -p whisper-src && \ + tar xzf whisper.cpp.tar.gz --strip-components=1 -C whisper-src && \ + rm whisper.cpp.tar.gz + # Build llama.cpp RUN cd llama-src && \ cmake -B build -DBUILD_SHARED_LIBS=OFF && \ cmake --build build --config Release -j$(nproc) -# Collect artifacts +# Build whisper.cpp +RUN cd whisper-src && \ + cmake -B build -DBUILD_SHARED_LIBS=OFF && \ + cmake --build build --config Release -j$(nproc) + +# Collect llama.cpp artifacts RUN mkdir -p /out/llama.cpp/linux-amd64 /out/llama.cpp/include /out/llama.cpp/ggml/include /out/llama.cpp/common && \ find llama-src/build -name "*.a" -exec cp {} /out/llama.cpp/linux-amd64/ \; && \ cp llama-src/include/*.h /out/llama.cpp/include/ && \ @@ -39,6 +52,12 @@ RUN mkdir -p /out/llama.cpp/linux-amd64 /out/llama.cpp/include /out/llama.cpp/gg cp llama-src/common/common.h /out/llama.cpp/common/ && \ cp llama-src/common/sampling.h /out/llama.cpp/common/ +# Collect whisper.cpp artifacts +RUN mkdir -p /out/whisper.cpp/linux-amd64 /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \ + find whisper-src/build -name "*.a" -exec cp {} /out/whisper.cpp/linux-amd64/ \; && \ + cp whisper-src/include/*.h /out/whisper.cpp/include/ && \ + cp whisper-src/ggml/include/*.h /out/whisper.cpp/ggml/include/ + # Output stage — docker build -o extracts from here FROM scratch COPY --from=builder /out/ / diff --git a/Makefile b/Makefile index 6c07128..4ff5f7c 100644 --- a/Makefile +++ b/Makefile @@ -2,11 +2,13 @@ # # For consumers: # go get github.com/footprintai/go-nativeml -# go build -tags llamacpp ./... # just works — prebuilt .a files are in the module +# go build -tags llamacpp ./... # just works — prebuilt .a files are in the module +# go build -tags whispercpp ./... # whisper.cpp bindings # # For maintainers (rebuild .a files from source): # make build-libs # Build all libraries for current platform # make build-libs-llama # Build llama.cpp only +# make build-libs-whisper # Build whisper.cpp only # make build-libs-linux # Build linux-amd64 .a files via Docker # make build-libs-all # Build native + linux-amd64 # make clean # Remove temp build dirs (keeps prebuilt .a + headers) @@ -21,15 +23,18 @@ PLATFORM := $(shell go env GOOS)-$(shell go env GOARCH) # Parallel build cores NPROC := $(shell if which nproc > /dev/null 2>&1; then nproc; elif [ "$$(uname)" = "Darwin" ]; then sysctl -n hw.ncpu; else echo 4; fi) -# Paths -THIRD_PARTY := third_party -LLAMA_DIR := $(THIRD_PARTY)/llama.cpp -LLAMA_SRC := $(LLAMA_DIR)/src -LLAMA_PREBUILT := $(LLAMA_DIR)/prebuilt/$(PLATFORM) +# Paths — assets live inside the Go package directories +LLAMA_THIRD_PARTY := ggml/llamacpp/third_party +LLAMA_SRC := $(LLAMA_THIRD_PARTY)/src +LLAMA_PREBUILT := $(LLAMA_THIRD_PARTY)/prebuilt/$(PLATFORM) -.PHONY: build-libs build-libs-llama build-libs-linux build-libs-all clean verify +WHISPER_THIRD_PARTY := ggml/whispercpp/third_party +WHISPER_SRC := $(WHISPER_THIRD_PARTY)/src +WHISPER_PREBUILT := $(WHISPER_THIRD_PARTY)/prebuilt/$(PLATFORM) -build-libs: build-libs-llama +.PHONY: build-libs build-libs-llama build-libs-whisper build-libs-linux build-libs-all clean verify + +build-libs: build-libs-llama build-libs-whisper # Build both native platform and linux-amd64 (via Docker) build-libs-all: build-libs build-libs-linux @@ -46,13 +51,13 @@ $(LLAMA_PREBUILT): $(LLAMA_SRC) @mkdir -p $(LLAMA_PREBUILT) find $(LLAMA_SRC)/build -name "*.a" -exec cp {} $(LLAMA_PREBUILT)/ \; @echo "==> Copying llama.cpp headers..." - @mkdir -p $(LLAMA_DIR)/include - cp $(LLAMA_SRC)/include/*.h $(LLAMA_DIR)/include/ - @mkdir -p $(LLAMA_DIR)/ggml/include - cp $(LLAMA_SRC)/ggml/include/*.h $(LLAMA_DIR)/ggml/include/ - @mkdir -p $(LLAMA_DIR)/common - cp $(LLAMA_SRC)/common/common.h $(LLAMA_DIR)/common/ - cp $(LLAMA_SRC)/common/sampling.h $(LLAMA_DIR)/common/ + @mkdir -p $(LLAMA_THIRD_PARTY)/include + cp $(LLAMA_SRC)/include/*.h $(LLAMA_THIRD_PARTY)/include/ + @mkdir -p $(LLAMA_THIRD_PARTY)/ggml/include + cp $(LLAMA_SRC)/ggml/include/*.h $(LLAMA_THIRD_PARTY)/ggml/include/ + @mkdir -p $(LLAMA_THIRD_PARTY)/common + cp $(LLAMA_SRC)/common/common.h $(LLAMA_THIRD_PARTY)/common/ + cp $(LLAMA_SRC)/common/sampling.h $(LLAMA_THIRD_PARTY)/common/ @echo "==> llama.cpp $(LLAMA_VERSION) ready: $(LLAMA_PREBUILT)/" $(LLAMA_SRC): @@ -62,20 +67,51 @@ $(LLAMA_SRC): tar xzf llama.cpp.tar.gz --strip-components=1 -C $(LLAMA_SRC) rm llama.cpp.tar.gz +# ============================================================================ +# whisper.cpp +# ============================================================================ +build-libs-whisper: $(WHISPER_PREBUILT) + +$(WHISPER_PREBUILT): $(WHISPER_SRC) + @echo "==> Building whisper.cpp $(WHISPER_VERSION) for $(PLATFORM)..." + cd $(WHISPER_SRC) && cmake -B build -DBUILD_SHARED_LIBS=OFF && \ + cmake --build build --config Release -j$(NPROC) + @mkdir -p $(WHISPER_PREBUILT) + find $(WHISPER_SRC)/build -name "*.a" -exec cp {} $(WHISPER_PREBUILT)/ \; + @echo "==> Copying whisper.cpp headers..." + @mkdir -p $(WHISPER_THIRD_PARTY)/include + cp $(WHISPER_SRC)/include/*.h $(WHISPER_THIRD_PARTY)/include/ + @mkdir -p $(WHISPER_THIRD_PARTY)/ggml/include + cp $(WHISPER_SRC)/ggml/include/*.h $(WHISPER_THIRD_PARTY)/ggml/include/ + @echo "==> whisper.cpp $(WHISPER_VERSION) ready: $(WHISPER_PREBUILT)/" + +$(WHISPER_SRC): + @echo "==> Downloading whisper.cpp $(WHISPER_VERSION)..." + wget -qO whisper.cpp.tar.gz https://github.com/ggerganov/whisper.cpp/archive/refs/tags/$(WHISPER_VERSION).tar.gz + mkdir -p $(WHISPER_SRC) + tar xzf whisper.cpp.tar.gz --strip-components=1 -C $(WHISPER_SRC) + rm whisper.cpp.tar.gz + # ============================================================================ # Docker build for linux-amd64 (cross-compile from macOS) # ============================================================================ build-libs-linux: @echo "==> Building linux-amd64 static libraries via Docker..." docker build -f Dockerfile.libs -o ./out . - @mkdir -p $(LLAMA_DIR)/prebuilt/linux-amd64 - cp out/llama.cpp/linux-amd64/*.a $(LLAMA_DIR)/prebuilt/linux-amd64/ - @# Copy headers if not already present - @mkdir -p $(LLAMA_DIR)/include $(LLAMA_DIR)/ggml/include $(LLAMA_DIR)/common - cp out/llama.cpp/include/*.h $(LLAMA_DIR)/include/ - cp out/llama.cpp/ggml/include/*.h $(LLAMA_DIR)/ggml/include/ - cp out/llama.cpp/common/common.h $(LLAMA_DIR)/common/ - cp out/llama.cpp/common/sampling.h $(LLAMA_DIR)/common/ + @# llama.cpp + @mkdir -p $(LLAMA_THIRD_PARTY)/prebuilt/linux-amd64 + cp out/llama.cpp/linux-amd64/*.a $(LLAMA_THIRD_PARTY)/prebuilt/linux-amd64/ + @mkdir -p $(LLAMA_THIRD_PARTY)/include $(LLAMA_THIRD_PARTY)/ggml/include $(LLAMA_THIRD_PARTY)/common + cp out/llama.cpp/include/*.h $(LLAMA_THIRD_PARTY)/include/ + cp out/llama.cpp/ggml/include/*.h $(LLAMA_THIRD_PARTY)/ggml/include/ + cp out/llama.cpp/common/common.h $(LLAMA_THIRD_PARTY)/common/ + cp out/llama.cpp/common/sampling.h $(LLAMA_THIRD_PARTY)/common/ + @# whisper.cpp + @mkdir -p $(WHISPER_THIRD_PARTY)/prebuilt/linux-amd64 + cp out/whisper.cpp/linux-amd64/*.a $(WHISPER_THIRD_PARTY)/prebuilt/linux-amd64/ + @mkdir -p $(WHISPER_THIRD_PARTY)/include $(WHISPER_THIRD_PARTY)/ggml/include + cp out/whisper.cpp/include/*.h $(WHISPER_THIRD_PARTY)/include/ + cp out/whisper.cpp/ggml/include/*.h $(WHISPER_THIRD_PARTY)/ggml/include/ rm -rf out @echo "==> linux-amd64 libraries ready" @@ -83,16 +119,19 @@ build-libs-linux: # Verification # ============================================================================ verify: - @echo "==> Verifying stub build (no tag)..." + @echo "==> Verifying stub builds (no tags)..." go build ./ggml/llamacpp/... - @echo "==> Verifying CGO build (with tag)..." + go build ./ggml/whispercpp/... + @echo "==> Verifying CGO builds (with tags)..." CGO_ENABLED=1 go build -tags llamacpp ./ggml/llamacpp/... + CGO_ENABLED=1 go build -tags whispercpp ./ggml/whispercpp/... @echo "==> Running stub tests..." go test ./ggml/llamacpp/... + go test ./ggml/whispercpp/... @echo "==> All checks passed" # ============================================================================ # Cleanup # ============================================================================ clean: - rm -rf $(LLAMA_SRC) out + rm -rf $(LLAMA_SRC) $(WHISPER_SRC) out diff --git a/README.md b/README.md new file mode 100644 index 0000000..23e2068 --- /dev/null +++ b/README.md @@ -0,0 +1,173 @@ +# go-nativeml + +Go bindings for C++ inference frameworks via CGO, with prebuilt static libraries for zero-dependency builds. + +## Supported Frameworks + +| Framework | Version | Package | Build Tag | Capabilities | Status | +|-----------|---------|---------|-----------|--------------|--------| +| [llama.cpp](https://github.com/ggerganov/llama.cpp) | `b8220` | `ggml/llamacpp` | `llamacpp` | Text generation, embeddings, tokenization | Available | +| [whisper.cpp](https://github.com/ggerganov/whisper.cpp) | `v1.8.3` | `ggml/whispercpp` | `whispercpp` | Speech-to-text | Planned | + +## Quick Start + +```bash +go get github.com/footprintai/go-nativeml +``` + +```go +import "github.com/footprintai/go-nativeml/ggml/llamacpp" + +llamacpp.Init() +defer llamacpp.Shutdown() + +model, _ := llamacpp.LoadModel("model.gguf", llamacpp.WithGPULayers(999)) +defer model.Close() + +ctx, _ := model.NewContext(llamacpp.WithContextSize(2048), llamacpp.WithThreads(4)) +defer ctx.Close() + +// Streaming generation +ctx.GenerateStream("Hello, world", func(token string) bool { + fmt.Print(token) + return true // return false to stop early +}, llamacpp.WithMaxTokens(256), llamacpp.WithTemperature(0.8)) +``` + +## Build Tags + +| Tag | Behavior | +|-----|----------| +| _(none)_ | Stub implementations that return errors. Allows `go build` without CGO. | +| `llamacpp` | Enables CGO bindings to prebuilt llama.cpp static libraries. | + +```bash +# Stub build (no CGO required) +go build ./... + +# CGO build with llama.cpp +CGO_ENABLED=1 go build -tags llamacpp ./... +``` + +## API + +### Lifecycle + +```go +llamacpp.Init() // initialize backend +llamacpp.Shutdown() // cleanup +``` + +### Model + +```go +model, err := llamacpp.LoadModel(path, + llamacpp.WithGPULayers(n), // layers to offload to GPU +) +model.Close() +model.EmbeddingSize() // returns embedding dimension +``` + +### Context + +```go +ctx, err := model.NewContext( + llamacpp.WithContextSize(2048), + llamacpp.WithThreads(4), + llamacpp.WithEmbeddings(), // enable embedding mode +) +ctx.Close() +``` + +### Generation + +```go +// Blocking +text, err := ctx.Generate(prompt, + llamacpp.WithMaxTokens(256), + llamacpp.WithTemperature(0.8), + llamacpp.WithTopP(0.95), + llamacpp.WithTopK(40), + llamacpp.WithMinP(0.05), + llamacpp.WithRepeatPenalty(1.1), + llamacpp.WithSeed(42), +) + +// Streaming +err := ctx.GenerateStream(prompt, func(token string) bool { + fmt.Print(token) + return true // return false to cancel +}, llamacpp.WithMaxTokens(256)) +``` + +### Embeddings + +```go +ctx, _ := model.NewContext(llamacpp.WithContextSize(512), llamacpp.WithEmbeddings()) +embeddings, err := ctx.GetEmbeddings("some text") // []float32 +``` + +### Tokenization + +```go +tokens, err := ctx.Tokenize("some text") // []int +``` + +## Examples + +```bash +# Text generation +CGO_ENABLED=1 go run -tags llamacpp ./examples/generate \ + -model /path/to/model.gguf \ + -prompt "Hello, world" \ + -max-tokens 256 \ + -temperature 0.8 + +# Embeddings +CGO_ENABLED=1 go run -tags llamacpp ./examples/embeddings \ + -model /path/to/model.gguf \ + -text "Hello, world" +``` + +## Supported Platforms + +| Platform | Status | +|----------|--------| +| darwin-amd64 (macOS Intel) | Prebuilt libraries included | +| darwin-arm64 (macOS Apple Silicon) | Prebuilt libraries not yet available | +| linux-amd64 | Prebuilt libraries included | + +## Building Libraries from Source + +For maintainers who need to rebuild the static libraries: + +```bash +make build-libs # Build for current platform +make build-libs-linux # Build linux-amd64 via Docker +make build-libs-all # Build native + linux-amd64 +make verify # Run stub + CGO build checks +make clean # Remove temp build dirs +``` + +## Adding New Platforms + +1. Build llama.cpp static libraries for the target platform +2. Place `.a` files in `third_party/llama.cpp/prebuilt/-/` +3. Add a `#cgo , LDFLAGS` directive in `ggml/llamacpp/llamacpp.go` + +## Project Structure + +``` +ggml/llamacpp/ Go bindings for llama.cpp + llamacpp.go CGO implementation (build tag: llamacpp) + llamacpp_stub.go Stub implementation (default) + options.go Option builders for model, context, generation + wrapper.h/.cpp C++ bridge to llama.cpp APIs + bridge.c CGO callback adapter +third_party/llama.cpp/ Upstream headers + prebuilt static libraries +examples/ Usage examples (generate, embeddings) +``` + +## License + +Apache-2.0 diff --git a/claude.md b/claude.md index ae50c9a..0245a32 100644 --- a/claude.md +++ b/claude.md @@ -5,16 +5,19 @@ This project provides CGO wrappers for C++ inference frameworks for Go. ## Structure - `ggml/llamacpp/` — Go bindings for llama.cpp (build tag: `llamacpp`) -- `ggml/whispercpp/` — (future) Go bindings for whisper.cpp -- `third_party/llama.cpp/` — Upstream headers + prebuilt static libraries (keep upstream layout untouched) + - `third_party/` — Upstream headers + prebuilt static libraries +- `ggml/whispercpp/` — Go bindings for whisper.cpp (build tag: `whispercpp`) + - `third_party/` — Upstream headers + prebuilt static libraries +- `embed.go` files use `//go:embed` to ensure `go mod vendor` includes headers and `.a` files ## Build Tags - Default (no tag): stub implementations that return errors - `llamacpp`: enables CGO bindings to prebuilt llama.cpp libraries +- `whispercpp`: enables CGO bindings to prebuilt whisper.cpp libraries ## Adding New Platforms -1. Build llama.cpp static libraries for the target platform -2. Place `.a` files in `third_party/llama.cpp/prebuilt/-/` -3. Add CGO LDFLAGS directive in `llamacpp.go` +1. Build static libraries for the target platform +2. Place `.a` files in `ggml//third_party/prebuilt/-/` +3. Add CGO LDFLAGS directive in the corresponding `.go` file diff --git a/ggml/llamacpp/embed.go b/ggml/llamacpp/embed.go new file mode 100644 index 0000000..91dae55 --- /dev/null +++ b/ggml/llamacpp/embed.go @@ -0,0 +1,16 @@ +// Copyright 2025 FootprintAI +// SPDX-License-Identifier: Apache-2.0 + +package llamacpp + +import "embed" + +// Embed directives ensure go mod vendor includes headers and prebuilt libraries. +// The embedded filesystem is not used at runtime — CGO links directly via ${SRCDIR} paths. + +//go:embed third_party/include/*.h +//go:embed third_party/ggml/include/*.h +//go:embed third_party/common/*.h +//go:embed third_party/prebuilt/darwin-amd64/*.a +//go:embed third_party/prebuilt/linux-amd64/*.a +var _ embed.FS diff --git a/ggml/llamacpp/llamacpp.go b/ggml/llamacpp/llamacpp.go index 6d75260..854e5f8 100644 --- a/ggml/llamacpp/llamacpp.go +++ b/ggml/llamacpp/llamacpp.go @@ -8,11 +8,11 @@ package llamacpp /* -#cgo CFLAGS: -I${SRCDIR}/../../third_party/llama.cpp/include -I${SRCDIR}/../../third_party/llama.cpp/ggml/include -#cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/../../third_party/llama.cpp/include -I${SRCDIR}/../../third_party/llama.cpp/ggml/include -I${SRCDIR}/../../third_party/llama.cpp/common -#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/../../third_party/llama.cpp/prebuilt/darwin-arm64 -#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/../../third_party/llama.cpp/prebuilt/darwin-amd64 -#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/../../third_party/llama.cpp/prebuilt/linux-amd64 +#cgo CFLAGS: -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include +#cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include -I${SRCDIR}/third_party/common +#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-arm64 +#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64 +#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64 #cgo LDFLAGS: -lcommon -lllama -lggml-cpu -lggml-base -lggml -lstdc++ -lm #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation #include diff --git a/third_party/llama.cpp/LICENSE b/ggml/llamacpp/third_party/LICENSE similarity index 100% rename from third_party/llama.cpp/LICENSE rename to ggml/llamacpp/third_party/LICENSE diff --git a/third_party/llama.cpp/common/common.h b/ggml/llamacpp/third_party/common/common.h similarity index 100% rename from third_party/llama.cpp/common/common.h rename to ggml/llamacpp/third_party/common/common.h diff --git a/third_party/llama.cpp/common/sampling.h b/ggml/llamacpp/third_party/common/sampling.h similarity index 100% rename from third_party/llama.cpp/common/sampling.h rename to ggml/llamacpp/third_party/common/sampling.h diff --git a/third_party/llama.cpp/ggml/include/ggml-alloc.h b/ggml/llamacpp/third_party/ggml/include/ggml-alloc.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-alloc.h rename to ggml/llamacpp/third_party/ggml/include/ggml-alloc.h diff --git a/third_party/llama.cpp/ggml/include/ggml-backend.h b/ggml/llamacpp/third_party/ggml/include/ggml-backend.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-backend.h rename to ggml/llamacpp/third_party/ggml/include/ggml-backend.h diff --git a/third_party/llama.cpp/ggml/include/ggml-blas.h b/ggml/llamacpp/third_party/ggml/include/ggml-blas.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-blas.h rename to ggml/llamacpp/third_party/ggml/include/ggml-blas.h diff --git a/third_party/llama.cpp/ggml/include/ggml-cann.h b/ggml/llamacpp/third_party/ggml/include/ggml-cann.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-cann.h rename to ggml/llamacpp/third_party/ggml/include/ggml-cann.h diff --git a/third_party/llama.cpp/ggml/include/ggml-cpp.h b/ggml/llamacpp/third_party/ggml/include/ggml-cpp.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-cpp.h rename to ggml/llamacpp/third_party/ggml/include/ggml-cpp.h diff --git a/third_party/llama.cpp/ggml/include/ggml-cpu.h b/ggml/llamacpp/third_party/ggml/include/ggml-cpu.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-cpu.h rename to ggml/llamacpp/third_party/ggml/include/ggml-cpu.h diff --git a/third_party/llama.cpp/ggml/include/ggml-cuda.h b/ggml/llamacpp/third_party/ggml/include/ggml-cuda.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-cuda.h rename to ggml/llamacpp/third_party/ggml/include/ggml-cuda.h diff --git a/third_party/llama.cpp/ggml/include/ggml-hexagon.h b/ggml/llamacpp/third_party/ggml/include/ggml-hexagon.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-hexagon.h rename to ggml/llamacpp/third_party/ggml/include/ggml-hexagon.h diff --git a/third_party/llama.cpp/ggml/include/ggml-metal.h b/ggml/llamacpp/third_party/ggml/include/ggml-metal.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-metal.h rename to ggml/llamacpp/third_party/ggml/include/ggml-metal.h diff --git a/third_party/llama.cpp/ggml/include/ggml-opencl.h b/ggml/llamacpp/third_party/ggml/include/ggml-opencl.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-opencl.h rename to ggml/llamacpp/third_party/ggml/include/ggml-opencl.h diff --git a/third_party/llama.cpp/ggml/include/ggml-opt.h b/ggml/llamacpp/third_party/ggml/include/ggml-opt.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-opt.h rename to ggml/llamacpp/third_party/ggml/include/ggml-opt.h diff --git a/third_party/llama.cpp/ggml/include/ggml-rpc.h b/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-rpc.h rename to ggml/llamacpp/third_party/ggml/include/ggml-rpc.h diff --git a/third_party/llama.cpp/ggml/include/ggml-sycl.h b/ggml/llamacpp/third_party/ggml/include/ggml-sycl.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-sycl.h rename to ggml/llamacpp/third_party/ggml/include/ggml-sycl.h diff --git a/third_party/llama.cpp/ggml/include/ggml-virtgpu.h b/ggml/llamacpp/third_party/ggml/include/ggml-virtgpu.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-virtgpu.h rename to ggml/llamacpp/third_party/ggml/include/ggml-virtgpu.h diff --git a/third_party/llama.cpp/ggml/include/ggml-vulkan.h b/ggml/llamacpp/third_party/ggml/include/ggml-vulkan.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-vulkan.h rename to ggml/llamacpp/third_party/ggml/include/ggml-vulkan.h diff --git a/third_party/llama.cpp/ggml/include/ggml-webgpu.h b/ggml/llamacpp/third_party/ggml/include/ggml-webgpu.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-webgpu.h rename to ggml/llamacpp/third_party/ggml/include/ggml-webgpu.h diff --git a/third_party/llama.cpp/ggml/include/ggml-zdnn.h b/ggml/llamacpp/third_party/ggml/include/ggml-zdnn.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-zdnn.h rename to ggml/llamacpp/third_party/ggml/include/ggml-zdnn.h diff --git a/third_party/llama.cpp/ggml/include/ggml-zendnn.h b/ggml/llamacpp/third_party/ggml/include/ggml-zendnn.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml-zendnn.h rename to ggml/llamacpp/third_party/ggml/include/ggml-zendnn.h diff --git a/third_party/llama.cpp/ggml/include/ggml.h b/ggml/llamacpp/third_party/ggml/include/ggml.h similarity index 100% rename from third_party/llama.cpp/ggml/include/ggml.h rename to ggml/llamacpp/third_party/ggml/include/ggml.h diff --git a/third_party/llama.cpp/ggml/include/gguf.h b/ggml/llamacpp/third_party/ggml/include/gguf.h similarity index 100% rename from third_party/llama.cpp/ggml/include/gguf.h rename to ggml/llamacpp/third_party/ggml/include/gguf.h diff --git a/third_party/llama.cpp/include/llama-cpp.h b/ggml/llamacpp/third_party/include/llama-cpp.h similarity index 100% rename from third_party/llama.cpp/include/llama-cpp.h rename to ggml/llamacpp/third_party/include/llama-cpp.h diff --git a/third_party/llama.cpp/include/llama.h b/ggml/llamacpp/third_party/include/llama.h similarity index 100% rename from third_party/llama.cpp/include/llama.h rename to ggml/llamacpp/third_party/include/llama.h diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcommon.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libcommon.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcommon.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libcpp-httplib.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libggml-base.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libggml-blas.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libggml-blas.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libggml-cpu.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libggml-metal.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libggml-metal.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libggml.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libggml.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libgguf-model-data.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libgguf-model-data.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libllama.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libllama.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libmtmd.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a diff --git a/third_party/llama.cpp/prebuilt/darwin-amd64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a similarity index 100% rename from third_party/llama.cpp/prebuilt/darwin-amd64/libserver-context.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a diff --git a/third_party/llama.cpp/prebuilt/linux-amd64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcommon.a similarity index 100% rename from third_party/llama.cpp/prebuilt/linux-amd64/libcommon.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libcommon.a diff --git a/third_party/llama.cpp/prebuilt/linux-amd64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a similarity index 100% rename from third_party/llama.cpp/prebuilt/linux-amd64/libcpp-httplib.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a diff --git a/third_party/llama.cpp/prebuilt/linux-amd64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a similarity index 100% rename from third_party/llama.cpp/prebuilt/linux-amd64/libggml-base.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a diff --git a/third_party/llama.cpp/prebuilt/linux-amd64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a similarity index 100% rename from third_party/llama.cpp/prebuilt/linux-amd64/libggml-cpu.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a diff --git a/third_party/llama.cpp/prebuilt/linux-amd64/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a similarity index 100% rename from third_party/llama.cpp/prebuilt/linux-amd64/libggml.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a diff --git a/third_party/llama.cpp/prebuilt/linux-amd64/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a similarity index 100% rename from third_party/llama.cpp/prebuilt/linux-amd64/libllama.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a diff --git a/third_party/llama.cpp/prebuilt/linux-amd64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a similarity index 100% rename from third_party/llama.cpp/prebuilt/linux-amd64/libmtmd.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a diff --git a/third_party/llama.cpp/prebuilt/linux-amd64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a similarity index 100% rename from third_party/llama.cpp/prebuilt/linux-amd64/libserver-context.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a diff --git a/ggml/whispercpp/embed.go b/ggml/whispercpp/embed.go new file mode 100644 index 0000000..fb4fda8 --- /dev/null +++ b/ggml/whispercpp/embed.go @@ -0,0 +1,15 @@ +// Copyright 2025 FootprintAI +// SPDX-License-Identifier: Apache-2.0 + +package whispercpp + +import "embed" + +// Embed directives ensure go mod vendor includes headers and prebuilt libraries. +// The embedded filesystem is not used at runtime — CGO links directly via ${SRCDIR} paths. + +//go:embed third_party/include/*.h +//go:embed third_party/ggml/include/*.h +//go:embed third_party/prebuilt/darwin-amd64/*.a +//go:embed third_party/prebuilt/linux-amd64/*.a +var _ embed.FS diff --git a/ggml/whispercpp/options.go b/ggml/whispercpp/options.go new file mode 100644 index 0000000..52798a2 --- /dev/null +++ b/ggml/whispercpp/options.go @@ -0,0 +1,105 @@ +//go:build whispercpp + +// Copyright 2025 FootprintAI +// SPDX-License-Identifier: Apache-2.0 + +package whispercpp + +// ModelOption configures model loading. +type ModelOption func(*modelConfig) + +type modelConfig struct { + useGPU bool + flashAttn bool +} + +func defaultModelConfig() modelConfig { + return modelConfig{ + useGPU: true, + flashAttn: false, + } +} + +// WithGPU enables or disables GPU acceleration. +func WithGPU(enabled bool) ModelOption { + return func(c *modelConfig) { c.useGPU = enabled } +} + +// WithFlashAttention enables or disables flash attention. +func WithFlashAttention(enabled bool) ModelOption { + return func(c *modelConfig) { c.flashAttn = enabled } +} + +// TranscribeOption configures transcription. +type TranscribeOption func(*transcribeConfig) + +type transcribeConfig struct { + threads int + language string + translate bool + timestamps bool + tokenTimestamps bool + singleSegment bool + temperature float32 + maxTokens int + prompt string +} + +func defaultTranscribeConfig() transcribeConfig { + return transcribeConfig{ + threads: 4, + language: "auto", + translate: false, + timestamps: true, + tokenTimestamps: false, + singleSegment: false, + temperature: 0.0, + maxTokens: 0, + prompt: "", + } +} + +// WithThreads sets the number of CPU threads for inference. +func WithThreads(n int) TranscribeOption { + return func(c *transcribeConfig) { c.threads = n } +} + +// WithLanguage sets the language for transcription (e.g. "en", "de", "auto"). +func WithLanguage(lang string) TranscribeOption { + return func(c *transcribeConfig) { c.language = lang } +} + +// WithTranslate enables translation to English. +func WithTranslate(enabled bool) TranscribeOption { + return func(c *transcribeConfig) { c.translate = enabled } +} + +// WithTimestamps enables or disables timestamps in output. +func WithTimestamps(enabled bool) TranscribeOption { + return func(c *transcribeConfig) { c.timestamps = enabled } +} + +// WithTokenTimestamps enables token-level timestamps. +func WithTokenTimestamps(enabled bool) TranscribeOption { + return func(c *transcribeConfig) { c.tokenTimestamps = enabled } +} + +// WithSingleSegment forces output into a single segment. +func WithSingleSegment(enabled bool) TranscribeOption { + return func(c *transcribeConfig) { c.singleSegment = enabled } +} + +// WithTemperature sets the sampling temperature. +func WithTemperature(t float32) TranscribeOption { + return func(c *transcribeConfig) { c.temperature = t } +} + +// WithMaxTokens sets the maximum tokens per segment (0 = no limit). +func WithMaxTokens(n int) TranscribeOption { + return func(c *transcribeConfig) { c.maxTokens = n } +} + +// WithPrompt sets the initial prompt for the decoder. +func WithPrompt(prompt string) TranscribeOption { + return func(c *transcribeConfig) { c.prompt = prompt } +} diff --git a/ggml/whispercpp/third_party/LICENSE b/ggml/whispercpp/third_party/LICENSE new file mode 100644 index 0000000..acb96ce --- /dev/null +++ b/ggml/whispercpp/third_party/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023-2024 The ggml authors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-alloc.h b/ggml/whispercpp/third_party/ggml/include/ggml-alloc.h new file mode 100644 index 0000000..78aa059 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-alloc.h @@ -0,0 +1,85 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; +typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +typedef struct ggml_backend * ggml_backend_t; + +// Tensor allocator +struct ggml_tallocr { + ggml_backend_buffer_t buffer; + void * base; + size_t alignment; + size_t offset; +}; + +GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer); +GGML_API enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor); + +// Graph allocator +/* + Example usage: + ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); + + // optional: create a worst-case graph and reserve the buffers to avoid reallocations + ggml_gallocr_reserve(galloc, build_graph(max_batch)); + + // allocate the graph + struct ggml_cgraph * graph = build_graph(batch); + ggml_gallocr_alloc_graph(galloc, graph); + + printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0)); + + // evaluate the graph + ggml_backend_graph_compute(backend, graph); +*/ + +// special tensor flags for use with the graph allocator: +// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses +// ggml_set_output(): output tensors are never freed and never overwritten + +typedef struct ggml_gallocr * ggml_gallocr_t; + +GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft); +GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs); +GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); + +// pre-allocate buffers from a measure graph - does not allocate or modify the graph +// call with a worst-case graph to avoid buffer reallocations +// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed +// returns false if the buffer allocation failed +// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes +GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API void ggml_gallocr_reserve_n_size( + ggml_gallocr_t galloc, + struct ggml_cgraph * graph, + const int * node_buffer_ids, + const int * leaf_buffer_ids, + size_t * sizes); +GGML_API bool ggml_gallocr_reserve_n( + ggml_gallocr_t galloc, + struct ggml_cgraph * graph, + const int * node_buffer_ids, + const int * leaf_buffer_ids); + +// automatic reallocation if the topology changes when using a single buffer +// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) +GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); + +GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); + +// Utils +// Create a buffer and allocate all the tensors in a ggml_context +// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft +GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-backend.h b/ggml/whispercpp/third_party/ggml/include/ggml-backend.h new file mode 100644 index 0000000..a9d1778 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-backend.h @@ -0,0 +1,373 @@ +#pragma once + +#include "ggml.h" +#include "ggml-alloc.h" + +#ifdef GGML_BACKEND_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BACKEND_BUILD +# define GGML_BACKEND_API __declspec(dllexport) extern +# else +# define GGML_BACKEND_API __declspec(dllimport) extern +# endif +# else +# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern +# endif +#else +# define GGML_BACKEND_API extern +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; + typedef struct ggml_backend_buffer * ggml_backend_buffer_t; + typedef struct ggml_backend_event * ggml_backend_event_t; + typedef struct ggml_backend * ggml_backend_t; + typedef void * ggml_backend_graph_plan_t; + typedef struct ggml_backend_reg * ggml_backend_reg_t; + typedef struct ggml_backend_device * ggml_backend_dev_t; + + + // + // Backend buffer type + // + + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft); + + // + // Backend buffer + // + + enum ggml_backend_buffer_usage { + GGML_BACKEND_BUFFER_USAGE_ANY = 0, + GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, + GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2, + }; + + GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); + + // tensor copy between different backends + GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); + + // + // Backend (stream) + // + + GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend); + GGML_API const char * ggml_backend_name(ggml_backend_t backend); + GGML_API void ggml_backend_free(ggml_backend_t backend); + + GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend); + GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); + GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); + GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend); + + GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + + // "offset" refers to the offset in tensor->data for setting/getting data + GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); + + GGML_API void ggml_backend_synchronize(ggml_backend_t backend); + + GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + + GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + + // NOTE: will be removed, use device version instead + GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); + GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); + + // asynchronous copy + // the copy is performed after all the currently queued operations in backend_src + // backend_dst will wait for the copy to complete before performing other operations + // automatic fallback to sync copy if async is not supported + GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); + + GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); + + // + // Events + // + + GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device); + GGML_API void ggml_backend_event_free(ggml_backend_event_t event); + GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend); + GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); + GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event); + + // + // Backend device + // + + enum ggml_backend_dev_type { + // CPU device using system memory + GGML_BACKEND_DEVICE_TYPE_CPU, + // GPU device using dedicated memory + GGML_BACKEND_DEVICE_TYPE_GPU, + // integrated GPU device using host memory + GGML_BACKEND_DEVICE_TYPE_IGPU, + // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX) + GGML_BACKEND_DEVICE_TYPE_ACCEL + }; + + // functionality supported by the device + struct ggml_backend_dev_caps { + // asynchronous operations + bool async; + // pinned host buffer + bool host_buffer; + // creating buffers from host ptr + bool buffer_from_host_ptr; + // event synchronization + bool events; + }; + + // all the device properties + struct ggml_backend_dev_props { + // device name + const char * name; + // device description + const char * description; + // device free memory in bytes + size_t memory_free; + // device total memory in bytes + size_t memory_total; + // device type + enum ggml_backend_dev_type type; + // device id + // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0") + // if the id is unknown, this should be NULL + const char * device_id; + // device capabilities + struct ggml_backend_dev_caps caps; + }; + + GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); + GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device); + GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total); + GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device); + GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props); + GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device); + GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params); + GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device); + GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size); + + GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op); + GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); + GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); + + // + // Backend (reg) + // + + GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg); + GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg); + GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index); + GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name); + + // Common functions that may be obtained using ggml_backend_reg_get_proc_address + + // Split buffer type for tensor parallelism + typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split); + // Set the number of threads for the backend + typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); + // Get additional buffer types provided by the device (returns a NULL-terminated array) + typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); + // Set the abort callback for the backend + typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); + // Get a list of feature flags supported by the backend (returns a NULL-terminated array) + struct ggml_backend_feature { + const char * name; + const char * value; + }; + typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg); + + // + // Backend registry + // + + GGML_API void ggml_backend_register(ggml_backend_reg_t reg); + + GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); + + // Backend (reg) enumeration + GGML_API size_t ggml_backend_reg_count(void); + GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index); + GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name); + + // Device enumeration + GGML_API size_t ggml_backend_dev_count(void); + GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); + GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); + GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); + + // Direct backend (stream) initialization + // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params) + GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params); + // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params) + GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params); + // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) + GGML_API ggml_backend_t ggml_backend_init_best(void); + + // Load a backend from a dynamic library and register it + GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); + // Unload a backend if loaded dynamically and unregister it + GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); + // Load all known backends from dynamic libraries + GGML_API void ggml_backend_load_all(void); + GGML_API void ggml_backend_load_all_from_path(const char * dir_path); + + // + // Backend scheduler + // + + // The backend scheduler allows for multiple backend devices to be used together + // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends + // The backends are selected based on: + // - the backend that supports the operation + // - the location of the pre-allocated tensors (e.g. the weights) + /* + Example usage: + + // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned + // preferrably to run on the same backend as the buffer + ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + + sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true); + + // initialize buffers from a max size graph (optional) + reserve_graph = build_graph(sched, max_batch_size); + + // manually assign nodes to a backend (optional, should not be needed in most cases) + struct ggml_tensor * node = ggml_mul_mat(ctx, ...); + ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu); + + ggml_backend_sched_reserve(sched, reserve_graph); + + // compute + graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation + for (int i = 0; i < 10; ++i) { + ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically + } + + // if there are graph inputs: + graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called) + ggml_backend_sched_reset(sched); // clear the allocation of the previous graph + ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it + ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors + ggml_backend_sched_graph_compute(sched, graph); // execute the graph + + // as an alternative to the above it is also possible to assign the inputs to a dedicated context and + // allocate them statically via ggml_backend_alloc_ctx_tensors + } + */ + + typedef struct ggml_backend_sched * ggml_backend_sched_t; + + // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback) + // when ask == true, the scheduler wants to know if the user wants to observe this node + // this allows the scheduler to batch nodes together in order to evaluate them in a single call + // + // when ask == false, the scheduler is passing the node tensor to the user for observation + // if the user returns false, the scheduler will cancel the graph compute + // + typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); + + // Initialize a backend scheduler, backends with low index are given priority over backends with high index + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); + + // Initialize backend buffers from a measure graph + GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success + + GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); + GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); + + // Get the number of splits of the last graph + GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); + GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched); + + GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); + + GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); + + // Split graph without allocating it + GGML_API void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + + // Allocate and compute graph on the backend scheduler + GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success + GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); + + // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph. + // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers. + // The correct way to use this API is to discard the deallocated tensors and create new ones. + GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); + + // Set a callback to be called for each resulting node during graph compute + GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); + + // + // Utils + // + + struct ggml_backend_graph_copy { + ggml_backend_buffer_t buffer; + struct ggml_context * ctx_allocated; + struct ggml_context * ctx_unallocated; + struct ggml_cgraph * graph; + }; + + // Copy a graph to a different backend + GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); + GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); + + typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); + + // Compare the output of two backends + GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data, struct ggml_tensor const * const * test_nodes, size_t num_test_nodes); + + // Tensor initialization + GGML_API enum ggml_status ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); + GGML_API enum ggml_status ggml_backend_view_init(struct ggml_tensor * tensor); + + // CPU buffer types are always available + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); + GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-blas.h b/ggml/whispercpp/third_party/ggml/include/ggml-blas.h new file mode 100644 index 0000000..87a81b3 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-blas.h @@ -0,0 +1,25 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void); + +GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend); + +// number of threads used for conversion to float +// for openblas and blis, this will also set the number of threads used for blas operations +GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-cann.h b/ggml/whispercpp/third_party/ggml/include/ggml-cann.h new file mode 100644 index 0000000..b469e22 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-cann.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Maximum number of CANN devices supported. + */ +#define GGML_CANN_MAX_DEVICES 16 + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void); + +/** + * @brief Initializes the CANN backend for a specified device. + * + * This function initializes the CANN backend for the given device. + * It verifies the device index, allocates a context, and creates a backend + * instance. + * + * @param device The index of the device to initialize. + * @return A pointer to the initialized backend instance, or nullptr on failure. + */ +GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device); + +/** + * @brief Checks if a given backend is a CANN backend. + * + * This function verifies if the provided backend is a CANN backend by comparing + * its GUID with the CANN backend's GUID. + * + * @param backend The backend instance to check. + * @return True if the backend is a CANN backend, false otherwise. + */ +GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend); + +/** + * @brief Retrieves the CANN buffer type for a specified device. + * + * This function initializes and returns the buffer type interface associated + * with the given device. It ensures thread-safe access using a mutex. + * + * @param device The device index for which to retrieve the buffer type. + * @return A pointer to the buffer type interface for the specified device, or + * nullptr if the device index is out of range. + */ +GGML_BACKEND_API ggml_backend_buffer_type_t +ggml_backend_cann_buffer_type(int32_t device); + +/** + * @brief Retrieves the number of CANN devices available. + * + * This function returns the number of CANN devices available based on + * information obtained from `ggml_cann_info()`. + * + * @return The number of CANN devices available. + */ +GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void); + +/** + * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. + * + * @return A pointer to the host buffer type interface. + */ +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); + +/** + * @brief Retrieves the description of a specific CANN device. + * + * This function sets the specified device, retrieves the SoC name, + * and writes it into the provided description buffer. + * + * @param device The device index to retrieve the description for. + * @param description Pointer to a buffer where the description will be written. + * @param description_size Size of the description buffer. + */ +GGML_BACKEND_API void ggml_backend_cann_get_device_description( + int32_t device, char* description, size_t description_size); + +/** + * @brief Retrieves the memory information of a specific CANN device. + * + * This function sets the specified device, retrieves the free and total + * memory information of the specified type (ACL_HBM_MEM), and stores them + * in the provided pointers. + * + * @param device The device index to retrieve memory information for. + * @param free Pointer to a variable where the free memory size will be stored. + * @param total Pointer to a variable where the total memory size will be + * stored. + */ +GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device, + size_t* free, + size_t* total); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-cpp.h b/ggml/whispercpp/third_party/ggml/include/ggml-cpp.h new file mode 100644 index 0000000..48aa796 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-cpp.h @@ -0,0 +1,39 @@ +#pragma once + +#ifndef __cplusplus +#error "This header is for C++ only" +#endif + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "gguf.h" +#include + +// Smart pointers for ggml types + +// ggml + +struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } }; +struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } }; + +typedef std::unique_ptr ggml_context_ptr; +typedef std::unique_ptr gguf_context_ptr; + +// ggml-alloc + +struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } }; + +typedef std::unique_ptr ggml_gallocr_ptr; + +// ggml-backend + +struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } }; +struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } }; +struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } }; +struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } }; + +typedef std::unique_ptr ggml_backend_ptr; +typedef std::unique_ptr ggml_backend_buffer_ptr; +typedef std::unique_ptr ggml_backend_event_ptr; +typedef std::unique_ptr ggml_backend_sched_ptr; diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-cpu.h b/ggml/whispercpp/third_party/ggml/include/ggml-cpu.h new file mode 100644 index 0000000..4f3b99c --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-cpu.h @@ -0,0 +1,146 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggml-org/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` + + int n_threads; + struct ggml_threadpool * threadpool; + + // abort ggml_graph_compute when true + ggml_abort_callback abort_callback; + void * abort_callback_data; + }; + + // numa strategies + enum ggml_numa_strategy { + GGML_NUMA_STRATEGY_DISABLED = 0, + GGML_NUMA_STRATEGY_DISTRIBUTE = 1, + GGML_NUMA_STRATEGY_ISOLATE = 2, + GGML_NUMA_STRATEGY_NUMACTL = 3, + GGML_NUMA_STRATEGY_MIRROR = 4, + GGML_NUMA_STRATEGY_COUNT + }; + + GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems + GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node + + GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); + GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); + + GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); + GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); + + GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); + GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); + + GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value); + + GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); + GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); + + GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); + GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); + + GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_BACKEND_API struct ggml_cplan ggml_graph_plan( + const struct ggml_cgraph * cgraph, + int n_threads, /* = GGML_DEFAULT_N_THREADS */ + struct ggml_threadpool * threadpool /* = NULL */ ); + GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + + // + // system info + // + + // x86 + GGML_BACKEND_API int ggml_cpu_has_sse3 (void); + GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); + GGML_BACKEND_API int ggml_cpu_has_avx (void); + GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); + GGML_BACKEND_API int ggml_cpu_has_avx2 (void); + GGML_BACKEND_API int ggml_cpu_has_bmi2 (void); + GGML_BACKEND_API int ggml_cpu_has_f16c (void); + GGML_BACKEND_API int ggml_cpu_has_fma (void); + GGML_BACKEND_API int ggml_cpu_has_avx512 (void); + GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); + GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); + GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void); + GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void); + // ARM + GGML_BACKEND_API int ggml_cpu_has_neon (void); + GGML_BACKEND_API int ggml_cpu_has_arm_fma (void); + GGML_BACKEND_API int ggml_cpu_has_fp16_va (void); + GGML_BACKEND_API int ggml_cpu_has_dotprod (void); + GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void); + GGML_BACKEND_API int ggml_cpu_has_sve (void); + GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes + GGML_BACKEND_API int ggml_cpu_has_sme (void); + // other + GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); + GGML_BACKEND_API int ggml_cpu_get_rvv_vlen (void); // risc-v vector length in bytes + GGML_BACKEND_API int ggml_cpu_has_vsx (void); + GGML_BACKEND_API int ggml_cpu_has_vxe (void); + GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); + GGML_BACKEND_API int ggml_cpu_has_llamafile (void); + + // Internal types and functions exposed for tests and benchmarks + + typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, + const void * GGML_RESTRICT y, size_t by, int nrc); + + struct ggml_type_traits_cpu { + ggml_from_float_t from_float; + ggml_vec_dot_t vec_dot; + enum ggml_type vec_dot_type; + int64_t nrows; // number of rows to process simultaneously + }; + + GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type); + + GGML_BACKEND_API void ggml_cpu_init(void); + + // + // CPU backend + // + + GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void); + + GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend); + GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads); + GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool); + GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data); + + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void); + + GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t); + GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t); + GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-cuda.h b/ggml/whispercpp/third_party/ggml/include/ggml-cuda.h new file mode 100644 index 0000000..22ad2c0 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-cuda.h @@ -0,0 +1,47 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef GGML_USE_HIP +#define GGML_CUDA_NAME "ROCm" +#define GGML_CUBLAS_NAME "hipBLAS" +#elif defined(GGML_USE_MUSA) +#define GGML_CUDA_NAME "MUSA" +#define GGML_CUBLAS_NAME "muBLAS" +#else +#define GGML_CUDA_NAME "CUDA" +#define GGML_CUBLAS_NAME "cuBLAS" +#endif +#define GGML_CUDA_MAX_DEVICES 16 + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device); + +GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend); + +// device buffer +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split); + +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); + +GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void); +GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); + +GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); +GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-hexagon.h b/ggml/whispercpp/third_party/ggml/include/ggml-hexagon.h new file mode 100644 index 0000000..6e07900 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-hexagon.h @@ -0,0 +1,19 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(void); + +GGML_BACKEND_API bool ggml_backend_is_hexagon(ggml_backend_t backend); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-metal.h b/ggml/whispercpp/third_party/ggml/include/ggml-metal.h new file mode 100644 index 0000000..433838f --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-metal.h @@ -0,0 +1,61 @@ +// Note: this description is outdated +// +// An interface allowing to compute ggml_cgraph with Metal +// +// This is a fully functional interface that extends ggml with GPU support for Apple devices. +// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.) +// +// How it works? +// +// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this +// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you +// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.) +// +// You only need to make sure that all memory buffers that you used during the graph creation +// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is +// used during the graph evaluation to determine the arguments of the compute kernels. +// +// Synchronization between device and host memory (for example for input and output tensors) +// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions. +// + +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#include +#include + +struct ggml_tensor; +struct ggml_cgraph; + +#ifdef __cplusplus +extern "C" { +#endif + +// +// backend API +// user-code should use only these functions +// + +// TODO: remove in the future +GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void); + +GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend); + +GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); + +// helper to check if the device supports a specific family +// ideally, the user code should be doing these checks +// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf +GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family); + +// capture all command buffers committed the next time `ggml_backend_graph_compute` is called +GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-opencl.h b/ggml/whispercpp/third_party/ggml/include/ggml-opencl.h new file mode 100644 index 0000000..6b61771 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-opencl.h @@ -0,0 +1,26 @@ +#ifndef GGML_OPENCL_H +#define GGML_OPENCL_H + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// +// backend API +// +GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void); +GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend); + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void); +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void); + +#ifdef __cplusplus +} +#endif + +#endif // GGML_OPENCL_H diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-opt.h b/ggml/whispercpp/third_party/ggml/include/ggml-opt.h new file mode 100644 index 0000000..4703a05 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-opt.h @@ -0,0 +1,256 @@ +// This file contains functionality for training models using GGML. +// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets. +// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code. +// +// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de) + +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + struct ggml_opt_dataset; + struct ggml_opt_context; + struct ggml_opt_result; + + typedef struct ggml_opt_dataset * ggml_opt_dataset_t; + typedef struct ggml_opt_context * ggml_opt_context_t; + typedef struct ggml_opt_result * ggml_opt_result_t; + + // ====== Loss ====== + + // built-in loss types, i.e. the built-in quantities minimized by the optimizer + // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value + enum ggml_opt_loss_type { + GGML_OPT_LOSS_TYPE_MEAN, + GGML_OPT_LOSS_TYPE_SUM, + GGML_OPT_LOSS_TYPE_CROSS_ENTROPY, + GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR, + }; + + // ====== Dataset ====== + + GGML_API ggml_opt_dataset_t ggml_opt_dataset_init( + enum ggml_type type_data, // the type for the internal data tensor + enum ggml_type type_label, // the type for the internal labels tensor + int64_t ne_datapoint, // number of elements per datapoint + int64_t ne_label, // number of elements per label + int64_t ndata, // total number of datapoints/labels + int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied) + GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset); + + // get underlying tensors that store the data + GGML_API int64_t ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset); + GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata] + GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata] + + // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative + GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata); + + // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch + GGML_API void ggml_opt_dataset_get_batch( + ggml_opt_dataset_t dataset, + struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch] + struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch] + int64_t ibatch); + GGML_API void ggml_opt_dataset_get_batch_host( + ggml_opt_dataset_t dataset, + void * data_batch, + size_t nb_data_batch, + void * labels_batch, + int64_t ibatch); + + // ====== Model / Context ====== + + enum ggml_opt_build_type { + GGML_OPT_BUILD_TYPE_FORWARD = 10, + GGML_OPT_BUILD_TYPE_GRAD = 20, + GGML_OPT_BUILD_TYPE_OPT = 30, + }; + + enum ggml_opt_optimizer_type { + GGML_OPT_OPTIMIZER_TYPE_ADAMW, + GGML_OPT_OPTIMIZER_TYPE_SGD, + + GGML_OPT_OPTIMIZER_TYPE_COUNT + }; + + // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss + struct ggml_opt_optimizer_params { + struct { + float alpha; // learning rate + float beta1; // first AdamW momentum + float beta2; // second AdamW momentum + float eps; // epsilon for numerical stability + float wd; // weight decay - 0.0f to disable + } adamw; + struct { + float alpha; // learning rate + float wd; // weight decay + } sgd; + }; + + // callback to calculate optimizer parameters prior to a backward pass + // userdata can be used to pass arbitrary data + typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata); + + // returns the default optimizer params (constant, hard-coded values) + // userdata is not used + GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata); + + // casts userdata to ggml_opt_optimizer_params and returns it + GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata); + + // parameters for initializing a new optimization context + struct ggml_opt_params { + ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs + + // by default the forward graph needs to be reconstructed for each eval + // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically + struct ggml_context * ctx_compute; + struct ggml_tensor * inputs; + struct ggml_tensor * outputs; + + enum ggml_opt_loss_type loss_type; + enum ggml_opt_build_type build_type; + + int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done + + ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters + void * get_opt_pars_ud; // userdata for calculating optimizer parameters + + // only GGML_OPT_OPTIMIZER_TYPE_ADAMW needs m, v momenta per parameter tensor + enum ggml_opt_optimizer_type optimizer; + }; + + // get parameters for an optimization context with defaults set where possible + // parameters for which no sensible defaults exist are supplied as arguments to this function + GGML_API struct ggml_opt_params ggml_opt_default_params( + ggml_backend_sched_t backend_sched, + enum ggml_opt_loss_type loss_type); + + GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params); + GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx); + + // set gradients to zero, initilize loss, and optionally reset the optimizer + GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer); + + GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically + + // get underlying tensors that store data + // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc + GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor + GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor + GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against + GGML_API struct ggml_tensor * ggml_opt_loss( ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss + GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs + GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels + + // get the gradient accumulator for a node from the forward graph + GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node); + + GGML_API enum ggml_opt_optimizer_type ggml_opt_context_optimizer_type(ggml_opt_context_t); //TODO consistent naming scheme + + GGML_API const char * ggml_opt_optimizer_name(enum ggml_opt_optimizer_type); + + // ====== Optimization Result ====== + + GGML_API ggml_opt_result_t ggml_opt_result_init(void); + GGML_API void ggml_opt_result_free(ggml_opt_result_t result); + GGML_API void ggml_opt_result_reset(ggml_opt_result_t result); + + // get data from result, uncertainties are optional and can be ignored by passing NULL + GGML_API void ggml_opt_result_ndata( ggml_opt_result_t result, int64_t * ndata); // writes 1 value, number of datapoints + GGML_API void ggml_opt_result_loss( ggml_opt_result_t result, double * loss, double * unc); // writes 1 value + GGML_API void ggml_opt_result_pred( ggml_opt_result_t result, int32_t * pred); // writes ndata values + GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc); // writes 1 value + + // ====== Computation ====== + + // if not using static graphs, this function must be called prior to ggml_opt_alloc + GGML_API void ggml_opt_prepare_alloc( + ggml_opt_context_t opt_ctx, + struct ggml_context * ctx_compute, + struct ggml_cgraph * gf, + struct ggml_tensor * inputs, + struct ggml_tensor * outputs); + + // allocate the next graph for evaluation, either forward or forward + backward + // must be called exactly once prior to calling ggml_opt_eval + GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward); + + // do forward pass, increment result if not NULL, do backward pass if allocated + GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result); + + // ############################################################################ + // ## The high-level functions start here. They do not depend on any private ## + // ## functions or structs and can be copied to and adapted for user code. ## + // ############################################################################ + + // ====== Intended Usage ====== + // + // 1. Select the appropriate loss for your problem. + // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them. + // Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster). + // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors. + // The first context should contain the model parameters and inputs and be allocated statically in user code. + // The second context should contain all other tensors and will be (re)allocated automatically. + // Due to this automated allocation the data of the second context is not defined when accessed in user code. + // Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors. + // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead. + + // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation + typedef void (*ggml_opt_epoch_callback)( + bool train, // true after training evaluation, false after validation evaluation + ggml_opt_context_t opt_ctx, + ggml_opt_dataset_t dataset, + ggml_opt_result_t result, // result associated with the dataset subsection + int64_t ibatch, // number of batches that have been evaluated so far + int64_t ibatch_max, // total number of batches in this dataset subsection + int64_t t_start_us); // time at which the evaluation on the dataset subsection was started + + // do training on front of dataset, do evaluation only on back of dataset + GGML_API void ggml_opt_epoch( + ggml_opt_context_t opt_ctx, + ggml_opt_dataset_t dataset, + ggml_opt_result_t result_train, // result to increment during training, ignored if NULL + ggml_opt_result_t result_eval, // result to increment during evaluation, ignored if NULL + int64_t idata_split, // data index at which to split training and evaluation + ggml_opt_epoch_callback callback_train, + ggml_opt_epoch_callback callback_eval); + + // callback that prints a progress bar on stderr + GGML_API void ggml_opt_epoch_callback_progress_bar( + bool train, + ggml_opt_context_t opt_ctx, + ggml_opt_dataset_t dataset, + ggml_opt_result_t result, + int64_t ibatch, + int64_t ibatch_max, + int64_t t_start_us); + + // fit model defined by inputs and outputs to dataset + GGML_API void ggml_opt_fit( + ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs + struct ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs + struct ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch] + struct ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used + ggml_opt_dataset_t dataset, // dataset with data and optionally also labels + enum ggml_opt_loss_type loss_type, // loss to minimize + enum ggml_opt_optimizer_type optimizer, // sgd or adamw + ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t) + int64_t nepoch, // how many times the dataset should be iterated over + int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs + float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f) + bool silent); // whether or not info prints to stderr should be suppressed + + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-rpc.h b/ggml/whispercpp/third_party/ggml/include/ggml-rpc.h new file mode 100644 index 0000000..df1ad2a --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-rpc.h @@ -0,0 +1,30 @@ +#pragma once + +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define RPC_PROTO_MAJOR_VERSION 3 +#define RPC_PROTO_MINOR_VERSION 6 +#define RPC_PROTO_PATCH_VERSION 0 +#define GGML_RPC_MAX_SERVERS 16 + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint, uint32_t device); +GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend); + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint, uint32_t device); + +GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, uint32_t device, size_t * free, size_t * total); + +GGML_BACKEND_API void ggml_backend_rpc_start_server(const char * endpoint, const char * cache_dir, + size_t n_threads, size_t n_devices, ggml_backend_dev_t * devices); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void); +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_add_server(const char * endpoint); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-sycl.h b/ggml/whispercpp/third_party/ggml/include/ggml-sycl.h new file mode 100644 index 0000000..5ce349a --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-sycl.h @@ -0,0 +1,49 @@ +// +// MIT license +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: MIT +// + +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#define GGML_SYCL_NAME "SYCL" +#define GGML_SYCL_MAX_DEVICES 48 + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device); + +GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend); + +// devide buffer +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); + +// split tensor buffer that splits matrices by rows across multiple devices +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); + +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); + +GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void); +GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len); +GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device, + char *description, + size_t description_size); +GGML_BACKEND_API int ggml_backend_sycl_get_device_count(); +GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); + +// SYCL doesn't support registering host memory, keep here for reference +// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); +// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-vulkan.h b/ggml/whispercpp/third_party/ggml/include/ggml-vulkan.h new file mode 100644 index 0000000..ed5ea5f --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-vulkan.h @@ -0,0 +1,29 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_VK_NAME "Vulkan" +#define GGML_VK_MAX_DEVICES 16 + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); + +GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend); +GGML_BACKEND_API int ggml_backend_vk_get_device_count(void); +GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); +GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); + +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-webgpu.h b/ggml/whispercpp/third_party/ggml/include/ggml-webgpu.h new file mode 100644 index 0000000..65b8ed9 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-webgpu.h @@ -0,0 +1,19 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_WEBGPU_NAME "WebGPU" + +// Needed for examples in ggml +GGML_BACKEND_API ggml_backend_t ggml_backend_webgpu_init(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_webgpu_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-zdnn.h b/ggml/whispercpp/third_party/ggml/include/ggml-zdnn.h new file mode 100644 index 0000000..fbf45b6 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-zdnn.h @@ -0,0 +1,17 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// device buffer +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_zdnn_buffer_type(void); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml-zendnn.h b/ggml/whispercpp/third_party/ggml/include/ggml-zendnn.h new file mode 100644 index 0000000..a30a3a9 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml-zendnn.h @@ -0,0 +1,22 @@ +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// backend API +GGML_BACKEND_API ggml_backend_t ggml_backend_zendnn_init(void); + +GGML_BACKEND_API bool ggml_backend_is_zendnn(ggml_backend_t backend); + +// number of threads used for zendnn operations +GGML_BACKEND_API void ggml_backend_zendnn_set_n_threads(ggml_backend_t backend_zendnn, int n_threads); + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zendnn_reg(void); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/ggml.h b/ggml/whispercpp/third_party/ggml/include/ggml.h new file mode 100644 index 0000000..b69583d --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/ggml.h @@ -0,0 +1,2724 @@ +#pragma once + +// +// GGML Tensor Library +// +// This documentation is still a work in progress. +// If you wish some specific topics to be covered, feel free to drop a comment: +// +// https://github.com/ggerganov/whisper.cpp/issues/40 +// +// ## Overview +// +// This library implements: +// +// - a set of tensor operations +// - automatic differentiation +// - basic optimization algorithms +// +// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, +// but is not limited to, the following: +// +// - linear regression +// - support vector machines +// - neural networks +// +// The library allows the user to define a certain function using the available tensor operations. This function +// definition is represented internally via a computation graph. Each tensor operation in the function definition +// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the +// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized +// using one of the available optimization algorithms. +// +// For example, here we define the function: f(x) = a*x^2 + b +// +// { +// struct ggml_init_params params = { +// .mem_size = 16*1024*1024, +// .mem_buffer = NULL, +// }; +// +// // memory allocation happens here +// struct ggml_context * ctx = ggml_init(params); +// +// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// +// ggml_set_param(ctx, x); // x is an input variable +// +// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); +// struct ggml_tensor * x2 = ggml_mul(ctx, x, x); +// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); +// +// ... +// } +// +// Notice that the function definition above does not involve any actual computation. The computation is performed only +// when the user explicitly requests it. For example, to compute the function's value at x = 2.0: +// +// { +// ... +// +// struct ggml_cgraph * gf = ggml_new_graph(ctx); +// ggml_build_forward_expand(gf, f); +// +// // set the input variable and parameter values +// ggml_set_f32(x, 2.0f); +// ggml_set_f32(a, 3.0f); +// ggml_set_f32(b, 4.0f); +// +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); +// +// printf("f = %f\n", ggml_get_f32_1d(f, 0)); +// +// ... +// } +// +// The actual computation is performed in the ggml_graph_compute() function. +// +// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the +// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know +// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory +// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was +// actually needed. +// +// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic +// differentiation and optimization algorithms. +// +// The described approach allows to define the function graph once and then compute its forward or backward graphs +// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way +// the user can avoid the memory allocation overhead at runtime. +// +// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class +// citizens, but in theory the library can be extended to support FP8 and integer data types. +// +// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary +// and binary operations. Most of the available operations fall into one of these two categories. With time, it became +// clear that the library needs to support more complex operations. The way to support these operations is not clear +// yet, but a few examples are demonstrated in the following operations: +// +// - ggml_permute() +// - ggml_conv_1d_1s() +// - ggml_conv_1d_2s() +// +// For each tensor operator, the library implements a forward and backward computation function. The forward function +// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the +// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a +// calculus class, or watch the following video: +// +// What is Automatic Differentiation? +// https://www.youtube.com/watch?v=wG_nF1awSSY +// +// +// ## Tensor data (struct ggml_tensor) +// +// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of +// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains +// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: +// +// { +// struct ggml_tensor * c = ggml_add(ctx, a, b); +// +// assert(c->src[0] == a); +// assert(c->src[1] == b); +// } +// +// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the +// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows +// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and +// permutation. All tensor operations have to take the stride into account and not assume that the tensor is +// contiguous in memory. +// +// The data of the tensor is accessed via the "data" pointer. For example: +// +// { +// const int nx = 2; +// const int ny = 3; +// +// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); +// +// for (int y = 0; y < ny; y++) { +// for (int x = 0; x < nx; x++) { +// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y; +// } +// } +// +// ... +// } +// +// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. +// +// ## The matrix multiplication operator (ggml_mul_mat) +// +// TODO +// +// +// ## Multi-threading +// +// TODO +// +// +// ## Overview of ggml.c +// +// TODO +// +// +// ## SIMD optimizations +// +// TODO +// +// +// ## Debugging ggml +// +// TODO +// +// + +#ifdef GGML_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef GGML_BUILD +# define GGML_API __declspec(dllexport) extern +# else +# define GGML_API __declspec(dllimport) extern +# endif +# else +# define GGML_API __attribute__ ((visibility ("default"))) extern +# endif +#else +# define GGML_API extern +#endif + +// TODO: support for clang +#ifdef __GNUC__ +# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define GGML_DEPRECATED(func, hint) func +#endif + +#ifndef __GNUC__ +# define GGML_ATTRIBUTE_FORMAT(...) +#elif defined(__MINGW32__) && !defined(__clang__) +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif + +#if defined(_WIN32) && !defined(_WIN32_WINNT) +# define _WIN32_WINNT 0x0A00 +#endif + +#include +#include +#include +#include + +#define GGML_FILE_MAGIC 0x67676d6c // "ggml" +#define GGML_FILE_VERSION 2 + +#define GGML_QNT_VERSION 2 // bump this on quantization format changes +#define GGML_QNT_VERSION_FACTOR 1000 // do not change this + +#define GGML_MAX_DIMS 4 +#define GGML_MAX_PARAMS 2048 +#define GGML_MAX_SRC 10 +#define GGML_MAX_N_THREADS 512 +#define GGML_MAX_OP_PARAMS 64 + +#ifndef GGML_MAX_NAME +# define GGML_MAX_NAME 64 +#endif + +#define GGML_DEFAULT_N_THREADS 4 +#define GGML_DEFAULT_GRAPH_SIZE 2048 + +#if UINTPTR_MAX == 0xFFFFFFFF + #define GGML_MEM_ALIGN 4 +#elif defined(__EMSCRIPTEN__) +// emscripten uses max_align_t == 8, so we need GGML_MEM_ALIGN == 8 for 64-bit wasm. +// (for 32-bit wasm, the first conditional is true and GGML_MEM_ALIGN stays 4.) +// ref: https://github.com/ggml-org/llama.cpp/pull/18628 + #define GGML_MEM_ALIGN 8 +#else + #define GGML_MEM_ALIGN 16 +#endif + +#define GGML_EXIT_SUCCESS 0 +#define GGML_EXIT_ABORTED 1 + +// TODO: convert to enum https://github.com/ggml-org/llama.cpp/pull/16187#discussion_r2388538726 +#define GGML_ROPE_TYPE_NORMAL 0 +#define GGML_ROPE_TYPE_NEOX 2 +#define GGML_ROPE_TYPE_MROPE 8 +#define GGML_ROPE_TYPE_VISION 24 +#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000 + +#define GGML_MROPE_SECTIONS 4 + +#define GGML_UNUSED(x) (void)(x) +#ifdef __CUDACC__ +template +__host__ __device__ constexpr inline void ggml_unused_vars_impl(Args&&...) noexcept {} +#define GGML_UNUSED_VARS(...) ggml_unused_vars_impl(__VA_ARGS__) +#else +#define GGML_UNUSED_VARS(...) do { (void)sizeof((__VA_ARGS__, 0)); } while(0) +#endif // __CUDACC__ + +#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) + +#ifndef NDEBUG +# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) +#elif defined(__GNUC__) +# define GGML_UNREACHABLE() __builtin_unreachable() +#elif defined(_MSC_VER) +# define GGML_UNREACHABLE() __assume(0) +#else +# define GGML_UNREACHABLE() ((void) 0) +#endif + +#ifdef __cplusplus +# define GGML_NORETURN [[noreturn]] +#elif defined(_MSC_VER) +# define GGML_NORETURN __declspec(noreturn) +#else +# define GGML_NORETURN _Noreturn +#endif + +#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__) +#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x) + +// used to copy the number of elements and stride in bytes of tensors into local variables. +// main purpose is to reduce code duplication and improve readability. +// +// example: +// +// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); +// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb); +// +#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \ + const type prefix##0 = (pointer) ? (pointer)->array[0] : 0; \ + GGML_UNUSED(prefix##0); +#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \ + const type prefix##1 = (pointer) ? (pointer)->array[1] : 0; \ + GGML_UNUSED(prefix##1); +#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \ + const type prefix##2 = (pointer) ? (pointer)->array[2] : 0; \ + GGML_UNUSED(prefix##2); +#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \ + GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \ + const type prefix##3 = (pointer) ? (pointer)->array[3] : 0; \ + GGML_UNUSED(prefix##3); + +#define GGML_TENSOR_UNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_TENSOR_BINARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_TENSOR_TERNARY_OP_LOCALS \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne2, src2, ne) \ + GGML_TENSOR_LOCALS(size_t, nb2, src2, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \ + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + +#define GGML_TENSOR_BINARY_OP_LOCALS01 \ + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \ + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \ + GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \ + GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) + +#ifdef __cplusplus +extern "C" { +#endif + + // Function type used in fatal error callbacks + typedef void (*ggml_abort_callback_t)(const char * error_message); + + // Set the abort callback (passing null will restore original abort functionality: printing a message to stdout) + // Returns the old callback for chaining + GGML_API ggml_abort_callback_t ggml_set_abort_callback(ggml_abort_callback_t callback); + + GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) + GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); + + enum ggml_status { + GGML_STATUS_ALLOC_FAILED = -2, + GGML_STATUS_FAILED = -1, + GGML_STATUS_SUCCESS = 0, + GGML_STATUS_ABORTED = 1, + }; + + // get ggml_status name string + GGML_API const char * ggml_status_to_string(enum ggml_status status); + + // ieee 754-2008 half-precision float16 + // todo: make this not an integral type + typedef uint16_t ggml_fp16_t; + GGML_API float ggml_fp16_to_fp32(ggml_fp16_t); + GGML_API ggml_fp16_t ggml_fp32_to_fp16(float); + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t); + GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t); + + // google brain half-precision bfloat16 + typedef struct { uint16_t bits; } ggml_bf16_t; + GGML_API ggml_bf16_t ggml_fp32_to_bf16(float); + GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16 + GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t); + GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t); + GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t); + + struct ggml_object; + struct ggml_context; + struct ggml_cgraph; + + // NOTE: always add types at the end of the enum to keep backward compatibility + enum ggml_type { + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, + // GGML_TYPE_Q4_2 = 4, support has been removed + // GGML_TYPE_Q4_3 = 5, support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, + GGML_TYPE_IQ2_XXS = 16, + GGML_TYPE_IQ2_XS = 17, + GGML_TYPE_IQ3_XXS = 18, + GGML_TYPE_IQ1_S = 19, + GGML_TYPE_IQ4_NL = 20, + GGML_TYPE_IQ3_S = 21, + GGML_TYPE_IQ2_S = 22, + GGML_TYPE_IQ4_XS = 23, + GGML_TYPE_I8 = 24, + GGML_TYPE_I16 = 25, + GGML_TYPE_I32 = 26, + GGML_TYPE_I64 = 27, + GGML_TYPE_F64 = 28, + GGML_TYPE_IQ1_M = 29, + GGML_TYPE_BF16 = 30, + // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files + // GGML_TYPE_Q4_0_4_8 = 32, + // GGML_TYPE_Q4_0_8_8 = 33, + GGML_TYPE_TQ1_0 = 34, + GGML_TYPE_TQ2_0 = 35, + // GGML_TYPE_IQ4_NL_4_4 = 36, + // GGML_TYPE_IQ4_NL_4_8 = 37, + // GGML_TYPE_IQ4_NL_8_8 = 38, + GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) + GGML_TYPE_COUNT = 40, + }; + + // precision + enum ggml_prec { + GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default + GGML_PREC_F32 = 10, + }; + + // model file types + enum ggml_ftype { + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors + GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors + GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors + GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors + GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors + }; + + // available tensor operations: + enum ggml_op { + GGML_OP_NONE = 0, + + GGML_OP_DUP, + GGML_OP_ADD, + GGML_OP_ADD_ID, + GGML_OP_ADD1, + GGML_OP_ACC, + GGML_OP_SUB, + GGML_OP_MUL, + GGML_OP_DIV, + GGML_OP_SQR, + GGML_OP_SQRT, + GGML_OP_LOG, + GGML_OP_SIN, + GGML_OP_COS, + GGML_OP_SUM, + GGML_OP_SUM_ROWS, + GGML_OP_CUMSUM, + GGML_OP_MEAN, + GGML_OP_ARGMAX, + GGML_OP_COUNT_EQUAL, + GGML_OP_REPEAT, + GGML_OP_REPEAT_BACK, + GGML_OP_CONCAT, + GGML_OP_SILU_BACK, + GGML_OP_NORM, // normalize + GGML_OP_RMS_NORM, + GGML_OP_RMS_NORM_BACK, + GGML_OP_GROUP_NORM, + GGML_OP_L2_NORM, + + GGML_OP_MUL_MAT, + GGML_OP_MUL_MAT_ID, + GGML_OP_OUT_PROD, + + GGML_OP_SCALE, + GGML_OP_SET, + GGML_OP_CPY, + GGML_OP_CONT, + GGML_OP_RESHAPE, + GGML_OP_VIEW, + GGML_OP_PERMUTE, + GGML_OP_TRANSPOSE, + GGML_OP_GET_ROWS, + GGML_OP_GET_ROWS_BACK, + GGML_OP_SET_ROWS, + GGML_OP_DIAG, + GGML_OP_DIAG_MASK_INF, + GGML_OP_DIAG_MASK_ZERO, + GGML_OP_SOFT_MAX, + GGML_OP_SOFT_MAX_BACK, + GGML_OP_ROPE, + GGML_OP_ROPE_BACK, + GGML_OP_CLAMP, + GGML_OP_CONV_TRANSPOSE_1D, + GGML_OP_IM2COL, + GGML_OP_IM2COL_BACK, + GGML_OP_IM2COL_3D, + GGML_OP_CONV_2D, + GGML_OP_CONV_3D, + GGML_OP_CONV_2D_DW, + GGML_OP_CONV_TRANSPOSE_2D, + GGML_OP_POOL_1D, + GGML_OP_POOL_2D, + GGML_OP_POOL_2D_BACK, + GGML_OP_UPSCALE, + GGML_OP_PAD, + GGML_OP_PAD_REFLECT_1D, + GGML_OP_ROLL, + GGML_OP_ARANGE, + GGML_OP_TIMESTEP_EMBEDDING, + GGML_OP_ARGSORT, + GGML_OP_TOP_K, + GGML_OP_LEAKY_RELU, + GGML_OP_TRI, + GGML_OP_FILL, + + GGML_OP_FLASH_ATTN_EXT, + GGML_OP_FLASH_ATTN_BACK, + GGML_OP_SSM_CONV, + GGML_OP_SSM_SCAN, + GGML_OP_WIN_PART, + GGML_OP_WIN_UNPART, + GGML_OP_GET_REL_POS, + GGML_OP_ADD_REL_POS, + GGML_OP_RWKV_WKV6, + GGML_OP_GATED_LINEAR_ATTN, + GGML_OP_RWKV_WKV7, + GGML_OP_SOLVE_TRI, + + GGML_OP_UNARY, + + GGML_OP_MAP_CUSTOM1, + GGML_OP_MAP_CUSTOM2, + GGML_OP_MAP_CUSTOM3, + + GGML_OP_CUSTOM, + + GGML_OP_CROSS_ENTROPY_LOSS, + GGML_OP_CROSS_ENTROPY_LOSS_BACK, + GGML_OP_OPT_STEP_ADAMW, + GGML_OP_OPT_STEP_SGD, + + GGML_OP_GLU, + + GGML_OP_COUNT, + }; + + enum ggml_unary_op { + GGML_UNARY_OP_ABS, + GGML_UNARY_OP_SGN, + GGML_UNARY_OP_NEG, + GGML_UNARY_OP_STEP, + GGML_UNARY_OP_TANH, + GGML_UNARY_OP_ELU, + GGML_UNARY_OP_RELU, + GGML_UNARY_OP_SIGMOID, + GGML_UNARY_OP_GELU, + GGML_UNARY_OP_GELU_QUICK, + GGML_UNARY_OP_SILU, + GGML_UNARY_OP_HARDSWISH, + GGML_UNARY_OP_HARDSIGMOID, + GGML_UNARY_OP_EXP, + GGML_UNARY_OP_EXPM1, + GGML_UNARY_OP_SOFTPLUS, + GGML_UNARY_OP_GELU_ERF, + GGML_UNARY_OP_XIELU, + GGML_UNARY_OP_FLOOR, + GGML_UNARY_OP_CEIL, + GGML_UNARY_OP_ROUND, + GGML_UNARY_OP_TRUNC, + + GGML_UNARY_OP_COUNT, + }; + + enum ggml_glu_op { + GGML_GLU_OP_REGLU, + GGML_GLU_OP_GEGLU, + GGML_GLU_OP_SWIGLU, + GGML_GLU_OP_SWIGLU_OAI, + GGML_GLU_OP_GEGLU_ERF, + GGML_GLU_OP_GEGLU_QUICK, + + GGML_GLU_OP_COUNT, + }; + + enum ggml_object_type { + GGML_OBJECT_TYPE_TENSOR, + GGML_OBJECT_TYPE_GRAPH, + GGML_OBJECT_TYPE_WORK_BUFFER + }; + + enum ggml_log_level { + GGML_LOG_LEVEL_NONE = 0, + GGML_LOG_LEVEL_DEBUG = 1, + GGML_LOG_LEVEL_INFO = 2, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_ERROR = 4, + GGML_LOG_LEVEL_CONT = 5, // continue previous log + }; + + // this tensor... + enum ggml_tensor_flag { + GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph + GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph + GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters + GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up) + }; + + enum ggml_tri_type { + GGML_TRI_TYPE_UPPER_DIAG = 0, + GGML_TRI_TYPE_UPPER = 1, + GGML_TRI_TYPE_LOWER_DIAG = 2, + GGML_TRI_TYPE_LOWER = 3 + }; + + struct ggml_init_params { + // memory pool + size_t mem_size; // bytes + void * mem_buffer; // if NULL, memory will be allocated internally + bool no_alloc; // don't allocate memory for the tensor data + }; + + // n-dimensional tensor + struct ggml_tensor { + enum ggml_type type; + + struct ggml_backend_buffer * buffer; + + int64_t ne[GGML_MAX_DIMS]; // number of elements + size_t nb[GGML_MAX_DIMS]; // stride in bytes: + // nb[0] = ggml_type_size(type) + // nb[1] = nb[0] * (ne[0] / ggml_blck_size(type)) + padding + // nb[i] = nb[i-1] * ne[i-1] + + // compute data + enum ggml_op op; + + // op params - allocated as int32_t for alignment + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; + + int32_t flags; + + struct ggml_tensor * src[GGML_MAX_SRC]; + + // source tensor and offset for views + struct ggml_tensor * view_src; + size_t view_offs; + + void * data; + + char name[GGML_MAX_NAME]; + + void * extra; // extra things e.g. for ggml-cuda.cu + + char padding[8]; + }; + + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + + // Abort callback + // If not NULL, called before ggml computation + // If it returns true, the computation is aborted + typedef bool (*ggml_abort_callback)(void * data); + + + // + // GUID + // + + // GUID types + typedef uint8_t ggml_guid[16]; + typedef ggml_guid * ggml_guid_t; + + GGML_API bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b); + + // misc + + GGML_API const char * ggml_version(void); + GGML_API const char * ggml_commit(void); + + GGML_API void ggml_time_init(void); // call this once at the beginning of the program + GGML_API int64_t ggml_time_ms(void); + GGML_API int64_t ggml_time_us(void); + GGML_API int64_t ggml_cycles(void); + GGML_API int64_t ggml_cycles_per_ms(void); + + // accepts a UTF-8 path, even on Windows + GGML_API FILE * ggml_fopen(const char * fname, const char * mode); + + GGML_API void ggml_print_object (const struct ggml_object * obj); + GGML_API void ggml_print_objects(const struct ggml_context * ctx); + + GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); + GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); + GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN + + GGML_API int64_t ggml_blck_size(enum ggml_type type); + GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block + GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row + + GGML_DEPRECATED( + GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float + "use ggml_row_size() instead"); + + GGML_API const char * ggml_type_name(enum ggml_type type); + GGML_API const char * ggml_op_name (enum ggml_op op); + GGML_API const char * ggml_op_symbol(enum ggml_op op); + + GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); + GGML_API const char * ggml_glu_op_name(enum ggml_glu_op op); + GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name + + GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); + + GGML_API bool ggml_is_quantized(enum ggml_type type); + + // TODO: temporary until model loading of ggml examples is refactored + GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + + GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); + GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); + GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars + + // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation) + GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor); + GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() + GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 + GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 + + // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok) + GGML_API bool ggml_is_contiguously_allocated(const struct ggml_tensor * tensor); + + // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN + GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor); + + // true if the elements in dimension 0 are contiguous, or there is just 1 block of elements + GGML_API bool ggml_is_contiguous_rows(const struct ggml_tensor * tensor); + + GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); + GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + + GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1); + + // use this to compute the memory overhead of a tensor + GGML_API size_t ggml_tensor_overhead(void); + + GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes); + + // main + + GGML_API struct ggml_context * ggml_init (struct ggml_init_params params); + GGML_API void ggml_reset(struct ggml_context * ctx); + GGML_API void ggml_free (struct ggml_context * ctx); + + GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); + + GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx); + GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); + + GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx); + GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx); + + GGML_API struct ggml_tensor * ggml_new_tensor( + struct ggml_context * ctx, + enum ggml_type type, + int n_dims, + const int64_t *ne); + + GGML_API struct ggml_tensor * ggml_new_tensor_1d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_new_tensor_2d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1); + + GGML_API struct ggml_tensor * ggml_new_tensor_3d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_new_tensor_4d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes); + + GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src); + + // Context tensor enumeration and lookup + GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx); + GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + + // Converts a flat index into coordinates + GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); + + GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); + GGML_API enum ggml_glu_op ggml_get_glu_op(const struct ggml_tensor * tensor); + + GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); + GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); + + GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); + GGML_ATTRIBUTE_FORMAT(2, 3) + GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...); + + // Tensor flags + GGML_API void ggml_set_input(struct ggml_tensor * tensor); + GGML_API void ggml_set_output(struct ggml_tensor * tensor); + GGML_API void ggml_set_param(struct ggml_tensor * tensor); + GGML_API void ggml_set_loss(struct ggml_tensor * tensor); + + // + // operations on tensors with backpropagation + // + + GGML_API struct ggml_tensor * ggml_dup( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_dup_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_add( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_type type); + + // dst[i0, i1, i2] = a[i0, i1, i2] + b[i0, ids[i1, i2]] + GGML_API struct ggml_tensor * ggml_add_id( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * ids); + + GGML_API struct ggml_tensor * ggml_add1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_add1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // dst = a + // view(dst, nb1, nb2, nb3, offset) += b + // return dst + GGML_API struct ggml_tensor * ggml_acc( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_acc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_sub( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_mul( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_div( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_sqr( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqrt( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_log( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_log_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_expm1( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_expm1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_softplus( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_softplus_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sin( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sin_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_cos( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_cos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // return scalar + GGML_API struct ggml_tensor * ggml_sum( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d] + GGML_API struct ggml_tensor * ggml_sum_rows( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_cumsum( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // mean along rows + GGML_API struct ggml_tensor * ggml_mean( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // argmax along rows + GGML_API struct ggml_tensor * ggml_argmax( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // count number of equal elements in a and b + GGML_API struct ggml_tensor * ggml_count_equal( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // if a is the same shape as b, and a is not parameter, return a + // otherwise, return a new tensor: repeat(a) to fit in b + GGML_API struct ggml_tensor * ggml_repeat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // repeat a to the specified shape + GGML_API struct ggml_tensor * ggml_repeat_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + // sums repetitions in a into shape of b + GGML_API struct ggml_tensor * ggml_repeat_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride + + // concat a and b along dim + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_concat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int dim); + + GGML_API struct ggml_tensor * ggml_abs( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_tanh_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_elu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_relu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_leaky_relu( + struct ggml_context * ctx, + struct ggml_tensor * a, float negative_slope, bool inplace); + + GGML_API struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sigmoid( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_sigmoid_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // GELU using erf (error function) when possible + // some backends may fallback to approximation based on Abramowitz and Stegun formula + GGML_API struct ggml_tensor * ggml_gelu_erf( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_erf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_silu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // a - x + // b - dy + GGML_API struct ggml_tensor * ggml_silu_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // hardswish(x) = x * relu6(x + 3) / 6 + GGML_API struct ggml_tensor * ggml_hardswish( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // hardsigmoid(x) = relu6(x + 3) / 6 + GGML_API struct ggml_tensor * ggml_hardsigmoid( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_exp( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_exp_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_floor( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_floor_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_ceil( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_ceil_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_round( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_round_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + /** + * Truncates the fractional part of each element in the tensor (towards zero). + * For example: trunc(3.7) = 3.0, trunc(-2.9) = -2.0 + * Similar to std::trunc in C/C++. + */ + + GGML_API struct ggml_tensor * ggml_trunc( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_trunc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + + + // xIELU activation function + // x = x * (c_a(alpha_n) + c_b(alpha_p, beta) * sigmoid(beta * x)) + eps * (x > 0) + // where c_a = softplus and c_b(a, b) = softplus(a) + b are constraining functions + // that constrain the positive and negative source alpha values respectively + GGML_API struct ggml_tensor * ggml_xielu( + struct ggml_context * ctx, + struct ggml_tensor * a, + float alpha_n, + float alpha_p, + float beta, + float eps); + + // gated linear unit ops + // A: n columns, r rows, + // result is n / 2 columns, r rows, + // expects gate in second half of row, unless swapped is true + GGML_API struct ggml_tensor * ggml_glu( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_glu_op op, + bool swapped); + + GGML_API struct ggml_tensor * ggml_reglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_reglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_swiglu( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_swiglu_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_erf( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_erf_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_geglu_quick_swapped( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // A: n columns, r rows, + // B: n columns, r rows, + GGML_API struct ggml_tensor * ggml_glu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + enum ggml_glu_op op); + + GGML_API struct ggml_tensor * ggml_reglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_geglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_swiglu_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_geglu_erf_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_geglu_quick_split( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + GGML_API struct ggml_tensor * ggml_swiglu_oai( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float alpha, + float limit); + + // normalize along rows + GGML_API struct ggml_tensor * ggml_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_rms_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_rms_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + // group normalize along ne0*ne1*n_groups + // used in stable-diffusion + GGML_API struct ggml_tensor * ggml_group_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + float eps); + + GGML_API struct ggml_tensor * ggml_group_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_groups, + float eps); + + // l2 normalize along rows + // used in rwkv v7 + GGML_API struct ggml_tensor * ggml_l2_norm( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + GGML_API struct ggml_tensor * ggml_l2_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float eps); + + // a - x + // b - dy + GGML_API struct ggml_tensor * ggml_rms_norm_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float eps); + + // A: k columns, n rows => [ne03, ne02, n, k] + // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k] + // result is n columns, m rows => [ne03 * x, ne02 * y, m, n] + GGML_API struct ggml_tensor * ggml_mul_mat( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // change the precision of a matrix multiplication + // set to GGML_PREC_F32 for higher precision (useful for phi-2) + GGML_API void ggml_mul_mat_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec); + + // indirect matrix multiplication + GGML_API struct ggml_tensor * ggml_mul_mat_id( + struct ggml_context * ctx, + struct ggml_tensor * as, + struct ggml_tensor * b, + struct ggml_tensor * ids); + + // A: m columns, n rows, + // B: p columns, n rows, + // result is m columns, p rows + GGML_API struct ggml_tensor * ggml_out_prod( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // + // operations on tensors without backpropagation + // + + GGML_API struct ggml_tensor * ggml_scale( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_scale_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s); + + // x = s * a + b + GGML_API struct ggml_tensor * ggml_scale_bias( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s, + float b); + + GGML_API struct ggml_tensor * ggml_scale_bias_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float s, + float b); + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_API struct ggml_tensor * ggml_set( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); // in bytes + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_API struct ggml_tensor * ggml_set_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t nb2, + size_t nb3, + size_t offset); // in bytes + + GGML_API struct ggml_tensor * ggml_set_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); // in bytes + + GGML_API struct ggml_tensor * ggml_set_1d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t offset); // in bytes + + // b -> view(a,offset,nb1,nb2,3), return modified a + GGML_API struct ggml_tensor * ggml_set_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); // in bytes + + // b -> view(a,offset,nb1,nb2,3), return view(a) + GGML_API struct ggml_tensor * ggml_set_2d_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + size_t nb1, + size_t offset); // in bytes + + // a -> b, return view(b) + GGML_API struct ggml_tensor * ggml_cpy( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // note: casting from f32 to i32 will discard the fractional part + GGML_API struct ggml_tensor * ggml_cast( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_type type); + + // make contiguous + GGML_API struct ggml_tensor * ggml_cont( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // make contiguous, with new shape + GGML_API struct ggml_tensor * ggml_cont_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_cont_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1); + + GGML_API struct ggml_tensor * ggml_cont_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_cont_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + // return view(a), b specifies the new shape + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0); + + GGML_API struct ggml_tensor * ggml_reshape_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1); + + // return view(a) + // TODO: when we start computing gradient, make a copy instead of view + GGML_API struct ggml_tensor * ggml_reshape_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2); + + GGML_API struct ggml_tensor * ggml_reshape_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3); + + // offset in bytes + GGML_API struct ggml_tensor * ggml_view_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + size_t nb1, // row stride in bytes + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t offset); + + GGML_API struct ggml_tensor * ggml_view_4d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + size_t nb1, // row stride in bytes + size_t nb2, // slice stride in bytes + size_t nb3, + size_t offset); + + GGML_API struct ggml_tensor * ggml_permute( + struct ggml_context * ctx, + struct ggml_tensor * a, + int axis0, + int axis1, + int axis2, + int axis3); + + // alias for ggml_permute(ctx, a, 1, 0, 2, 3) + GGML_API struct ggml_tensor * ggml_transpose( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // supports 4D a: + // a [n_embd, ne1, ne2, ne3] + // b I32 [n_rows, ne2, ne3, 1] + // + // return [n_embd, n_rows, ne2, ne3] + GGML_API struct ggml_tensor * ggml_get_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, // data + struct ggml_tensor * b); // row indices + + GGML_API struct ggml_tensor * ggml_get_rows_back( + struct ggml_context * ctx, + struct ggml_tensor * a, // gradients of ggml_get_rows result + struct ggml_tensor * b, // row indices + struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape + + // a TD [n_embd, ne1, ne2, ne3] + // b TS [n_embd, n_rows, ne02, ne03] | ne02 == ne2, ne03 == ne3 + // c I64 [n_rows, ne11, ne12, 1] | c[i] in [0, ne1) + // + // undefined behavior if destination rows overlap + // + // broadcast: + // ne2 % ne11 == 0 + // ne3 % ne12 == 0 + // + // return view(a) + GGML_API struct ggml_tensor * ggml_set_rows( + struct ggml_context * ctx, + struct ggml_tensor * a, // destination + struct ggml_tensor * b, // source + struct ggml_tensor * c); // row indices + + GGML_API struct ggml_tensor * ggml_diag( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // set elements above the diagonal to -INF + GGML_API struct ggml_tensor * ggml_diag_mask_inf( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // set elements above the diagonal to 0 + GGML_API struct ggml_tensor * ggml_diag_mask_zero( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + int n_past); + + GGML_API struct ggml_tensor * ggml_soft_max( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // a [ne0, ne01, ne02, ne03] + // mask [ne0, ne11, ne12, ne13] | ne11 >= ne01, F16 or F32, optional + // + // broadcast: + // ne02 % ne12 == 0 + // ne03 % ne13 == 0 + // + // fused soft_max(a*scale + mask*(ALiBi slope)) + // max_bias = 0.0f for no ALiBi + GGML_API struct ggml_tensor * ggml_soft_max_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * mask, + float scale, + float max_bias); + + GGML_API struct ggml_tensor * ggml_soft_max_ext_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * mask, + float scale, + float max_bias); + + GGML_API void ggml_soft_max_add_sinks( + struct ggml_tensor * a, + struct ggml_tensor * sinks); + + GGML_API struct ggml_tensor * ggml_soft_max_ext_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float scale, + float max_bias); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_soft_max_ext_back_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + float scale, + float max_bias); + + // rotary position embedding + // if (mode & 1) - skip n_past elements (NOT SUPPORTED) + // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style + // + // b is an int32 vector with size a->ne[2], it contains the positions + GGML_API struct ggml_tensor * ggml_rope( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode); + + // custom RoPE + // c is freq factors (e.g. phi3-128k), (optional) + GGML_API struct ggml_tensor * ggml_rope_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + GGML_API struct ggml_tensor * ggml_rope_multi( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int sections[GGML_MROPE_SECTIONS], + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_rope_ext_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + GGML_API struct ggml_tensor * ggml_rope_multi_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int sections[GGML_MROPE_SECTIONS], + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow), + "use ggml_rope_ext instead"); + + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int n_dims, + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow), + "use ggml_rope_ext_inplace instead"); + + // compute correction dims for YaRN RoPE scaling + GGML_API void ggml_rope_yarn_corr_dims( + int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]); + + // rotary position embedding backward, i.e compute dx from dy + // a - dy + GGML_API struct ggml_tensor * ggml_rope_ext_back( + struct ggml_context * ctx, + struct ggml_tensor * a, // gradients of ggml_rope result + struct ggml_tensor * b, // positions + struct ggml_tensor * c, // freq factors + int n_dims, + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + GGML_API struct ggml_tensor * ggml_rope_multi_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + int n_dims, + int sections[4], + int mode, + int n_ctx_orig, + float freq_base, + float freq_scale, + float ext_factor, + float attn_factor, + float beta_fast, + float beta_slow); + + + // clamp + // in-place, returns view(a) + GGML_API struct ggml_tensor * ggml_clamp( + struct ggml_context * ctx, + struct ggml_tensor * a, + float min, + float max); + + // im2col + // converts data into a format that effectively results in a convolution when combined with matrix multiplication + GGML_API struct ggml_tensor * ggml_im2col( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1, // dilation dimension 1 + bool is_2D, + enum ggml_type dst_type); + + GGML_API struct ggml_tensor * ggml_im2col_back( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // gradient of im2col output + int64_t * ne, // shape of im2col input + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1, // dilation dimension 1 + bool is_2D); + + GGML_API struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0); // dilation + + // conv_1d with padding = half + // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) + GGML_API struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s, // stride + int d); // dilation + + // depthwise + // TODO: this is very likely wrong for some cases! - needs more testing + GGML_API struct ggml_tensor * ggml_conv_1d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0); // dilation + + GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int d0); // dilation + + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0); // dilation + + GGML_API struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + + GGML_API struct ggml_tensor * ggml_im2col_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int64_t IC, + int s0, // stride width + int s1, // stride height + int s2, // stride depth + int p0, // padding width + int p1, // padding height + int p2, // padding depth + int d0, // dilation width + int d1, // dilation height + int d2, // dilation depth + enum ggml_type dst_type); + + // a: [OC*IC, KD, KH, KW] + // b: [N*IC, ID, IH, IW] + // result: [N*OC, OD, OH, OW] + GGML_API struct ggml_tensor * ggml_conv_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int64_t IC, + int s0, // stride width + int s1, // stride height + int s2, // stride depth + int p0, // padding width + int p1, // padding height + int p2, // padding depth + int d0, // dilation width + int d1, // dilation height + int d2 // dilation depth + ); + + // kernel size is a->ne[0] x a->ne[1] + // stride is equal to kernel size + // padding is zero + // example: + // a: 16 16 3 768 + // b: 1024 1024 3 1 + // res: 64 64 768 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // kernel size is a->ne[0] x a->ne[1] + // stride is 1 + // padding is half + // example: + // a: 3 3 256 256 + // b: 64 64 256 1 + // res: 64 64 256 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // depthwise (via im2col and mul_mat) + GGML_API struct ggml_tensor * ggml_conv_2d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + + // Depthwise 2D convolution + // may be faster than ggml_conv_2d_dw, but not available in all backends + // a: KW KH 1 C convolution kernel + // b: W H C N input data + // res: W_out H_out C N + GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride0, + int stride1, + int pad0, + int pad1, + int dilation0, + int dilation1); + + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int stride); + + GGML_API struct ggml_tensor * ggml_conv_2d_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel [KW, KH, IC, OC] + struct ggml_tensor * b, // input data [W, H, C, N] + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + + GGML_API struct ggml_tensor * ggml_conv_3d_direct( + struct ggml_context * ctx, + struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC] + struct ggml_tensor * b, // input [W, H, D, C * N] + int s0, // stride + int s1, + int s2, + int p0, // padding + int p1, + int p2, + int d0, // dilation + int d1, + int d2, + int n_channels, + int n_batch, + int n_channels_out); + + enum ggml_op_pool { + GGML_OP_POOL_MAX, + GGML_OP_POOL_AVG, + GGML_OP_POOL_COUNT, + }; + + GGML_API struct ggml_tensor * ggml_pool_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, // kernel size + int s0, // stride + int p0); // padding + + // the result will have 2*p0 padding for the first dimension + // and 2*p1 padding for the second dimension + GGML_API struct ggml_tensor * ggml_pool_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + float p0, + float p1); + + GGML_API struct ggml_tensor * ggml_pool_2d_back( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * af, // "a"/input used in forward pass + enum ggml_op_pool op, + int k0, + int k1, + int s0, + int s1, + float p0, + float p1); + + enum ggml_scale_mode { + GGML_SCALE_MODE_NEAREST = 0, + GGML_SCALE_MODE_BILINEAR = 1, + GGML_SCALE_MODE_BICUBIC = 2, + + GGML_SCALE_MODE_COUNT + }; + + enum ggml_scale_flag { + GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8), + GGML_SCALE_FLAG_ANTIALIAS = (1 << 9), + }; + + // interpolate + // multiplies ne0 and ne1 by scale factor + GGML_API struct ggml_tensor * ggml_upscale( + struct ggml_context * ctx, + struct ggml_tensor * a, + int scale_factor, + enum ggml_scale_mode mode); + + // interpolate + // interpolate scale to specified dimensions + GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_upscale_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + int ne0, + int ne1, + int ne2, + int ne3, + enum ggml_scale_mode mode), + "use ggml_interpolate instead"); + + // Up- or downsamples the input to the specified size. + // 2D scale modes (eg. bilinear) are applied to the first two dimensions. + GGML_API struct ggml_tensor * ggml_interpolate( + struct ggml_context * ctx, + struct ggml_tensor * a, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + uint32_t mode); // ggml_scale_mode [ | ggml_scale_flag...] + + // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0] + GGML_API struct ggml_tensor * ggml_pad( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1, + int p2, + int p3); + + // pad each dimension with values on the other side of the torus (looping around) + GGML_API struct ggml_tensor * ggml_pad_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1, + int p2, + int p3); + + GGML_API struct ggml_tensor * ggml_pad_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3 + ); + + // pad each dimension with values on the other side of the torus (looping around) + GGML_API struct ggml_tensor * ggml_pad_ext_circular( + struct ggml_context * ctx, + struct ggml_tensor * a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3); + + // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c] + GGML_API struct ggml_tensor * ggml_pad_reflect_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + int p0, + int p1); + + // Move tensor elements by an offset given for each dimension. Elements that + // are shifted beyond the last position are wrapped around to the beginning. + GGML_API struct ggml_tensor * ggml_roll( + struct ggml_context * ctx, + struct ggml_tensor * a, + int shift0, + int shift1, + int shift2, + int shift3); + + // Convert matrix into a triangular one (upper, strict upper, lower or strict lower) by writing + // zeroes everywhere outside the masked area + GGML_API struct ggml_tensor * ggml_tri( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_tri_type type); + + // Fill tensor a with constant c + GGML_API struct ggml_tensor * ggml_fill( + struct ggml_context * ctx, + struct ggml_tensor * a, + float c); + + GGML_API struct ggml_tensor * ggml_fill_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + float c); + + // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151 + // timesteps: [N,] + // return: [N, dim] + GGML_API struct ggml_tensor * ggml_timestep_embedding( + struct ggml_context * ctx, + struct ggml_tensor * timesteps, + int dim, + int max_period); + + // sort rows + enum ggml_sort_order { + GGML_SORT_ORDER_ASC, + GGML_SORT_ORDER_DESC, + }; + + GGML_API struct ggml_tensor * ggml_argsort( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_sort_order order); + + // similar to ggml_top_k but implemented as `argsort` + `view` + GGML_API struct ggml_tensor * ggml_argsort_top_k( + struct ggml_context * ctx, + struct ggml_tensor * a, + int k); + + // top k elements per row + // note: the resulting top k indices are in no particular order + GGML_API struct ggml_tensor * ggml_top_k( + struct ggml_context * ctx, + struct ggml_tensor * a, + int k); + + GGML_API struct ggml_tensor * ggml_arange( + struct ggml_context * ctx, + float start, + float stop, + float step); + + // q: [n_embd_k, n_batch, n_head, ne3 ] + // k: [n_embd_k, n_kv, n_head_kv, ne3 ] + // v: [n_embd_v, n_kv, n_head_kv, ne3 ] !! not transposed !! + // mask: [n_kv, n_batch, ne32, ne33] + // res: [n_embd_v, n_head, n_batch, ne3 ] !! permuted !! + // + // broadcast: + // n_head % n_head_kv == 0 + // n_head % ne32 == 0 + // ne3 % ne33 == 0 + // + GGML_API struct ggml_tensor * ggml_flash_attn_ext( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * mask, + float scale, + float max_bias, + float logit_softcap); + + GGML_API void ggml_flash_attn_ext_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec); + + GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec( + const struct ggml_tensor * a); + + GGML_API void ggml_flash_attn_ext_add_sinks( + struct ggml_tensor * a, + struct ggml_tensor * sinks); + + // TODO: needs to be adapted to ggml_flash_attn_ext + GGML_API struct ggml_tensor * ggml_flash_attn_back( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * d, + bool masked); + + GGML_API struct ggml_tensor * ggml_ssm_conv( + struct ggml_context * ctx, + struct ggml_tensor * sx, + struct ggml_tensor * c); + + GGML_API struct ggml_tensor * ggml_ssm_scan( + struct ggml_context * ctx, + struct ggml_tensor * s, + struct ggml_tensor * x, + struct ggml_tensor * dt, + struct ggml_tensor * A, + struct ggml_tensor * B, + struct ggml_tensor * C, + struct ggml_tensor * ids); + + // partition into non-overlapping windows with padding if needed + // example: + // a: 768 64 64 1 + // w: 14 + // res: 768 14 14 25 + // used in sam + GGML_API struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w); + + // reverse of ggml_win_part + // used in sam + GGML_API struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w0, + int h0, + int w); + + GGML_API struct ggml_tensor * ggml_unary( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + + GGML_API struct ggml_tensor * ggml_unary_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + enum ggml_unary_op op); + + // used in sam + GGML_API struct ggml_tensor * ggml_get_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + int qh, + int kh); + + // used in sam + GGML_API struct ggml_tensor * ggml_add_rel_pos( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * pw, + struct ggml_tensor * ph); + + GGML_API struct ggml_tensor * ggml_rwkv_wkv6( + struct ggml_context * ctx, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * r, + struct ggml_tensor * tf, + struct ggml_tensor * td, + struct ggml_tensor * state); + + GGML_API struct ggml_tensor * ggml_gated_linear_attn( + struct ggml_context * ctx, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * q, + struct ggml_tensor * g, + struct ggml_tensor * state, + float scale); + + GGML_API struct ggml_tensor * ggml_rwkv_wkv7( + struct ggml_context * ctx, + struct ggml_tensor * r, + struct ggml_tensor * w, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * state); + + /* Solves a specific equation of the form Ax=B, where A is a triangular matrix + * without zeroes on the diagonal (i.e. invertible). + * B can have any number of columns, but must have the same number of rows as A + * If A is [n, n] and B is [n, m], then the result will be [n, m] as well + * Has O(n^3) complexity (unlike most matrix ops out there), so use on cases + * where n > 100 sparingly, pre-chunk if necessary. + * + * If left = false, solves xA=B instead + * If lower = false, assumes upper triangular instead + * If uni = true, assumes diagonal of A to be all ones (will override actual values) + * + * TODO: currently only lower, right, non-unitriangular variant is implemented + */ + GGML_API struct ggml_tensor * ggml_solve_tri( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + bool left, + bool lower, + bool uni); + + // custom operators + + typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); + typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); + typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); + +#define GGML_N_TASKS_MAX (-1) + // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks + + GGML_API struct ggml_tensor * ggml_map_custom1( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom1_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + ggml_custom1_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom2_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + ggml_custom2_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_map_custom3_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + struct ggml_tensor * c, + ggml_custom3_op_t fun, + int n_tasks, + void * userdata); + + typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata); + + GGML_API struct ggml_tensor * ggml_custom_4d( + struct ggml_context * ctx, + enum ggml_type type, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int64_t ne3, + struct ggml_tensor ** args, + int n_args, + ggml_custom_op_t fun, + int n_tasks, + void * userdata); + + GGML_API struct ggml_tensor * ggml_custom_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor ** args, + int n_args, + ggml_custom_op_t fun, + int n_tasks, + void * userdata); + + // loss function + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss( + struct ggml_context * ctx, + struct ggml_tensor * a, // logits + struct ggml_tensor * b); // labels + + GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( + struct ggml_context * ctx, + struct ggml_tensor * a, // logits + struct ggml_tensor * b, // labels + struct ggml_tensor * c); // gradients of cross_entropy_loss result + + // AdamW optimizer step + // Paper: https://arxiv.org/pdf/1711.05101v3.pdf + // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html + GGML_API struct ggml_tensor * ggml_opt_step_adamw( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * grad, + struct ggml_tensor * m, + struct ggml_tensor * v, + struct ggml_tensor * adamw_params); // parameters such as the learning rate + + // stochastic gradient descent step (with weight decay) + GGML_API struct ggml_tensor * ggml_opt_step_sgd( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * grad, + struct ggml_tensor * sgd_params); // alpha, weight decay + + // + // automatic differentiation + // + + GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + GGML_API void ggml_build_backward_expand( + struct ggml_context * ctx, // context for gradient computation + struct ggml_cgraph * cgraph, + struct ggml_tensor ** grad_accs); + + // graph allocation in a context + GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false + GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); + GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads); + GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1 + GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); + + GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph); + GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i] + GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph); + GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph); + + GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); + + GGML_API size_t ggml_graph_overhead(void); + GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); + + GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name); + GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node); + GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node); + + // print info and performance information for the graph + GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); + + // dump the graph into a file using the dot format + GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); + + // TODO these functions were sandwiched in the old optimization interface, is there a better place for them? + typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); + + // Set callback for all future logging events. + // If this is not called, or NULL is supplied, everything is output on stderr. + GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data); + GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); + + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); + + // + // quantization + // + + // - ggml_quantize_init can be called multiple times with the same type + // it will only initialize the quantization tables for the first call or after ggml_quantize_free + // automatically called by ggml_quantize_chunk for convenience + // + // - ggml_quantize_free will free any memory allocated by ggml_quantize_init + // call this at the end of the program to avoid memory leaks + // + // note: these are thread-safe + // + GGML_API void ggml_quantize_init(enum ggml_type type); + GGML_API void ggml_quantize_free(void); + + // some quantization type cannot be used without an importance matrix + GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type); + + // calls ggml_quantize_init internally (i.e. can allocate memory) + GGML_API size_t ggml_quantize_chunk( + enum ggml_type type, + const float * src, + void * dst, + int64_t start, + int64_t nrows, + int64_t n_per_row, + const float * imatrix); + +#ifdef __cplusplus + // restrict not standard in C++ +# if defined(__GNUC__) +# define GGML_RESTRICT __restrict__ +# elif defined(__clang__) +# define GGML_RESTRICT __restrict +# elif defined(_MSC_VER) +# define GGML_RESTRICT __restrict +# else +# define GGML_RESTRICT +# endif +#else +# if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L) +# define GGML_RESTRICT __restrict +# else +# define GGML_RESTRICT restrict +# endif +#endif + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); + + struct ggml_type_traits { + const char * type_name; + int64_t blck_size; + int64_t blck_size_interleave; // interleave elements in blocks + size_t type_size; + bool is_quantized; + ggml_to_float_t to_float; + ggml_from_float_t from_float_ref; + }; + + GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); + + // ggml threadpool + // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend + // the goal should be to create an API that other backends can use move everything to the ggml base + + // scheduling priorities + enum ggml_sched_priority { + GGML_SCHED_PRIO_LOW = -1, + GGML_SCHED_PRIO_NORMAL, + GGML_SCHED_PRIO_MEDIUM, + GGML_SCHED_PRIO_HIGH, + GGML_SCHED_PRIO_REALTIME + }; + + // threadpool params + // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults + struct ggml_threadpool_params { + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) + int n_threads; // number of threads + enum ggml_sched_priority prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state + }; + + struct ggml_threadpool; // forward declaration, see ggml.c + + typedef struct ggml_threadpool * ggml_threadpool_t; + + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/ggml/include/gguf.h b/ggml/whispercpp/third_party/ggml/include/gguf.h new file mode 100644 index 0000000..79ee202 --- /dev/null +++ b/ggml/whispercpp/third_party/ggml/include/gguf.h @@ -0,0 +1,202 @@ +// This file contains functionality related to "GGUF" files, the binary file format used by ggml. +// GGUF files have the following structure: +// +// 1. File magic "GGUF" (4 bytes). +// 2. File version (uint32_t). +// 3. Number of ggml tensors in file (int64_t). +// 4. Number of key-value-pairs in file (int64_t). +// 5. For each KV pair: +// 1. The key (string). +// 2. The value type (gguf_type). +// 3a. If the value type is GGUF_TYPE_ARRAY: +// 1. The type of the array (gguf_type). +// 2. The number of elements in the array (uint64_t). +// 3. The binary representation of each element in the array. +// 3b. Otherwise: +// 1. The binary representation of the value. +// 6. For each ggml tensor: +// 1. The tensor name (string). +// 2. The number of dimensions of the tensor (uint32_t). +// 3. For each dimension: +// 1. The size of the tensor in the dimension (int64_t). +// 4. The tensor data type (ggml_type). +// 5. The tensor data offset in the tensor data binary blob (uint64_t). +// 7. The tensor data binary blob (optional, aligned). +// +// Strings are serialized as the string length (uint64_t) followed by the C string without the null terminator. +// All enums are stored as int32_t. +// All bool values are stored as int8_t. +// If the special key "general.alignment" (uint32_t) is defined it is used for alignment, +// otherwise GGUF_DEFAULT_ALIGNMENT is used. +// +// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de) + +#pragma once + +#include "ggml.h" + +#include +#include + +#define GGUF_MAGIC "GGUF" +#define GGUF_VERSION 3 + +#define GGUF_KEY_GENERAL_ALIGNMENT "general.alignment" + +#define GGUF_DEFAULT_ALIGNMENT 32 + +#ifdef __cplusplus +extern "C" { +#endif + + // types that can be stored as GGUF KV data + enum gguf_type { + GGUF_TYPE_UINT8 = 0, + GGUF_TYPE_INT8 = 1, + GGUF_TYPE_UINT16 = 2, + GGUF_TYPE_INT16 = 3, + GGUF_TYPE_UINT32 = 4, + GGUF_TYPE_INT32 = 5, + GGUF_TYPE_FLOAT32 = 6, + GGUF_TYPE_BOOL = 7, + GGUF_TYPE_STRING = 8, + GGUF_TYPE_ARRAY = 9, + GGUF_TYPE_UINT64 = 10, + GGUF_TYPE_INT64 = 11, + GGUF_TYPE_FLOAT64 = 12, + GGUF_TYPE_COUNT, // marks the end of the enum + }; + + struct gguf_context; + + struct gguf_init_params { + bool no_alloc; + + // if not NULL, create a ggml_context and allocate the tensor data in it + struct ggml_context ** ctx; + }; + + GGML_API struct gguf_context * gguf_init_empty(void); + GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); + //GGML_API struct gguf_context * gguf_init_from_buffer(..); + + GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API const char * gguf_type_name(enum gguf_type type); + + GGML_API uint32_t gguf_get_version (const struct gguf_context * ctx); + GGML_API size_t gguf_get_alignment (const struct gguf_context * ctx); + GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx); + + GGML_API int64_t gguf_get_n_kv(const struct gguf_context * ctx); + GGML_API int64_t gguf_find_key(const struct gguf_context * ctx, const char * key); // returns -1 if key is not found + GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int64_t key_id); + + GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int64_t key_id); + GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int64_t key_id); + + // will abort if the wrong type is used for the key + GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int64_t key_id); + GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int64_t key_id); + GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int64_t key_id); + GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int64_t key_id); + GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int64_t key_id); + GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int64_t key_id); + GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int64_t key_id); + GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int64_t key_id); + GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int64_t key_id); + GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int64_t key_id); + GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int64_t key_id); + GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int64_t key_id); + GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int64_t key_id); + GGML_API size_t gguf_get_arr_n (const struct gguf_context * ctx, int64_t key_id); + + // get raw pointer to the first element of the array with the given key_id + // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference) + GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int64_t key_id); + + // get ith C string from array with given key_id + GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int64_t key_id, size_t i); + + GGML_API int64_t gguf_get_n_tensors (const struct gguf_context * ctx); + GGML_API int64_t gguf_find_tensor (const struct gguf_context * ctx, const char * name); // returns -1 if the tensor is not found + GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int64_t tensor_id); + GGML_API const char * gguf_get_tensor_name (const struct gguf_context * ctx, int64_t tensor_id); + GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int64_t tensor_id); + GGML_API size_t gguf_get_tensor_size (const struct gguf_context * ctx, int64_t tensor_id); + + // removes key if it exists, returns id that the key had prior to removal (-1 if it didn't exist) + GGML_API int64_t gguf_remove_key(struct gguf_context * ctx, const char * key); + + // overrides an existing KV pair or adds a new one, the new KV pair is always at the back + GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); + GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); + GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val); + GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t val); + GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val); + GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val); + GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val); + GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val); + GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val); + GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val); + GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val); + GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val); + + // creates a new array with n elements of the given type and copies the corresponding number of bytes from data + GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, size_t n); + + // creates a new array with n strings and copies the corresponding strings from data + GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, size_t n); + + // set or add KV pairs from another context + GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src); + + // add tensor to GGUF context, tensor name must be unique + GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + + // after changing a tensor's type, the offsets of all tensors with higher indices are immediately recalculated + // in such a way that the tensor data remains as one contiguous block (except for padding) + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + + // assumes that at least gguf_get_tensor_size bytes can be read from data + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data); + + // writing gguf files can be done in 3 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname, /*only_meta =*/ false); + // + // - write only the meta data to a file, then re-open the file and append the tensor data: + // + // gguf_write_to_file(ctx, fname, /*only_meta =*/ true); + // FILE * f = fopen(fname, "ab"); + // fwrite(f, ...); // write tensor data + // fclose(f); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // const size_t size_meta = gguf_get_meta_size(ctx); + // fseek(f, size_meta, SEEK_SET); + // fwrite(f, ...); // write tensor data + // void * data = malloc(size_meta); + // gguf_get_meta_data(ctx, data); + // rewind(f); + // fwrite(data, 1, data, f); + // free(data); + // fclose(f); + // + + // write the entire context to a binary file + GGML_API bool gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx); + + // writes the meta data to pointer "data" + GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data); + +#ifdef __cplusplus +} +#endif diff --git a/ggml/whispercpp/third_party/include/whisper.h b/ggml/whispercpp/third_party/include/whisper.h new file mode 100644 index 0000000..f4cc6bf --- /dev/null +++ b/ggml/whispercpp/third_party/include/whisper.h @@ -0,0 +1,741 @@ +#ifndef WHISPER_H +#define WHISPER_H + +#include "ggml.h" +#include "ggml-cpu.h" + +#include +#include +#include + +#ifdef __GNUC__ +# define WHISPER_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define WHISPER_DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define WHISPER_DEPRECATED(func, hint) func +#endif + +#ifdef WHISPER_SHARED +# ifdef _WIN32 +# ifdef WHISPER_BUILD +# define WHISPER_API __declspec(dllexport) +# else +# define WHISPER_API __declspec(dllimport) +# endif +# else +# define WHISPER_API __attribute__ ((visibility ("default"))) +# endif +#else +# define WHISPER_API +#endif + +#define WHISPER_SAMPLE_RATE 16000 +#define WHISPER_N_FFT 400 +#define WHISPER_HOP_LENGTH 160 +#define WHISPER_CHUNK_SIZE 30 + +#ifdef __cplusplus +extern "C" { +#endif + + // + // C interface + // + // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads + // concurrently. + // + // Basic usage: + // + // #include "whisper.h" + // + // ... + // + // whisper_context_params cparams = whisper_context_default_params(); + // + // struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams); + // + // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { + // fprintf(stderr, "failed to process audio\n"); + // return 7; + // } + // + // const int n_segments = whisper_full_n_segments(ctx); + // for (int i = 0; i < n_segments; ++i) { + // const char * text = whisper_full_get_segment_text(ctx, i); + // printf("%s", text); + // } + // + // whisper_free(ctx); + // + // ... + // + // This is a demonstration of the most straightforward usage of the library. + // "pcmf32" contains the RAW audio data in 32-bit floating point format. + // + // The interface also allows for more fine-grained control over the computation, but it requires a deeper + // understanding of how the model works. + // + + struct whisper_context; + struct whisper_state; + struct whisper_full_params; + + typedef int32_t whisper_pos; + typedef int32_t whisper_token; + typedef int32_t whisper_seq_id; + + enum whisper_alignment_heads_preset { + WHISPER_AHEADS_NONE, + WHISPER_AHEADS_N_TOP_MOST, // All heads from the N-top-most text-layers + WHISPER_AHEADS_CUSTOM, + WHISPER_AHEADS_TINY_EN, + WHISPER_AHEADS_TINY, + WHISPER_AHEADS_BASE_EN, + WHISPER_AHEADS_BASE, + WHISPER_AHEADS_SMALL_EN, + WHISPER_AHEADS_SMALL, + WHISPER_AHEADS_MEDIUM_EN, + WHISPER_AHEADS_MEDIUM, + WHISPER_AHEADS_LARGE_V1, + WHISPER_AHEADS_LARGE_V2, + WHISPER_AHEADS_LARGE_V3, + WHISPER_AHEADS_LARGE_V3_TURBO, + }; + + typedef struct whisper_ahead { + int n_text_layer; + int n_head; + } whisper_ahead; + + typedef struct whisper_aheads { + size_t n_heads; + const whisper_ahead * heads; + } whisper_aheads; + + struct whisper_context_params { + bool use_gpu; + bool flash_attn; + int gpu_device; // CUDA device + + // [EXPERIMENTAL] Token-level timestamps with DTW + bool dtw_token_timestamps; + enum whisper_alignment_heads_preset dtw_aheads_preset; + + int dtw_n_top; + struct whisper_aheads dtw_aheads; + + size_t dtw_mem_size; // TODO: remove + }; + + typedef struct whisper_token_data { + whisper_token id; // token id + whisper_token tid; // forced timestamp token id + + float p; // probability of the token + float plog; // log probability of the token + float pt; // probability of the timestamp token + float ptsum; // sum of probabilities of all timestamp tokens + + // token-level timestamp data + // do not use if you haven't computed token-level timestamps + int64_t t0; // start time of the token + int64_t t1; // end time of the token + + // [EXPERIMENTAL] Token-level timestamps with DTW + // do not use if you haven't computed token-level timestamps with dtw + // Roughly corresponds to the moment in audio in which the token was output + int64_t t_dtw; + + float vlen; // voice length of the token + } whisper_token_data; + + typedef struct whisper_model_loader { + void * context; + + size_t (*read)(void * ctx, void * output, size_t read_size); + bool (*eof)(void * ctx); + void (*close)(void * ctx); + } whisper_model_loader; + + // grammar element type + enum whisper_gretype { + // end of rule definition + WHISPER_GRETYPE_END = 0, + + // start of alternate definition for rule + WHISPER_GRETYPE_ALT = 1, + + // non-terminal element: reference to rule + WHISPER_GRETYPE_RULE_REF = 2, + + // terminal element: character (code point) + WHISPER_GRETYPE_CHAR = 3, + + // inverse char(s) ([^a], [^a-b] [^abc]) + WHISPER_GRETYPE_CHAR_NOT = 4, + + // modifies a preceding WHISPER_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to + // be an inclusive range ([a-z]) + WHISPER_GRETYPE_CHAR_RNG_UPPER = 5, + + // modifies a preceding WHISPER_GRETYPE_CHAR or + // WHISPER_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) + WHISPER_GRETYPE_CHAR_ALT = 6, + }; + + typedef struct whisper_grammar_element { + enum whisper_gretype type; + uint32_t value; // Unicode code point or rule ID + } whisper_grammar_element; + + typedef struct whisper_vad_params { + float threshold; // Probability threshold to consider as speech. + int min_speech_duration_ms; // Min duration for a valid speech segment. + int min_silence_duration_ms; // Min silence duration to consider speech as ended. + float max_speech_duration_s; // Max duration of a speech segment before forcing a new segment. + int speech_pad_ms; // Padding added before and after speech segments. + float samples_overlap; // Overlap in seconds when copying audio samples from speech segment. + } whisper_vad_params; + + WHISPER_API const char * whisper_version(void); + + // Various functions for loading a ggml whisper model. + // Allocate (almost) all memory needed for the model. + // Return NULL on failure + WHISPER_API struct whisper_context * whisper_init_from_file_with_params (const char * path_model, struct whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params(void * buffer, size_t buffer_size, struct whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_with_params (struct whisper_model_loader * loader, struct whisper_context_params params); + + // These are the same as the above, but the internal state of the context is not allocated automatically + // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523) + WHISPER_API struct whisper_context * whisper_init_from_file_with_params_no_state (const char * path_model, struct whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_from_buffer_with_params_no_state(void * buffer, size_t buffer_size, struct whisper_context_params params); + WHISPER_API struct whisper_context * whisper_init_with_params_no_state (struct whisper_model_loader * loader, struct whisper_context_params params); + + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_from_file(const char * path_model), + "use whisper_init_from_file_with_params instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size), + "use whisper_init_from_buffer_with_params instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader), + "use whisper_init_with_params instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model), + "use whisper_init_from_file_with_params_no_state instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size), + "use whisper_init_from_buffer_with_params_no_state instead" + ); + WHISPER_DEPRECATED( + WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader), + "use whisper_init_with_params_no_state instead" + ); + + WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx); + + // Given a context, enable use of OpenVINO for encode inference. + // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr, + // the path will be generated from the ggml model path that was passed + // in to whisper_init_from_file. For example, if 'path_model' was + // "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be + // assumed to be "/path/to/ggml-base.en-encoder-openvino.xml". + // device: OpenVINO device to run inference on ("CPU", "GPU", etc.) + // cache_dir: Optional cache directory that can speed up init time, especially for + // GPU, by caching compiled 'blobs' there. + // Set to nullptr if not used. + // Returns 0 on success. If OpenVINO is not enabled in build, this simply returns 1. + WHISPER_API int whisper_ctx_init_openvino_encoder_with_state( + struct whisper_context * ctx, + struct whisper_state * state, + const char * model_path, + const char * device, + const char * cache_dir); + + WHISPER_API int whisper_ctx_init_openvino_encoder( + struct whisper_context * ctx, + const char * model_path, + const char * device, + const char * cache_dir); + + // Frees all allocated memory + WHISPER_API void whisper_free (struct whisper_context * ctx); + WHISPER_API void whisper_free_state(struct whisper_state * state); + WHISPER_API void whisper_free_params(struct whisper_full_params * params); + WHISPER_API void whisper_free_context_params(struct whisper_context_params * params); + + // Convert RAW PCM audio to log mel spectrogram. + // The resulting spectrogram is stored inside the default state of the provided whisper context. + // Returns 0 on success + WHISPER_API int whisper_pcm_to_mel( + struct whisper_context * ctx, + const float * samples, + int n_samples, + int n_threads); + + WHISPER_API int whisper_pcm_to_mel_with_state( + struct whisper_context * ctx, + struct whisper_state * state, + const float * samples, + int n_samples, + int n_threads); + + // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context. + // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram. + // n_mel must be 80 + // Returns 0 on success + WHISPER_API int whisper_set_mel( + struct whisper_context * ctx, + const float * data, + int n_len, + int n_mel); + + WHISPER_API int whisper_set_mel_with_state( + struct whisper_context * ctx, + struct whisper_state * state, + const float * data, + int n_len, + int n_mel); + + // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context. + // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first. + // offset can be used to specify the offset of the first frame in the spectrogram. + // Returns 0 on success + WHISPER_API int whisper_encode( + struct whisper_context * ctx, + int offset, + int n_threads); + + WHISPER_API int whisper_encode_with_state( + struct whisper_context * ctx, + struct whisper_state * state, + int offset, + int n_threads); + + // Run the Whisper decoder to obtain the logits and probabilities for the next token. + // Make sure to call whisper_encode() first. + // tokens + n_tokens is the provided context for the decoder. + // n_past is the number of tokens to use from previous decoder calls. + // Returns 0 on success + // TODO: add support for multiple decoders + WHISPER_API int whisper_decode( + struct whisper_context * ctx, + const whisper_token * tokens, + int n_tokens, + int n_past, + int n_threads); + + WHISPER_API int whisper_decode_with_state( + struct whisper_context * ctx, + struct whisper_state * state, + const whisper_token * tokens, + int n_tokens, + int n_past, + int n_threads); + + // Convert the provided text into tokens. + // The tokens pointer must be large enough to hold the resulting tokens. + // Returns the number of tokens on success, no more than n_max_tokens + // Returns a negative number on failure - the number of tokens that would have been returned + // TODO: not sure if correct + WHISPER_API int whisper_tokenize( + struct whisper_context * ctx, + const char * text, + whisper_token * tokens, + int n_max_tokens); + + // Return the number of tokens in the provided text + // Equivalent to: -whisper_tokenize(ctx, text, NULL, 0) + int whisper_token_count(struct whisper_context * ctx, const char * text); + + // Largest language id (i.e. number of available languages - 1) + WHISPER_API int whisper_lang_max_id(void); + + // Return the id of the specified language, returns -1 if not found + // Examples: + // "de" -> 2 + // "german" -> 2 + WHISPER_API int whisper_lang_id(const char * lang); + + // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found + WHISPER_API const char * whisper_lang_str(int id); + + // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found + WHISPER_API const char * whisper_lang_str_full(int id); + + // Use mel data at offset_ms to try and auto-detect the spoken language + // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first + // Returns the top language id or negative on failure + // If not null, fills the lang_probs array with the probabilities of all languages + // The array must be whisper_lang_max_id() + 1 in size + // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69 + WHISPER_API int whisper_lang_auto_detect( + struct whisper_context * ctx, + int offset_ms, + int n_threads, + float * lang_probs); + + WHISPER_API int whisper_lang_auto_detect_with_state( + struct whisper_context * ctx, + struct whisper_state * state, + int offset_ms, + int n_threads, + float * lang_probs); + + WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length + WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length + WHISPER_API int whisper_n_vocab (struct whisper_context * ctx); + WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx); + WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx); + WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx); + + WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx); + WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx); + WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx); + WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx); + WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx); + WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx); + WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx); + WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx); + WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx); + WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx); + WHISPER_API int whisper_model_ftype (struct whisper_context * ctx); + WHISPER_API int whisper_model_type (struct whisper_context * ctx); + + // Token logits obtained from the last call to whisper_decode() + // The logits for the last token are stored in the last row + // Rows: n_tokens + // Cols: n_vocab + WHISPER_API float * whisper_get_logits (struct whisper_context * ctx); + WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state); + + // Token Id -> String. Uses the vocabulary in the provided context + WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token); + WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx); + + + // Special tokens + WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_solm(struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_nosp(struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_lang(struct whisper_context * ctx, int lang_id); + + // Task tokens + WHISPER_API whisper_token whisper_token_translate (struct whisper_context * ctx); + WHISPER_API whisper_token whisper_token_transcribe(struct whisper_context * ctx); + + // Performance information from the default state. + struct whisper_timings { + float sample_ms; + float encode_ms; + float decode_ms; + float batchd_ms; + float prompt_ms; + }; + WHISPER_API struct whisper_timings * whisper_get_timings(struct whisper_context * ctx); + WHISPER_API void whisper_print_timings(struct whisper_context * ctx); + WHISPER_API void whisper_reset_timings(struct whisper_context * ctx); + + // Print system information + WHISPER_API const char * whisper_print_system_info(void); + + //////////////////////////////////////////////////////////////////////////// + + // Available sampling strategies + enum whisper_sampling_strategy { + WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder + WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder + }; + + // Text segment callback + // Called on every newly generated text segment + // Use the whisper_full_...() functions to obtain the text segments + typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data); + + // Progress callback + typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data); + + // Encoder begin callback + // If not NULL, called before the encoder starts + // If it returns false, the computation is aborted + typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data); + + // Logits filter callback + // Can be used to modify the logits before sampling + // If not NULL, called after applying temperature to logits + typedef void (*whisper_logits_filter_callback)( + struct whisper_context * ctx, + struct whisper_state * state, + const whisper_token_data * tokens, + int n_tokens, + float * logits, + void * user_data); + + // Parameters for the whisper_full() function + // If you change the order or add new parameters, make sure to update the default values in whisper.cpp: + // whisper_full_default_params() + struct whisper_full_params { + enum whisper_sampling_strategy strategy; + + int n_threads; + int n_max_text_ctx; // max tokens to use from past text as prompt for the decoder + int offset_ms; // start offset in ms + int duration_ms; // audio duration to process in ms + + bool translate; + bool no_context; // do not use past transcription (if any) as initial prompt for the decoder + bool no_timestamps; // do not generate timestamps + bool single_segment; // force single segment output (useful for streaming) + bool print_special; // print special tokens (e.g. , , , etc.) + bool print_progress; // print progress information + bool print_realtime; // print results from within whisper.cpp (avoid it, use callback instead) + bool print_timestamps; // print timestamps for each text segment when printing realtime + + // [EXPERIMENTAL] token-level timestamps + bool token_timestamps; // enable token-level timestamps + float thold_pt; // timestamp token probability threshold (~0.01) + float thold_ptsum; // timestamp token sum probability threshold (~0.01) + int max_len; // max segment length in characters + bool split_on_word; // split on word rather than on token (when used with max_len) + int max_tokens; // max tokens per segment (0 = no limit) + + // [EXPERIMENTAL] speed-up techniques + // note: these can significantly reduce the quality of the output + bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel) + int audio_ctx; // overwrite the audio context size (0 = use default) + + // [EXPERIMENTAL] [TDRZ] tinydiarize + bool tdrz_enable; // enable tinydiarize speaker turn detection + + // A regular expression that matches tokens to suppress + const char * suppress_regex; + + // tokens to provide to the whisper decoder as initial prompt + // these are prepended to any existing text context from a previous call + // use whisper_tokenize() to convert text to tokens + // maximum of whisper_n_text_ctx()/2 tokens are used (typically 224) + const char * initial_prompt; + bool carry_initial_prompt; // if true, always prepend initial_prompt to every decode window (may reduce conditioning on previous text) + const whisper_token * prompt_tokens; + int prompt_n_tokens; + + // for auto-detection, set to nullptr, "" or "auto" + const char * language; + bool detect_language; + + // common decoding parameters: + bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89 + bool suppress_nst; // non-speech tokens, ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253 + + float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478 + float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97 + float length_penalty; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L267 + + // fallback parameters + // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L274-L278 + float temperature_inc; + float entropy_thold; // similar to OpenAI's "compression_ratio_threshold" + float logprob_thold; + float no_speech_thold; + + struct { + int best_of; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L264 + } greedy; + + struct { + int beam_size; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/transcribe.py#L265 + + float patience; // TODO: not implemented, ref: https://arxiv.org/pdf/2204.05424.pdf + } beam_search; + + // called for every newly generated text segment + whisper_new_segment_callback new_segment_callback; + void * new_segment_callback_user_data; + + // called on each progress update + whisper_progress_callback progress_callback; + void * progress_callback_user_data; + + // called each time before the encoder starts + whisper_encoder_begin_callback encoder_begin_callback; + void * encoder_begin_callback_user_data; + + // called each time before ggml computation starts + ggml_abort_callback abort_callback; + void * abort_callback_user_data; + + // called by each decoder to filter obtained logits + whisper_logits_filter_callback logits_filter_callback; + void * logits_filter_callback_user_data; + + const whisper_grammar_element ** grammar_rules; + size_t n_grammar_rules; + size_t i_start_rule; + float grammar_penalty; + + // Voice Activity Detection (VAD) params + bool vad; // Enable VAD + const char * vad_model_path; // Path to VAD model + + whisper_vad_params vad_params; + }; + + // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params() + WHISPER_API struct whisper_context_params * whisper_context_default_params_by_ref(void); + WHISPER_API struct whisper_context_params whisper_context_default_params (void); + + WHISPER_API struct whisper_full_params * whisper_full_default_params_by_ref(enum whisper_sampling_strategy strategy); + WHISPER_API struct whisper_full_params whisper_full_default_params (enum whisper_sampling_strategy strategy); + + // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text + // Not thread safe for same context + // Uses the specified decoding strategy to obtain the text. + WHISPER_API int whisper_full( + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples); + + WHISPER_API int whisper_full_with_state( + struct whisper_context * ctx, + struct whisper_state * state, + struct whisper_full_params params, + const float * samples, + int n_samples); + + // Split the input audio in chunks and process each chunk separately using whisper_full_with_state() + // Result is stored in the default state of the context + // Not thread safe if executed in parallel on the same context. + // It seems this approach can offer some speedup in some cases. + // However, the transcription accuracy can be worse at the beginning and end of each chunk. + WHISPER_API int whisper_full_parallel( + struct whisper_context * ctx, + struct whisper_full_params params, + const float * samples, + int n_samples, + int n_processors); + + // Number of generated text segments + // A segment can be a few words, a sentence, or even a paragraph. + WHISPER_API int whisper_full_n_segments (struct whisper_context * ctx); + WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state); + + // Language id associated with the context's default state + WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx); + + // Language id associated with the provided state + WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state); + + // Get the start and end time of the specified segment + WHISPER_API int64_t whisper_full_get_segment_t0 (struct whisper_context * ctx, int i_segment); + WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment); + + WHISPER_API int64_t whisper_full_get_segment_t1 (struct whisper_context * ctx, int i_segment); + WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment); + + // Get whether the next segment is predicted as a speaker turn + WHISPER_API bool whisper_full_get_segment_speaker_turn_next(struct whisper_context * ctx, int i_segment); + WHISPER_API bool whisper_full_get_segment_speaker_turn_next_from_state(struct whisper_state * state, int i_segment); + + // Get the text of the specified segment + WHISPER_API const char * whisper_full_get_segment_text (struct whisper_context * ctx, int i_segment); + WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment); + + // Get number of tokens in the specified segment + WHISPER_API int whisper_full_n_tokens (struct whisper_context * ctx, int i_segment); + WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment); + + // Get the token text of the specified token in the specified segment + WHISPER_API const char * whisper_full_get_token_text (struct whisper_context * ctx, int i_segment, int i_token); + WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token); + + WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token); + WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token); + + // Get token data for the specified token in the specified segment + // This contains probabilities, timestamps, etc. + WHISPER_API whisper_token_data whisper_full_get_token_data (struct whisper_context * ctx, int i_segment, int i_token); + WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token); + + // Get the probability of the specified token in the specified segment + WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token); + WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token); + + // + // Voice Activity Detection (VAD) + // + + struct whisper_vad_context; + + WHISPER_API struct whisper_vad_params whisper_vad_default_params(void); + + struct whisper_vad_context_params { + int n_threads; // The number of threads to use for processing. + bool use_gpu; + int gpu_device; // CUDA device + }; + + WHISPER_API struct whisper_vad_context_params whisper_vad_default_context_params(void); + + WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params(const char * path_model, struct whisper_vad_context_params params); + WHISPER_API struct whisper_vad_context * whisper_vad_init_with_params (struct whisper_model_loader * loader, struct whisper_vad_context_params params); + + WHISPER_API bool whisper_vad_detect_speech( + struct whisper_vad_context * vctx, + const float * samples, + int n_samples); + + WHISPER_API int whisper_vad_n_probs(struct whisper_vad_context * vctx); + WHISPER_API float * whisper_vad_probs (struct whisper_vad_context * vctx); + + struct whisper_vad_segments; + + WHISPER_API struct whisper_vad_segments * whisper_vad_segments_from_probs( + struct whisper_vad_context * vctx, + struct whisper_vad_params params); + + WHISPER_API struct whisper_vad_segments * whisper_vad_segments_from_samples( + struct whisper_vad_context * vctx, + struct whisper_vad_params params, + const float * samples, + int n_samples); + + WHISPER_API int whisper_vad_segments_n_segments(struct whisper_vad_segments * segments); + + WHISPER_API float whisper_vad_segments_get_segment_t0(struct whisper_vad_segments * segments, int i_segment); + WHISPER_API float whisper_vad_segments_get_segment_t1(struct whisper_vad_segments * segments, int i_segment); + + WHISPER_API void whisper_vad_free_segments(struct whisper_vad_segments * segments); + WHISPER_API void whisper_vad_free (struct whisper_vad_context * ctx); + + //////////////////////////////////////////////////////////////////////////// + + // Temporary helpers needed for exposing ggml interface + + WHISPER_API int whisper_bench_memcpy (int n_threads); + WHISPER_API const char * whisper_bench_memcpy_str (int n_threads); + WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads); + WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads); + + // Control logging output; default behavior is to print to stderr + + WHISPER_API void whisper_log_set(ggml_log_callback log_callback, void * user_data); + + // Get the no_speech probability for the specified segment + WHISPER_API float whisper_full_get_segment_no_speech_prob (struct whisper_context * ctx, int i_segment); + WHISPER_API float whisper_full_get_segment_no_speech_prob_from_state(struct whisper_state * state, int i_segment); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a new file mode 100644 index 0000000..c58565f Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a new file mode 100644 index 0000000..3a51b19 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a new file mode 100644 index 0000000..a1a690e Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a new file mode 100644 index 0000000..b78e00f Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a new file mode 100644 index 0000000..f2434b6 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a new file mode 100644 index 0000000..ed939f4 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a new file mode 100644 index 0000000..4ff1a6b Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libcommon.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libcommon.a new file mode 100644 index 0000000..94ed338 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libcommon.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-base.a new file mode 100644 index 0000000..38cf53b Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-base.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a new file mode 100644 index 0000000..a37112a Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml.a new file mode 100644 index 0000000..39a9565 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libwhisper.a new file mode 100644 index 0000000..453f978 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libwhisper.a differ diff --git a/ggml/whispercpp/whispercpp.go b/ggml/whispercpp/whispercpp.go new file mode 100644 index 0000000..783bbcb --- /dev/null +++ b/ggml/whispercpp/whispercpp.go @@ -0,0 +1,160 @@ +//go:build whispercpp + +// Copyright 2025 FootprintAI +// SPDX-License-Identifier: Apache-2.0 + +// Package whispercpp provides Go bindings for whisper.cpp via CGO. +// Build with -tags whispercpp to enable; without the tag, stub implementations are used. +package whispercpp + +/* +#cgo CFLAGS: -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include +#cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include +#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-arm64 +#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64 +#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64 +#cgo LDFLAGS: -lwhisper -lcommon -lggml-cpu -lggml-base -lggml -lstdc++ -lm +#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation +#include +#include "whisper.h" +*/ +import "C" + +import ( + "errors" + "fmt" + "sync" + "time" + "unsafe" +) + +// Segment represents a transcribed audio segment with timing information. +type Segment struct { + Start time.Duration + End time.Duration + Text string +} + +// Model wraps a loaded whisper.cpp model context. +type Model struct { + c *C.struct_whisper_context + mu sync.Mutex +} + +// LoadModel loads a whisper GGML model from path. +func LoadModel(path string, opts ...ModelOption) (*Model, error) { + cfg := defaultModelConfig() + for _, o := range opts { + o(&cfg) + } + + params := C.whisper_context_default_params() + params.use_gpu = C.bool(cfg.useGPU) + params.flash_attn = C.bool(cfg.flashAttn) + + cpath := C.CString(path) + defer C.free(unsafe.Pointer(cpath)) + + ctx := C.whisper_init_from_file_with_params(cpath, params) + if ctx == nil { + return nil, fmt.Errorf("failed to load whisper model: %s", path) + } + return &Model{c: ctx}, nil +} + +// Close frees the model resources. +func (m *Model) Close() { + m.mu.Lock() + defer m.mu.Unlock() + if m.c != nil { + C.whisper_free(m.c) + m.c = nil + } +} + +// IsMultilingual returns true if the model supports multiple languages. +func (m *Model) IsMultilingual() bool { + m.mu.Lock() + defer m.mu.Unlock() + if m.c == nil { + return false + } + return C.whisper_is_multilingual(m.c) != 0 +} + +// Transcribe runs the full whisper pipeline on PCM audio data. +// pcmData must be 16kHz mono float32 samples. +func (m *Model) Transcribe(pcmData []float32, opts ...TranscribeOption) ([]Segment, error) { + m.mu.Lock() + defer m.mu.Unlock() + + if m.c == nil { + return nil, errors.New("model is closed") + } + if len(pcmData) == 0 { + return nil, errors.New("empty audio data") + } + + cfg := defaultTranscribeConfig() + for _, o := range opts { + o(&cfg) + } + + params := C.whisper_full_default_params(C.WHISPER_SAMPLING_GREEDY) + params.n_threads = C.int(cfg.threads) + params.translate = C.bool(cfg.translate) + params.no_timestamps = C.bool(!cfg.timestamps) + params.single_segment = C.bool(cfg.singleSegment) + params.print_special = C.bool(false) + params.print_progress = C.bool(false) + params.print_realtime = C.bool(false) + params.print_timestamps = C.bool(false) + params.token_timestamps = C.bool(cfg.tokenTimestamps) + params.temperature = C.float(cfg.temperature) + params.max_tokens = C.int(cfg.maxTokens) + + if cfg.language != "" { + clang := C.CString(cfg.language) + defer C.free(unsafe.Pointer(clang)) + params.language = clang + } + + if cfg.prompt != "" { + cprompt := C.CString(cfg.prompt) + defer C.free(unsafe.Pointer(cprompt)) + params.initial_prompt = cprompt + } + + rc := C.whisper_full( + m.c, + params, + (*C.float)(unsafe.Pointer(&pcmData[0])), + C.int(len(pcmData)), + ) + if rc != 0 { + return nil, fmt.Errorf("whisper_full failed with code %d", rc) + } + + nSegments := int(C.whisper_full_n_segments(m.c)) + segments := make([]Segment, 0, nSegments) + for i := 0; i < nSegments; i++ { + t0 := int64(C.whisper_full_get_segment_t0(m.c, C.int(i))) + t1 := int64(C.whisper_full_get_segment_t1(m.c, C.int(i))) + text := C.GoString(C.whisper_full_get_segment_text(m.c, C.int(i))) + segments = append(segments, Segment{ + Start: time.Duration(t0) * 10 * time.Millisecond, + End: time.Duration(t1) * 10 * time.Millisecond, + Text: text, + }) + } + + return segments, nil +} + +// LangID returns the language ID for the given language string. +// Returns -1 if not found. +func LangID(lang string) int { + clang := C.CString(lang) + defer C.free(unsafe.Pointer(clang)) + return int(C.whisper_lang_id(clang)) +} diff --git a/ggml/whispercpp/whispercpp_stub.go b/ggml/whispercpp/whispercpp_stub.go new file mode 100644 index 0000000..740863a --- /dev/null +++ b/ggml/whispercpp/whispercpp_stub.go @@ -0,0 +1,65 @@ +//go:build !whispercpp + +// Copyright 2025 FootprintAI +// SPDX-License-Identifier: Apache-2.0 + +// Package whispercpp provides stub implementations when whisper.cpp is not available. +// Build with -tags whispercpp to enable the real implementation. +package whispercpp + +import ( + "errors" + "time" +) + +var errNotAvailable = errors.New("whispercpp: not available, build with -tags whispercpp") + +// Segment represents a transcribed audio segment with timing information. +type Segment struct { + Start time.Duration + End time.Duration + Text string +} + +// Model is a stub. +type Model struct{} + +// LoadModel returns an error without the whispercpp build tag. +func LoadModel(path string, opts ...ModelOption) (*Model, error) { + return nil, errNotAvailable +} + +// Close is a no-op. +func (m *Model) Close() {} + +// IsMultilingual returns false. +func (m *Model) IsMultilingual() bool { return false } + +// Transcribe returns an error without the whispercpp build tag. +func (m *Model) Transcribe(pcmData []float32, opts ...TranscribeOption) ([]Segment, error) { + return nil, errNotAvailable +} + +// LangID returns -1. +func LangID(lang string) int { return -1 } + +// ModelOption configures model loading (stub). +type ModelOption func(*modelConfig) +type modelConfig struct{} + +func WithGPU(enabled bool) ModelOption { return func(*modelConfig) {} } +func WithFlashAttention(enabled bool) ModelOption { return func(*modelConfig) {} } + +// TranscribeOption configures transcription (stub). +type TranscribeOption func(*transcribeConfig) +type transcribeConfig struct{} + +func WithThreads(n int) TranscribeOption { return func(*transcribeConfig) {} } +func WithLanguage(lang string) TranscribeOption { return func(*transcribeConfig) {} } +func WithTranslate(enabled bool) TranscribeOption { return func(*transcribeConfig) {} } +func WithTimestamps(enabled bool) TranscribeOption { return func(*transcribeConfig) {} } +func WithTokenTimestamps(enabled bool) TranscribeOption { return func(*transcribeConfig) {} } +func WithSingleSegment(enabled bool) TranscribeOption { return func(*transcribeConfig) {} } +func WithTemperature(t float32) TranscribeOption { return func(*transcribeConfig) {} } +func WithMaxTokens(n int) TranscribeOption { return func(*transcribeConfig) {} } +func WithPrompt(prompt string) TranscribeOption { return func(*transcribeConfig) {} } diff --git a/ggml/whispercpp/whispercpp_stub_test.go b/ggml/whispercpp/whispercpp_stub_test.go new file mode 100644 index 0000000..68d6703 --- /dev/null +++ b/ggml/whispercpp/whispercpp_stub_test.go @@ -0,0 +1,57 @@ +//go:build !whispercpp + +// Copyright 2025 FootprintAI +// SPDX-License-Identifier: Apache-2.0 + +package whispercpp + +import ( + "testing" +) + +func TestStubLoadModel(t *testing.T) { + _, err := LoadModel("/nonexistent.bin") + if err == nil { + t.Fatal("expected error from stub LoadModel") + } + if err.Error() != "whispercpp: not available, build with -tags whispercpp" { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestStubTranscribe(t *testing.T) { + var m Model + + _, err := m.Transcribe([]float32{0.1, 0.2, 0.3}) + if err == nil { + t.Fatal("expected error from stub Transcribe") + } + + if m.IsMultilingual() { + t.Fatal("expected false from stub IsMultilingual") + } + + // Close should be a no-op + m.Close() +} + +func TestStubLangID(t *testing.T) { + if LangID("en") != -1 { + t.Fatal("expected -1 from stub LangID") + } +} + +func TestStubOptions(t *testing.T) { + // Options should be constructable without error + _ = WithGPU(true) + _ = WithFlashAttention(false) + _ = WithThreads(4) + _ = WithLanguage("en") + _ = WithTranslate(true) + _ = WithTimestamps(true) + _ = WithTokenTimestamps(false) + _ = WithSingleSegment(false) + _ = WithTemperature(0.0) + _ = WithMaxTokens(100) + _ = WithPrompt("test") +}