Add libpsijent fallback

nullspook · nullspook · commit 320779ae4781 · 2026-01-29T20:50:05.000+07:00
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "libpsirngclient"]
 	path = libpsirngclient
 	url = https://github.com/nullspook/libpsirngclient.git
+[submodule "libpsijent"]
+	path = libpsijent
+	url = https://github.com/nullspook/libpsijent.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -256,6 +256,7 @@ endif()
 #
 
 add_subdirectory(libpsirngclient)
+add_subdirectory(libpsijent)
 
 #
 # install
diff --git a/README.md b/README.md
@@ -31,6 +31,12 @@ cd build/bin
 
 **Note:** quantum-llama.cpp currently does not support `-DLLAMA_CURL=ON`.
 
+#### Fallback RNG option
+
+quantum-llama.cpp includes [libpsijent](https://github.com/nullspook/libpsijent.git)
+hardware timing jitter RNG as a fallback if a psirng server is not available.
+Enable it by setting `PSIJENT_FALLBACK=ON` before running `llama-*` programs.
+
 ---
 
 # llama.cpp
@@ -68,7 +74,7 @@ LLM inference in C/C++
 
 Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
 
-- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
+- Install `llama.cpp` using [brew, nix, or winget](docs/install.md)
 - Run with Docker - see our [Docker documentation](docs/docker.md)
 - Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
 - Build from source by cloning this repository - check out [our build guide](docs/build.md)
@@ -94,7 +100,7 @@ The main goal of `llama.cpp` is to enable LLM inference with minimal setup and s
 range of hardware - locally and in the cloud.
 
 - Plain C/C++ implementation without any dependencies
-- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
+- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate, and Metal frameworks
 - AVX, AVX2, AVX512 and AMX support for x86 architectures
 - RVV, ZVFH, ZFH, ZICBOP and ZIHINTPAUSE support for RISC-V architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
@@ -341,7 +347,7 @@ After downloading a model, use the CLI tools to run it locally - see below.
 
 `llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
 
-The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
+The Hugging Face platform provides a variety of online tools for converting, quantizing, and hosting models with `llama.cpp`:
 
 - Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
 - Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
@@ -539,7 +545,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
 - Contributors can open PRs
 - Collaborators will be invited based on contributions
 - Maintainers can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
-- Any help with managing issues, PRs and projects is very appreciated!
+- Any help with managing issues, PRs, and projects is very appreciated!
 - See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
 - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
 - Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
diff --git a/libpsijent b/libpsijent
@@ -0,0 +1 @@
+Subproject commit 16ffd40fba7bed3338f6bc37fed9bfb23e0145c6
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -139,8 +139,8 @@ add_library(llama
             models/xverse.cpp
             models/mistral3.cpp
             models/graph-context-mamba.cpp
-            psirngclient-manager.cpp
-            psirngclient-manager.h
+            psirng-wrapper.cpp
+            psirng-wrapper.h
             )
 
 set_target_properties(llama PROPERTIES
@@ -150,10 +150,10 @@ set_target_properties(llama PROPERTIES
 )
 
 target_include_directories(llama PRIVATE .)
-target_include_directories(llama PUBLIC ../include ../libpsirngclient/src)
+target_include_directories(llama PUBLIC ../include ../libpsirngclient/src ../libpsijent/src)
 target_compile_features   (llama PRIVATE cxx_std_17) # don't bump
 
-target_link_libraries(llama PUBLIC ggml psirngclient)
+target_link_libraries(llama PUBLIC ggml psirngclient psijent)
 
 if (BUILD_SHARED_LIBS)
     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
@@ -3,7 +3,7 @@
 #include "llama-impl.h"
 #include "llama-vocab.h"
 #include "llama-grammar.h"
-#include "psirngclient-manager.h"
+#include "psirng-wrapper.h"
 
 #include "ggml-cpp.h"
 
@@ -216,11 +216,7 @@ static void llama_token_data_array_partial_sort_inplace(llama_token_data_array *
 }
 
 static int llama_sample_dist(llama_token_data_array * cur_p, std::mt19937 & rng) {
-    double chance;
-    int rand_result = psirngclient_randuniform(psirngclient_manager::get_psirngclient(), &chance, 1, 0.0, 1.0);
-    if (rand_result != PSIRNGCLIENT_RESULT_OK) {
-        GGML_ABORT("%s: psirngclient_randuniform error: %d", __func__, rand_result);
-    }
+    const double chance = psirng_wrapper::uniform01();
 
     double cumulative = 0.0;
     for (size_t i = 0; i < cur_p->size; ++i) {
@@ -1060,11 +1056,7 @@ static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_da
     // sample from the obtained probabilities and normalize the probs in a single pass
     // this is ~3x faster on Mac with full gpt-oss vocab than the version below
     //
-    double rnd;
-    int rand_result = psirngclient_randuniform(psirngclient_manager::get_psirngclient(), &rnd, 1, 0.0, 1.0);
-    if (rand_result != PSIRNGCLIENT_RESULT_OK) {
-        GGML_ABORT("%s: psirngclient_randuniform error: %d", __func__, rand_result);
-    }
+    const double rnd = psirng_wrapper::uniform01();
           double sum_run = 0.0f;
     const double sum_tgt = sum_cum*rnd;
 
@@ -2140,12 +2132,7 @@ static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data
         return;
     }
 
-    double chance;
-    int rand_result = psirngclient_randuniform(psirngclient_manager::get_psirngclient(), &chance, 1, 0.0, 1.0);
-    if (rand_result != PSIRNGCLIENT_RESULT_OK) {
-        GGML_ABORT("%s: psirngclient_randuniform error: %d", __func__, rand_result);
-    }
-    if (chance > ctx->probability) {
+    if (double chance = psirng_wrapper::uniform01(); chance > ctx->probability) {
         return;
     }
 
diff --git a/src/psirng-wrapper.cpp b/src/psirng-wrapper.cpp
@@ -0,0 +1,111 @@
+#include "psirng-wrapper.h"
+
+#include <algorithm>
+#include <cstdlib>
+#include <stdexcept>
+#include <string>
+
+psirng_wrapper & psirng_wrapper::instance() {
+    static psirng_wrapper instance;
+    return instance;
+}
+
+psirng_wrapper::psirng_wrapper() {
+    const char* psirng_host      = std::getenv("PSIRNG_HOST");
+    const char* psirng_grpc_port = std::getenv("PSIRNG_GRPC_PORT");
+    const char* psirng_cert_path = std::getenv("PSIRNG_CERT_PATH");
+
+    bool bool_psijent_fallback = false;
+    if (const char * psijent_fallback = std::getenv("PSIJENT_FALLBACK")) {
+        std::string str_psijent_fallback(psijent_fallback);
+        std::transform(
+            str_psijent_fallback.begin(),
+            str_psijent_fallback.end(),
+            str_psijent_fallback.begin(),
+            [](const unsigned char c) {
+                return static_cast<char>(std::tolower(c));
+            }
+        );
+        bool_psijent_fallback = str_psijent_fallback == "yes" ||
+                                str_psijent_fallback == "on" ||
+                                str_psijent_fallback == "true" ||
+                                str_psijent_fallback == "1";
+    }
+
+    int result;
+    bool should_init_psijent = false;
+
+    if (psirng_host && psirng_grpc_port && psirng_cert_path) {
+        result = psirngclient_init(&psirngclient_ptr, psirng_host, std::atoi(psirng_grpc_port), psirng_cert_path);
+        if (result != PSIRNGCLIENT_RESULT_OK) {
+            if (bool_psijent_fallback) {
+                should_init_psijent = true;
+            } else {
+                throw std::runtime_error("failed to initialize psirng client: " + std::to_string(result));
+            }
+        }
+
+        if (!psirngclient_ishealthy(psirngclient_ptr)) {
+            psirngclient_free(psirngclient_ptr);
+            if (bool_psijent_fallback) {
+                should_init_psijent = true;
+            } else {
+                throw std::runtime_error("psirng is not healthy");
+            }
+        }
+    } else {
+        if (bool_psijent_fallback) {
+            should_init_psijent = true;
+        } else {
+            throw std::runtime_error("psirng is not configured");
+        }
+    }
+
+    if (should_init_psijent) {
+        result = psijent_init(&psijent_ptr);
+        if (result != PSIJENT_RESULT_OK) {
+            throw std::runtime_error("failed to initialize psijent");
+        }
+
+        result = psijent_start(psijent_ptr);
+        if (result != PSIJENT_RESULT_OK) {
+            psijent_free(psijent_ptr);
+            throw std::runtime_error("failed to start psijent");
+        }
+
+        if (const char * mantissa_length = std::getenv("PSIJENT_MANTISSA_LENGTH")) {
+            psijent_mantissa_length = std::atoi(mantissa_length);
+        }
+    }
+}
+
+psirng_wrapper::~psirng_wrapper() {
+    if (psirngclient_ptr) {
+        psirngclient_free(psirngclient_ptr);
+    }
+
+    if (psijent_ptr) {
+        psijent_free(psijent_ptr);
+    }
+}
+
+double psirng_wrapper::uniform01() {
+    const psirng_wrapper & instance = psirng_wrapper::instance();
+
+    int result;
+    double value = 0.0;
+
+    if (instance.psijent_ptr) {
+        result = psijent_randuniform(instance.psijent_ptr, &value, 1, instance.psijent_mantissa_length);
+        if (result != PSIJENT_RESULT_OK) {
+            throw std::runtime_error("psijent_randuniform failed: " + std::to_string(result));
+        }
+    } else {
+        result = psirngclient_randuniform(instance.psirngclient_ptr, &value, 1, 0.0, 1.0);
+        if (result != PSIRNGCLIENT_RESULT_OK) {
+            throw std::runtime_error("psirngclient_randuniform failed: " + std::to_string(result));
+        }
+    }
+
+    return value;
+}
diff --git a/src/psirng-wrapper.h b/src/psirng-wrapper.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "psirngclient.h"
+#include "psijent.h"
+
+class psirng_wrapper {
+public:
+    ~psirng_wrapper();
+    static double uniform01();
+
+private:
+    psirng_wrapper();
+    static psirng_wrapper& instance();
+    psirngclient* psirngclient_ptr = nullptr;
+    psijent* psijent_ptr = nullptr;
+    int psijent_mantissa_length = 52;
+};
diff --git a/src/psirngclient-manager.cpp b/src/psirngclient-manager.cpp
diff --git a/src/psirngclient-manager.h b/src/psirngclient-manager.h

Original file line number	Diff line number	Diff line change
`@@ -256,6 +256,7 @@ endif()`
`256`	`256`	`#`
`257`	`257`
`258`	`258`	`add_subdirectory(libpsirngclient)`
	`259`	`+add_subdirectory(libpsijent)`
`259`	`260`
`260`	`261`	`#`
`261`	`262`	`# install`