diff --git a/.github/workflows/test-platforms.yml b/.github/workflows/test-platforms.yml
new file mode 100644
index 0000000..d7166de
--- /dev/null
+++ b/.github/workflows/test-platforms.yml
@@ -0,0 +1,59 @@
+name: Build and test on multiple platforms
+
+on:
+ push:
+ pull_request:
+ branches: [main, master, develop]
+
+jobs:
+ test:
+ strategy:
+ fail-fast: false
+ matrix:
+ include:
+ - os: ubuntu-24.04
+ variant: system
+ - os: ubuntu-24.04-arm
+ variant: system
+ - os: ubuntu-24.04
+ variant: standard
+ - os: ubuntu-24.04-arm
+ variant: standard
+ - os: macos-15-intel
+ variant: standard
+ - os: macos-latest
+ variant: standard
+ - os: ubuntu-24.04
+ variant: full
+ - os: ubuntu-24.04-arm
+ variant: full
+
+ runs-on: ${{ matrix.os }}
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Install libffi (Linux)
+ if: runner.os == 'Linux'
+ run: sudo apt-get update && sudo apt-get install -y libffi-dev
+
+ - name: Install libffi (macOS)
+ if: runner.os == 'macOS'
+ run: brew install libffi
+
+ - name: Install system CLD2 library (Linux system profile)
+ if: matrix.variant == 'system'
+ run: sudo apt-get install -y libcld2-0 libcld2-dev
+
+ - name: Install build tools (Linux standard/full profiles)
+ if: runner.os == 'Linux' && matrix.variant != 'system'
+ run: sudo apt-get install -y build-essential
+
+ - name: Set up JDK
+ uses: actions/setup-java@v4
+ with:
+ distribution: 'temurin'
+ java-version: '17'
+
+ - name: Build and test
+ run: mvn clean verify -P${{ matrix.variant }} -q
diff --git a/.gitignore b/.gitignore
index de262cd..ea915b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,8 @@ hs_err_pid*
.classpath
.settings/
+.idea
+
+# Native libraries (use build-native profile to build)
+src/main/resources/linux-*/
+src/main/resources/darwin-*/
diff --git a/README.md b/README.md
index 0f7aac7..ddd4cbe 100644
--- a/README.md
+++ b/README.md
@@ -5,27 +5,62 @@ The [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) is a nativ
## Installation
-### Native Library
+This project supports four build profiles:
-First, the library libcld2.so (or a .dll on Windows) needs to be installed.
+| Profile | Description | Platforms |
+| ------------- | ----------------------------------------------------------------------- | ----------------|
+| *(default)* | No native library bundled. Requires system library or `-Djava.library.path` | Any |
+| `system` | Use system-installed libcld2 (Debian package) | Linux (Debian) |
+| `standard` | Clone and build CLD2 from source, bundle into JAR | Linux, macOS |
+| `full` | Build from source with full language support (160+) | Linux |
-- on Debian-based systems the easiest way is to install the package [libcld-0](https://packages.debian.org/stretch/libcld2-0):
+### System Library (Linux/Debian)
+
+Install the native library via apt:
```
apt-get install libcld2-0 libcld2-dev
```
-- to compile the CLD2 library from source:
+
+Then build with the `system` profile:
+```
+mvn clean verify -Psystem
+```
+
+### Build from Source
+
+For Linux or macOS, use the `standard` profile to clone and build CLD2 from source:
+```
+mvn clean verify -Pstandard
+```
+
+This clones [lfoppiano/CLD2](https://github.com/lfoppiano/CLD2) and builds `libcld2`, then bundles it into the JAR.
+
+**Prerequisites:**
+- Linux: `build-essential`, `git`
+- macOS: Xcode Command Line Tools (includes git, clang++)
+
+### Full Language Support (160+ languages)
+
+The `full` profile is **Linux only**. It builds both `libcld2` and `libcld2_full` from source and uses `LD_PRELOAD` to load the full language tables during testing:
+
```
-git clone https://github.com/CLD2Owners/cld2.git
-cd cld2/internal/
-export CFLAGS="-Wno-narrowing -O3"
-./compile_and_test_all.sh
+mvn clean verify -Pfull
```
-If you only want the libraries, `./compile_libs.sh` is sufficient. You may use different compiler flags, the flag `-Wno-narrowing` is required for compilers which follow the C++11 standard.
+The `libcld2_full` library only contains the classifier tables for 160+ languages — it is not a standalone library. At runtime, use `LD_PRELOAD=libcld2_full.so` to override the standard tables in `libcld2`. For Hadoop Map-Reduce jobs, pass `-Dmapreduce.reduce.env=LD_PRELOAD=libcld2_full.so`.
+
+**Why Linux only?** The macOS equivalent (`DYLD_INSERT_LIBRARIES`) does not work because System Integrity Protection (SIP) strips all `DYLD_*` environment variables from child processes, including the JVM forked by Maven Surefire.
-#### Using the CLD2 Full Version (160+ languages)
+### Using Without Maven Profiles
-Both the Debian package and the source build provide two native libraries: `libcld2.so` and `libcld2_full.so`. The former supports 80+, the latter 160+ languages. However, the `libcld2_full.so` from the Debian package isn't a complete shared library - it only contains the tables used by the classifier. To use the larger tables for 160+ language instead of those for 80+ languages, you must use the [LD_PRELOAD trick](https://stackoverflow.com/questions/426230/what-is-the-ld-preload-trick) and set the environment variable `LD_PRELOAD=libcld2_full.so` (on Linux). In case, the language detector is used in Hadoop Map-Reduce jobs, this can be achieved by setting the Hadoop configuration property `mapreduce.reduce.env`, e.g., by passing `-Dmapreduce.reduce.env=LD_PRELOAD=libcld2_full.so` as command-line argument.
+If not using a profile, you must provide the native library yourself:
+
+1. **Install system library** (see above), then:
+ ```
+ mvn clean verify -Djava.library.path=/usr/lib/x86_64-linux-gnu
+ ```
+
+2. **Or use JNA's classpath loading**: Place `libcld2.so`/`libcld2.dylib` on the classpath and JNA will find it.
### Java Bindings
@@ -43,29 +78,20 @@ and can then be used as dependency
```
-To link the Java code with the native libraries, you need to make sure that Java can find the share object:
+To link the Java code with the native libraries when using the default build (without profiles), you need to make sure that Java can find the shared object:
- either install the native library on a standard library path (already done when the Debian package is used)
- add the directory where your libcld2.so installed to the environment variable `LD_LIBRARY_PATH`
- use the Java option `-Djava.library.path=...`
#### Java Native Access (JNA) and libffi
-The CLD2 native functions are accessed via the [Java Native Access (JNA)](https://github.com/java-native-access/jna) which uses the [Foreign Function Interface Library (libffi)](https://sourceware.org/libffi/). JNA is a project dependency but the libffi needs to be present on your system. If not install it, e.g.
-```
-apt-get install libffi6
-```
-
-#### Potential Issues on Other Platforms (Non-Linux)
-
-So far, the bindings have only been tested on Linux.
+The CLD2 native functions are accessed via the [Java Native Access (JNA)](https://github.com/java-native-access/jna) which uses the [Foreign Function Interface Library (libffi)](https://sourceware.org/libffi/). JNA is a project dependency but libffi needs to be present on your system:
+- Linux (Debian/Ubuntu): `apt-get install libffi-dev`
+- macOS: `brew install libffi`
-One potential issue for ports to other platforms is the [mangling of C++ function names](https://en.wikipedia.org/wiki/Name_mangling). Function names called in the native library are registered in [Cld2Library](../blob/master/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java) and [Cld2](../blob/master/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java) using the mangled names, e.g., `_ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPSt6vectorINS_11ResultChunkESaISA_EES7_Pb`. The mangling may work differently on a different platform or when another C++-compiler is used.
+#### Platform Support
-To adopt the Java bindings, you first need to get the mangled names from the shared object. On Linux this could be done by calling
-```
-% nm -D .../libcld2.so.0.0.197
-```
-The mangled function names in the two Java classes need to be replaced by the ones exposed by your native library. Please also see the notes in [Cld2](../blob/master/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java) regarding the creation of the bindings.
+The bindings have been tested on Linux (x86-64, ARM64) and macOS (Intel, Apple Silicon).
## History
diff --git a/pom.xml b/pom.xml
index 21a0ee5..6c19b1d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,6 +23,13 @@
+
+
+ kr.motd.maven
+ os-maven-plugin
+ 1.7.0
+
+
${basedir}/src/test/java
@@ -77,6 +84,176 @@
+
+
+ system
+
+
+ standard
+
+
+
+ org.apache.maven.plugins
+ maven-antrun-plugin
+ 3.1.0
+
+
+ build-cld2
+ generate-resources
+
+ run
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ full
+
+
+
+ org.apache.maven.plugins
+ maven-antrun-plugin
+ 3.1.0
+
+
+ build-cld2
+ generate-resources
+
+ run
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ maven-surefire-plugin
+ 3.5.4
+
+
+ ${project.build.directory}/classes/${os.detected.classifier}/libcld2_full.so
+
+
+
+
+
+
+
+
diff --git a/src/main/java/org/commoncrawl/langdetect/cld2/CLDHints.java b/src/main/java/org/commoncrawl/langdetect/cld2/CLDHints.java
index accbd18..536d3ae 100644
--- a/src/main/java/org/commoncrawl/langdetect/cld2/CLDHints.java
+++ b/src/main/java/org/commoncrawl/langdetect/cld2/CLDHints.java
@@ -57,10 +57,16 @@ public class CLDHints extends Structure {
public int encoding_hint = Encoding.UNKNOWN_ENCODING.value();
/** ITALIAN boosts it */
- public int language_hint = Language.UNKNOWN_LANGUAGE.value();
+ public int language_hint;
- protected static CLDHints NO_HINTS = new CLDHints(null, "",
- Encoding.UNKNOWN_ENCODING.value(), Language.UNKNOWN_LANGUAGE.value());
+ private static CLDHints noHints;
+
+ public static CLDHints getNoHints() {
+ if (noHints == null) {
+ noHints = new CLDHints(null, "", Encoding.UNKNOWN_ENCODING.value(), Language.UNKNOWN_LANGUAGE.value());
+ }
+ return noHints;
+ }
private static final Pattern DOTPATTERN = Pattern.compile("\\.");
diff --git a/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java b/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java
index f47a89d..78d67dc 100644
--- a/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java
+++ b/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java
@@ -18,6 +18,8 @@
import java.nio.charset.StandardCharsets;
+import com.sun.jna.ptr.PointerByReference;
+
/**
* Public interface for the CLD2 library.
*/
@@ -79,7 +81,7 @@ public static Result detect(String text) {
* @return detection result
*/
public static Result detect(byte[] bytes) {
- return detect(bytes, CLDHints.NO_HINTS, 0, true);
+ return detect(bytes, CLDHints.getNoHints(), 0, true);
}
public static Result detect(String text, CLDHints hints) {
@@ -87,7 +89,7 @@ public static Result detect(String text, CLDHints hints) {
}
public static Result detect(String text, boolean isPlainText) {
- return detect(encodeNative(text), CLDHints.NO_HINTS, 0, isPlainText);
+ return detect(encodeNative(text), CLDHints.getNoHints(), 0, isPlainText);
}
public static Result detect(byte[] bytes, CLDHints hints) {
@@ -127,7 +129,7 @@ public static Result detect(String text, CLDHints hints, int flags,
public static Result detect(byte[] bytes, CLDHints hints, int flags,
boolean isPlainText) {
Result res = new Result();
- int language = Cld2Library.INSTANCE._ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPSt6vectorINS_11ResultChunkESaISA_EES7_Pb(
+ int language = invokeExtDetectLanguageSummary(
bytes,
bytes.length,
isPlainText,
@@ -143,4 +145,20 @@ public static Result detect(byte[] bytes, CLDHints hints, int flags,
return res;
}
+
+ private static int invokeExtDetectLanguageSummary(
+ byte[] buffer, int bufferLength, boolean isPlainText, CLDHints hints, int flags,
+ int[] language3, int[] percent3, double[] normalizedScore3,
+ PointerByReference resultchunkvector, int[] textBytes, boolean[] isReliable) {
+ // Try libstdc++ (Linux) first, then libc++ (macOs)
+ try {
+ return Cld2Library.INSTANCE._ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPSt6vectorINS_11ResultChunkESaISA_EES7_Pb(
+ buffer, bufferLength, isPlainText, hints, flags,
+ language3, percent3, normalizedScore3, resultchunkvector, textBytes, isReliable);
+ } catch (UnsatisfiedLinkError e) {
+ return Cld2Library.INSTANCE._ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPNSt3__16vectorINS_11ResultChunkENS9_9allocatorISB_EEEES7_Pb(
+ buffer, bufferLength, isPlainText, hints, flags,
+ language3, percent3, normalizedScore3, resultchunkvector, textBytes, isReliable);
+ }
+ }
}
diff --git a/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java b/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java
index a356f12..ba90510 100644
--- a/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java
+++ b/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java
@@ -18,7 +18,7 @@
import com.sun.jna.Library;
import com.sun.jna.Native;
-import com.sun.jna.NativeLibrary;
+import com.sun.jna.Platform;
import com.sun.jna.ptr.PointerByReference;
@@ -29,11 +29,21 @@ public interface Cld2Library extends Library {
String JNA_LIBRARY_NAME = "cld2";
- NativeLibrary JNA_NATIVE_LIB = NativeLibrary
- .getInstance(Cld2Library.JNA_LIBRARY_NAME);
+ Cld2Library INSTANCE = loadLibrary();
- Cld2Library INSTANCE = (Cld2Library) Native.load(Cld2Library.JNA_LIBRARY_NAME,
- Cld2Library.class);
+ static Cld2Library loadLibrary() {
+ try {
+ return (Cld2Library) Native.load(JNA_LIBRARY_NAME, Cld2Library.class);
+ } catch (UnsatisfiedLinkError e) {
+ String resourcePath = "/" + Platform.RESOURCE_PREFIX + "/" + JNA_LIBRARY_NAME +
+ (Platform.isMac() ? ".dylib" : ".so");
+ try {
+ return (Cld2Library) Native.load(resourcePath, Cld2Library.class);
+ } catch (UnsatisfiedLinkError e2) {
+ return (Cld2Library) Native.load(JNA_LIBRARY_NAME, Cld2Library.class);
+ }
+ }
+ }
//String LanguageName(int lang);
String _ZN4CLD212LanguageNameENS_8LanguageE(int lang);
@@ -45,6 +55,13 @@ public interface Cld2Library extends Library {
int _ZN4CLD219GetLanguageFromNameEPKc(String src);
//int ExtDetectLanguageSummary(String buffer, int buffer_length, byte is_plain_text, CLDHints cld_hints, int flags, IntBuffer language3, IntBuffer percent3, DoubleBuffer normalized_score3, PointerByReference resultchunkvector, IntBuffer text_bytes, ByteBuffer is_reliable);
+ // libc++ (macOS) uses NSt3__16vector
+ int _ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPNSt3__16vectorINS_11ResultChunkENS9_9allocatorISB_EEEES7_Pb(
+ byte[] buffer, int bufferLength, boolean isPlainText, CLDHints cldHints, int flags,
+ int[] language3, int[] percent3, double[] normalizedScore3,
+ PointerByReference resultchunkvector, int[] textBytes, boolean[] isReliable);
+
+ // libstdc++ (Linux) uses St6vector
int _ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPSt6vectorINS_11ResultChunkESaISA_EES7_Pb(
byte[] buffer, int bufferLength, boolean isPlainText, CLDHints cldHints, int flags,
int[] language3, int[] percent3, double[] normalizedScore3,
@@ -52,5 +69,4 @@ int _ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPS
//String DetectLanguageVersion();
String _ZN4CLD221DetectLanguageVersionEv();
-
}