diff --git a/.github/workflows/test-platforms.yml b/.github/workflows/test-platforms.yml new file mode 100644 index 0000000..d7166de --- /dev/null +++ b/.github/workflows/test-platforms.yml @@ -0,0 +1,59 @@ +name: Build and test on multiple platforms + +on: + push: + pull_request: + branches: [main, master, develop] + +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-24.04 + variant: system + - os: ubuntu-24.04-arm + variant: system + - os: ubuntu-24.04 + variant: standard + - os: ubuntu-24.04-arm + variant: standard + - os: macos-15-intel + variant: standard + - os: macos-latest + variant: standard + - os: ubuntu-24.04 + variant: full + - os: ubuntu-24.04-arm + variant: full + + runs-on: ${{ matrix.os }} + + steps: + - uses: actions/checkout@v4 + + - name: Install libffi (Linux) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y libffi-dev + + - name: Install libffi (macOS) + if: runner.os == 'macOS' + run: brew install libffi + + - name: Install system CLD2 library (Linux system profile) + if: matrix.variant == 'system' + run: sudo apt-get install -y libcld2-0 libcld2-dev + + - name: Install build tools (Linux standard/full profiles) + if: runner.os == 'Linux' && matrix.variant != 'system' + run: sudo apt-get install -y build-essential + + - name: Set up JDK + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + + - name: Build and test + run: mvn clean verify -P${{ matrix.variant }} -q diff --git a/.gitignore b/.gitignore index de262cd..ea915b1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,8 @@ hs_err_pid* .classpath .settings/ +.idea + +# Native libraries (use build-native profile to build) +src/main/resources/linux-*/ +src/main/resources/darwin-*/ diff --git a/README.md b/README.md index 0f7aac7..ddd4cbe 100644 --- a/README.md +++ b/README.md @@ -5,27 +5,62 @@ The [Compact Language Detector 2](https://github.com/CLD2Owners/cld2) is a nativ ## Installation -### Native Library +This project supports four build profiles: -First, the library libcld2.so (or a .dll on Windows) needs to be installed. +| Profile | Description | Platforms | +| ------------- | ----------------------------------------------------------------------- | ----------------| +| *(default)* | No native library bundled. Requires system library or `-Djava.library.path` | Any | +| `system` | Use system-installed libcld2 (Debian package) | Linux (Debian) | +| `standard` | Clone and build CLD2 from source, bundle into JAR | Linux, macOS | +| `full` | Build from source with full language support (160+) | Linux | -- on Debian-based systems the easiest way is to install the package [libcld-0](https://packages.debian.org/stretch/libcld2-0): +### System Library (Linux/Debian) + +Install the native library via apt: ``` apt-get install libcld2-0 libcld2-dev ``` -- to compile the CLD2 library from source: + +Then build with the `system` profile: +``` +mvn clean verify -Psystem +``` + +### Build from Source + +For Linux or macOS, use the `standard` profile to clone and build CLD2 from source: +``` +mvn clean verify -Pstandard +``` + +This clones [lfoppiano/CLD2](https://github.com/lfoppiano/CLD2) and builds `libcld2`, then bundles it into the JAR. + +**Prerequisites:** +- Linux: `build-essential`, `git` +- macOS: Xcode Command Line Tools (includes git, clang++) + +### Full Language Support (160+ languages) + +The `full` profile is **Linux only**. It builds both `libcld2` and `libcld2_full` from source and uses `LD_PRELOAD` to load the full language tables during testing: + ``` -git clone https://github.com/CLD2Owners/cld2.git -cd cld2/internal/ -export CFLAGS="-Wno-narrowing -O3" -./compile_and_test_all.sh +mvn clean verify -Pfull ``` -If you only want the libraries, `./compile_libs.sh` is sufficient. You may use different compiler flags, the flag `-Wno-narrowing` is required for compilers which follow the C++11 standard. +The `libcld2_full` library only contains the classifier tables for 160+ languages — it is not a standalone library. At runtime, use `LD_PRELOAD=libcld2_full.so` to override the standard tables in `libcld2`. For Hadoop Map-Reduce jobs, pass `-Dmapreduce.reduce.env=LD_PRELOAD=libcld2_full.so`. + +**Why Linux only?** The macOS equivalent (`DYLD_INSERT_LIBRARIES`) does not work because System Integrity Protection (SIP) strips all `DYLD_*` environment variables from child processes, including the JVM forked by Maven Surefire. -#### Using the CLD2 Full Version (160+ languages) +### Using Without Maven Profiles -Both the Debian package and the source build provide two native libraries: `libcld2.so` and `libcld2_full.so`. The former supports 80+, the latter 160+ languages. However, the `libcld2_full.so` from the Debian package isn't a complete shared library - it only contains the tables used by the classifier. To use the larger tables for 160+ language instead of those for 80+ languages, you must use the [LD_PRELOAD trick](https://stackoverflow.com/questions/426230/what-is-the-ld-preload-trick) and set the environment variable `LD_PRELOAD=libcld2_full.so` (on Linux). In case, the language detector is used in Hadoop Map-Reduce jobs, this can be achieved by setting the Hadoop configuration property `mapreduce.reduce.env`, e.g., by passing `-Dmapreduce.reduce.env=LD_PRELOAD=libcld2_full.so` as command-line argument. +If not using a profile, you must provide the native library yourself: + +1. **Install system library** (see above), then: + ``` + mvn clean verify -Djava.library.path=/usr/lib/x86_64-linux-gnu + ``` + +2. **Or use JNA's classpath loading**: Place `libcld2.so`/`libcld2.dylib` on the classpath and JNA will find it. ### Java Bindings @@ -43,29 +78,20 @@ and can then be used as dependency ``` -To link the Java code with the native libraries, you need to make sure that Java can find the share object: +To link the Java code with the native libraries when using the default build (without profiles), you need to make sure that Java can find the shared object: - either install the native library on a standard library path (already done when the Debian package is used) - add the directory where your libcld2.so installed to the environment variable `LD_LIBRARY_PATH` - use the Java option `-Djava.library.path=...` #### Java Native Access (JNA) and libffi -The CLD2 native functions are accessed via the [Java Native Access (JNA)](https://github.com/java-native-access/jna) which uses the [Foreign Function Interface Library (libffi)](https://sourceware.org/libffi/). JNA is a project dependency but the libffi needs to be present on your system. If not install it, e.g. -``` -apt-get install libffi6 -``` - -#### Potential Issues on Other Platforms (Non-Linux) - -So far, the bindings have only been tested on Linux. +The CLD2 native functions are accessed via the [Java Native Access (JNA)](https://github.com/java-native-access/jna) which uses the [Foreign Function Interface Library (libffi)](https://sourceware.org/libffi/). JNA is a project dependency but libffi needs to be present on your system: +- Linux (Debian/Ubuntu): `apt-get install libffi-dev` +- macOS: `brew install libffi` -One potential issue for ports to other platforms is the [mangling of C++ function names](https://en.wikipedia.org/wiki/Name_mangling). Function names called in the native library are registered in [Cld2Library](../blob/master/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java) and [Cld2](../blob/master/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java) using the mangled names, e.g., `_ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPSt6vectorINS_11ResultChunkESaISA_EES7_Pb`. The mangling may work differently on a different platform or when another C++-compiler is used. +#### Platform Support -To adopt the Java bindings, you first need to get the mangled names from the shared object. On Linux this could be done by calling -``` -% nm -D .../libcld2.so.0.0.197 -``` -The mangled function names in the two Java classes need to be replaced by the ones exposed by your native library. Please also see the notes in [Cld2](../blob/master/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java) regarding the creation of the bindings. +The bindings have been tested on Linux (x86-64, ARM64) and macOS (Intel, Apple Silicon). ## History diff --git a/pom.xml b/pom.xml index 21a0ee5..6c19b1d 100644 --- a/pom.xml +++ b/pom.xml @@ -23,6 +23,13 @@ + + + kr.motd.maven + os-maven-plugin + 1.7.0 + + ${basedir}/src/test/java @@ -77,6 +84,176 @@ + + + system + + + standard + + + + org.apache.maven.plugins + maven-antrun-plugin + 3.1.0 + + + build-cld2 + generate-resources + + run + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + full + + + + org.apache.maven.plugins + maven-antrun-plugin + 3.1.0 + + + build-cld2 + generate-resources + + run + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + maven-surefire-plugin + 3.5.4 + + + ${project.build.directory}/classes/${os.detected.classifier}/libcld2_full.so + + + + + + + + diff --git a/src/main/java/org/commoncrawl/langdetect/cld2/CLDHints.java b/src/main/java/org/commoncrawl/langdetect/cld2/CLDHints.java index accbd18..536d3ae 100644 --- a/src/main/java/org/commoncrawl/langdetect/cld2/CLDHints.java +++ b/src/main/java/org/commoncrawl/langdetect/cld2/CLDHints.java @@ -57,10 +57,16 @@ public class CLDHints extends Structure { public int encoding_hint = Encoding.UNKNOWN_ENCODING.value(); /** ITALIAN boosts it */ - public int language_hint = Language.UNKNOWN_LANGUAGE.value(); + public int language_hint; - protected static CLDHints NO_HINTS = new CLDHints(null, "", - Encoding.UNKNOWN_ENCODING.value(), Language.UNKNOWN_LANGUAGE.value()); + private static CLDHints noHints; + + public static CLDHints getNoHints() { + if (noHints == null) { + noHints = new CLDHints(null, "", Encoding.UNKNOWN_ENCODING.value(), Language.UNKNOWN_LANGUAGE.value()); + } + return noHints; + } private static final Pattern DOTPATTERN = Pattern.compile("\\."); diff --git a/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java b/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java index f47a89d..78d67dc 100644 --- a/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java +++ b/src/main/java/org/commoncrawl/langdetect/cld2/Cld2.java @@ -18,6 +18,8 @@ import java.nio.charset.StandardCharsets; +import com.sun.jna.ptr.PointerByReference; + /** * Public interface for the CLD2 library. */ @@ -79,7 +81,7 @@ public static Result detect(String text) { * @return detection result */ public static Result detect(byte[] bytes) { - return detect(bytes, CLDHints.NO_HINTS, 0, true); + return detect(bytes, CLDHints.getNoHints(), 0, true); } public static Result detect(String text, CLDHints hints) { @@ -87,7 +89,7 @@ public static Result detect(String text, CLDHints hints) { } public static Result detect(String text, boolean isPlainText) { - return detect(encodeNative(text), CLDHints.NO_HINTS, 0, isPlainText); + return detect(encodeNative(text), CLDHints.getNoHints(), 0, isPlainText); } public static Result detect(byte[] bytes, CLDHints hints) { @@ -127,7 +129,7 @@ public static Result detect(String text, CLDHints hints, int flags, public static Result detect(byte[] bytes, CLDHints hints, int flags, boolean isPlainText) { Result res = new Result(); - int language = Cld2Library.INSTANCE._ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPSt6vectorINS_11ResultChunkESaISA_EES7_Pb( + int language = invokeExtDetectLanguageSummary( bytes, bytes.length, isPlainText, @@ -143,4 +145,20 @@ public static Result detect(byte[] bytes, CLDHints hints, int flags, return res; } + + private static int invokeExtDetectLanguageSummary( + byte[] buffer, int bufferLength, boolean isPlainText, CLDHints hints, int flags, + int[] language3, int[] percent3, double[] normalizedScore3, + PointerByReference resultchunkvector, int[] textBytes, boolean[] isReliable) { + // Try libstdc++ (Linux) first, then libc++ (macOs) + try { + return Cld2Library.INSTANCE._ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPSt6vectorINS_11ResultChunkESaISA_EES7_Pb( + buffer, bufferLength, isPlainText, hints, flags, + language3, percent3, normalizedScore3, resultchunkvector, textBytes, isReliable); + } catch (UnsatisfiedLinkError e) { + return Cld2Library.INSTANCE._ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPNSt3__16vectorINS_11ResultChunkENS9_9allocatorISB_EEEES7_Pb( + buffer, bufferLength, isPlainText, hints, flags, + language3, percent3, normalizedScore3, resultchunkvector, textBytes, isReliable); + } + } } diff --git a/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java b/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java index a356f12..ba90510 100644 --- a/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java +++ b/src/main/java/org/commoncrawl/langdetect/cld2/Cld2Library.java @@ -18,7 +18,7 @@ import com.sun.jna.Library; import com.sun.jna.Native; -import com.sun.jna.NativeLibrary; +import com.sun.jna.Platform; import com.sun.jna.ptr.PointerByReference; @@ -29,11 +29,21 @@ public interface Cld2Library extends Library { String JNA_LIBRARY_NAME = "cld2"; - NativeLibrary JNA_NATIVE_LIB = NativeLibrary - .getInstance(Cld2Library.JNA_LIBRARY_NAME); + Cld2Library INSTANCE = loadLibrary(); - Cld2Library INSTANCE = (Cld2Library) Native.load(Cld2Library.JNA_LIBRARY_NAME, - Cld2Library.class); + static Cld2Library loadLibrary() { + try { + return (Cld2Library) Native.load(JNA_LIBRARY_NAME, Cld2Library.class); + } catch (UnsatisfiedLinkError e) { + String resourcePath = "/" + Platform.RESOURCE_PREFIX + "/" + JNA_LIBRARY_NAME + + (Platform.isMac() ? ".dylib" : ".so"); + try { + return (Cld2Library) Native.load(resourcePath, Cld2Library.class); + } catch (UnsatisfiedLinkError e2) { + return (Cld2Library) Native.load(JNA_LIBRARY_NAME, Cld2Library.class); + } + } + } //String LanguageName(int lang); String _ZN4CLD212LanguageNameENS_8LanguageE(int lang); @@ -45,6 +55,13 @@ public interface Cld2Library extends Library { int _ZN4CLD219GetLanguageFromNameEPKc(String src); //int ExtDetectLanguageSummary(String buffer, int buffer_length, byte is_plain_text, CLDHints cld_hints, int flags, IntBuffer language3, IntBuffer percent3, DoubleBuffer normalized_score3, PointerByReference resultchunkvector, IntBuffer text_bytes, ByteBuffer is_reliable); + // libc++ (macOS) uses NSt3__16vector + int _ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPNSt3__16vectorINS_11ResultChunkENS9_9allocatorISB_EEEES7_Pb( + byte[] buffer, int bufferLength, boolean isPlainText, CLDHints cldHints, int flags, + int[] language3, int[] percent3, double[] normalizedScore3, + PointerByReference resultchunkvector, int[] textBytes, boolean[] isReliable); + + // libstdc++ (Linux) uses St6vector int _ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPSt6vectorINS_11ResultChunkESaISA_EES7_Pb( byte[] buffer, int bufferLength, boolean isPlainText, CLDHints cldHints, int flags, int[] language3, int[] percent3, double[] normalizedScore3, @@ -52,5 +69,4 @@ int _ZN4CLD224ExtDetectLanguageSummaryEPKcibPKNS_8CLDHintsEiPNS_8LanguageEPiPdPS //String DetectLanguageVersion(); String _ZN4CLD221DetectLanguageVersionEv(); - }