diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 2542b73..8ec3b6c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -17,19 +17,9 @@ jobs: steps: - name: Checkout Code uses: actions/checkout@v4 - - name: Download datasets and models from release - uses: robinraju/release-downloader@v1 - with: - tag: 'v0.1.0' - fileName: 'release.zip' - tarBall: true - zipBall: true - out-file-path: 'release-download' - - name: Extract files and place them in the correct place - run: cd release-download && unzip release.zip && mv en/{us_gold.json,us_silver.json,model.fst} ../src/en/data - name: Install newest toolchain run: rustup update stable && rustup default stable - name: Build - run: cargo build --verbose + run: cargo build --verbose -F download-data - name: Run tests - run: cargo test --verbose + run: cargo test --verbose -F download-data diff --git a/Cargo.lock b/Cargo.lock index 3e1818c..43d84c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,23 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -47,7 +64,7 @@ version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -58,7 +75,7 @@ checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" dependencies = [ "anstyle", "once_cell", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -67,6 +84,15 @@ version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" +[[package]] +name = "arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "autocfg" version = "1.4.0" @@ -79,6 +105,12 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "beef" version = "0.5.2" @@ -124,18 +156,60 @@ version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array 0.14.7", +] + [[package]] name = "bumpalo" version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "bzip2" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +dependencies = [ + "bzip2-sys", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cc" version = "1.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" dependencies = [ + "jobserver", + "libc", "shlex", ] @@ -145,6 +219,16 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "clap" version = "4.5.36" @@ -207,7 +291,46 @@ dependencies = [ "libc", "once_cell", "unicode-width", - "windows-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", ] [[package]] @@ -235,6 +358,16 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array 0.14.7", + "typenum", +] + [[package]] name = "darling" version = "0.20.11" @@ -270,6 +403,32 @@ dependencies = [ "syn", ] +[[package]] +name = "deflate64" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da692b8d1080ea3045efaab14434d40468c3d8657e42abddfffca87b428f4c1b" + +[[package]] +name = "deranged" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "derive_arbitrary" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "derive_builder" version = "0.20.2" @@ -301,6 +460,17 @@ dependencies = [ "syn", ] +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + [[package]] name = "either" version = "1.15.0" @@ -313,6 +483,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + [[package]] name = "esaxx-rs" version = "0.1.10" @@ -322,6 +498,16 @@ dependencies = [ "cc", ] +[[package]] +name = "flate2" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -337,6 +523,16 @@ dependencies = [ "typenum", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -346,22 +542,78 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.0+wasi-snapshot-preview1", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", "wasm-bindgen", ] +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "http" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + [[package]] name = "ident_case" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" +[[package]] +name = "indexmap" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +dependencies = [ + "equivalent", + "hashbrown", +] + [[package]] name = "indicatif" version = "0.17.11" @@ -375,6 +627,15 @@ dependencies = [ "web-time", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array 0.14.7", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -414,6 +675,16 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" +[[package]] +name = "jobserver" +version = "0.1.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38f262f097c174adebe41eb73d66ae9c06b2844fb0da69969647bbddd9b0538a" +dependencies = [ + "getrandom 0.3.2", + "libc", +] + [[package]] name = "js-sys" version = "0.3.77" @@ -436,6 +707,12 @@ version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" +[[package]] +name = "lockfree-object-pool" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9374ef4228402d4b7e403e5838cb880d9ee663314b0a900d5a6aabf0c213552e" + [[package]] name = "log" version = "0.4.27" @@ -476,6 +753,27 @@ dependencies = [ "logos-codegen", ] +[[package]] +name = "lzma-rs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "297e814c836ae64db86b36cf2a557ba54368d03f6afcd7d947c266692f71115e" +dependencies = [ + "byteorder", + "crc", +] + +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "macro_rules_attribute" version = "0.2.0" @@ -504,6 +802,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" +[[package]] +name = "miniz_oxide" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" +dependencies = [ + "adler2", +] + [[package]] name = "monostate" version = "0.1.14" @@ -535,6 +842,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-traits" version = "0.2.19" @@ -604,6 +917,22 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "phf" version = "0.11.3" @@ -658,7 +987,7 @@ dependencies = [ [[package]] name = "phonemoro" -version = "0.2.0" +version = "0.3.0" dependencies = [ "anyhow", "logos", @@ -668,6 +997,9 @@ dependencies = [ "phonetisaurus-g2p", "serde", "tokenizers", + "ureq", + "uuid", + "zip-extract", ] [[package]] @@ -721,6 +1053,12 @@ version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -748,6 +1086,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + [[package]] name = "rand" version = "0.8.5" @@ -775,7 +1119,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.15", ] [[package]] @@ -838,6 +1182,20 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.15", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc_version" version = "0.4.1" @@ -857,8 +1215,8 @@ dependencies = [ "bimap", "binary-heap-plus", "bitflags 2.9.0", - "generic-array", - "getrandom", + "generic-array 0.12.4", + "getrandom 0.2.15", "itertools 0.12.1", "nom", "num-traits", @@ -872,6 +1230,47 @@ dependencies = [ "unsafe_unwrap", ] +[[package]] +name = "rustls" +version = "0.23.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df51b5869f3a441595eac5e8ff14d486ff285f7b8c0df8770e49c3b56351f0f0" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" + +[[package]] +name = "rustls-webpki" +version = "0.103.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fef8b8769aaccf73098557a87cd1816b4f9c7c16811c9c77142aa695c16f2c03" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "ryu" version = "1.0.20" @@ -916,12 +1315,29 @@ dependencies = [ "serde", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "simd-adler32" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" + [[package]] name = "siphasher" version = "1.0.1" @@ -940,7 +1356,7 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" dependencies = [ - "base64", + "base64 0.13.1", "nom", "serde", "unicode-segmentation", @@ -962,6 +1378,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "superslice" version = "1.0.0" @@ -999,6 +1421,25 @@ dependencies = [ "syn", ] +[[package]] +name = "time" +version = "0.3.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" +dependencies = [ + "deranged", + "num-conv", + "powerfmt", + "serde", + "time-core", +] + +[[package]] +name = "time-core" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" + [[package]] name = "tokenizers" version = "0.21.1" @@ -1008,7 +1449,7 @@ dependencies = [ "aho-corasick", "derive_builder", "esaxx-rs", - "getrandom", + "getrandom 0.2.15", "indicatif", "itertools 0.13.0", "lazy_static", @@ -1076,24 +1517,90 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1230ec65f13e0f9b28d789da20d2d419511893ea9dac2c1f4ef67b8b14e5da80" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b0351ca625c7b41a8e4f9bb6c5d9755f67f62c2187ebedecacd9974674b271d" +dependencies = [ + "base64 0.22.1", + "flate2", + "log", + "percent-encoding", + "rustls", + "rustls-pemfile", + "rustls-pki-types", + "ureq-proto", + "utf-8", + "webpki-roots", +] + +[[package]] +name = "ureq-proto" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae239d0a3341aebc94259414d1dc67cfce87d41cbebc816772c91b77902fafa4" +dependencies = [ + "base64 0.22.1", + "http", + "httparse", + "log", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "uuid" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9" +dependencies = [ + "getrandom 0.3.2", +] + [[package]] name = "vec_map" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -1161,6 +1668,24 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-roots" +version = "0.26.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2210b291f7ea53617fbafcc4939f10914214ec15aace5ba62293a668f322c5c9" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.59.0" @@ -1234,6 +1759,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags 2.9.0", +] + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "zerocopy" version = "0.8.24" @@ -1253,3 +1796,104 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zip" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dcb24d0152526ae49b9b96c1dcf71850ca1e0b882e4e28ed898a93c41334744" +dependencies = [ + "aes", + "arbitrary", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "deflate64", + "flate2", + "getrandom 0.3.2", + "hmac", + "indexmap", + "lzma-rs", + "memchr", + "pbkdf2", + "sha1", + "time", + "xz2", + "zeroize", + "zopfli", + "zstd", +] + +[[package]] +name = "zip-extract" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3938d2b7d7ffd0fb7d4a86baeade9189535487d05d175401daf92306c531c0" +dependencies = [ + "log", + "thiserror", + "zip", +] + +[[package]] +name = "zopfli" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5019f391bac5cf252e93bbcc53d039ffd62c7bfb7c150414d61369afe57e946" +dependencies = [ + "bumpalo", + "crc32fast", + "lockfree-object-pool", + "log", + "once_cell", + "simd-adler32", +] + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.15+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb81183ddd97d0c74cedf1d50d85c8d08c1b8b68ee863bdee9e706eedba1a237" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index 28c7e54..72825d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,13 +1,21 @@ [workspace] -members = [ "phonemoro-macros", "phonemoro-common", "phonemoro-cli"] +members = [".", "phonemoro-macros", "phonemoro-common", "phonemoro-cli"] +default-members = ["."] resolver = "2" [package] name = "phonemoro" -version = "0.2.0" +version = "0.3.0" edition = "2024" authors = ["lastleon "] +[features] +default = [] + +# This enables automatic download of data necessary to build this project from +# the GitHub releases page. Disabled by default. +download-data = ["dep:anyhow", "dep:ureq", "dep:uuid", "dep:zip-extract"] + [dependencies] phonemoro-common = { version = "0.2.0", path = "./phonemoro-common" } phonemoro-macros = { version = "0.2.0", path = "./phonemoro-macros" } @@ -17,3 +25,9 @@ phf = { version = "0.11.3", features = ["macros"] } serde = { version = "1.0.217", features = ["derive"] } tokenizers = "0.21.0" phonetisaurus-g2p = "0.1.1" + +[build-dependencies] +anyhow = { version = "1.0.95", optional = true } +ureq = { version = "3.0.10", optional = true } +uuid = { version = "1.16.0", features = ["v4"], optional = true } +zip-extract = { version = "0.2.2", optional = true } diff --git a/README.md b/README.md index 1efcdd0..78cf87f 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,12 @@ Created for the use with [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M), bu Suitable for edge devices. Easy deployment, since all data is statically included in the binary, so no dependencies or other files needed. Currently only support for US english. +

🚨 WIP, so a lot can still change 🚨

-> ⚠️ This project was renamed from phonemizer-rs to phonemoro. See https://github.com/lastleon/phonemoro/pull/1#issue-2992845609 for further information. +> ⚠️ This project was renamed from phonemizer-rs to phonemoro. See for further information. ## Overview @@ -23,7 +24,7 @@ This project started because I needed a phonemizer for use with [Kokoro](https:/ - produce IPA phonemes that are compatible with Kokoro, i.e. do not sound weird - be easy to use and cross compile -With that in mind, this is how the works: +With that in mind, this is how the works: 1. **Tokenization**: First, the input text is tokenized using [Logos](https://github.com/maciejhirsz/logos) for easier preprocessing and phonemization logic. 2. **Lookup**: Then, the relevant words are looked up in the grapheme-to-phoneme datasets used by [Misaki](https://github.com/hexgrad/Misaki), the phonemizer behind Kokoro. The datasets are preprocessed and then statically embedded in the binary as a `phf_map` from the [phf](https://github.com/rust-phf/rust-phf) crate. @@ -31,24 +32,24 @@ With that in mind, this is how the works: ## Usage (lib) -1. Add the repository as a submodule to your crate: +This library requires data that needs to be prepared. You can either do that manually, or you can enable a feature and automatically download the prepared data from the releases page. -```shell -$ git submodule add https://github.com/lastleon/phonemoro -``` +By default, automatically downloading the data is disabled. -2. Prepare the data. Currently, only US english is supported, so the instructions focus on that. You have two options: +### Easy Way _(Recommended)_ - - **Build the data yourself**. For that, go to the `data-preparation` directory, and follow the instructions there. Then, copy the artifacts (`model.fst`, `us_gold.json` and `us_silver.json`) to `src/en/data`. Note that this requires additional dependencies, and is currently only supported on Linux and maybe MacOS. - - **Download the data from the _Releases_ page** (_Recommended_). Copy the files `model.fst`, `us_gold.json` and `us_silver.json` within the `en/` folder from the release into `src/en/data`. - -3. Now, back in your crate, add `phonemoro` as a dependency: +1. Add this library to your crate, with the `download-data` feature enabled: ```shell -$ cargo add --path ./phonemoro +$ cargo add --git https://github.com/lastleon/phonemoro phonemoro -F download-data ``` -4. Use the library like so: +> ⚠️ **Warning**: +> This downloads the `release.zip` file from the releases page on GitHub, unzips it, and moves the contents to the appropriate directory. +> +> This only works from **version 0.3.0 onwards**. You should only ever use the latest version of the library anyway, for now. + +2. Use the library like so: ```rust use phonemoro::en::phonemizer::EnPhonemizer; @@ -61,23 +62,48 @@ fn main() { } ``` -## Usage (cli) +### Harder Way + +Use this only if you're uncomfortable downloading from the internet, or you want to use your own data. -1. Clone the repository: +1. Clone this repository: ```shell $ git clone https://github.com/lastleon/phonemoro ``` -2. Prepare data the same way as for library usage, so step 2 of the previous section. +2. Prepare the data. Currently, only US english is supported, so the instructions focus on that. For that, go to the `data-preparation` directory, and follow the instructions there. Then, copy the artifacts (`model.fst`, `us_gold.json` and `us_silver.json`) to `src/en/data`. Note that this requires additional dependencies, and is currently only supported on Linux and maybe MacOS. + +3. Now, go to your own crate, and add `phonemoro` as a dependency: + +```shell +$ cargo add --path +``` + +4. Use the library like shown in the previous section. -3. Build the cli: +## Usage (cli) + +1. Clone this repository: ```shell -$ cargo build -p phonemoro-cli --release +$ git clone https://github.com/lastleon/phonemoro ``` -4. Use the binary, no other files needed: +2. Build the cli tool: + +- **Easy Way**: Build the cli tool with the `download-data` feature enabled: + + ```shell + $ cargo build -p phonemoro-cli --release -F download-data + ``` + + > ⚠️ **Warning**: + > The same warnings as in [Usage (lib) > Easy Way](#easy-way) apply here. + +- **Harder Way**: Follow step 2 of [Usage (lib) > Harder Way](#harder-way) + +3. Use the tool: ```shell $ ./target/release/phonemoro-cli --help diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..4f8128a --- /dev/null +++ b/build.rs @@ -0,0 +1,75 @@ +use std::path::PathBuf; + +fn main() { + if !en_data_available() { + #[cfg(feature = "download-data")] + download::en_download_data(); + + #[cfg(not(feature = "download-data"))] + panic!( + "Necessary data for language 'en' not found. You can manually add it (see README on GitHub), or enable the 'download-data' feature to automatically download it." + ); + } +} + +/// Check if all artifacts for 'en' are available. If at least one is missing, this returns false. +fn en_data_available() -> bool { + let data_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/en/data"); + let data_files = ["us_gold.json", "us_silver.json", "model.fst"]; + + data_files + .iter() + .all(|&fname| data_path.join(fname).exists()) +} + +#[cfg(feature = "download-data")] +mod download { + use anyhow::{Context, Result}; + use std::path::PathBuf; + use uuid::Uuid; + + /// Download artifacts for en from GitHub releases + pub fn en_download_data() { + let download_url = + "https://github.com/lastleon/phonemoro/releases/download/v0.1.0/release.zip"; + let data_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("src/en/data"); + + // Download and unzip to temporary directory + let tmp_dir = std::env::temp_dir().join(Uuid::new_v4().to_string()); + + let downloaded_file = + download(download_url).expect("Downloading data from GitHub release page failed."); + + zip_extract::extract(std::io::Cursor::new(downloaded_file), &tmp_dir, false) + .expect("Unzipping release.zip failed."); + + // Move files to correct location + let tmp_en_data_dir = tmp_dir.join("en"); + for entry in tmp_en_data_dir + .read_dir() + .expect("Reading the contents of the unzipped directory failed.") + { + let fpath = entry.expect("Reading a in the unzipped directory failed."); + std::fs::rename(fpath.path(), data_path.join(fpath.file_name())) + .expect("Moving file from unzipped directory to data directory failed."); + } + } + + /// Download file from url to memory + fn download>(url: S) -> Result> { + let mut resp = ureq::get(url.as_ref()) + .call() + .with_context(|| "Sending download request failed.")?; + + if resp.status() != ureq::http::StatusCode::OK { + anyhow::bail!("Request failed with status: {}", resp.status()); + } + + // Note: Limit raised to 50 MiB (only ~21 MB should be necessary) + Ok(resp + .body_mut() + .with_config() + .limit(50 * 1024 * 1024) + .read_to_vec()?) + } +} diff --git a/phonemoro-cli/Cargo.toml b/phonemoro-cli/Cargo.toml index 7bfe365..b45ad7a 100644 --- a/phonemoro-cli/Cargo.toml +++ b/phonemoro-cli/Cargo.toml @@ -8,7 +8,14 @@ authors = ["lastleon "] description = "CLI for phonemoro" repository = "https://github.com/lastleon/phonemoro/tree/main/phonemoro-cli" +[features] +default = [] + +# This enables automatic download of data necessary to build this project from +# the GitHub releases page. Disabled by default. +download-data = ["phonemoro/download-data"] + [dependencies] anyhow = "1.0.97" clap = { version = "4.5.32", features = ["derive"] } -phonemoro = { version = "0.2.0", path = ".." } +phonemoro = { version = "0.3.0", path = ".." } diff --git a/phonemoro-macros/src/dataset_parsing.rs b/phonemoro-macros/src/dataset_parsing.rs index 1363dee..905af5a 100644 --- a/phonemoro-macros/src/dataset_parsing.rs +++ b/phonemoro-macros/src/dataset_parsing.rs @@ -1,5 +1,3 @@ -use anyhow; -use phf_codegen; use serde::Deserialize; use std::{collections::HashMap, fs::File, io::BufReader, path::PathBuf}; diff --git a/phonemoro-macros/src/lib.rs b/phonemoro-macros/src/lib.rs index 590acf0..af9dbc1 100644 --- a/phonemoro-macros/src/lib.rs +++ b/phonemoro-macros/src/lib.rs @@ -26,15 +26,14 @@ pub fn phm_from_json(tokens: TokenStream) -> TokenStream { passed_data_path }; - let data_path = passed_data_path.canonicalize().expect( - format!( + let data_path = passed_data_path.canonicalize().unwrap_or_else(|_| { + panic!( "Canonicalization failed. Absolute path before trying to resolve it: '{:?}'", - ::std::path::absolute(passed_data_path).expect( + std::path::absolute(passed_data_path).expect( "Getting absolute path of argument path failed, something is really wrong." ) ) - .as_str(), - ); + }); // create map from that let phf_map_builder = create_phf_map(&data_path).expect("Creating phf map failed.");