From 63b7f4e2ebcac41778ef018274af47461f45ca88 Mon Sep 17 00:00:00 2001 From: pkufool Date: Mon, 11 Aug 2025 08:13:23 +0800 Subject: [PATCH] support output all initials/finals/pinyins; add valid_pinyin function --- cppinyin/csrc/cppinyin.cc | 101 ++++++++++++++++++++-- cppinyin/csrc/cppinyin.h | 11 +++ cppinyin/csrc/cppinyin_test.cc | 123 +++++++++++++++++++++++++++ cppinyin/csrc/utils.cc | 8 ++ cppinyin/csrc/utils.h | 2 + cppinyin/python/cppinyin/cppinyin.py | 12 +++ cppinyin/python/csrc/cppinyin.cc | 36 +++++++- 7 files changed, 287 insertions(+), 6 deletions(-) diff --git a/cppinyin/csrc/cppinyin.cc b/cppinyin/csrc/cppinyin.cc index 7a9865c..63f7d5f 100644 --- a/cppinyin/csrc/cppinyin.cc +++ b/cppinyin/csrc/cppinyin.cc @@ -26,9 +26,12 @@ #include #include #include +#include #include #include #include +#include +#include #include namespace cppinyin { @@ -41,6 +44,98 @@ void PinyinEncoder::Init(int32_t num_threads) { tone_to_normal_.reserve(NORMAL_TO_TONE.size()); for (const auto &item : NORMAL_TO_TONE) { tone_to_normal_[item.second] = item.first; + auto value = RemoveNumberTone(item.second); + no_tone_set_.insert(value); + } +} + +std::vector +PinyinEncoder::AllPinyin(const std::string &tone /*=number*/, + bool partial /*=false*/) const { + + std::vector pinyins; + if (partial) { + auto initials = AllInitials(); + for (const auto &initial : initials) { + pinyins.push_back(initial); + } + auto finals = AllFinals(tone); + for (const auto &final : finals) { + pinyins.push_back(final); + } + return pinyins; + } + if (tone == "none") { + for (const auto &value : no_tone_set_) { + pinyins.push_back(value); + } + } else if (tone == "normal") { + for (const auto &item : NORMAL_TO_TONE) { + pinyins.push_back(item.first); + } + } else if (tone == "number") { + for (const auto &item : NORMAL_TO_TONE) { + pinyins.push_back(item.second); + } + } else { + std::cerr << "PinyinEncoder: Invalid tone type: " << tone << std::endl; + std::abort(); + } + std::sort(pinyins.begin(), pinyins.end()); + return pinyins; +} + +std::vector PinyinEncoder::AllInitials() const { + std::set initial_set; + for (const auto &item : NORMAL_TO_TONE) { + auto initial = GetInitial(item.first); + if (!initial.empty()) { + initial_set.insert(initial); + } + } + std::vector initials; + initials.reserve(initial_set.size()); + for (const auto &initial : initial_set) { + initials.push_back(initial); + } + return initials; +} + +std::vector +PinyinEncoder::AllFinals(const std::string &tone /*=number*/) const { + auto pinyins = AllPinyin(tone, false); + + std::set finals_set; + for (const auto &value : pinyins) { + auto initial = GetInitial(value); + auto final_t = value.substr(initial.size()); + if (!final_t.empty()) { + finals_set.insert(final_t); + } + } + + std::vector finals; + finals.reserve(finals_set.size()); + for (const auto &value : finals_set) { + finals.push_back(value); + } + return finals; +} + +bool PinyinEncoder::ValidPinyin(const std::string &s, + const std::string &tone /*=number*/) const { + if (tone == "none") { + return no_tone_set_.find(s) != no_tone_set_.end(); + } else if (tone == "normal") { + return NORMAL_TO_TONE.find(s) != NORMAL_TO_TONE.end(); + } else if (tone == "number") { + return tone_to_normal_.find(s) != tone_to_normal_.end(); + } else { + CPY_ASSERT(tone.empty(), + "tone should be empty of one of 'number', 'none' and 'normal'"); + return no_tone_set_.find(s) != no_tone_set_.end() || + NORMAL_TO_TONE.find(s) != NORMAL_TO_TONE.end() || + tone_to_normal_.find(s) != tone_to_normal_.end(); } } @@ -307,11 +402,7 @@ std::string PinyinEncoder::GetInitial(const std::string &s) const { } std::string PinyinEncoder::RemoveTone(const std::string &s) const { - if (std::isdigit(s.back())) { - return s.substr(0, s.size() - 1); - } else { - return s; - } + return RemoveNumberTone(s); std::string phonetic; std::size_t len; diff --git a/cppinyin/csrc/cppinyin.h b/cppinyin/csrc/cppinyin.h index f39ae9a..a350d4e 100644 --- a/cppinyin/csrc/cppinyin.h +++ b/cppinyin/csrc/cppinyin.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,15 @@ class PinyinEncoder { ~PinyinEncoder() {} + std::vector AllPinyin(const std::string &tone = "number", + bool partial = false) const; + + std::vector AllInitials() const; + + std::vector AllFinals(const std::string &tone = "number") const; + + bool ValidPinyin(const std::string &s, const std::string &tone = "") const; + void Encode(const std::string &str, std::vector *ostrs, const std::string &tone = "number", bool partial = false, std::vector *segs = nullptr) const; @@ -114,6 +124,7 @@ class PinyinEncoder { size_t LoadValues(std::istream &ifile); std::unordered_map tone_to_normal_; + std::unordered_set no_tone_set_; std::vector tokens_; std::vector scores_; std::vector> values_; diff --git a/cppinyin/csrc/cppinyin_test.cc b/cppinyin/csrc/cppinyin_test.cc index ce87495..bcd94fb 100644 --- a/cppinyin/csrc/cppinyin_test.cc +++ b/cppinyin/csrc/cppinyin_test.cc @@ -293,4 +293,127 @@ TEST(PinyinEncoder, TestSaveLoad) { "wo3 shi4 zhong1 guo2 ren2 wo3 ai4 wo3 de love you zu3 guo2 "); } +TEST(PinyinEncoder, TestAllPinyin) { + PinyinEncoder processor; + std::ostringstream oss; + auto pinyins = processor.AllPinyin("number", false); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All pinyins in number tone: " << oss.str() << std::endl; + + pinyins = processor.AllPinyin("normal", false); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All pinyins in normal tone: " << oss.str() << std::endl; + + pinyins = processor.AllPinyin("none", false); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All pinyins in none tone: " << oss.str() << std::endl; + + pinyins = processor.AllPinyin("number", true); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All partial pinyins in number tone: " << oss.str() << std::endl; + + pinyins = processor.AllPinyin("normal", true); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All partial pinyins in normal tone: " << oss.str() << std::endl; + + pinyins = processor.AllPinyin("none", true); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All partial pinyins in none tone: " << oss.str() << std::endl; +} + +TEST(PinyinEncoder, TestAllInitialFinals) { + PinyinEncoder processor; + std::ostringstream oss; + auto pinyins = processor.AllInitials(); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All pinyin initials: " << oss.str() << std::endl; + + pinyins = processor.AllFinals("normal"); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All pinyin finals in normal tone: " << oss.str() << std::endl; + + pinyins = processor.AllFinals("none"); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All pinyin finals in none tone: " << oss.str() << std::endl; + + pinyins = processor.AllFinals("number"); + oss.str(""); + for (const auto &pinyin : pinyins) { + oss << pinyin << " "; + } + std::cerr << "All partial pinyin finals in number tone: " << oss.str() + << std::endl; +} + +TEST(PinyinEncoder, TestValidPinyin) { + PinyinEncoder processor; + std::vector pinyins = { + "wǒ", "shì", "zhōng", "guó", "rén", "wǒ", "ài", "wǒ", "de", + "love", "you", "zǔ", "guó", "wo3", "shi4", "zhong1", "guo2", "ren2", + "wo3", "ai4", "wo3", "de", "zu3", "guo2", "wo", "shi", "zhong", + "guo", "ren", "wo", "ai", "wo", "de", "zu", "guo"}; + std::vector valids = { + true, true, true, true, true, true, true, true, true, false, true, true, + true, true, true, true, true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, true, true, true}; + + for (int32_t i = 0; i < pinyins.size(); ++i) { + EXPECT_EQ(processor.ValidPinyin(pinyins[i]), valids[i]); + } + + pinyins = {"wǒ", "shì", "zhōng", "guó", "rén", "wǒ", "ài", + "wǒ", "de", "love", "you", "zǔ", "guó"}; + valids = {true, true, true, true, true, true, true, + true, true, false, false, true, true}; + + for (int32_t i = 0; i < pinyins.size(); ++i) { + EXPECT_EQ(processor.ValidPinyin(pinyins[i], "normal"), valids[i]); + } + + pinyins = {"wo3", "shi4", "zhong1", "guo2", "ren2", "wo3", "ai4", + "wo3", "de", "love", "you", "zu3", "guo2"}; + valids = {true, true, true, true, true, true, true, + true, true, false, false, true, true}; + + for (int32_t i = 0; i < pinyins.size(); ++i) { + EXPECT_EQ(processor.ValidPinyin(pinyins[i], "number"), valids[i]); + } + + pinyins = {"wo", "shi", "zhong", "guo", "ren", "wo", "ai", + "wo", "de", "love", "you", "zu", "guo"}; + valids = {true, true, true, true, true, true, true, + true, true, false, true, true, true}; + + for (int32_t i = 0; i < pinyins.size(); ++i) { + EXPECT_EQ(processor.ValidPinyin(pinyins[i], "none"), valids[i]); + } +} + } // namespace cppinyin diff --git a/cppinyin/csrc/utils.cc b/cppinyin/csrc/utils.cc index d5c8ccd..1ade02f 100644 --- a/cppinyin/csrc/utils.cc +++ b/cppinyin/csrc/utils.cc @@ -7,6 +7,14 @@ namespace cppinyin { +std::string RemoveNumberTone(const std::string &s) { + if (std::isdigit(s.back())) { + return s.substr(0, s.size() - 1); + } else { + return s; + } +} + size_t ReadUint32(std::istream &ifile, uint32_t *data) { ifile.read(reinterpret_cast(data), sizeof(uint32_t)); return sizeof(uint32_t); diff --git a/cppinyin/csrc/utils.h b/cppinyin/csrc/utils.h index e3fec2a..1e9cfc6 100644 --- a/cppinyin/csrc/utils.h +++ b/cppinyin/csrc/utils.h @@ -38,6 +38,8 @@ size_t ReadHeader(std::istream &ifile, std::string *data); size_t WriteHeader(std::ofstream &ofile); +std::string RemoveNumberTone(const std::string &s); + } // namespace cppinyin #endif // CPPINYIN_CSRC_UTILS_H_ diff --git a/cppinyin/python/cppinyin/cppinyin.py b/cppinyin/python/cppinyin/cppinyin.py index 41e1d17..b027278 100644 --- a/cppinyin/python/cppinyin/cppinyin.py +++ b/cppinyin/python/cppinyin/cppinyin.py @@ -56,6 +56,18 @@ def to_finals(self, data: Union[str, List[str]], tone: str = "number"): """ return self.encoder.to_finals(data, tone) + def all_pinyins(self, tone: str = "number"): + return self.encoder.all_pinyins(tone) + + def all_initials(self): + return self.encoder.all_initials() + + def all_finals(self, tone: str = "number"): + return self.encoder.all_finals(tone) + + def valid_pinyin(self, pinyin: str, tone: str = ""): + return self.encoder.valid_pinyin(pinyin, tone) + def load(self, path: str): self.encoder.load(path) diff --git a/cppinyin/python/csrc/cppinyin.cc b/cppinyin/python/csrc/cppinyin.cc index c3ff6b7..f97b987 100644 --- a/cppinyin/python/csrc/cppinyin.cc +++ b/cppinyin/python/csrc/cppinyin.cc @@ -126,7 +126,41 @@ void PybindCppinyin(py::module &m) { self.ToFinals(strs, &ostrs, tone); return ostrs; }, - py::arg("strs"), py::arg("tone") = "number"); + py::arg("strs"), py::arg("tone") = "number") + .def( + "valid_pinyin", + [](PyClass &self, const std::string &str, + const std::string &tone = "number") -> bool { + py::gil_scoped_release release; + bool res = self.ValidPinyin(str, tone); + return res; + }, + py::arg("str"), py::arg("tone") = "number") + + .def( + "all_pinyins", + [](PyClass &self, const std::string &tone = "number", + bool partial = false) -> std::vector { + py::gil_scoped_release release; + std::vector ostrs = self.AllPinyin(tone, partial); + return ostrs; + }, + py::arg("tone") = "number", py::arg("partial") = false) + .def("all_initials", + [](PyClass &self) -> std::vector { + py::gil_scoped_release release; + std::vector ostrs = self.AllInitials(); + return ostrs; + }) + .def( + "all_finals", + [](PyClass &self, + const std::string &tone = "number") -> std::vector { + py::gil_scoped_release release; + std::vector ostrs = self.AllFinals(tone); + return ostrs; + }, + py::arg("tone") = "number"); } PYBIND11_MODULE(_cppinyin, m) {