Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 96 additions & 5 deletions cppinyin/csrc/cppinyin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@
#include <iostream>
#include <limits>
#include <numeric>
#include <set>
#include <sstream>
#include <string>
#include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <utility>

namespace cppinyin {
Expand All @@ -41,6 +44,98 @@ void PinyinEncoder::Init(int32_t num_threads) {
tone_to_normal_.reserve(NORMAL_TO_TONE.size());
for (const auto &item : NORMAL_TO_TONE) {
tone_to_normal_[item.second] = item.first;
auto value = RemoveNumberTone(item.second);
no_tone_set_.insert(value);
}
}

std::vector<std::string>
PinyinEncoder::AllPinyin(const std::string &tone /*=number*/,
bool partial /*=false*/) const {

std::vector<std::string> pinyins;
if (partial) {
auto initials = AllInitials();
for (const auto &initial : initials) {
pinyins.push_back(initial);
}
auto finals = AllFinals(tone);
for (const auto &final : finals) {
pinyins.push_back(final);
}
return pinyins;
}
if (tone == "none") {
for (const auto &value : no_tone_set_) {
pinyins.push_back(value);
}
} else if (tone == "normal") {
for (const auto &item : NORMAL_TO_TONE) {
pinyins.push_back(item.first);
}
} else if (tone == "number") {
for (const auto &item : NORMAL_TO_TONE) {
pinyins.push_back(item.second);
}
} else {
std::cerr << "PinyinEncoder: Invalid tone type: " << tone << std::endl;
std::abort();
}
std::sort(pinyins.begin(), pinyins.end());
return pinyins;
}

std::vector<std::string> PinyinEncoder::AllInitials() const {
std::set<std::string> initial_set;
for (const auto &item : NORMAL_TO_TONE) {
auto initial = GetInitial(item.first);
if (!initial.empty()) {
initial_set.insert(initial);
}
}
std::vector<std::string> initials;
initials.reserve(initial_set.size());
for (const auto &initial : initial_set) {
initials.push_back(initial);
}
return initials;
}

std::vector<std::string>
PinyinEncoder::AllFinals(const std::string &tone /*=number*/) const {
auto pinyins = AllPinyin(tone, false);

std::set<std::string> finals_set;
for (const auto &value : pinyins) {
auto initial = GetInitial(value);
auto final_t = value.substr(initial.size());
if (!final_t.empty()) {
finals_set.insert(final_t);
}
}

std::vector<std::string> finals;
finals.reserve(finals_set.size());
for (const auto &value : finals_set) {
finals.push_back(value);
}
return finals;
}

bool PinyinEncoder::ValidPinyin(const std::string &s,
const std::string &tone /*=number*/) const {
if (tone == "none") {
return no_tone_set_.find(s) != no_tone_set_.end();
} else if (tone == "normal") {
return NORMAL_TO_TONE.find(s) != NORMAL_TO_TONE.end();
} else if (tone == "number") {
return tone_to_normal_.find(s) != tone_to_normal_.end();
} else {
CPY_ASSERT(tone.empty(),
"tone should be empty of one of 'number', 'none' and 'normal'");
return no_tone_set_.find(s) != no_tone_set_.end() ||
NORMAL_TO_TONE.find(s) != NORMAL_TO_TONE.end() ||
tone_to_normal_.find(s) != tone_to_normal_.end();
}
}

Expand Down Expand Up @@ -307,11 +402,7 @@ std::string PinyinEncoder::GetInitial(const std::string &s) const {
}

std::string PinyinEncoder::RemoveTone(const std::string &s) const {
if (std::isdigit(s.back())) {
return s.substr(0, s.size() - 1);
} else {
return s;
}
return RemoveNumberTone(s);

std::string phonetic;
std::size_t len;
Expand Down
11 changes: 11 additions & 0 deletions cppinyin/csrc/cppinyin.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <string>
#include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -59,6 +60,15 @@ class PinyinEncoder {

~PinyinEncoder() {}

std::vector<std::string> AllPinyin(const std::string &tone = "number",
bool partial = false) const;

std::vector<std::string> AllInitials() const;

std::vector<std::string> AllFinals(const std::string &tone = "number") const;

bool ValidPinyin(const std::string &s, const std::string &tone = "") const;

void Encode(const std::string &str, std::vector<std::string> *ostrs,
const std::string &tone = "number", bool partial = false,
std::vector<std::string> *segs = nullptr) const;
Expand Down Expand Up @@ -114,6 +124,7 @@ class PinyinEncoder {
size_t LoadValues(std::istream &ifile);

std::unordered_map<std::string, std::string> tone_to_normal_;
std::unordered_set<std::string> no_tone_set_;
std::vector<std::string> tokens_;
std::vector<float> scores_;
std::vector<std::vector<std::string>> values_;
Expand Down
123 changes: 123 additions & 0 deletions cppinyin/csrc/cppinyin_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -293,4 +293,127 @@ TEST(PinyinEncoder, TestSaveLoad) {
"wo3 shi4 zhong1 guo2 ren2 wo3 ai4 wo3 de love you zu3 guo2 ");
}

TEST(PinyinEncoder, TestAllPinyin) {
PinyinEncoder processor;
std::ostringstream oss;
auto pinyins = processor.AllPinyin("number", false);
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All pinyins in number tone: " << oss.str() << std::endl;

pinyins = processor.AllPinyin("normal", false);
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All pinyins in normal tone: " << oss.str() << std::endl;

pinyins = processor.AllPinyin("none", false);
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All pinyins in none tone: " << oss.str() << std::endl;

pinyins = processor.AllPinyin("number", true);
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All partial pinyins in number tone: " << oss.str() << std::endl;

pinyins = processor.AllPinyin("normal", true);
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All partial pinyins in normal tone: " << oss.str() << std::endl;

pinyins = processor.AllPinyin("none", true);
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All partial pinyins in none tone: " << oss.str() << std::endl;
}

TEST(PinyinEncoder, TestAllInitialFinals) {
PinyinEncoder processor;
std::ostringstream oss;
auto pinyins = processor.AllInitials();
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All pinyin initials: " << oss.str() << std::endl;

pinyins = processor.AllFinals("normal");
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All pinyin finals in normal tone: " << oss.str() << std::endl;

pinyins = processor.AllFinals("none");
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All pinyin finals in none tone: " << oss.str() << std::endl;

pinyins = processor.AllFinals("number");
oss.str("");
for (const auto &pinyin : pinyins) {
oss << pinyin << " ";
}
std::cerr << "All partial pinyin finals in number tone: " << oss.str()
<< std::endl;
}

TEST(PinyinEncoder, TestValidPinyin) {
PinyinEncoder processor;
std::vector<std::string> pinyins = {
"wǒ", "shì", "zhōng", "guó", "rén", "wǒ", "ài", "wǒ", "de",
"love", "you", "zǔ", "guó", "wo3", "shi4", "zhong1", "guo2", "ren2",
"wo3", "ai4", "wo3", "de", "zu3", "guo2", "wo", "shi", "zhong",
"guo", "ren", "wo", "ai", "wo", "de", "zu", "guo"};
std::vector<bool> valids = {
true, true, true, true, true, true, true, true, true, false, true, true,
true, true, true, true, true, true, true, true, true, true, true, true,
true, true, true, true, true, true, true, true, true, true, true};

for (int32_t i = 0; i < pinyins.size(); ++i) {
EXPECT_EQ(processor.ValidPinyin(pinyins[i]), valids[i]);
}

pinyins = {"wǒ", "shì", "zhōng", "guó", "rén", "wǒ", "ài",
"wǒ", "de", "love", "you", "zǔ", "guó"};
valids = {true, true, true, true, true, true, true,
true, true, false, false, true, true};

for (int32_t i = 0; i < pinyins.size(); ++i) {
EXPECT_EQ(processor.ValidPinyin(pinyins[i], "normal"), valids[i]);
}

pinyins = {"wo3", "shi4", "zhong1", "guo2", "ren2", "wo3", "ai4",
"wo3", "de", "love", "you", "zu3", "guo2"};
valids = {true, true, true, true, true, true, true,
true, true, false, false, true, true};

for (int32_t i = 0; i < pinyins.size(); ++i) {
EXPECT_EQ(processor.ValidPinyin(pinyins[i], "number"), valids[i]);
}

pinyins = {"wo", "shi", "zhong", "guo", "ren", "wo", "ai",
"wo", "de", "love", "you", "zu", "guo"};
valids = {true, true, true, true, true, true, true,
true, true, false, true, true, true};

for (int32_t i = 0; i < pinyins.size(); ++i) {
EXPECT_EQ(processor.ValidPinyin(pinyins[i], "none"), valids[i]);
}
}

} // namespace cppinyin
8 changes: 8 additions & 0 deletions cppinyin/csrc/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@

namespace cppinyin {

std::string RemoveNumberTone(const std::string &s) {
if (std::isdigit(s.back())) {
return s.substr(0, s.size() - 1);
} else {
return s;
}
}

size_t ReadUint32(std::istream &ifile, uint32_t *data) {
ifile.read(reinterpret_cast<char *>(data), sizeof(uint32_t));
return sizeof(uint32_t);
Expand Down
2 changes: 2 additions & 0 deletions cppinyin/csrc/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ size_t ReadHeader(std::istream &ifile, std::string *data);

size_t WriteHeader(std::ofstream &ofile);

std::string RemoveNumberTone(const std::string &s);

} // namespace cppinyin

#endif // CPPINYIN_CSRC_UTILS_H_
12 changes: 12 additions & 0 deletions cppinyin/python/cppinyin/cppinyin.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,18 @@ def to_finals(self, data: Union[str, List[str]], tone: str = "number"):
"""
return self.encoder.to_finals(data, tone)

def all_pinyins(self, tone: str = "number"):
return self.encoder.all_pinyins(tone)

def all_initials(self):
return self.encoder.all_initials()

def all_finals(self, tone: str = "number"):
return self.encoder.all_finals(tone)

def valid_pinyin(self, pinyin: str, tone: str = ""):
return self.encoder.valid_pinyin(pinyin, tone)

def load(self, path: str):
self.encoder.load(path)

Expand Down
36 changes: 35 additions & 1 deletion cppinyin/python/csrc/cppinyin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,41 @@ void PybindCppinyin(py::module &m) {
self.ToFinals(strs, &ostrs, tone);
return ostrs;
},
py::arg("strs"), py::arg("tone") = "number");
py::arg("strs"), py::arg("tone") = "number")
.def(
"valid_pinyin",
[](PyClass &self, const std::string &str,
const std::string &tone = "number") -> bool {
py::gil_scoped_release release;
bool res = self.ValidPinyin(str, tone);
return res;
},
py::arg("str"), py::arg("tone") = "number")

.def(
"all_pinyins",
[](PyClass &self, const std::string &tone = "number",
bool partial = false) -> std::vector<std::string> {
py::gil_scoped_release release;
std::vector<std::string> ostrs = self.AllPinyin(tone, partial);
return ostrs;
},
py::arg("tone") = "number", py::arg("partial") = false)
.def("all_initials",
[](PyClass &self) -> std::vector<std::string> {
py::gil_scoped_release release;
std::vector<std::string> ostrs = self.AllInitials();
return ostrs;
})
.def(
"all_finals",
[](PyClass &self,
const std::string &tone = "number") -> std::vector<std::string> {
py::gil_scoped_release release;
std::vector<std::string> ostrs = self.AllFinals(tone);
return ostrs;
},
py::arg("tone") = "number");
}

PYBIND11_MODULE(_cppinyin, m) {
Expand Down
Loading