Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions decoder/ff_klm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -373,15 +373,17 @@ boost::shared_ptr<FeatureFunction> KLanguageModelFactory::Create(std::string par
if (!RecognizeBinary(filename.c_str(), m)) m = HASH_PROBING;

switch (m) {
case HASH_PROBING:
case PROBING:
return CreateModel<ProbingModel>(param);
case TRIE_SORTED:
case REST_PROBING:
return CreateModel<RestProbingModel>(param);
case TRIE:
return CreateModel<TrieModel>(param);
case ARRAY_TRIE_SORTED:
case ARRAY_TRIE:
return CreateModel<ArrayTrieModel>(param);
case QUANT_TRIE_SORTED:
case QUANT_TRIE:
return CreateModel<QuantTrieModel>(param);
case QUANT_ARRAY_TRIE_SORTED:
case QUANT_ARRAY_TRIE:
return CreateModel<QuantArrayTrieModel>(param);
default:
UTIL_THROW(util::Exception, "Unrecognized kenlm binary file type " << (unsigned)m);
Expand Down
2 changes: 1 addition & 1 deletion klm/lm/Jamfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
lib kenlm : bhiksha.cc binary_format.cc config.cc lm_exception.cc model.cc quantize.cc read_arpa.cc search_hashed.cc search_trie.cc trie.cc trie_sort.cc virtual_interface.cc vocab.cc ../util//kenutil : <include>.. : : <include>.. <library>../util//kenutil ;
lib kenlm : bhiksha.cc binary_format.cc config.cc lm_exception.cc model.cc quantize.cc read_arpa.cc search_hashed.cc search_trie.cc trie.cc trie_sort.cc value_build.cc virtual_interface.cc vocab.cc ../util//kenutil : <include>.. : : <include>.. <library>../util//kenutil ;

import testing ;

Expand Down
1 change: 1 addition & 0 deletions klm/lm/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ libklm_a_SOURCES = \
search_trie.cc \
trie.cc \
trie_sort.cc \
value_build.cc \
virtual_interface.cc \
vocab.cc

Expand Down
2 changes: 1 addition & 1 deletion klm/lm/binary_format.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ struct Sanity {
}
};

const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};

std::size_t TotalHeaderSize(unsigned char order) {
return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
Expand Down
97 changes: 64 additions & 33 deletions klm/lm/build_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,28 @@ uint8_t ParseBitCount(const char *from) {
return val;
}

void ParseFileList(const char *from, std::vector<std::string> &to) {
to.clear();
while (true) {
const char *i;
for (i = from; *i && *i != ' '; ++i) {}
to.push_back(std::string(from, i - from));
if (!*i) break;
from = i + 1;
}
}

void ShowSizes(const char *file, const lm::ngram::Config &config) {
std::vector<uint64_t> counts;
util::FilePiece f(file);
lm::ReadARPACounts(f, counts);
std::size_t sizes[5];
std::size_t sizes[6];
sizes[0] = ProbingModel::Size(counts, config);
sizes[1] = TrieModel::Size(counts, config);
sizes[2] = QuantTrieModel::Size(counts, config);
sizes[3] = ArrayTrieModel::Size(counts, config);
sizes[4] = QuantArrayTrieModel::Size(counts, config);
sizes[1] = RestProbingModel::Size(counts, config);
sizes[2] = TrieModel::Size(counts, config);
sizes[3] = QuantTrieModel::Size(counts, config);
sizes[4] = ArrayTrieModel::Size(counts, config);
sizes[5] = QuantArrayTrieModel::Size(counts, config);
std::size_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(size_t));
std::size_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(size_t));
std::size_t divide;
Expand All @@ -99,10 +111,11 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) {
for (long int i = 0; i < length - 2; ++i) std::cout << ' ';
std::cout << prefix << "B\n"
"probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n"
"trie " << std::setw(length) << (sizes[1] / divide) << " without quantization\n"
"trie " << std::setw(length) << (sizes[2] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"
"trie " << std::setw(length) << (sizes[3] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
"trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n";
"probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r -p " << config.probing_multiplier << "\n"
"trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n"
"trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n"
"trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n"
"trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n";
}

void ProbingQuantizationUnsupported() {
Expand All @@ -118,10 +131,10 @@ int main(int argc, char *argv[]) {
using namespace lm::ngram;

try {
bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false;
bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
lm::ngram::Config config;
int opt;
while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) {
while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:sir:")) != -1) {
switch(opt) {
case 'q':
config.prob_bits = ParseBitCount(optarg);
Expand Down Expand Up @@ -164,6 +177,11 @@ int main(int argc, char *argv[]) {
case 'i':
config.positive_log_probability = lm::SILENT;
break;
case 'r':
rest = true;
ParseFileList(optarg, config.rest_lower_files);
config.rest_function = Config::REST_LOWER;
break;
default:
Usage(argv[0]);
}
Expand All @@ -174,35 +192,48 @@ int main(int argc, char *argv[]) {
}
if (optind + 1 == argc) {
ShowSizes(argv[optind], config);
} else if (optind + 2 == argc) {
return 0;
}
const char *model_type;
const char *from_file;

if (optind + 2 == argc) {
model_type = "probing";
from_file = argv[optind];
config.write_mmap = argv[optind + 1];
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
ProbingModel(argv[optind], config);
} else if (optind + 3 == argc) {
const char *model_type = argv[optind];
const char *from_file = argv[optind + 1];
model_type = argv[optind];
from_file = argv[optind + 1];
config.write_mmap = argv[optind + 2];
if (!strcmp(model_type, "probing")) {
if (!set_write_method) config.write_method = Config::WRITE_AFTER;
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
} else {
Usage(argv[0]);
}
if (!strcmp(model_type, "probing")) {
if (!set_write_method) config.write_method = Config::WRITE_AFTER;
if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
if (rest) {
RestProbingModel(from_file, config);
} else {
ProbingModel(from_file, config);
} else if (!strcmp(model_type, "trie")) {
if (!set_write_method) config.write_method = Config::WRITE_MMAP;
if (quantize) {
if (bhiksha) {
QuantArrayTrieModel(from_file, config);
} else {
QuantTrieModel(from_file, config);
}
}
} else if (!strcmp(model_type, "trie")) {
if (rest) {
std::cerr << "Rest + trie is not supported yet." << std::endl;
return 1;
}
if (!set_write_method) config.write_method = Config::WRITE_MMAP;
if (quantize) {
if (bhiksha) {
QuantArrayTrieModel(from_file, config);
} else {
if (bhiksha) {
ArrayTrieModel(from_file, config);
} else {
TrieModel(from_file, config);
}
QuantTrieModel(from_file, config);
}
} else {
Usage(argv[0]);
if (bhiksha) {
ArrayTrieModel(from_file, config);
} else {
TrieModel(from_file, config);
}
}
} else {
Usage(argv[0]);
Expand Down
1 change: 1 addition & 0 deletions klm/lm/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Config::Config() :
write_mmap(NULL),
write_method(WRITE_AFTER),
include_vocab(true),
rest_function(REST_MAX),
prob_bits(8),
backoff_bits(8),
pointer_bhiksha_bits(22),
Expand Down
22 changes: 17 additions & 5 deletions klm/lm/config.hh
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#ifndef LM_CONFIG__
#define LM_CONFIG__

#include <iosfwd>

#include "lm/lm_exception.hh"
#include "util/mmap.hh"

#include <iosfwd>
#include <string>
#include <vector>

/* Configuration for ngram model. Separate header to reduce pollution. */

namespace lm {
Expand Down Expand Up @@ -63,23 +65,33 @@ struct Config {
const char *temporary_directory_prefix;

// Level of complaining to do when loading from ARPA instead of binary format.
typedef enum {ALL, EXPENSIVE, NONE} ARPALoadComplain;
enum ARPALoadComplain {ALL, EXPENSIVE, NONE};
ARPALoadComplain arpa_complain;

// While loading an ARPA file, also write out this binary format file. Set
// to NULL to disable.
const char *write_mmap;

typedef enum {
enum WriteMethod {
WRITE_MMAP, // Map the file directly.
WRITE_AFTER // Write after we're done.
} WriteMethod;
};
WriteMethod write_method;

// Include the vocab in the binary file? Only effective if write_mmap != NULL.
bool include_vocab;


// Left rest options. Only used when the model includes rest costs.
enum RestFunction {
REST_MAX, // Maximum of any score to the left
REST_LOWER, // Use lower-order files given below.
};
RestFunction rest_function;
// Only used for REST_LOWER.
std::vector<std::string> rest_lower_files;



// Quantization options. Only effective for QuantTrieModel. One value is
// reserved for each of prob and backoff, so 2^bits - 1 buckets will be used
Expand Down
Loading