diff --git a/data/text/download_data.sh b/data/text/download_data.sh new file mode 100755 index 0000000..fff9409 --- /dev/null +++ b/data/text/download_data.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz +tar xf simple-examples.tgz +mv simple-examples/data/ptb.train.txt ./ +mv simple-examples/data/ptb.test.txt ./ +rm -rf simple-examples{,.tgz} diff --git a/primitiv-cpp/rnnlm.cc b/primitiv-cpp/rnnlm.cc new file mode 100644 index 0000000..aff4b13 --- /dev/null +++ b/primitiv-cpp/rnnlm.cc @@ -0,0 +1,272 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using primitiv::initializers::Constant; +using primitiv::initializers::XavierUniform; +using primitiv::optimizers::Adam; +namespace F = primitiv::operators; +using namespace primitiv; +using namespace std; +using namespace std::chrono; + +const string TRAIN_FILE = "../data/text/ptb.train.txt"; +const string TEST_FILE = "../data/text/ptb.test.txt"; +const int MAX_EPOCH = 100; + +template +class LSTM : public Model { + unsigned out_size; + Parameter pwxh, pwhh, pbh; + Var wxh, whh, bh, h, c; + +public: + LSTM(unsigned in_size_, unsigned out_size_) + : out_size(out_size_) + , pwxh({4 * out_size_, in_size_}, XavierUniform()) + , pwhh({4 * out_size_, out_size_}, XavierUniform()) + , pbh({4 * out_size_}, Constant(0)){ + add_parameter("pwxh", pwxh); + add_parameter("pwhh", pwhh); + add_parameter("pbh", pbh); + } + + void init() { + wxh = F::parameter(pwxh); + whh = F::parameter(pwhh); + bh = F::parameter(pbh); + h = c = F::zeros({out_size}); + } + + Var forward(const Var &x) { + const Var u = F::matmul(wxh, x) + F::matmul(whh, h) + bh; + const Var i = F::sigmoid(F::slice(u, 0, 0, out_size)); + const Var f = F::sigmoid(F::slice(u, 0, out_size, 2 * out_size)); + const Var o = F::sigmoid(F::slice(u, 0, 2 * out_size, 3 * out_size)); + const Var j = F::tanh(F::slice(u, 0, 3 * out_size, 4 * out_size)); + c = i * j + f * c; + h = o * F::tanh(c); + return h; + } +}; + +template +class RNNLM : public Model { + Parameter plookup; + LSTM lstm; + Parameter pwhy, pby; + Var lookup, why, by; + +public: + RNNLM(unsigned vocab_size, unsigned embed_size, unsigned hidden_size) + : plookup({embed_size, vocab_size}, XavierUniform()) + , lstm(embed_size, hidden_size) + , pwhy({vocab_size, hidden_size}, XavierUniform()) + , pby({vocab_size}, Constant(0)) { + add_parameter("plookup", plookup); + add_submodel("lstm", lstm); + add_parameter("pwhy", pwhy); + add_parameter("pby", pby); + } + + Var forward(const vector &input) { + Var x = F::pick(lookup, input, 1); + Var h = F::sigmoid(lstm.forward(x)); + return F::matmul(why, h) + by; + } + + Var loss(const vector> &inputs) { + lookup = F::parameter(plookup); + why = F::parameter(pwhy); + by = F::parameter(pby); + lstm.init(); + + vector losses; + for (unsigned i = 0; i < inputs.size()-1; i++) { + const auto output = forward(inputs[i]); + losses.emplace_back( + F::softmax_cross_entropy(output, inputs[i + 1], 0)); + } + return F::batch::mean(F::sum(losses)); + } +}; + + +unordered_map make_vocab( + const string &filename) { + ifstream ifs(filename); + if (!ifs.is_open()) { + cerr << "File could not be opened: " << filename << endl; + exit(1); + } + unordered_map vocab; + string line, word; + while (getline(ifs, line)) { + line = line + " "; + stringstream ss(line); + while (getline(ss, word, ' ')) { + if (vocab.find(word) != vocab.end()) + continue; + const unsigned id = vocab.size(); + vocab.emplace(make_pair(word, id)); + } + } + return vocab; +} + +vector> load_corpus( + const string &filename, + const unordered_map &vocab) { + ifstream ifs(filename); + if (!ifs.is_open()) { + cerr << "File could not be opened: " << filename << endl; + exit(1); + } + vector> corpus; + string line, word; + while (getline(ifs, line)) { + line = line + " "; + stringstream ss(line); + vector sentence; + while (getline(ss, word, ' ')) { + sentence.emplace_back(vocab.at(word)); + } + corpus.emplace_back(move(sentence)); + } + return corpus; +} + +unsigned count_labels(const vector> &corpus) { + unsigned ret = 0; + for (const auto &sent : corpus) ret += sent.size() - 1; + return ret; +} + +vector> make_batch( + const vector> &corpus, + const vector &sent_ids, + unsigned eos_id) { + + const unsigned batch_size = sent_ids.size(); + unsigned max_len = 0; + for (const unsigned sid : sent_ids) + max_len = max(max_len, corpus[sid].size()); + + vector> batch(max_len, + vector(batch_size, eos_id)); + for (unsigned i = 0; i < batch_size; i++) { + const auto &sent = corpus[sent_ids[i]]; + for (unsigned j = 0; j < sent.size(); j++) { + batch[j][i] = sent[j]; + } + } + return batch; +} + +int main(int argc, char** argv) { + + auto start = system_clock::now(); + + if (argc != 5) { + cerr << "Usage: " << argv[0]; + cerr << " gpu_id embed_size hidden_size minibatch_size" << endl; + return 1; + } + const int gpu_device = atoi(argv[1]); + const int embed = atoi(argv[2]); + const int hidden = atoi(argv[3]); + const int minibatch = atoi(argv[4]); + + auto vocab = make_vocab(TRAIN_FILE); + unsigned eos_id = vocab[""]; + + const auto train_corpus = load_corpus(TRAIN_FILE, vocab); + const auto test_corpus = load_corpus(TEST_FILE, vocab); + const unsigned num_train_sents = train_corpus.size(); + const unsigned num_test_sents = test_corpus.size(); + const unsigned num_train_labels = count_labels(train_corpus); + const unsigned num_test_labels = count_labels(test_corpus); + + static Device *dev; + if (gpu_device >= 0) + dev = new devices::CUDA(gpu_device); + else + dev = new devices::Naive(); + Device::set_default(*dev); + + Graph g; + Graph::set_default(g); + + RNNLM rnnlm(vocab.size(), embed, hidden); + + Adam optimizer; + optimizer.add_model(rnnlm); + + random_device rd; + mt19937 rng(rd()); + + vector train_ids(num_train_sents); + vector test_ids(num_test_sents); + iota(begin(train_ids), end(train_ids), 0); + iota(begin(test_ids), end(test_ids), 0); + + { + duration fs = system_clock::now() - start; + float startup_time = duration_cast(fs).count(); + cout << "startup time=" << startup_time / 1000. << endl; + } + + for (unsigned epoch = 0; epoch < MAX_EPOCH; epoch++) { + start = system_clock::now(); + + // float train_loss = 0; + shuffle(begin(train_ids), end(train_ids), rng); + for (unsigned ofs = 0; ofs < num_train_sents; ofs += minibatch) { + const vector batch_ids( + begin(train_ids) + ofs, + begin(train_ids) + min( + ofs+minibatch, num_train_sents)); + const auto batch = make_batch(train_corpus, batch_ids, eos_id); + + g.clear(); + const auto loss = rnnlm.loss(batch); + // train_loss += loss.to_float() * batch_ids.size(); + optimizer.reset_gradients(); + loss.backward(); + optimizer.update(); + } + // const float train_ppl = exp(train_loss / num_train_labels); + + duration fs = system_clock::now() - start; + float train_time = duration_cast(fs).count() / 1000.; + + float test_loss = 0; + for (unsigned ofs = 0; ofs < num_test_sents; ofs += minibatch) { + const vector batch_ids( + begin(test_ids) + ofs, + begin(test_ids) + min( + ofs+minibatch, num_test_sents)); + const auto batch = make_batch(test_corpus, batch_ids, eos_id); + + g.clear(); + + const auto loss = rnnlm.loss(batch); + test_loss += loss.to_float() * batch_ids.size(); + } + const float test_ppl = exp(test_loss / num_test_labels); + + cout << "epoch=" << epoch + 1 << ", "; + cout << "time=" << train_time << ", "; + cout << "ppl=" << test_ppl << ", "; + cout << "word_per_sec=" << num_train_labels / train_time << endl; + } + + return 0; +} diff --git a/primitiv-python/rnnlm.py b/primitiv-python/rnnlm.py new file mode 100644 index 0000000..ee83bd7 --- /dev/null +++ b/primitiv-python/rnnlm.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +import time +startup_time = time.time() + +import sys +import random +import math + +from argparse import ArgumentParser + +from primitiv import Device, Graph, Optimizer +from primitiv import Model, Parameter, Node +from primitiv import devices as D +from primitiv import operators as F +from primitiv import initializers as I +from primitiv import optimizers as O + +TRAIN_FILE = "../data/text/ptb.train.txt" +TEST_FILE = "../data/text/ptb.test.txt" +MAX_EPOCH = 30 + +class LSTM(Model): + def __init__(self): + self.pwxh = Parameter() + self.pwhh = Parameter() + self.pbh = Parameter() + self.add_all_parameters() + + def init(self, in_size, out_size): + self.pwxh.init([4 * out_size, in_size], I.XavierUniform()) + self.pwhh.init([4 * out_size, out_size], I.XavierUniform()) + self.pbh.init([4 * out_size], I.Constant(0)) + + def reset(self, init_c = Node(), init_h = Node()): + out_size = self.pwhh.shape()[1] + self.wxh = F.parameter(self.pwxh) + self.whh = F.parameter(self.pwhh) + self.bh = F.parameter(self.pbh) + self.c = init_c if init_c.valid() else F.zeros([out_size]) + self.h = init_h if init_h.valid() else F.zeros([out_size]) + + def forward(self, x): + out_size = self.pwhh.shape()[1] + u = self.wxh @ x + self.whh @ self.h + self.bh + i = F.sigmoid(F.slice(u, 0, 0, out_size)) + f = F.sigmoid(F.slice(u, 0, out_size, 2 * out_size)) + o = F.sigmoid(F.slice(u, 0, 2 * out_size, 3 * out_size)) + j = F.tanh(F.slice(u, 0, 3 * out_size, 4 * out_size)) + self.c = i * j + f * self.c + self.h = o * F.tanh(self.c) + return self.h + + +class RNNLM(Model): + def __init__(self): + self.plookup = Parameter() + self.lstm = LSTM() + self.pwhy = Parameter() + self.pby = Parameter() + + self.add_all_parameters() + self.add_all_submodels() + + def init(self, vocab_size, embed_size, hidden_size): + self.plookup.init([embed_size, vocab_size], I.XavierUniform()) + self.lstm.init(embed_size, hidden_size) + self.pwhy.init([vocab_size, hidden_size], I.XavierUniform()) + self.pby.init([vocab_size], I.Constant(0)) + + def forward(self, word): + x = F.pick(self.lookup, word, 1) + h = F.sigmoid(self.lstm.forward(x)) + return self.why @ h + self.by + + def loss(self, inputs): + self.lookup = F.parameter(self.plookup) + self.lstm.reset() + self.why = F.parameter(self.pwhy) + self.by = F.parameter(self.pby) + + losses = [] + for i in range(len(inputs)-1): + output = self.forward(inputs[i]) + losses.append(F.softmax_cross_entropy(output, inputs[i+1], 0)) + return F.batch.mean(F.sum(losses)) + +def parse_args(): + parser = ArgumentParser() + parser.add_argument('--gpu', metavar="INT", type=int, default=-1, + help="GPU device ID (default: %(default)d)") + parser.add_argument('embed', type=int, help="embedding layer size") + parser.add_argument('hidden', type=int, help="hidden layer size") + parser.add_argument("minibatch", type=int, help="minibatch size") + return parser.parse_args() + +def make_vocab(filename): + vocab = {} + with open(filename, "r") as ifs: + for line in ifs: + line = line.strip() + " " + for word in line.split(): + if word not in vocab: + vocab[word] = len(vocab) + return vocab + +def load_corpus(filename, vocab): + corpus = [] + with open(filename, "r") as ifs: + for line in ifs: + line = line.strip() + " " + sentence = [vocab[word] for word in line.split()] + corpus.append(sentence) + return corpus + +def count_labels(corpus): + ret = 0 + for sent in corpus: + ret += len(sent) - 1 + return ret + +def make_batch(corpus, sent_ids, eos_id): + batch_size = len(sent_ids) + max_len = 0 + for sid in sent_ids: + max_len = max(max_len, len(corpus[sid])) + batch = [[eos_id] * batch_size for i in range(max_len)] + for i in range(batch_size): + sent = corpus[sent_ids[i]] + for j in range(len(sent)): + batch[j][i] = sent[j] + return batch + +if __name__ == '__main__': + args = parse_args() + + vocab = make_vocab(TRAIN_FILE) + vocab_size = len(vocab) + eos_id = vocab[""] + + train_corpus = load_corpus(TRAIN_FILE, vocab) + test_corpus = load_corpus(TEST_FILE, vocab) + num_train_sents = len(train_corpus) + num_test_sents = len(test_corpus) + num_train_labels = count_labels(train_corpus) + num_test_labels = count_labels(test_corpus) + + if args.gpu >= 0: + dev = D.CUDA(args.gpu) + else: + dev = D.Naive() + Device.set_default(dev) + + g = Graph() + Graph.set_default(g) + + rnnlm = RNNLM() + rnnlm.init(vocab_size, args.embed, args.hidden) + + optimizer = O.Adam() + optimizer.add_model(rnnlm) + + train_ids = list(range(num_train_sents)) + test_ids = list(range(num_test_sents)) + + print("startup time=%r" % (time.time() - startup_time)) + + for epoch in range(MAX_EPOCH): + train_time = time.time() + + # train_loss = 0 + random.shuffle(train_ids) + for ofs in range(0, num_train_sents, args.minibatch): + batch_ids = train_ids[ofs : min(ofs+args.minibatch, num_train_sents)] + batch = make_batch(train_corpus, batch_ids, eos_id) + + g.clear() + loss = rnnlm.loss(batch) + # train_loss += loss.to_float() * len(batch_ids) + optimizer.reset_gradients() + loss.backward() + optimizer.update() + # train_ppl = math.exp(train_loss / num_train_labels) + train_time = time.time() - train_time + + test_loss = 0 + for ofs in range(0, num_test_sents, args.minibatch): + batch_ids = test_ids[ofs : min(ofs+args.minibatch, num_test_sents)] + batch = make_batch(test_corpus, batch_ids, eos_id) + + g.clear() + loss = rnnlm.loss(batch) + test_loss += loss.to_float() * len(batch_ids) + test_ppl = math.exp(test_loss / num_test_labels) + + print("epoch=%d, time=%.4f, ppl=%.4f, word_per_sec=%.4f" % ( + epoch+1, train_time, test_ppl, num_train_labels / train_time))