diff --git a/README b/README index 4a0de890..5b33c65a 100644 --- a/README +++ b/README @@ -16,7 +16,7 @@ Executables built by this pacage: * `lt-comp`: compiler, execute without parameters to show usage instructions. - + * `lt-proc`: processor, typical options are -a (lexical analyser, default option), -g (lexical generator) and -p (lexical post-generator). Using -h will show all flags. @@ -31,6 +31,9 @@ Executables built by this pacage: * `lt-print`: print the arcs of a transducer in [ATT format][3]. +* `lt-reweight`: (experimental) utility to assign weights to a + compiled transducer based on a corpus. + There is also a C++ API that you can link to (see how [apertium][1] or [apertium-lex-tools][2] do this). @@ -54,4 +57,4 @@ Building & installing: [1]: https://github.com/apertium/apertium [2]: https://github.com/apertium/apertium-lex-tools -[3]: http://wiki.apertium.org/wiki/ATT_format +[3]: http://wiki.apertium.org/wiki/ATT_format \ No newline at end of file diff --git a/lttoolbox/Makefile.am b/lttoolbox/Makefile.am index fb44eebf..cc24cb18 100644 --- a/lttoolbox/Makefile.am +++ b/lttoolbox/Makefile.am @@ -14,7 +14,7 @@ cc_sources = alphabet.cc att_compiler.cc compiler.cc compression.cc entry_token. library_includedir = $(includedir)/$(PACKAGE_NAME)-$(VERSION_API)/$(PACKAGE_NAME) library_include_HEADERS = $(h_sources) -bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim +bin_PROGRAMS = lt-comp lt-proc lt-expand lt-tmxcomp lt-tmxproc lt-print lt-trim lt-reweight instdir = lttoolbox lib_LTLIBRARIES= liblttoolbox3.la @@ -55,6 +55,10 @@ lt_tmxproc_SOURCES = lt_tmxproc.cc lt_tmxproc_LDADD = liblttoolbox$(VERSION_MAJOR).la lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) +lt_reweight_SOURCES = lt_reweight.cc +lt_reweight_LDADD = liblttoolbox$(VERSION_MAJOR).la +lt_reweight_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) + #lt-validate-dictionary: Makefile.am validate-header.sh # @echo "Creating lt-validate-dictionary script" # @echo "#!$(BASH)" > $@ @@ -65,7 +69,7 @@ lt_tmxproc_LDFLAGS = -llttoolbox$(VERSION_MAJOR) $(LTTOOLBOX_LIBS) -man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 +man_MANS = lt-comp.1 lt-expand.1 lt-proc.1 lt-tmxcomp.1 lt-tmxproc.1 lt-print.1 lt-trim.1 lt-reweight.1 INCLUDES = -I$(top_srcdir) $(LTTOOLBOX_CFLAGS) if WINDOWS diff --git a/lttoolbox/lt-reweight.1 b/lttoolbox/lt-reweight.1 new file mode 100644 index 00000000..8d5bfe68 --- /dev/null +++ b/lttoolbox/lt-reweight.1 @@ -0,0 +1,66 @@ +.TH lt-reweight 1 2014-02-07 "" "" +.SH NAME +lt-reweight \- This application is part of the lexical processing modules +and tools ( +.B lttoolbox +) +.PP +This tool is part of the apertium machine translation +architecture: \fBhttp://www.apertium.org\fR. +.SH SYNOPSIS +.B lt-trim +analyser_binary bidix_binary trimmed_analyser_binary +.PP +.SH DESCRIPTION +.BR lt-trim +is the application responsible for trimming compiled dictionaries. The +analyses (right-side when compiling lr) of analyser_binary are trimmed +to the input side of bidix_binary (left-side when compiling lr, +right-side when compiling rl), such that only analyses which would +pass through `lt-proc \-b bidix_binary' are kept. + +\fBWarning: this program is experimental!\fR It has been tested, but +not deployed extensively yet. + +Both compund tags (`', `') and join +elements (`' in XML, `+' in the stream) and the group element +(`' in XML, `#' in the stream) should be handled correctly, even +combinations of + followed by # in monodix are handled. + +Some minor caveats: If you have the capitalised lemma "Foo" in the +monodix, but "foo" in the bidix, an analysis "^Foo$" would pass +through bidix when doing lt-proc \-b, but will not make it through +trimming. Make sure your lemmas have the same capitalisation in the +different dictionaries. Also, you should not have literal `+' or `#' +in your lemmas. Since lt-comp doesn't escape these, lt-trim cannot +know that they are different from `' or `', and you may get +@-marked output this way. You can analyse `+' or `#' by having the +literal symbol in the `' part and some other string (e.g. "plus") +in the `'. + +You should not trim a generator unless you have a \fBvery\fR simple +translator pipeline, since the output of bidix seldom goes unchanged +through transfer. +.PP +.SH FILES +.B analyser_binary +The untrimmed analyser dictionary (a finite state transducer). +.PP +.B bidix_binary +The dictionary to use as trimmer (a finite state transducer). +.PP +.B trimmed_analyser_binary +The trimmed analyser dictionary (a finite state transducer). + +.SH SEE ALSO +.I lt-comp\fR(1), +.I lt-proc\fR(1), +.I lt-print\fR(1), +.I lt-expand\fR(1), +.I lt-trim\fR(1), +.I apertium-tagger\fR(1), +.I apertium\fR(1). +.SH BUGS +Lots of...lurking in the dark and waiting for you! +.SH AUTHOR +(c) 2013--2014 Universitat d'Alacant / Universidad de Alicante. diff --git a/lttoolbox/lt_proc.cc b/lttoolbox/lt_proc.cc index 0d461c94..f54ff6fa 100644 --- a/lttoolbox/lt_proc.cc +++ b/lttoolbox/lt_proc.cc @@ -54,7 +54,8 @@ void endProgram(char *name) cout << " -o, --surf-bilingual: lexical transfer with surface forms" << endl; cout << " -p, --post-generation: post-generation" << endl; cout << " -x, --inter-generation: inter-generation" << endl; - cout << " -s, --sao: SAO annotation system input processing" << endl; +// Deprecated: +// cout << " -s, --sao: SAO annotation system input processing" << endl; cout << " -t, --transliteration: apply transliteration dictionary" << endl; cout << " -v, --version: version" << endl; cout << " -z, --null-flush: flush output on the null character " << endl; @@ -79,7 +80,8 @@ void endProgram(char *name) cout << " -o: lexical transfer with surface forms" << endl; cout << " -p: post-generation" << endl; cout << " -x: inter-generation" << endl; - cout << " -s: SAO annotation system input processing" << endl; +// Deprecated: +// cout << " -s: SAO annotation system input processing" << endl; cout << " -t: apply transliteration dictionary" << endl; cout << " -v: version" << endl; cout << " -z: flush output on the null character " << endl; @@ -124,7 +126,8 @@ int main(int argc, char *argv[]) {"tagged-nm-gen", 0, 0, 'm'}, {"post-generation", 0, 0, 'p'}, {"inter-generation", 0, 0, 'x'}, - {"sao", 0, 0, 's'}, +// Deprecated: +// {"sao", 0, 0, 's'}, {"transliteration", 0, 0, 't'}, {"null-flush", 0, 0, 'z'}, {"dictionary-case", 0, 0, 'w'}, @@ -143,9 +146,9 @@ int main(int argc, char *argv[]) { #if HAVE_GETOPT_LONG int option_index; - int c = getopt_long(argc, argv, "abcegi:r:lmndopxstzwvCIWN:L:h", long_options, &option_index); + int c = getopt_long(argc, argv, "abcegi:r:lmndopxtzwvCIWN:L:h", long_options, &option_index); #else - int c = getopt(argc, argv, "abcegi:r:lmndopxstzwvCIWN:L:h"); + int c = getopt(argc, argv, "abcegi:r:lmndopxtzwvCIWN:L:h"); #endif if(c == -1) @@ -210,7 +213,6 @@ int main(int argc, char *argv[]) case 'p': case 'x': case 't': - case 's': case 'C': if(cmd == 0) { @@ -362,12 +364,14 @@ int main(int argc, char *argv[]) fstp.intergeneration(input, output); break; +/** Deprecated: + case 's': fstp.initAnalysis(); checkValidity(fstp); fstp.SAO(input, output); break; - +*/ case 't': fstp.initPostgeneration(); checkValidity(fstp); diff --git a/lttoolbox/lt_reweight.cc b/lttoolbox/lt_reweight.cc new file mode 100644 index 00000000..3a1c234e --- /dev/null +++ b/lttoolbox/lt_reweight.cc @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#include +#include +//#include + +#include +#include + +#include +#include +#include +#include + +void endProgram(char *name) +{ + if(name != NULL) + { + cout << basename(name) << " v" << PACKAGE_VERSION <<": assign weights to a compiled transducer based on a corpus." << endl; + cout << "USAGE: " << basename(name) << " analyser_bin_file tagged_corpus" << endl; + } + exit(EXIT_FAILURE); +} + +std::pair, std::map > +read_fst(FILE *bin_file) +{ + Alphabet new_alphabet; + wstring letters = L""; + + std::map transducers; + + // letters + int len = Compression::multibyte_read(bin_file); + while(len > 0) + { + letters.push_back(static_cast(Compression::multibyte_read(bin_file))); + len--; + } + + // symbols + new_alphabet.read(bin_file); + + len = Compression::multibyte_read(bin_file); + + while(len > 0) + { + int len2 = Compression::multibyte_read(bin_file); + wstring name = L""; + while(len2 > 0) + { + name += static_cast(Compression::multibyte_read(bin_file)); + len2--; + } + transducers[name].read(bin_file); + + len--; + } + + std::pair alph_letters; + alph_letters.first = new_alphabet; + alph_letters.second = letters; + return std::pair, std::map > (alph_letters, transducers); +} + +std::pair, std::map > +trim(FILE *file_mono, FILE *file_bi) +{ + std::pair, std::map > alph_trans_mono = read_fst(file_mono); + Alphabet alph_mono = alph_trans_mono.first.first; + std::map trans_mono = alph_trans_mono.second; + std::pair, std::map > alph_trans_bi = read_fst(file_bi); + Alphabet alph_bi = alph_trans_bi.first.first; + std::map trans_bi = alph_trans_bi.second; + + // The prefix transducer is the union of all transducers from bidix, + // with a ".*" appended + Transducer union_transducer; + // The "." in ".*" is a set of equal pairs of the output symbols + // from the monodix alphabet (: etc.) + Alphabet alph_prefix = alph_bi; + set loopback_symbols; // ints refer to alph_prefix + alph_prefix.createLoopbackSymbols(loopback_symbols, alph_mono, Alphabet::right); + + for(std::map::iterator it = trans_bi.begin(); it != trans_bi.end(); it++) + { + Transducer union_tmp = it->second; + if(union_transducer.isEmpty()) + { + union_transducer = union_tmp; + } + else + { + union_transducer.unionWith(alph_bi, union_tmp); + } + } + union_transducer.minimize(); + + Transducer prefix_transducer = union_transducer.appendDotStar(loopback_symbols); + // prefix_transducer should _not_ be minimized (both useless and takes forever) + Transducer moved_transducer = prefix_transducer.moveLemqsLast(alph_prefix); + + + for(std::map::iterator it = trans_mono.begin(); it != trans_mono.end(); it++) + { + Transducer trimmed = it->second.intersect(moved_transducer, + alph_mono, + alph_prefix); + + wcout << it->first << " " << it->second.size(); + wcout << " " << it->second.numberOfTransitions() << endl; + if(it->second.numberOfTransitions() == 0) + { + wcerr << L"Warning: empty section! Skipping it ..."<first].clear(); + } + else if(trimmed.hasNoFinals()) { + wcerr << L"Warning: section had no final state after trimming! Skipping it ..."<first].clear(); + } + else { + trimmed.minimize(); + trans_mono[it->first] = trimmed; + } + } + + alph_trans_mono.second = trans_mono; + return alph_trans_mono; +} + + +int main(int argc, char *argv[]) +{ + if(argc != 3) + { + endProgram(argv[0]); + } + + LtLocale::tryToSetLocale(); + + FILE *analyser = fopen(argv[1], "rb"); + if(!analyser) + { + wcerr << "Error: Cannot not open file '" << argv[1] << "'." << endl << endl; + exit(EXIT_FAILURE); + } + FILE *bidix = fopen(argv[2], "rb"); + if(!bidix) + { + wcerr << "Error: Cannot not open file '" << argv[2] << "'." << endl << endl; + exit(EXIT_FAILURE); + } + + std::pair, std::map > trimmed = trim(analyser, bidix); + Alphabet alph_t = trimmed.first.first; + wstring letters = trimmed.first.second; + std::map trans_t = trimmed.second; + + int n_transducers = 0; + for(std::map::iterator it = trans_t.begin(); it != trans_t.end(); it++) + { + if(!(it->second.isEmpty())) + { + n_transducers++; + } + } + + if(n_transducers == 0) + { + wcerr << L"Error: Trimming gave empty transducer!" << endl; + exit(EXIT_FAILURE); + } + + // Write the file: + FILE *output = fopen(argv[3], "wb"); + if(!output) + { + wcerr << "Error: Cannot not open file '" << argv[3] << "'." << endl << endl; + exit(EXIT_FAILURE); + } + + // letters + Compression::wstring_write(letters, output); + + // symbols + alph_t.write(output); + + // transducers + Compression::multibyte_write(n_transducers, output); + for(std::map::iterator it = trans_t.begin(); it != trans_t.end(); it++) + { + if(!(it->second.isEmpty())) + { + Compression::wstring_write(it->first, output); + it->second.write(output); + } + } + + fclose(analyser); + fclose(bidix); + fclose(output); + + return 0; +}