tannonk · AnjaRy · Nov 14, 2021 · Nov 14, 2021 · Nov 15, 2021 · Nov 15, 2021
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,39 @@ seq2seq/div_beam.py
 translate_beam.py
 translate_mbr.py
 translate_div_beam.py
+
+### venv ###
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.
+
+ex3_env/
+
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+
+# Local History for Visual Studio Code
+.history/
+
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+
+# Support for Project snippet scope
+!.vscode/*.code-snippets
+
diff --git a/Test_help b/Test_help
@@ -0,0 +1,3 @@
+help
+
+I'm here for you;) 
diff --git a/assignments/03/preprocess_data.sh b/assignments/03/preprocess_data.sh
@@ -39,6 +39,6 @@ rm $data/preprocessed/train.$src.p
 rm $data/preprocessed/train.$tgt.p
 
 # preprocess all files for model training
-python preprocess.py --target-lang $tgt --source-lang $src --dest-dir $data/prepared/ --train-prefix $data/preprocessed/train --valid-prefix $data/preprocessed/valid --test-prefix $data/preprocessed/test --tiny-train-prefix $data/preprocessed/tiny_train --threshold-src 1 --threshold-tgt 1 --num-words-src 4000 --num-words-tgt 4000
+python preprocess.py --target-lang $tgt --source-lang $src --dest-dir $data/prepared/ --train-prefix $data/preprocessed/train --valid-prefix $data/preprocessed/valid --test-prefix $data/preprocessed/test --tiny-train-prefix $data/preprocessed/tiny_train --threshold-src 1 --threshold-tgt 1
 
 echo "done!"
diff --git a/bpe.py b/bpe.py
@@ -0,0 +1,170 @@
+from seq2seq.data.dictionary import Dictionary
+from collections import defaultdict
+import re
+import random
+
+class BPE():
+    def __init__(self, merges=2000):
+        self.merges = merges
+        self.bpe_vocabulary = Dictionary()
+        self.eow = '</w>'
+        self.space_words = []
+
+    def update_spacewords(self, char1, char2):
+        # update space_words
+        for index, word in enumerate(self.space_words):
+            for number, char in enumerate(word):
+                if number < len(word):
+                    if word[number] == char1 and word[number+1] == char2:
+                        word[number:number + 2] = [''.join(word[number:number + 2])]
+            self.space_words[index] = word
+
+    def create_vocabulary(self, vocabulary: Dictionary, outputfile):
+        '''create bpe-vocabulary from existing word-vocabulary
+        '''
+        words = vocabulary.words
+        space_words = []
+        counts = vocabulary.counts
+        pairs = defaultdict(int)
+
+        # separate all characters, add eow tag
+        for word in words:
+            temp = ''
+            for index, char in enumerate(word):
+                if index < len(word):
+                    temp = temp + char + ' '
+            temp = temp + word[-1] + self.eow
+            space_words.append(temp.split())
+        self.space_words = space_words
+
+        for i, word in enumerate(self.space_words):
+            count = counts[i]
+            # add each single character to vocabulary with count
+            # create pair-dictionary with frequencies
+            for y in range(len(word)-1):
+                self.bpe_vocabulary.add_word(word[y], count)
+                pairs[word[y], word[y+1]] += count
+            self.bpe_vocabulary.add_word(word[-1], count)
+
+        # merge pairs
+        for a in range(self.merges):
+            # look for highest frequency
+            char1, char2 = max(pairs, key=pairs.get)
+            new_pair = char1 + char2
+            new_count = pairs[char1, char2]
+            #add new pair to vocabulary
+            self.bpe_vocabulary.add_word(new_pair, new_count)
+            #update vocabulary
+            char1_count = self.bpe_vocabulary.counts[self.bpe_vocabulary.word2idx[char1]]
+            char2_count = self.bpe_vocabulary.counts[self.bpe_vocabulary.word2idx[char2]]
+
+            self.bpe_vocabulary.counts[self.bpe_vocabulary.word2idx[char1]] = char1_count - new_count
+            self.bpe_vocabulary.counts[self.bpe_vocabulary.word2idx[char2]] = char2_count - new_count
+
+            self.update_spacewords(char1, char2)
+
+            #update pairs
+            pairs = defaultdict(int)
+            for u, word in enumerate(self.space_words):
+                count = counts[u]
+                for j in range(len(word) - 1):
+                    pairs[word[j], word[j + 1]] += count
+
+        with open(outputfile, 'w') as f:
+            for number, word in enumerate(self.bpe_vocabulary.words):
+                f.write(word + ' ' + str(self.bpe_vocabulary.counts[number]) + '\n')
+        return outputfile, self.bpe_vocabulary
+
+
+    def apply_bpe_to_file(self, input_file, vocabulary):
+        '''return file with applied bpe segmanetation with eow tag and whitespace between bp
+        return: preprocessed/train.en -> preprocessed/bpe1_train.en
+                preprocessed/bpe1_train.en -> preprocessed/bpe2_train.en
+        '''
+
+        path = 'data/en-fr/preprocessed/bpe/'
+        with open(input_file, 'r') as f:
+            data = f.readlines()
+
+        file_name = re.match(r'data/en-fr/preprocessed/(.+?$)', input_file).group(1)
+        output_file =path + file_name
+
+        with open(output_file, 'w') as o:
+            for line in data:
+                line = line.strip()
+                line = self.bpe_segmentation(line, vocabulary)
+                o.write(line)
+                o.write('\n')
+
+        return output_file
+
+
+    def bpe_segmentation(self, sent: str, vocab: Dictionary) -> str:
+        '''Encodes a sentence given a bpe dictionary'''
+        sorted_bpe_voc = sorted(vocab.words, key=len)
+
+        def split_with_bpe_dict(word: str):
+            if len(word) <= 1:
+                return word
+
+            # look at longest byte pairs first
+            for pair in reversed(sorted_bpe_voc):
+                if word == pair:
+                    return ['', word, '']
+                elif pair in word:
+                    try: 
+                        word_replaced = re.sub(re.escape(pair), r'<'+pair+'>', word, count=1)
+                    except ValueError:
+                        pdb.set_trace()
+                    left, right = word_replaced.split('<'+pair+'>')
+                    encoded_left = split_with_bpe_dict(left)
+                    encoded_right = split_with_bpe_dict(right)
+                    return [encoded_left, pair, encoded_right]
+
+
+        result_sent = []  
+        for word in sent.split(' '):
+            word = word + '</w>'
+            word_bpe_list = [x for x in list(self.flatten(split_with_bpe_dict(word))) if x is not None]
+            word_bpe = ' '.join(word_bpe_list)
+            result_sent.append(word_bpe)
+
+        result_sent = ' '.join(result_sent)
+        result_sent = re.sub(' +', ' ', result_sent).strip()
+
+        return result_sent
+
+    def dropout(self, outputfile, probability=0.5):
+        vocab = self.bpe_vocabulary
+        new_vocab = Dictionary()
+        number_random_samples = int(len(vocab) * probability)
+
+        random_list = random.sample(range(0, len(self.bpe_vocabulary)), number_random_samples)
+
+        for element in random_list:
+            word = self.bpe_vocabulary.words[element]
+            count = self.bpe_vocabulary.counts[element]
+
+            new_vocab.add_word(word, count)
+        with open(outputfile, 'w') as f:
+            for number, word in enumerate(self.bpe_vocabulary.words):
+                f.write(word + ' ' + str(self.bpe_vocabulary.counts[number]) + '\n')
+        return outputfile, new_vocab
+
+
+    @classmethod
+    def flatten(cls, L):
+        for l in L:
+            if isinstance(l, list):
+                yield from cls.flatten(l)
+            else:
+                yield l
+
+
+if __name__ == '__main__':
+    src_dict = Dictionary.load('data/en-fr/prepared/dict.fr')
+    tgt_dict = Dictionary.load('data/en-fr/prepared/dict.en')
+
+    myBPE = BPE()
+    myBPE.create_vocabulary(src_dict, 'ladidaa')
+    myBPE.bpe_segmentation('durant la fin du XXe siècle , la Yougoslavie était considérée comme un État voyou par les États @-@ Unis .', myBPE.bpe_vocabulary)
diff --git a/commands.txt b/commands.txt
@@ -0,0 +1,94 @@
+
+Remark, I created folders in 03
+--03
+    -- vera
+        -- checkpoints
+            -- tiny
+            -- all
+        -- translations
+            -- tiny
+            -- all
+
+Preprocess
+--------------------------------
+according to exercise description the preprocessed as follows:
+
+python preprocess.py \
+    --source-lang fr \
+    --target-lang en \
+    --num-words-src 4000 \
+    --threshold-src 1 \
+    --dest-dir <tobereplaced> \
+    --vocab-src <tobereplaced> \
+    --vocab-trg <tobereplaced>
+
+Train
+--------------------------------
+
+# train on tiny dataset and save checkpoints
+
+python train.py \
+ --data data/en-fr/prepared \
+ --source-lang fr \
+ --target-lang en \
+ --save-dir assignments/03/vera/checkpoints/tiny
+ --train-on-tiny
+
+# train on entire dataset and save checkpoints
+python train.py \
+ --data data/en-fr/prepared \
+ --source-lang fr \
+ --target-lang en \
+ --save-dir assignments/03/vera/checkpoints/all
+
+# train on tiny dataset, use lexical model and save checkpoints
+python train.py \
+ --data data/en-fr/prepared \
+ --source-lang fr \
+ --target-lang en \
+ --save-dir assignments/03/vera/checkpoints/tiny \
+ --train-on-tiny \
+ --decoder-use-lexical-model True
+
+
+Translate
+--------------------------------
+# translate test with checkpoint trained on tiny
+# ??? Should one take the last or best checkpoint???
+
+python translate.py \
+--data data/en-fr/prepared \
+--dicts data/en-fr/prepared \
+--checkpoint-path assignments/03/vera/checkpoints/tiny/checkpoint_best.pt \
+--output assignments/03/vera/translations/tiny/translation_tiny.txt
+
+# translate test with checkpoint trained on all data
+python translate.py \
+--data data/en-fr/prepared \
+--dicts data/en-fr/prepared \
+--checkpoint-path assignments/03/vera/checkpoints/all/checkpoint_best.pt \
+--output assignments/03/vera/translations/all/translation_all.txt
+
+Postprocess
+--------------------------------
+# postprocess tiny translation
+bash scripts/postprocess.sh \
+assignments/03/vera/translations/tiny/translations.txt \
+assignments/03/vera/translations/tiny/translations.p.txt en
+
+# postprocess all translation
+bash scripts/postprocess.sh \
+assignments/03/vera/translations/all/translations.txt \
+assignments/03/vera/translations/all/translations.p.txt en
+
+Evaluation
+--------------------------------
+# evaluate on tiny training
+cat \
+assignments/03/vera/translations/tiny/translations.p.txt \
+| sacrebleu data/en-fr/raw/test.en
+
+# evaluate on all training
+cat \
+assignments/03/vera/translations/all/translations.p.txt \
+| sacrebleu data/en-fr/raw/test.en