From 14c2d48108c41881f3b0f24b9ea1ae2afd87ec90 Mon Sep 17 00:00:00 2001 From: Ussama Naal <606033+Samahu@users.noreply.github.com> Date: Sat, 4 May 2019 13:50:27 -0500 Subject: [PATCH 1/4] Update compilation instructions for fastBPE --- NMT/get_data_enfr.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NMT/get_data_enfr.sh b/NMT/get_data_enfr.sh index 15e404e..9e9c107 100755 --- a/NMT/get_data_enfr.sh +++ b/NMT/get_data_enfr.sh @@ -90,7 +90,7 @@ cd $TOOLS_PATH if [ ! -f "$FASTBPE" ]; then echo "Compiling fastBPE..." cd $FASTBPE_DIR - g++ -std=c++11 -pthread -O3 fast.cc -o fast + g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast fi echo "fastBPE compiled in: $FASTBPE" From bcf7b2abdbff67c7aa240feebd39cdfa0c27ad6b Mon Sep 17 00:00:00 2001 From: Ussama Naal <606033+Samahu@users.noreply.github.com> Date: Sat, 4 May 2019 15:11:50 -0500 Subject: [PATCH 2/4] Explicitly invoke perl for tokenizer.perl and normalize_punctuation.perl files to avoid getting bad permission --- NMT/get_data_enfr.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/NMT/get_data_enfr.sh b/NMT/get_data_enfr.sh index 9e9c107..d952160 100755 --- a/NMT/get_data_enfr.sh +++ b/NMT/get_data_enfr.sh @@ -171,8 +171,8 @@ if ! [[ "$(wc -l < $TGT_RAW)" -eq "$N_MONO" ]]; then echo "ERROR: Number of line # tokenize data if ! [[ -f "$SRC_TOK" && -f "$TGT_TOK" ]]; then echo "Tokenize monolingual data..." - cat $SRC_RAW | $NORM_PUNC -l en | $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TOK - cat $TGT_RAW | $NORM_PUNC -l fr | $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TOK + cat $SRC_RAW | perl $NORM_PUNC -l en | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TOK + cat $TGT_RAW | perl $NORM_PUNC -l fr | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TOK fi echo "EN monolingual data tokenized in: $SRC_TOK" echo "FR monolingual data tokenized in: $TGT_TOK" @@ -233,10 +233,10 @@ if ! [[ -f "$SRC_TEST.sgm" ]]; then echo "$SRC_TEST.sgm is not found!"; exit; fi if ! [[ -f "$TGT_TEST.sgm" ]]; then echo "$TGT_TEST.sgm is not found!"; exit; fi echo "Tokenizing valid and test data..." -$INPUT_FROM_SGM < $SRC_VALID.sgm | $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_VALID -$INPUT_FROM_SGM < $TGT_VALID.sgm | $NORM_PUNC -l fr | $REM_NON_PRINT_CHAR | $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_VALID -$INPUT_FROM_SGM < $SRC_TEST.sgm | $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TEST -$INPUT_FROM_SGM < $TGT_TEST.sgm | $NORM_PUNC -l fr | $REM_NON_PRINT_CHAR | $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TEST +$INPUT_FROM_SGM < $SRC_VALID.sgm | perl $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_VALID +$INPUT_FROM_SGM < $TGT_VALID.sgm | perl $NORM_PUNC -l fr | $REM_NON_PRINT_CHAR | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_VALID +$INPUT_FROM_SGM < $SRC_TEST.sgm | perl $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TEST +$INPUT_FROM_SGM < $TGT_TEST.sgm | perl $NORM_PUNC -l fr | $REM_NON_PRINT_CHAR | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TEST echo "Applying BPE to valid and test files..." $FASTBPE applybpe $SRC_VALID.$CODES $SRC_VALID $BPE_CODES $SRC_VOCAB From f0991eaaa6b644dbe32de8e9533af511e629b92e Mon Sep 17 00:00:00 2001 From: Ussama Naal <606033+Samahu@users.noreply.github.com> Date: Sat, 4 May 2019 18:35:35 -0500 Subject: [PATCH 3/4] Explicitly call python on .py files --- NMT/get_data_enfr.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/NMT/get_data_enfr.sh b/NMT/get_data_enfr.sh index d952160..6c84399 100755 --- a/NMT/get_data_enfr.sh +++ b/NMT/get_data_enfr.sh @@ -207,8 +207,8 @@ echo "Full vocab in: $FULL_VOCAB" # binarize data if ! [[ -f "$SRC_TOK.$CODES.pth" && -f "$TGT_TOK.$CODES.pth" ]]; then echo "Binarizing data..." - $UMT_PATH/preprocess.py $FULL_VOCAB $SRC_TOK.$CODES - $UMT_PATH/preprocess.py $FULL_VOCAB $TGT_TOK.$CODES + python $UMT_PATH/preprocess.py $FULL_VOCAB $SRC_TOK.$CODES + python $UMT_PATH/preprocess.py $FULL_VOCAB $TGT_TOK.$CODES fi echo "EN binarized data in: $SRC_TOK.$CODES.pth" echo "FR binarized data in: $TGT_TOK.$CODES.pth" @@ -246,10 +246,10 @@ $FASTBPE applybpe $TGT_TEST.$CODES $TGT_TEST $BPE_CODES $TGT_VOCAB echo "Binarizing data..." rm -f $SRC_VALID.$CODES.pth $TGT_VALID.$CODES.pth $SRC_TEST.$CODES.pth $TGT_TEST.$CODES.pth -$UMT_PATH/preprocess.py $FULL_VOCAB $SRC_VALID.$CODES -$UMT_PATH/preprocess.py $FULL_VOCAB $TGT_VALID.$CODES -$UMT_PATH/preprocess.py $FULL_VOCAB $SRC_TEST.$CODES -$UMT_PATH/preprocess.py $FULL_VOCAB $TGT_TEST.$CODES +python $UMT_PATH/preprocess.py $FULL_VOCAB $SRC_VALID.$CODES +python $UMT_PATH/preprocess.py $FULL_VOCAB $TGT_VALID.$CODES +python $UMT_PATH/preprocess.py $FULL_VOCAB $SRC_TEST.$CODES +python $UMT_PATH/preprocess.py $FULL_VOCAB $TGT_TEST.$CODES # From 27e4fcaad633b5d776b72a1d4aef9bd3ad0f8349 Mon Sep 17 00:00:00 2001 From: Ussama Naal <606033+Samahu@users.noreply.github.com> Date: Sat, 4 May 2019 18:46:33 -0500 Subject: [PATCH 4/4] Invoke perl explicitly on the reminder of perl files --- NMT/get_data_enfr.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/NMT/get_data_enfr.sh b/NMT/get_data_enfr.sh index 6c84399..b461280 100755 --- a/NMT/get_data_enfr.sh +++ b/NMT/get_data_enfr.sh @@ -233,10 +233,10 @@ if ! [[ -f "$SRC_TEST.sgm" ]]; then echo "$SRC_TEST.sgm is not found!"; exit; fi if ! [[ -f "$TGT_TEST.sgm" ]]; then echo "$TGT_TEST.sgm is not found!"; exit; fi echo "Tokenizing valid and test data..." -$INPUT_FROM_SGM < $SRC_VALID.sgm | perl $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_VALID -$INPUT_FROM_SGM < $TGT_VALID.sgm | perl $NORM_PUNC -l fr | $REM_NON_PRINT_CHAR | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_VALID -$INPUT_FROM_SGM < $SRC_TEST.sgm | perl $NORM_PUNC -l en | $REM_NON_PRINT_CHAR | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TEST -$INPUT_FROM_SGM < $TGT_TEST.sgm | perl $NORM_PUNC -l fr | $REM_NON_PRINT_CHAR | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TEST +perl $INPUT_FROM_SGM < $SRC_VALID.sgm | perl $NORM_PUNC -l en | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_VALID +perl $INPUT_FROM_SGM < $TGT_VALID.sgm | perl $NORM_PUNC -l fr | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_VALID +perl $INPUT_FROM_SGM < $SRC_TEST.sgm | perl $NORM_PUNC -l en | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l en -no-escape -threads $N_THREADS > $SRC_TEST +perl $INPUT_FROM_SGM < $TGT_TEST.sgm | perl $NORM_PUNC -l fr | perl $REM_NON_PRINT_CHAR | perl $TOKENIZER -l fr -no-escape -threads $N_THREADS > $TGT_TEST echo "Applying BPE to valid and test files..." $FASTBPE applybpe $SRC_VALID.$CODES $SRC_VALID $BPE_CODES $SRC_VOCAB