From 5b376a8d62a13b1d4ab46026aed108f5a3caabc1 Mon Sep 17 00:00:00 2001 From: jiaruifang Date: Mon, 8 May 2023 12:06:44 +0800 Subject: [PATCH] fixed some errors in Make for lm make --- data_prep/cc/cc_net/Makefile | 23 +++++++++++-------- data_prep/cc/cc_net/cc_net/get_wiki_cirrus.py | 14 +++++++---- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/data_prep/cc/cc_net/Makefile b/data_prep/cc/cc_net/Makefile index e887ac4..bf6d3c2 100644 --- a/data_prep/cc/cc_net/Makefile +++ b/data_prep/cc/cc_net/Makefile @@ -1,6 +1,7 @@ # Makefile to install CC-Net and train the LMs. # `make` or `make help` to get some help. + # Arguments: lang?=en process?=8 @@ -58,6 +59,7 @@ dl_lm: lm: data/lm_sp/$(lang).sp.model data/lm_sp/$(lang).arpa.bin # Computes a 5-gram LM for the given language -> make lang=it lm # Restricted to the first NDOC_FOR_LM documents + mkdir -p data/lm_sp sp: data/lm_sp/$(lang).sp.model # Train a sentence piece model on Wikipedia -> make lang=it sp @@ -111,20 +113,21 @@ data/lm_sp/%.sp.model: data/cirrus/txt/%.opening.txt echo "Trained SentencePiece model with `wc -l $(basename $@).vocab` pieces" data/cirrus/sp/%.opening.txt: data/cirrus/gz/%.json.gz data/lm_sp/%.sp.model + mkdir -p data/cirrus/sp $(SPM_ENCODE) \ --model=$(word 2,$^) \ --output_format=piece \ - < <(python get_wiki_cirrus.py opening --file $< --n_docs $(NDOC_FOR_LM)) \ + < <(python cc_net/get_wiki_cirrus.py opening --file $< --n_docs $(NDOC_FOR_LM)) \ > $@ data/cirrus/txt/%.opening.txt: data/cirrus/gz/%.json.gz - python get_wiki_cirrus.py opening \ + python cc_net/get_wiki_cirrus.py opening \ --n_docs $(NDOC_FOR_LM) \ --file $< --output $@ data/cirrus/gz/%.json.gz: - mkdir $(@D) - python get_wiki_cirrus.py dl --lang $(call get_lang,$(@F)) --output_dir $(@D) + mkdir -p $(@D) + python cc_net/get_wiki_cirrus.py dl --lang $(call get_lang,$(@F)) --output_dir $(@D) clean: # Remove intemediary files, dataset, third_party sources @@ -155,11 +158,8 @@ bin/lmplz: third_party/kenlm third_party/sentencepiece: # Download sentencepiece sources: https://github.com/google/sentencepiece mkdir -p $(@D) - wget -c -O $(@D)/sentencepiece.zip https://github.com/google/sentencepiece/archive/v0.1.83.zip - unzip -o -d $(@D) $(@D)/sentencepiece.zip - rm $(@D)/sentencepiece.zip - # remove the version id from the folder name - mv $(@D)/sentencepiece-* $@ + git clone https://github.com/google/sentencepiece.git $(@D)/sentencepiece + bin/spm_train: third_party/sentencepiece # Compiles sentencepiece binaries @@ -172,7 +172,10 @@ bin/spm_train: third_party/sentencepiece # $ cd $ Dict[str, str]: def wget(url: str, output: Path): - subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True) - tmp(output).replace(output) + if not os.path.isfile(output): + subprocess.run(["wget", url, "-O", tmp(output), "-q"], check=True) + tmp(output).replace(output) + else: + print(f"File {tmp(output)} already exists, skipping download") + assert ( output.stat().st_size > 10_000 ), f"File {output} downloaded from {url} looks too small"