From 1cbd3532c2a6ceafe403949b56993fe8a7760240 Mon Sep 17 00:00:00 2001 From: Miracleyin Date: Fri, 16 Jun 2023 15:44:31 +0800 Subject: [PATCH 1/2] adding single machine download script --- .../arxiv/scripts/arxiv-single-download.sh | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 data_prep/arxiv/scripts/arxiv-single-download.sh diff --git a/data_prep/arxiv/scripts/arxiv-single-download.sh b/data_prep/arxiv/scripts/arxiv-single-download.sh new file mode 100644 index 0000000..610d045 --- /dev/null +++ b/data_prep/arxiv/scripts/arxiv-single-download.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# for download partitions +# for single machine download + +set -e + +# Depend on your CPU +WORKERS=8 + +export DATA_DIR="./data/arxiv" + +# setup partitions +python run_download.py --aws_config aws_config.ini --workers $WORKERS --target_dir $DATA_DIR --setup + +# Function to process a file +process_file() { + INPUT_FILE="$1" + echo "Processing input file is ${INPUT_FILE}" + python run_download.py --aws_config aws_config.ini --target_dir $DATA_DIR --input $INPUT_FILE +} + +# Export the function to be used by xargs +export -f process_file + +ls ${DATA_DIR}/partitions/*.txt | xargs -I {} -P ${WORKERS} -n 1 bash -c 'process_file "$@"' _ {} + From d6dca965d1b30ed82a7df017f21e5858d516f735 Mon Sep 17 00:00:00 2001 From: Miracleyin Date: Fri, 16 Jun 2023 15:50:10 +0800 Subject: [PATCH 2/2] adding downloaded files check --- data_prep/arxiv/run_download.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/data_prep/arxiv/run_download.py b/data_prep/arxiv/run_download.py index 8cb6cda..388bcfb 100644 --- a/data_prep/arxiv/run_download.py +++ b/data_prep/arxiv/run_download.py @@ -51,9 +51,16 @@ def run(self, input_file: str, tgt_dir: pathlib.Path, max_files=-1): break def __download_file(self, key, tgt_dir: pathlib.Path): + filename = pathlib.Path(tgt_dir, key) print('\nDownloading s3://arxiv/{} t' - 'o {}...'.format(key, pathlib.Path(tgt_dir, key))) - + 'o {}...'.format(key, filename)) + + if filename.exists(): + print(f'File {filename} already exists, skipping download...') + return + + print('\nDownloading s3://arxiv/{} to {}...'.format(key, filename)) + try: self.s3resource.meta.client.download_file( Bucket='arxiv',