diff --git a/.gitignore b/.gitignore index bce8ecbc3..47a5552a5 100644 --- a/.gitignore +++ b/.gitignore @@ -55,3 +55,4 @@ site/ /wandb/ /scratch/ core +/workspace/ diff --git a/configs/tiny/OLMo-20M.yaml b/configs/tiny/OLMo-20M.yaml index 04ffc6129..8fddfe84e 100644 --- a/configs/tiny/OLMo-20M.yaml +++ b/configs/tiny/OLMo-20M.yaml @@ -67,7 +67,7 @@ tokenizer: truncate_direction: right save_folder: workspace/${run_name} # doesn't matter since we'll upload to S3 -remote_save_folder: s3://ai2-llm/checkpoints/olmo-tiny/${run_name} +remote_save_folder: workspace/${run_name}/checkpoints # store locally save_overwrite: false # Unsharded checkpoints (for ddp) @@ -218,7 +218,7 @@ data: paths: ######### NON WEB DATA ######### # ~> GUTENBERG BOOKS (5.256 GT) - - s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy + - https://olmo-data.org/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy - https://olmo-data.org/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy - https://olmo-data.org/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy # ~> PES2O STEM PAPERS (57.21 GT) diff --git a/olmo/util.py b/olmo/util.py index af163e524..eaa49c298 100644 --- a/olmo/util.py +++ b/olmo/util.py @@ -4,6 +4,7 @@ import logging import os import re +import shutil import socket import sys import time @@ -361,6 +362,8 @@ def upload(source: PathOrStr, target: str, save_overwrite: bool = False): _gcs_upload(source, parsed.netloc, parsed.path.strip("/"), save_overwrite=save_overwrite) elif parsed.scheme in ("s3", "r2", "weka"): _s3_upload(source, parsed.scheme, parsed.netloc, parsed.path.strip("/"), save_overwrite=save_overwrite) + elif parsed.scheme in (""): + _file_upload(source, target, save_overwrite=save_overwrite) else: raise NotImplementedError(f"Upload not implemented for '{parsed.scheme}' scheme") @@ -579,6 +582,22 @@ def _wait_before_retry(attempt: int): time.sleep(min(0.5 * 2**attempt, 3.0)) +def _file_upload( + source: Path, target: str, save_overwrite: bool = False +): + target_path = Path(target) + + if not save_overwrite and target_path.exists(): + raise FileExistsError(f"{target} already exists. Use save_overwrite to overwrite it.") + + try: + target_path.parent.mkdir(parents=True, exist_ok=True) + + shutil.copyfile(source, target) + except: + raise OLMoNetworkError(f"Failed to upload to {target}") + + def _s3_upload( source: Path, scheme: str, bucket_name: str, key: str, save_overwrite: bool = False, max_attempts: int = 3 ):