Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,4 @@ site/
/wandb/
/scratch/
core
/workspace/
4 changes: 2 additions & 2 deletions configs/tiny/OLMo-20M.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ tokenizer:
truncate_direction: right

save_folder: workspace/${run_name} # doesn't matter since we'll upload to S3
remote_save_folder: s3://ai2-llm/checkpoints/olmo-tiny/${run_name}
remote_save_folder: workspace/${run_name}/checkpoints # store locally
save_overwrite: false

# Unsharded checkpoints (for ddp)
Expand Down Expand Up @@ -218,7 +218,7 @@ data:
paths:
######### NON WEB DATA #########
# ~> GUTENBERG BOOKS (5.256 GT)
- s3://ai2-llm/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
- https://olmo-data.org/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-0-00000.npy
- https://olmo-data.org/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-1-00000.npy
- https://olmo-data.org/preprocessed/olmo-mix/v1_6-decontaminated/books/gpt-neox-olmo-dolma-v1_5/part-2-00000.npy
# ~> PES2O STEM PAPERS (57.21 GT)
Expand Down
19 changes: 19 additions & 0 deletions olmo/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import os
import re
import shutil
import socket
import sys
import time
Expand Down Expand Up @@ -361,6 +362,8 @@ def upload(source: PathOrStr, target: str, save_overwrite: bool = False):
_gcs_upload(source, parsed.netloc, parsed.path.strip("/"), save_overwrite=save_overwrite)
elif parsed.scheme in ("s3", "r2", "weka"):
_s3_upload(source, parsed.scheme, parsed.netloc, parsed.path.strip("/"), save_overwrite=save_overwrite)
elif parsed.scheme in (""):
_file_upload(source, target, save_overwrite=save_overwrite)
else:
raise NotImplementedError(f"Upload not implemented for '{parsed.scheme}' scheme")

Expand Down Expand Up @@ -579,6 +582,22 @@ def _wait_before_retry(attempt: int):
time.sleep(min(0.5 * 2**attempt, 3.0))


def _file_upload(
source: Path, target: str, save_overwrite: bool = False
):
target_path = Path(target)

if not save_overwrite and target_path.exists():
raise FileExistsError(f"{target} already exists. Use save_overwrite to overwrite it.")

try:
target_path.parent.mkdir(parents=True, exist_ok=True)

shutil.copyfile(source, target)
except:
raise OLMoNetworkError(f"Failed to upload to {target}")


def _s3_upload(
source: Path, scheme: str, bucket_name: str, key: str, save_overwrite: bool = False, max_attempts: int = 3
):
Expand Down