From d9588f6677e4227e69bb4961ed6e238bca58e6fa Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 15 Sep 2025 14:21:23 +1000 Subject: [PATCH] add s3_list_files_dir s3_file_presignedurl --- DESCRIPTION | 1 + NAMESPACE | 2 + R/fs_s3.R | 79 +++++++++++++++++++++++++++++++++ deploy/conda/recipe/recipe.yaml | 2 + man/s3_file_presignedurl.Rd | 29 ++++++++++++ man/s3_list_files_dir.Rd | 29 ++++++++++++ 6 files changed, 142 insertions(+) create mode 100644 R/fs_s3.R create mode 100644 man/s3_file_presignedurl.Rd create mode 100644 man/s3_list_files_dir.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 1ce198f..4c264dc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,6 +23,7 @@ Imports: fs, glue, log4r, + paws.storage, purrr, R6, readr, diff --git a/NAMESPACE b/NAMESPACE index e6b9db2..744c8fb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,8 @@ export(nemoverse_wf_dispatch) export(parse_file) export(parse_file_nohead) export(pkg_found) +export(s3_file_presignedurl) +export(s3_list_files_dir) export(schema_guess) export(set_tbl_version_attr) export(valid_out_fmt) diff --git a/R/fs_s3.R b/R/fs_s3.R new file mode 100644 index 0000000..a2494b6 --- /dev/null +++ b/R/fs_s3.R @@ -0,0 +1,79 @@ +#' List Objects in AWS S3 Directory +#' +#' Returns some or all (up to 1,000) of the objects in an S3 directory. +#' +#' @param s3dir S3 directory. +#' @param max_objects Maximum objects returned. +#' +#' +#' @return A tibble with object basename, size, last modified timestamp, and +#' full S3 path. +#' @examples +#' \dontrun{ +#' p1 <- "s3://project-data-889522050439-ap-southeast-2/byob-icav2" +#' p2 <- "project-wgs-accreditation/analysis/oncoanalyser-wgts-dna" +#' p3 <- "20250910013ce65a/L2100216__L2100215" +#' s3dir <- file.path(p1, p2, p3) +#' s3_list_files_dir(s3dir, max_objects = 15) +#' } +#' @export +s3_list_files_dir <- function(s3dir, max_objects = 1000) { + stopifnot(grepl("^s3://", s3dir)) + bucket <- sub("s3://(.*?)/.*", "\\1", s3dir) + prefix <- sub("s3://(.*?)/(.*)", "\\2", s3dir) + s3 <- paws.storage::s3() + l <- s3$list_objects_v2( + Bucket = bucket, + Prefix = prefix, + MaxKeys = max_objects + ) + stopifnot(all(c("Contents", "KeyCount") %in% names(l))) + cols_sel <- c("bname", "size", "lastmodified", "path") + # handle no results + if (l[["KeyCount"]] == 0) { + return(empty_tbl(cnames = cols_sel, ctypes = "cccc")) + } + d <- l[["Contents"]] |> + purrr::map( + \(x) { + tibble::tibble( + Key = x[["Key"]], + Size = x[["Size"]], + lastmodified = as.character(x[["LastModified"]]) + ) + } + ) |> + dplyr::bind_rows() |> + dplyr::mutate( + path = paste0("s3://", bucket, "/", .data$Key), + bname = basename(.data$path), + size = fs::as_fs_bytes(.data$Size) + ) |> + dplyr::select(dplyr::all_of(cols_sel)) + return(d) +} + +#' S3 Generate Presigned URL +#' +#' @param client S3 client. Make sure you use `signature_version = "s3v4"` (see example). +#' @param s3path Full path to S3 object. +#' @param expiry_seconds Number of seconds the presigned URL is valid for (3600 = 1 hour). +#' +#' @return An S3 presigned URL. +#' @examples +#' \dontrun{ +#' client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) +#' s3path <- "s3://bucket1/path/to/file.tsv" +#' s3_file_presignedurl(client, s3path) +#' } +#' +#' @export +s3_file_presignedurl <- function(client, s3path, expiry_seconds = 604800) { + bucket <- sub("s3://(.*?)/.*", "\\1", s3path) + prefix <- sub("s3://(.*?)/(.*)", "\\2", s3path) + client$generate_presigned_url( + client_method = "get_object", + params = list(Bucket = bucket, Key = prefix), + expires_in = expiry_seconds + ) +} diff --git a/deploy/conda/recipe/recipe.yaml b/deploy/conda/recipe/recipe.yaml index 564598c..ada5cfa 100644 --- a/deploy/conda/recipe/recipe.yaml +++ b/deploy/conda/recipe/recipe.yaml @@ -25,6 +25,7 @@ requirements: - r-glue - r-knitr - r-log4r + - r-paws.storage - r-purrr - r-r6 - r-readr @@ -44,6 +45,7 @@ requirements: - r-glue - r-knitr - r-log4r + - r-paws.storage - r-purrr - r-r6 - r-readr diff --git a/man/s3_file_presignedurl.Rd b/man/s3_file_presignedurl.Rd new file mode 100644 index 0000000..f7dddd2 --- /dev/null +++ b/man/s3_file_presignedurl.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fs_s3.R +\name{s3_file_presignedurl} +\alias{s3_file_presignedurl} +\title{S3 Generate Presigned URL} +\usage{ +s3_file_presignedurl(client, s3path, expiry_seconds = 604800) +} +\arguments{ +\item{client}{S3 client. Make sure you use \code{signature_version = "s3v4"} (see example).} + +\item{s3path}{Full path to S3 object.} + +\item{expiry_seconds}{Number of seconds the presigned URL is valid for (3600 = 1 hour).} +} +\value{ +An S3 presigned URL. +} +\description{ +S3 Generate Presigned URL +} +\examples{ +\dontrun{ +client <- paws.storage::s3(paws.storage::config(signature_version = "s3v4")) +s3path <- "s3://bucket1/path/to/file.tsv" +s3_file_presignedurl(client, s3path) +} + +} diff --git a/man/s3_list_files_dir.Rd b/man/s3_list_files_dir.Rd new file mode 100644 index 0000000..a5aada3 --- /dev/null +++ b/man/s3_list_files_dir.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fs_s3.R +\name{s3_list_files_dir} +\alias{s3_list_files_dir} +\title{List Objects in AWS S3 Directory} +\usage{ +s3_list_files_dir(s3dir, max_objects = 1000) +} +\arguments{ +\item{s3dir}{S3 directory.} + +\item{max_objects}{Maximum objects returned.} +} +\value{ +A tibble with object basename, size, last modified timestamp, and +full S3 path. +} +\description{ +Returns some or all (up to 1,000) of the objects in an S3 directory. +} +\examples{ +\dontrun{ +p1 <- "s3://project-data-889522050439-ap-southeast-2/byob-icav2" +p2 <- "project-wgs-accreditation/analysis/oncoanalyser-wgts-dna" +p3 <- "20250910013ce65a/L2100216__L2100215" +s3dir <- file.path(p1, p2, p3) +s3_list_files_dir(s3dir, max_objects = 15) +} +}