Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: SomaDataIO
Title: Input/Output 'SomaScan' Data
Version: 6.5.0
Version: 6.5.0.9000
Authors@R: c(
person(given = "Stu",
family = "Field",
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,6 @@ importFrom(tidyr,gather)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,separate)
importFrom(tidyr,unite)
importFrom(tools,md5sum)
importFrom(utils,capture.output)
importFrom(utils,head)
importFrom(utils,read.csv)
Expand Down
12 changes: 12 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# SomaDataIO (6.5.0.9000)

### Function and Object Improvements

* Removed restrictive file validation from `read_annotations()`
- removed `md5sum` checksum validation and version dictionary checks
resulting in misleading warnings about unknown annotations files
- warning was often misleading, as menu annotations file updates are
not always in alignment with timing of CRAN releases
- removed `getAnnoVer()` function and `ver_dict` object
- removed `tools::md5sum` import dependency

# SomaDataIO (6.5.0)

### Function and Object Improvements
Expand Down
137 changes: 18 additions & 119 deletions R/read-annotations.R
Original file line number Diff line number Diff line change
@@ -1,61 +1,27 @@
#' Import a SomaLogic Annotations File
#'
#' @param file A path to an annotations file location.
#' This is a sanctioned, versioned file provided by
#' SomaLogic Operating Co., Inc. and should be an _unmodified_
#' `*.xlsx` file.
#' This should be a SomaLogic annotations file in
#' `*.xlsx` format.
#' @return A `tibble` containing analyte-specific annotations and
#' related (e.g. lift/bridging) information, keyed on SomaLogic
#' [SeqId], the unique SomaScan analyte identifier.
#' @examples
#' \dontrun{
#' # for example
#' file <- "~/Downloads/SomaScan_11K_Annotated_Content.xlsx"
#' file <- "~/Downloads/SomaScan_11K_v5.0_Plasma_Serum_Annotated_Menu.xlsx"
#' anno_tbl <- read_annotations(file)
#' }
#' @importFrom readxl read_xlsx
#' @importFrom tools md5sum
#' @export
read_annotations <- function(file) {

if ( !(endsWith(file, "xlsx") || endsWith(file, "json")) ) {
stop("Annotations file must be either ", .value("*.xlsx"),
" or ", .value("*.json"), ".", call. = FALSE)
if ( !grepl("\\.xlsx$", file, ignore.case = TRUE) ) {
stop("Annotations file must be in `*.xlsx` format.", call. = FALSE)
}

ver <- getAnnoVer(file)

# cannot determine version
if ( !grepl("^SL-[0-9]+-rev[0-9]+", ver) ) {
stop(
"Unable to determine annotations file version: ", .value(ver),
".\nA valid annotations file version is required to proceed.",
call. = FALSE
)
}

# check if recognized version
if ( ver %in% names(ver_dict) ) {
md5_file <- strtrim(md5sum(file), 7L) |> unname()
md5_true <- strtrim(ver_dict[[ver]]$sha, 7L)

# file modified
if ( !identical(md5_file, md5_true) ) {
warning(
"Checksum mismatch. ", basename(file), " may have been modified.",
call. = FALSE
)
}
skip <- ver_dict[[ver]]$skip
} else {
warning(
"Unknown version of the annotations file: ", ver, ".",
call. = FALSE
)
skip <- 8L
}

tbl <- readxl::read_xlsx(file, sheet = "Annotations", skip = skip)
# Read the annotations file with standard skip value = 8L
tbl <- readxl::read_xlsx(file, sheet = "Annotations", skip = 8L)
Comment thread
scheidec marked this conversation as resolved.

# map these fields to match those in ADATs
map <- c(Target = "Target Name",
Expand All @@ -64,84 +30,17 @@ read_annotations <- function(file) {
EntrezGeneID = "Entrez Gene ID",
EntrezGeneSymbol = "Entrez Gene Name")
tbl <- dplyr::rename(tbl, !!!map)
stopifnot(
all(c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
"Organism", "UniProt", "Dilution", "EntrezGeneID",
"EntrezGeneSymbol") %in% names(tbl)
)
)
structure(tbl, version = ver)
}

# assumes line7 contains the version info
getAnnoVer <- function(file) {
rev <- readxl::read_xlsx(file, sheet = "Annotations", skip = 6L, n_max = 1L,
col_names = c("text", "doc", "version", "date"),
col_types = "text")
ver <- paste(toupper(rev$text), rev$doc, tolower(rev$version), rev$date, sep = "-")
gsub(" +", "", ver)
}
# check for expected fields in annotations file
required_cols <- c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
"Organism", "UniProt", "EntrezGeneID", "EntrezGeneSymbol")
missing_cols <- setdiff(required_cols, names(tbl))

# version dictionary of key-value pairs
# for file characteristics
# SHA hashes are calculated with `tools::md5sum()`
ver_dict <- list(
# The first 2 are for testing
# dummy version; 5k -> 7k
"SL-99999999-rev99-1999-01" = list(col_serum = "Serum Scalar v4.0 to v4.1",
col_plasma = "Plasma Scalar v4.0 to v4.1"),
# test-anno.xlsx file; 7k -> 5k
"SL-12345678-rev0-2021-01" = list(sha = "8a345fa621377d0bac40fc8c47f5579d",
col_serum = "Serum Scalar v4.1 to v4.0",
col_plasma = "Plasma Scalar v4.1 to v4.0",
which_serum = 40,
which_plasma = 42,
skip = 8L,
rows = 1,
cols = 43),
# 7k -> 5k
"SL-00000571-rev2-2021-06" = list(sha = "5fa46834ed826eb1e8dba88698cf7a76",
col_serum = "Serum Scalar v4.1 to v4.0",
col_plasma = "Plasma Scalar v4.1 to v4.0",
which_serum = 40,
which_plasma = 42,
skip = 8L,
rows = 7605,
cols = 43),
# 5k -> 7k
"SL-00000246-rev5-2021-06" = list(sha = "7d92666369d4e33364b11804f2d1f8ce",
col_serum = "Serum Scalar v4.0 to v4.1",
col_plasma = "Plasma Scalar v4.0 to v4.1",
which_serum = 40,
which_plasma = 42,
skip = 8L,
rows = 5293,
cols = 43),
if ( length(missing_cols) > 0 ) {
stop("Missing required columns in annotations file: ",
paste(missing_cols, collapse = ", "), call. = FALSE)
}

tbl
}

# source 7k ----
# https://menu.somalogic.com/file-downloads/menu-annotations
# SL00000571_SomaScan_7K_v4.1_Plasma_Serum_Annotated_Menu.xlsx
"SL-00000571-rev11-2025-09" = list(sha = "f13dbe8d5f97bdf56eb107d2cff15408",
col_serum = c("Serum Scalar v4.1 7K to v4.0 5K",
"Serum Scalar v4.1 7K to v5.0 11K"),
col_plasma = c("Plasma Scalar v4.1 7K to v4.0 5K",
"Plasma Scalar v4.1 7K to v5.0 11K"),
which_serum = c(43, 47),
which_plasma = c(45, 49),
skip = 8L,
rows = 7605,
cols = 50),
# source 11k ----
# https://menu.somalogic.com/file-downloads/menu-annotations
# SL00000906_SomaScan_11K_v5.0_Plasma_Serum_Annotated_Menu.xlsx
"SL-00000906-rev8-2025-09" = list(sha = "48f7aafc713acdd7896f010f62506b51",
col_serum = c("Serum Scalar v5.0 11K to v4.1 7K",
"Serum Scalar v5.0 11K to v4.0 5K"),
col_plasma = c("Plasma Scalar v5.0 11K to v4.1 7K",
"Plasma Scalar v5.0 11K to v4.0 5K"),
which_serum = c(43, 47),
which_plasma = c(45, 49),
skip = 8L,
rows = 11092,
cols = 51)
)
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
<!-- badges: start -->

![GitHub
version](https://img.shields.io/badge/Version-6.4.0.9000-success.svg?style=flat&logo=github)
version](https://img.shields.io/badge/Version-6.5.0.9000-success.svg?style=flat&logo=github)
[![CRAN
status](http://www.r-pkg.org/badges/version/SomaDataIO)](https://cran.r-project.org/package=SomaDataIO)
[![Downloads](https://cranlogs.r-pkg.org/badges/SomaDataIO)](https://cran.r-project.org/package=SomaDataIO)
Expand Down
7 changes: 3 additions & 4 deletions man/read_annotations.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 10 additions & 35 deletions tests/testthat/test-read-annotations.R
Original file line number Diff line number Diff line change
@@ -1,48 +1,23 @@

# Setup ----
file <- test_path("testdata", "test-anno.xlsx")

test_that("`ver_dict` is updated and correct", {
expect_length(ver_dict, 6L)
expect_named(ver_dict,
c("SL-99999999-rev99-1999-01",
"SL-12345678-rev0-2021-01",
"SL-00000571-rev2-2021-06",
"SL-00000246-rev5-2021-06",
"SL-00000571-rev11-2025-09",
"SL-00000906-rev8-2025-09"))
})

test_that("`getAnnoVer()` parses the version correctly", {
expect_equal(getAnnoVer(file), "SL-12345678-rev0-2021-01")
})

# Testing ----
test_that("`read_annotations()` parses the annotations file correctly", {
tbl <- read_annotations(file)
expect_s3_class(tbl, "tbl_df")
expect_equal(dim(tbl), c(1L, 43L))
ver <- attr(tbl, "version")
expect_equal(ver, "SL-12345678-rev0-2021-01")
expect_true(ver_dict[[ver]]$col_serum == names(tbl)[ver_dict[[ver]]$which_serum])
expect_true(ver_dict[[ver]]$col_plasma == names(tbl)[ver_dict[[ver]]$which_plasma])
})

test_that("error conditions trigger stop and warnings when appropriate", {
# Check that required columns are present after field mapping
expected_cols <- c("SeqId", "SomaId", "Target", "Type", "TargetFullName",
"Organism", "UniProt", "EntrezGeneID",
"EntrezGeneSymbol")
expect_true(all(expected_cols %in% names(tbl)))
})

test_that("error conditions trigger appropriate errors", {
expect_error(
read_annotations("foo.txt"),
"Annotations file must be either"
)

expect_warning(
with_pkg_object(SomaDataIO:::ver_dict[-2L], read_annotations(file)),
"Unknown version of the annotations file:"
)

# temp modify md5sha
tmp <- SomaDataIO:::ver_dict
tmp$`SL-12345678-rev0-2021-01`$sha <- "x0x0x0x0x"
expect_warning(
with_pkg_object(tmp, read_annotations(file)),
"Checksum mismatch. test-anno.xlsx may have been modified"
"Annotations file must be"
)
})