From acab990846295ac1d8dc77eb60ebffd9c9c82add Mon Sep 17 00:00:00 2001 From: Hannes Reinwald Date: Fri, 13 Jun 2025 14:07:13 +0200 Subject: [PATCH 1/6] Added small stx_tax_groups function to retrieve all defined taxonomic groups from the taxa table. --- R/standartox.R | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/R/standartox.R b/R/standartox.R index fea1f76..e8c6fd6 100644 --- a/R/standartox.R +++ b/R/standartox.R @@ -330,6 +330,24 @@ stx_taxa = function(silent = TRUE, ...) { return(result) } +#' Retrieve taxa groups +#' +#' Retrieve a character vector of all tax_groups defined in Standartox. +#' +#' @return Returns a character vector of all tax_groups defined in Standartox. +#' @author Hannes Reinwald \email{hannes.reinwald@@bayer.com} +#' @examples +#' \donttest{ +#' # might fail if there is no internet connection or Zenodo.org not not available +#' stx_tax_groups() +#' } +#' +#' @export +stx_tax_groups = function(...){ + taxa = na.omit( unique(stx_taxa(...)$tax_group) ) + taxa = unique( unlist( strsplit(taxa, "\\s*\\|\\|\\s*") ) ) + return(sort(taxa)) +} # IDEA # microbenchmark::microbenchmark({ From a9e9cdda6ac7520f72f09cb677e57760649c8be8 Mon Sep 17 00:00:00 2001 From: Hannes Reinwald Date: Mon, 16 Jun 2025 12:44:02 +0200 Subject: [PATCH 2/6] Added stx_query() function. --- R/standartox.R | 292 +++++++++++++++++++++++++++++++------------------ 1 file changed, 183 insertions(+), 109 deletions(-) diff --git a/R/standartox.R b/R/standartox.R index e8c6fd6..cac2083 100644 --- a/R/standartox.R +++ b/R/standartox.R @@ -149,119 +149,193 @@ stx_catalog = function(silent = TRUE, ...) { #' } #' #' @export -stx_query = function(cas = NULL, - concentration_unit = NULL, - concentration_type = NULL, - duration = NULL, - endpoint = c('XX50', 'NOEX', 'LOEX'), - effect = NULL, - exposure = NULL, - chemical_role = NULL, - chemical_class = NULL, - taxa = NULL, - ecotox_grp = NULL, - trophic_lvl = NULL, - habitat = NULL, - region = NULL, - vers = NULL, - ...) { - # to avoid NOTE in R CMD check --as-cran - casnr = outlier = concentration = cname = NULL - # debuging - # browser() # debuging - # cas = '1071-83-6'; concentration_unit = NULL; concentration_type = NULL; duration = NULL; endpoint = 'XX50'; effect = NULL; exposure = NULL; chemical_role = NULL; chemical_class = NULL; taxa = NULL; ecotox_grp = NULL; trophic_lvl = NULL; habitat = NULL; region = NULL; vers = NULL - # checks - endpoint = match.arg(endpoint) - # request - body = list(casnr = cas, - concentration_unit = concentration_unit, - concentration_type = concentration_type, - duration = duration, - endpoint = endpoint, - effect = effect, - exposure = exposure, - chemical_role = chemical_role, - chemical_class = chemical_class, - taxa = taxa, - ecotox_grp = ecotox_grp, - trophic_lvl = trophic_lvl, - habitat = habitat, - region = region, - vers = vers) - # POST - stx_message(body) - res = try(httr::POST( - file.path(domain(), 'filter'), - body = body, - encode = 'json', - ), silent = TRUE) - stx_availability(res) - if (res$status_code == 400) { - warning(jsonlite::fromJSON(httr::content(res, type = 'text', encoding = 'UTF-8'))) - out_fil = data.table(NA) - filtered = data.table(NA) - out_agg = data.table(NA) +#' +stx_query_import = function(silent = TRUE, + data_type = c('test_fin.fst','phch.fst','taxa.fst','refs.fst'), + ...) { + if (!silent) message('Retrieving Standartox listed Chemicals ...') + if (silent) { + result = suppressMessages( stx_download(data_type = data_type, ...) ) + } else { + result = stx_download(data_type = data_type, ...) } - if (res$status_code == 200) { - out_fil = read_bin_vec(res$content, type = 'fst') - if (nrow(out_fil) == 0) { - warning('No results found.') - out_fil = data.table(NA) - filtered = data.table(NA) - out_agg = data.table(NA) - } else { - # CAS column (not sent through API to reduce size) - out_fil[ , cas := cas_conv(casnr) ][ , casnr := NULL ] - # outliers - para = c('cas', - 'concentration_unit', - 'concentration_type', - 'duration', - 'tax_taxon', - 'effect', - 'endpoint', - 'exposure') - out_fil[ , - outlier := flag_outliers(concentration), - by = para ] - # colorder - nam = names(out_fil) - col_order = c('cname', 'cas', 'iupac_name', 'inchikey', 'inchi', 'molecularweight', - 'result_id', 'endpoint', 'effect', 'exposure', 'trophic_lvl', 'ecotox_grp', 'concentration_type', - 'concentration', 'concentration_unit', 'concentration_orig', 'concentration_unit_orig', - 'duration', 'duration_unit', 'outlier', - c('species_number', grep('tax_|hab_|reg_', nam, value = TRUE)), - grep('cro_|ccl_', nam, value = TRUE), - grep('ref', nam, value = TRUE)) - setcolorder(out_fil, col_order) - # order - setorder(out_fil, cname) - # id - col_id = c('cname', 'cas', 'inchikey', 'inchi', 'result_id', 'species_number', 'ref_number') - id = out_fil[ , .SD, .SDcols = col_id ] - # short - col_short = c('cname', 'cas', 'inchikey', - 'endpoint', 'effect', 'exposure', 'trophic_lvl', 'ecotox_grp', 'concentration_type', - 'concentration', 'concentration_unit', 'concentration_orig', 'concentration_unit_orig', - 'duration', 'duration_unit', 'outlier', - grep('tax_', nam, value = TRUE)) - filtered = out_fil[ , .SD, .SDcols = col_short ] - # aggregate - out_agg = suppressWarnings( - stx_aggregate(out_fil) - ) + # Convert data frame to data table + result = lapply( result, function(dt) data.table::setDT(dt) ) + return(result) +} + +filter_dt = function(dt, var_ls){ + stopifnot(is.data.frame(dt)) + stopifnot(is.list(var_ls)) + data.table::setDT(dt) # Ensure dt is a data.table + + var_check = !unlist( lapply(var_ls, is.null) ) + var_check = names( which(var_check) ) # select only those variables that are not NULL + + # Loop through the variables and filter the taxa.fst data frame + if (length(var_check) > 0) { + dt.out = list() + + for(var in var_check){ + message("Filtering for variable: ", var) + if (var %in% colnames(dt)) { # Check if the variable exists as a column in data table + if( !is.null(var_ls[[var]]) ){ + # Filter the data table based on the specified column and the values in the variable + filter_val = var_ls[[var]] + dt.out[[var]] <- dt[ dt[[var]] %in% filter_val ] + if(nrow(dt.out[[var]]) == 0){ warning( paste("No query matches found for:\t", var) ) } + } + } else { + warning(paste("Variable", var, "not found in data table. Skipping filter.")) + } } - } - # meta - out_meta = stx_meta() - # return - list(filtered = filtered, - filtered_all = rm_col_na(out_fil), - # TODO id = id, - aggregated = out_agg, - meta = out_meta) + + # Combine results + dt.out = distinct( data.table::rbindlist(dt.out) ) + # Check + if( nrow(dt.out) == 0 ) { + warning("No query results for the provided ",paste(var_check, collapse = " & "), + ". Please check the input values.") + return(NULL) } + return(dt.out) + } else { dt } } +stx_query = function( + ## COMPOUND FILTERING ## + cas_number = NULL, + ## BASIC TOX DATA FILTERING ## + endpoint_group = c('XX50', 'NOEX', 'LOEX'), + exposure = NULL, # character vector + effect = NULL, # character vector + duration = c(0, Inf), # numeric vector + duration_unit = "h", # character vector; set to NULL if you want to keep all results! + concentration_unit = NULL, # character vector + concentration_type = NULL, # character vector + ## TAXA FILTERING ## + tax_columns = c('tax_group', 'tax_taxon', 'tax_genus', 'tax_family'), # Taxonomy columns to append to the query results. DEFAULT: c('tax_group', 'tax_taxon', 'tax_genus', 'tax_family') + tax_genus = NULL, # character vector + tax_family = NULL, # character vector + tax_order = NULL, # character vector + tax_class = NULL, # character vector + ecotox_grp = NULL, # character vector + ## REFERENCE SECTION ## + include_reference = FALSE, # Default FALSE + ...){ + # Import stxDb + message("Reading in Standartox Data ...") + stx_table = c('test_fin.fst','phch.fst','taxa.fst') # + if(include_reference) { stx_table = unique(c(stx_table,'refs.fst')) } + stxDb = stx_query_import(data_type = stx_table, ...) + # stxDb = stx_query_import(data_type = stx_table) + tox.dt = stxDb$test_fin.fst # final output object. LARGE right after import! + stxDb = stxDb[stx_table[-1]] # dump the largest object! + + # First quick filter steps: + # Remove rows where the specified columns contain "NR" <- NA values + message("Removing 'NA' values ...") + tox.dt <- tox.dt[!grepl("NR", endpoint) & !grepl("NR", duration_unit) ] + # Quick fix for endpoint values + tox.dt[, endpoint := sub("[/*]+$","", endpoint)] + if(!is.null(endpoint_group)){ + tmp_var = endpoint_group # quick fix + tox.dt = tox.dt[endpoint_group %in% tmp_var] + } + if(!is.null(duration_unit)){ + tmp_var = duration_unit # quick fix + tox.dt = tox.dt[duration_unit %in% tmp_var] + } + if( nrow(tox.dt) == 0 ) { + warning("No query matches found for the provided endpoint_group or duration_unit. Please check the input values.") + } + + # Step 1: Filter for cas_number then merge with toxdata + message("Appending chemical information ...") + ## Filter chem data for cas_number + if(!is.null(cas_number)){ + stxDb$phch.fst <- stxDb$phch.fst[cas %in% cas_number] + if( nrow(stxDb$phch.fst) == 0 ) { + warning("No query matches found for the provided CAS numbers. Please check the input values.") + } + } + ## Merge with tox data by cl_id + merge(stxDb$phch.fst, tox.dt, all.x = TRUE, by = "cl_id") -> tox.dt + tox.dt[, c("chem_class","casnr", "cl_id") := NULL] # don't need the cl_id column anymore. + + + # Step 2: Filter for taxonomic groups then merge with toxdata + message("Appending taxonomic information ...") + ## Filtering ecotox_grp ## + if(!is.null(ecotox_grp)){ + regstr = paste(ecotox_grp, collapse = "|") + stxDb$taxa.fst = stxDb$taxa.fst[ grepl(regstr, stxDb$taxa.fst$tax_group) ] + if( nrow( stxDb$taxa.fst ) == 0 ) { + warning("No query matches found for the provided tax_group. Please check the input values.") + return(NULL) + } + } + ## Filtering tax_columns ## + # Specify taxonomy columns for which tax filtering can be applied + var_ls = list( + tax_class = tax_class, + tax_order = tax_order, + tax_family = tax_family, + tax_genus = tax_genus + ) + tax.out = filter_dt( stxDb$taxa.fst, var_ls) + if( is.null(tax.out) ) { return(NULL) } # Check + + # Select pre-defined columns for output + tax_key = "tl_id" + tax.out = tax.out[, c(tax_key, tax_columns), with = FALSE] + # Merge taxonomy data with tox data by tl_id + merge(tax.out, tox.dt, all.x = TRUE, by = tax_key) -> tox.dt + tox.dt[, (tax_key) := NULL] # drop columns here + + + # Step 3: Final Tox data filtering + # rmv any rows with NA in result_id + tox.dt = tox.dt[!is.na(result_id)] + # Filter for the selected columns + var_ls = list( + concentration_unit = concentration_unit, + concentration_type = concentration_type, + effect = effect, + exposure = exposure + ) + tox.dt = filter_dt( tox.dt, var_ls ) + if( is.null(tox.dt) ) { return(NULL) } # Check + + # Filter for duration + lower_bound = min(duration) + upper_bound = max(duration) + tox.dt = tox.dt[duration >= lower_bound & duration <= upper_bound] + # Check + if( nrow( tox.dt ) == 0 ) { + warning("No query matches found. Please check the input filter values.") + return(NULL) + } + + # Step 4: Append references if wanted + if(include_reference){ + message("Appending reference information ...") + tox.dt = merge(tox.dt, stxDb$refs.fst, all.x = TRUE, by = "ref_number") + tox.dt[, c("ref_number") := NULL] + } + + # Step 5: Final Cleanup + # Replace all occurrences of "NR" with NA + tox.dt = tox.dt[, lapply(.SD, function(x) { + if (is.character(x)) { + x[x == "NR"] <- NA # Replace "NR" with NA for character columns + } + return(x) # Return the modified column + })] + # Filter out the "result_id" column + message("Done!\n") + return( tox.dt[, c("result_id") := NULL] ) +} + #' Retrieve chemical data #' From 9d1998b34a5b5643a9ee9e9e055a1d61fd3fedc4 Mon Sep 17 00:00:00 2001 From: Hannes Reinwald Date: Mon, 16 Jun 2025 19:21:01 +0200 Subject: [PATCH 3/6] Added some example code here. Does not need to be merged to Andrea's the main branch. --- R/example_code.R | 54 ++++++ R/standartox.R | 464 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 518 insertions(+) create mode 100644 R/example_code.R create mode 100644 R/standartox.R diff --git a/R/example_code.R b/R/example_code.R new file mode 100644 index 0000000..1506798 --- /dev/null +++ b/R/example_code.R @@ -0,0 +1,54 @@ +# filter_dt ----------------------------------- +taxa.dt = stx_taxa() +colnames(taxa.dt) # inspect column names to filter for + +# Specify taxonomy columns for which tax filtering can be applied. +# Make sure that list names match the column names in your data table! +var_ls = list( + family = 'Cyprinidae', + genus = c('Daphnia','Ceriodaphnia') +) + +tax.out = filter_dt( taxa.dt, var_ls, silent = FALSE) +View(tax.out) # inspect the output + + +# stx_query --------------------------------- +# Note that by default stx_query filters for EC50, NOEC and LOEC values and for duration_unit = "h" (hours). +# if you need something else specify in filter! + +# Example 1) - basix stx query +tox.dt1 = stx_query(verbose = T) # will return results filtered for duration_unit = "h" (hours) + +# to keep all duration values, set duration_unit = NULL, default is "h" +tox.dt2 = stx_query(duration_unit = NULL, verbose = T) # keep NA values ) + +# To get everything! +tox.dt3 = stx_query( endpoint_group = NULL, + duration_unit = NULL, + verbose = T, rm_na = F) # remove NA values + +# NOTE: stx_data() will deliver ALL available results without any pre-filtering! +dt = stx_data() + +# Example 2) +tox.dt = stx_query( + verbose = T, + endpoint_group = "XX50", + duration = c(12, 96), + concentration_unit = "g/l", + tax_genus = c("Danio","Pimephales") + #tax_family = c("Cyprinidae","Salmonidae") + #tax_group = c("fish","algae") +) + +# Example 3) +tox.dt = stx_query( + verbose = T, + cas_number = c("482-89-3", "50-00-0","1071-83-6"), + #endpoint_group = "XX50", + duration = c(12, 96), + concentration_unit = "g/l", + tax_genus = c("Danio","Daphnia"), + include_reference = TRUE +) diff --git a/R/standartox.R b/R/standartox.R new file mode 100644 index 0000000..a0341e8 --- /dev/null +++ b/R/standartox.R @@ -0,0 +1,464 @@ +#' Download Standartox Data Tables from Zenodo.org +#' +#' Downloads the Standartox data tables from Zenodo.org and reads them into R. Specific data_types can be specified. +#' +#' @return Returns a list of data.tables containing the downloaded data. +#' +#' @param data_type character; Specify the type of data to download. Can be one of NULL (default, downloads and imports all), "meta.fst", "phch.fst", "refs.fst", "test_fin.fst", "taxa.fst", etc. +#' @param dir_out character; Directory to which the downloaded files should be saved. Default is a temporary directory. +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' @author Hannes Reinwald +#' +#' @examples +#' \donttest{ +#' # might fail if there is no internet connection or Zenodo.org not not available +#' stxDb = stx_download() +#' names(stxDb) # files downloaded from zenodo.org +#' } +#' @noRd +#' +stx_download = function(data_type, dir_out = file.path(tempdir(), "standartox")) { + # Check + data_type = match.arg( + data_type, + c("meta", "phch", "refs", "test_fin", "taxa", "catalog"), + several.ok = TRUE + ) + + # Output directory + dir.create(dir_out, showWarnings = FALSE, recursive = TRUE) + + # Find download URLS + # HACK this has to be done, because doi.org is the only permanent link between versions + qurl_permanent = 'https://doi.org/10.5281/zenodo.3785030' + req = httr::GET(qurl_permanent) + cont = httr::content(req, as = 'text') + + # Extract all zenodo download links from the content + qurl = unique(stringr::str_extract_all(cont, 'https://zenodo.org/records/[0-9]+/files/.+')[[1]]) + qurl = grep("[.]rds|[.]fst", qurl, value = TRUE) # filter for .rsd and .fst files only! + qurl = sub('\">', '', qurl, fixed = TRUE) + qurl = grep(qurl, pattern = paste0(data_type, collapse = '|'), value = TRUE) # filter for data_type + + # For each link in qurl check if destination file exists and if not download it. + l = list() + for(URL in qurl){ + fl = basename(URL) + fl_name = sub('.rds|.fst', '', fl) # remove file extension for list name + + # Define destination file path + destfile = file.path(dir_out, fl) + + # Check if the file already exists + if ( !file.exists(destfile) ) { + curl::curl_download(url = URL, + destfile = destfile, + quiet = TRUE) + } + + # Read in the downloaded files based on their extension + sfx = sub("^.+[.]", "", fl) + if( sfx == "fst" ) { + out = fst::read_fst(destfile, as.data.table = TRUE) + } else if (sfx == "rds") { + out = readRDS(destfile) + } else { + stop("Unknown file format: ", sfx, "Expecting .fst or .rds files.") + } + } + + return(out) +} + + + +stx_query_import = function(silent = TRUE, + data_type = c('test_fin.fst','phch.fst','taxa.fst','refs.fst'), + ...) { + if (!silent) message('Retrieving Standartox listed Chemicals ...') + if (silent) { + result = suppressMessages( stx_download(data_type = data_type, ...) ) + } else { + result = stx_download(data_type = data_type, ...) + } + # Convert data frame to data table + result = lapply( result, function(dt) data.table::setDT(dt) ) + return(result) +} + + + +filter_dt = function(dt, var_ls){ + stopifnot(is.data.frame(dt)) + stopifnot(is.list(var_ls)) + data.table::setDT(dt) # Ensure dt is a data.table + + var_check = !unlist( lapply(var_ls, is.null) ) + var_check = names( which(var_check) ) # select only those variables that are not NULL + + # Loop through the variables and filter the taxa.fst data frame + if (length(var_check) > 0) { + dt.out = list() + + for(var in var_check){ + message("Filtering for variable: ", var) + if (var %in% colnames(dt)) { # Check if the variable exists as a column in data table + if( !is.null(var_ls[[var]]) ){ + # Filter the data table based on the specified column and the values in the variable + filter_val = var_ls[[var]] + dt.out[[var]] <- dt[ dt[[var]] %in% filter_val ] + if(nrow(dt.out[[var]]) == 0){ warning( paste("No query matches found for:\t", var) ) } + } + } else { + warning(paste("Variable", var, "not found in data table. Skipping filter.")) + } + } + + # Combine results + dt.out = distinct( data.table::rbindlist(dt.out) ) + # Check + if( nrow(dt.out) == 0 ) { + warning("No query results for the provided ",paste(var_check, collapse = " & "), + ". Please check the input values.") + return(NULL) } + return(dt.out) + } else { dt } +} + + +stx_query = function( + ## COMPOUND FILTERING ## + cas_number = NULL, + ## BASIC TOX DATA FILTERING ## + endpoint_group = c('XX50', 'NOEX', 'LOEX'), + exposure = NULL, # character vector + effect = NULL, # character vector + duration = c(0, Inf), # numeric vector + duration_unit = "h", # character vector; set to NULL if you want to keep all results! + concentration_unit = NULL, # character vector + concentration_type = NULL, # character vector + ## TAXA FILTERING ## + tax_columns = c('tax_group', 'tax_taxon', 'tax_genus', 'tax_family'), # Taxonomy columns to append to the query results. DEFAULT: c('tax_group', 'tax_taxon', 'tax_genus', 'tax_family') + tax_genus = NULL, # character vector + tax_family = NULL, # character vector + tax_order = NULL, # character vector + tax_class = NULL, # character vector + ecotox_grp = NULL, # character vector + ## REFERENCE SECTION ## + include_reference = FALSE, # Default FALSE + ...){ + # Import stxDb + message("Reading in Standartox Data ...") + stx_table = c('test_fin.fst','phch.fst','taxa.fst') # + if(include_reference) { stx_table = unique(c(stx_table,'refs.fst')) } + stxDb = stx_query_import(data_type = stx_table, ...) + tox.dt = stxDb$test_fin.fst # final output object. LARGE right after import! + stxDb = stxDb[stx_table[-1]] # dump the largest object! + + # First quick filter steps: + # Remove rows where the specified columns contain "NR" <- NA values + message("Removing 'NA' values ...") + tox.dt <- tox.dt[!grepl("NR", endpoint) & !grepl("NR", duration_unit) ] + # Quick fix for endpoint values + tox.dt[, endpoint := sub("[/*]+$","", endpoint)] + if(!is.null(endpoint_group)){ + tmp_var = endpoint_group # quick fix + tox.dt = tox.dt[endpoint_group %in% tmp_var] + } + if(!is.null(duration_unit)){ + tmp_var = duration_unit # quick fix + tox.dt = tox.dt[duration_unit %in% tmp_var] + } + if( nrow(tox.dt) == 0 ) { + warning("No query matches found for the provided endpoint_group or duration_unit. Please check the input values.") + } + + # Step 1: Filter for cas_number then merge with toxdata + message("Appending chemical information ...") + ## Filter chem data for cas_number + if(!is.null(cas_number)){ + stxDb$phch.fst <- stxDb$phch.fst[cas %in% cas_number] + if( nrow(stxDb$phch.fst) == 0 ) { + warning("No query matches found for the provided CAS numbers. Please check the input values.") + } + } + ## Merge with tox data by cl_id + merge(stxDb$phch.fst, tox.dt, all.x = TRUE, by = "cl_id") -> tox.dt + tox.dt[, c("chem_class","casnr", "cl_id") := NULL] # don't need the cl_id column anymore. + + + # Step 2: Filter for taxonomic groups then merge with toxdata + message("Appending taxonomic information ...") + ## Filtering ecotox_grp ## + if(!is.null(ecotox_grp)){ + regstr = paste(ecotox_grp, collapse = "|") + stxDb$taxa.fst = stxDb$taxa.fst[ grepl(regstr, stxDb$taxa.fst$tax_group) ] + if( nrow( stxDb$taxa.fst ) == 0 ) { + warning("No query matches found for the provided tax_group. Please check the input values.") + return(NULL) + } + } + ## Filtering tax_columns ## + # Specify taxonomy columns for which tax filtering can be applied + var_ls = list( + tax_class = tax_class, + tax_order = tax_order, + tax_family = tax_family, + tax_genus = tax_genus + ) + tax.out = filter_dt( stxDb$taxa.fst, var_ls) + if( is.null(tax.out) ) { return(NULL) } # Check + + # Select pre-defined columns for output + tax_key = "tl_id" + tax.out = tax.out[, c(tax_key, tax_columns), with = FALSE] + # Merge taxonomy data with tox data by tl_id + merge(tax.out, tox.dt, all.x = TRUE, by = tax_key) -> tox.dt + tox.dt[, (tax_key) := NULL] # drop columns here + + + # Step 3: Final Tox data filtering + # rmv any rows with NA in result_id + tox.dt = tox.dt[!is.na(result_id)] + # Filter for the selected columns + var_ls = list( + concentration_unit = concentration_unit, + concentration_type = concentration_type, + effect = effect, + exposure = exposure + ) + tox.dt = filter_dt( tox.dt, var_ls ) + if( is.null(tox.dt) ) { return(NULL) } # Check + + # Filter for duration + tox.dt = tox.dt[duration %between% c(min(duration),max(duration))] + # Check + if( nrow( tox.dt ) == 0 ) { + warning("No query matches found. Please check the input filter values.") + return(NULL) + } + + # Step 4: Append references if wanted + if(include_reference){ + message("Appending reference information ...") + tox.dt = merge(tox.dt, stxDb$refs.fst, all.x = TRUE, by = "ref_number") + tox.dt[, c("ref_number") := NULL] + } + + # Step 5: Final Cleanup + # Replace all occurrences of "NR" with NA + tox.dt = tox.dt[, lapply(.SD, function(x) { + if (is.character(x)) { + x[x == "NR"] <- NA # Replace "NR" with NA for character columns + } + return(x) # Return the modified column + })] + # Filter out the "result_id" column + message("Done!\n") + return( tox.dt[, c("result_id") := NULL] ) +} + + + +#' Retrieve data catalog +#' +#' Retrieve a data catalog for all variables (and their values) that can be retrieved with stx_query() +#' +#' @param silent logical; If TRUE, suppresses messages. Default is FALSE. +#' @param dir_out character; Directory to which the catalog should be downloaded. Default is a temporary directory. +#' +#' @return Returns a list of data.frames containing information on data base variables +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' @author Hannes Reinwald +#' +#' @examples +#' \donttest{ +#' # might fail if there is no internet connection or Zenodo.org not not available +#' # basic function call +#' l = stx_catalog() +#' +#' # to get verbose output from the function +#' l = stx_catalog(silent = FALSE) +#' +#' # to specify a directory to which the catalog should be downloaded +#' l = stx_catalog(silent = FALSE, dir_out = "~/tmp") +#' # This will create a directory under ~/tmp and download the catalog.rds file to that directory. +#' # The files are then permanently stored in that directory and can be directly read when restarting your R session. +#' } +#' @export +#' +stx_catalog = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { + if (!silent) message('Retrieving Standartox catalog..') + l = stx_download(data_type = 'catalog', dir_out = dir_out) + + return(l) +} + +#' Retrieve Standartox toxicity values +#' +#' Retrieve a data.table containing the Standartox toxicity data +#' +#' @param silent logical; If TRUE, suppresses messages. Default is FALSE. +#' @param dir_out character; Directory to which the catalog should be downloaded. Default is a temporary directory. +#' +#' @return Returns a data.table. +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' @author Hannes Reinwald +#' +#' @examples +#' \donttest{ +#' # might fail if there is no internet connection or Zenodo.org not not available +#' # basic function call +#' +#' dt = stx_data() +#' +#' } +#' @export +#' +stx_data = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { + if (!silent) message('Retrieving Standartox data..') + out = stx_download(data_type = 'test_fin', dir_out = dir_out) + + return(out) +} + +#' Retrieve chemical data +#' +#' Retrieve data on all chemicals in Standartox. +#' +#' @return Returns a data.table containing informaiton on chemicals in Standartox. +#' +#' @param silent logical; If TRUE, suppresses messages. Default is FALSE. +#' @param dir_out character; Directory to which the chemical information should be downloaded. Default is a temporary directory. +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' @author Hannes Reinwald +#' +#' @examples +#' \donttest{ +#' # might fail if there is no internet connection or Zenodo.org not not available +#' # basic function call +#' df = stx_chem() +#' +#' # to get verbose output from the function +#' df = stx_chem(silent = FALSE) +#' +#' # to specify a directory to which the chemical information should be downloaded +#' df = stx_chem(silent = FALSE, dir_out = "~/tmp") +#' # This will create a directory under ~/tmp and download the respective standartox file to that directory. +#' # The files are then permanently stored in that directory and can be directly read when restarting your R session. +#' } +#' +#' @export +#' +stx_chem = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { + if (!silent) message('Retrieving Standartox chemical information..') + out = stx_download(data_type = 'phch', dir_out = dir_out) + + return(out) +} + +#' Retrieve taxa data +#' +#' Retrieve data on all taxa in Standartox. +#' +#' @return Returns a data.table containing informaiton on taxa in Standartox. +#' +#' @param silent logical; If TRUE, suppresses messages. Default is FALSE. +#' @param dir_out character; Directory to which the taxa information should be downloaded. Default is a temporary directory. +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' @author Hannes Reinwald +#' +#' @examples +#' \donttest{ +#' # might fail if there is no internet connection or Zenodo.org not not available +#' # basic function call +#' df = stx_taxa() +#' +#' # to get verbose output from the function +#' df = stx_taxa(silent = FALSE) +#' +#' # to specify a directory to which the taxa information should be downloaded +#' df = stx_taxa(silent = FALSE, dir_out = "~/tmp") +#' # This will create a directory under ~/tmp and download the respective standartox file to that directory. +#' # The files are then permanently stored in that directory and can be directly read when restarting your R session. +#' } +#' +#' @export +#' +stx_taxa = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { + if (!silent) message('Retrieving Standartox taxa information..') + out = stx_download(data_type = 'taxa', dir_out = dir_out) + + return(out) +} + +#' Function to aggregate filtered test results +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' +#' @noRd +#' +stx_aggregate = function(dat = NULL) { + # assign variables to avoid R CMD check NOTES + . = concentration = cname = cas = tax_taxon = gmn = gmnsd = n = NULL + # checking + if (is.null(dat)) stop('Provide table.') + # aggregation + dat[ + , + .(gmn = gm_mean(concentration), + gmnsd = gm_sd(concentration), + n = .N), + .(cname, cas, tax_taxon) + ][ + , + .(min = min(gmn), + tax_min = .SD[ which.min(gmn), tax_taxon ], + gmn = gm_mean(gmn), + gmnsd = gm_sd(gmnsd), + max = max(gmn), + tax_max = .SD[ which.max(gmn), tax_taxon ], + n = sum(n), + tax_all = paste0(sort(unique(tax_taxon)), collapse = ', ')), + .(cname, cas) + ] +} + +#' Retrieve meta data +#' +#' @return Returns a data.table containing meta informaiton on Standartox. +#' +#' @param silent logical; If TRUE, suppresses messages. Default is FALSE. +#' @param dir_out character; Directory to which the meta information should be downloaded. Default is a temporary directory.#' +#' +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' @author Hannes Reinwald +#' +#' @examples +#' \donttest{ +#' # might fail if there is no internet connection or Zenodo.org not not available +#' # basic function call +#' df = stx_meta() +#' +#' # to get verbose output from the function +#' df = stx_meta(silent = FALSE) +#' +#' # to specify a directory to which the taxa information should be downloaded +#' df = stx_meta(silent = FALSE, dir_out = "~/tmp") +#' # This will create a directory under ~/tmp and download the respective standartox file to that directory. +#' # The files are then permanently stored in that directory and can be directly read when restarting your R session. +#' } +#' +#' @export +#' +stx_meta = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { + if (!silent) message('Retrieving Standartox meta information..') + out = stx_download(data_type = 'meta', dir_out = dir_out) + + return(out) +} From da0a45eb47baca08d5779afae70b7e60e99da08e Mon Sep 17 00:00:00 2001 From: Hannes Reinwald Date: Mon, 16 Jun 2025 19:21:23 +0200 Subject: [PATCH 4/6] Added stx_query() function --- R/standartox.R | 260 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 169 insertions(+), 91 deletions(-) diff --git a/R/standartox.R b/R/standartox.R index a0341e8..6e02edb 100644 --- a/R/standartox.R +++ b/R/standartox.R @@ -4,8 +4,9 @@ #' #' @return Returns a list of data.tables containing the downloaded data. #' -#' @param data_type character; Specify the type of data to download. Can be one of NULL (default, downloads and imports all), "meta.fst", "phch.fst", "refs.fst", "test_fin.fst", "taxa.fst", etc. +#' @param data_type character; Specify the type of data to download. Select from c("meta", "phch", "refs", "test_fin", "taxa", "catalog"). NULL (default) will download and imports all, #' @param dir_out character; Directory to which the downloaded files should be saved. Default is a temporary directory. +#' @param silent logical; If TRUE, suppresses messages. Default is TRUE. #' #' @author Andreas Scharmueller \email{andschar@@protonmail.com} #' @author Hannes Reinwald @@ -18,78 +19,104 @@ #' } #' @noRd #' -stx_download = function(data_type, dir_out = file.path(tempdir(), "standartox")) { +stx_download = function(data_type = NULL, dir_out = file.path(tempdir(),"standartox"), silent = TRUE) { + + # please keep this. Makes it easier to quickly pull everything without the need of having to specify specific + stx_files = c("meta", "phch", "refs", "test_fin", "taxa", "catalog") + if(is.null(data_type)){ data_type = stx_files} # Check - data_type = match.arg( - data_type, - c("meta", "phch", "refs", "test_fin", "taxa", "catalog"), - several.ok = TRUE - ) + data_type = match.arg( data_type, stx_files, several.ok = TRUE ) - # Output directory - dir.create(dir_out, showWarnings = FALSE, recursive = TRUE) + # Check if the output directory exists, if not create it + if (!dir.exists(dir_out)) { dir.create(dir_out, recursive = TRUE) } - # Find download URLS # HACK this has to be done, because doi.org is the only permanent link between versions qurl_permanent = 'https://doi.org/10.5281/zenodo.3785030' req = httr::GET(qurl_permanent) cont = httr::content(req, as = 'text') - - # Extract all zenodo download links from the content + # extract all zenodo download links from the content qurl = unique(stringr::str_extract_all(cont, 'https://zenodo.org/records/[0-9]+/files/.+')[[1]]) qurl = grep("[.]rds|[.]fst", qurl, value = TRUE) # filter for .rsd and .fst files only! - qurl = sub('\">', '', qurl, fixed = TRUE) - qurl = grep(qurl, pattern = paste0(data_type, collapse = '|'), value = TRUE) # filter for data_type + + # Filter for specific data_type if provided + if(!is.null(data_type)){ + # could be one of: "meta.fst","phch.fst","refs.fst","test_fin.fst","taxa.fst", ... + regx_str = paste0("/files/", sub("[.]fst$","[.]fst", sub("[.]rds$","[.]rds", data_type)) ) + regx_str = paste(regx_str, collapse = "|") + qurl = grep(regx_str, qurl, value = TRUE) + } # For each link in qurl check if destination file exists and if not download it. - l = list() - for(URL in qurl){ - fl = basename(URL) - fl_name = sub('.rds|.fst', '', fl) # remove file extension for list name - + stxDb_ls = list() # output list + for(k in qurl){ + URL = sub('\">','', k) + n = sub("^.+/files/","", URL) + if(!silent) message('\nChecking standartox file: ', n) + # Define destination file path - destfile = file.path(dir_out, fl) + destfile = file.path(dir_out, n) # Check if the file already exists if ( !file.exists(destfile) ) { + if(!silent) message('Downloading standartox ',n,' ...') curl::curl_download(url = URL, destfile = destfile, quiet = TRUE) + if(!silent) message('Done! Downloaded to:\n', destfile) } + else { if(!silent) message('File ', n, ' already exists, skipping download.') } + if(!silent) message('Reading in file:\t', n) # Read in the downloaded files based on their extension - sfx = sub("^.+[.]", "", fl) + # for .fst: fst::read_fst(); for .rds: readRDS() + sfx = sub("^.+[.]","", basename(destfile)) if( sfx == "fst" ) { - out = fst::read_fst(destfile, as.data.table = TRUE) + stxDb_ls[[n]] = try ( fst::read_fst(destfile) ) + if(!silent) message("Done!\n") } else if (sfx == "rds") { - out = readRDS(destfile) - } else { - stop("Unknown file format: ", sfx, "Expecting .fst or .rds files.") - } + stxDb_ls[[n]] = readRDS(destfile) + if(!silent) message("Done!\n") + } else { warning("Unknown file format: ", sfx, "Expecting .fst or .rds files.\n") } } - - return(out) + # Return the list of data frames + return(stxDb_ls) } - -stx_query_import = function(silent = TRUE, - data_type = c('test_fin.fst','phch.fst','taxa.fst','refs.fst'), - ...) { - if (!silent) message('Retrieving Standartox listed Chemicals ...') - if (silent) { - result = suppressMessages( stx_download(data_type = data_type, ...) ) - } else { - result = stx_download(data_type = data_type, ...) - } - # Convert data frame to data table - result = lapply( result, function(dt) data.table::setDT(dt) ) - return(result) -} - - - -filter_dt = function(dt, var_ls){ +#' Filter data.table based on a list of variables +#' +#' This function filters a data.table based on specified values in one or more columns. +#' It checks for the existence of the specified columns and applies the filters accordingly. +#' +#' @param dt data.table; The data.table to filter. +#' @param var_ls list; A named list where each element is a vector of values to filter the corresponding column in the data.table. The names of the list should match the column names in the data.table. +#' @param silent logical; If TRUE, suppresses messages. Default is TRUE. +#' +#' @return Returns a filtered data.table containing only the rows that match the specified values in the columns. If no matches are found, a warning is issued and NULL is returned. +#' +#' @author Hannes Reinwald +#' +#' +#' @examples +#' \donttest{ +#' # Import the standartox taxonomy data table as example +#' taxa.dt = stx_taxa() +#' colnames(taxa.dt) # inspect column names to filter for +#' +#' # Specify taxonomy columns for which tax filtering can be applied. +#' # Make sure that list names match the column names in your data table! +#' var_ls = list( +#' family = 'Cyprinidae', +#' genus = c('Daphnia','Ceriodaphnia') +#' ) +#' +#' # Filter your taxonomy table for genus and family specified. +#' tax.out = filter_dt( taxa.dt, var_ls, silent = FALSE) +#' View(tax.out) # inspect the output +#' } +#' +#' @noRd +filter_dt = function(dt, var_ls, silent = TRUE){ stopifnot(is.data.frame(dt)) stopifnot(is.list(var_ls)) data.table::setDT(dt) # Ensure dt is a data.table @@ -102,7 +129,7 @@ filter_dt = function(dt, var_ls){ dt.out = list() for(var in var_check){ - message("Filtering for variable: ", var) + if(!silent) message("Filtering for variable: ", var) if (var %in% colnames(dt)) { # Check if the variable exists as a column in data table if( !is.null(var_ls[[var]]) ){ # Filter the data table based on the specified column and the values in the variable @@ -116,7 +143,7 @@ filter_dt = function(dt, var_ls){ } # Combine results - dt.out = distinct( data.table::rbindlist(dt.out) ) + dt.out = unique( data.table::rbindlist(dt.out) ) # Check if( nrow(dt.out) == 0 ) { warning("No query results for the provided ",paste(var_check, collapse = " & "), @@ -127,6 +154,37 @@ filter_dt = function(dt, var_ls){ } +#' Query Standartox toxicity values +#' +#' Retrieve toxicity values from the Standartox data base on Zenodo.org \url{https://doi.org/10.5281/zenodo.3785030}. +#' +#' @return Returns a data.table containing Standartox data base query results. +#' +#' @param cas character, integer; Limit data base query to specific CAS numbers, multiple entries possible (e.g. c('1071-83-6', '1071836'), NULL (default). +#' @param concentration_unit character; Limit data base query to specific concentration units (e.g. ug/l - default). +#' @param concentration_type character; Limit data base query to specific concentration types, can be one of NULL (default), 'active ingredient', 'formulation', 'total', 'not reported', 'unionized', 'dissolved', 'labile'. See \url{https://cfpub.epa.gov/ecotox/pdf/codeappendix.pdf} p.4. +#' @param duration integer vector of length two; Limit data base query to specific test durations (hours) (e.g. c(24, 48)). NULL (default). +#' @param endpoint character; Choose endypoint type, must be one of 'XX50' (default), 'NOEX', 'LOEX'. +#' @param effect character; Limit data base query to specific effect groups, multiple entries possible (e.g. 'Mortality', 'Intoxication', 'Growth'). See \url{https://cfpub.epa.gov/ecotox/pdf/codeappendix.pdf} p.95. NULL (default). +#' @param exposure character; Choose exposure type, (e.g. aquatic, environmental, diet). NULL (default). +#' @param chemical_role character; Limit data base query to specific chemical roles (e.g. insecticide), multiple entries possible. NULL (default). +#' @param chemical_class character; Limit data base query to specific chemical classes (e.g. neonicotinoid), multiple entries possible. NULL (default). +#' @param taxa character; Limit data base query to specific taxa, multiple entries possible. NULL (default). +#' @param ecotox_grp character; Convenience grouping of organisms in ecotoxicology, must be one of NULL (default), 'invertebrate', 'fish', 'plant_land', 'macrophyte', 'algae'. +#' @param trophic_lvl character; Trophic level of organism, must be one of NULL (default), 'autotroph', 'heterotroph'. +#' @param habitat character; Limit data base query to specific organism habitats, can be one of NULL (default) 'marine', 'brackish', 'freshwater', 'terrestrial'. +#' @param region character; Limit data base query to organisms occurring in specific regions, can be one of NULL (default) 'africa', 'america_north', 'america_south', 'asia', 'europe', 'oceania'. +#' @param vers integer; Choose the version of the EPA Ecotox on which Standartox is based on. NULL (default) accesses the most recent version. +#' @param ... currently not used +#' +#' @return Returns a list of three data.tables (filtered data base query results, aggregated data base query results, meta information) +#' @author Andreas Scharmueller \email{andschar@@protonmail.com} +#' +#' @examples +#' \donttest{ +#' +#' @export +#' stx_query = function( ## COMPOUND FILTERING ## cas_number = NULL, @@ -147,20 +205,31 @@ stx_query = function( ecotox_grp = NULL, # character vector ## REFERENCE SECTION ## include_reference = FALSE, # Default FALSE - ...){ + rm_na = TRUE, # Default TRUE; if FALSE, keep NR values in the result + verbose = FALSE, # Default TRUE; if FALSE, print messages + ...){ + message("Querying Standartox data base ...") + # Import stxDb - message("Reading in Standartox Data ...") - stx_table = c('test_fin.fst','phch.fst','taxa.fst') # - if(include_reference) { stx_table = unique(c(stx_table,'refs.fst')) } - stxDb = stx_query_import(data_type = stx_table, ...) - tox.dt = stxDb$test_fin.fst # final output object. LARGE right after import! - stxDb = stxDb[stx_table[-1]] # dump the largest object! + if(verbose) message("Reading in Standartox Data ...") + stx_table = c('test_fin','phch','taxa') # + if(include_reference) { stx_table = unique(c(stx_table,'refs')) } + stxDb = stx_download(data_type = stx_table) #, ...) + names(stxDb) = sub("[.]fst$","",names(stxDb)) # FIX + # Convert data frame to data table + stxDb = lapply( stxDb, function(dt) data.table::setDT(dt) ) + tox.dt = stxDb$test_fin # final output object. LARGE right after import! + stxDb = stxDb[stx_table[-1]] # dump the largest object! <- hope to save some memory with that. + # First quick filter steps: # Remove rows where the specified columns contain "NR" <- NA values - message("Removing 'NA' values ...") - tox.dt <- tox.dt[!grepl("NR", endpoint) & !grepl("NR", duration_unit) ] - # Quick fix for endpoint values + if(rm_na){ + if(verbose) message("Removing 'NA' values ...") + tox.dt = tox.dt[!grepl("NR", endpoint) & !grepl("NR", duration_unit) ] + } + + # Quick fix for endpoint values - some contain weird string endings. tox.dt[, endpoint := sub("[/*]+$","", endpoint)] if(!is.null(endpoint_group)){ tmp_var = endpoint_group # quick fix @@ -175,52 +244,55 @@ stx_query = function( } # Step 1: Filter for cas_number then merge with toxdata - message("Appending chemical information ...") + if(verbose) message("Appending chemical information ...") ## Filter chem data for cas_number if(!is.null(cas_number)){ - stxDb$phch.fst <- stxDb$phch.fst[cas %in% cas_number] - if( nrow(stxDb$phch.fst) == 0 ) { + stxDb$phch <- stxDb$phch[cas %in% cas_number] + if( nrow(stxDb$phch) == 0 ) { warning("No query matches found for the provided CAS numbers. Please check the input values.") } + merge(stxDb$phch, tox.dt, all.x = TRUE, by = "cl_id") -> tox.dt + } else { + merge(stxDb$phch, tox.dt, all.y = TRUE, by = "cl_id") -> tox.dt } - ## Merge with tox data by cl_id - merge(stxDb$phch.fst, tox.dt, all.x = TRUE, by = "cl_id") -> tox.dt - tox.dt[, c("chem_class","casnr", "cl_id") := NULL] # don't need the cl_id column anymore. + suppressWarnings( tox.dt[, c("chem_class","casnr", "cl_id") := NULL] ) # don't need the cl_id column anymore. # Step 2: Filter for taxonomic groups then merge with toxdata - message("Appending taxonomic information ...") + if(verbose) message("Appending taxonomic information ...") + + # Append 'tax_' prefix to the taxonomic columns (except for tax_key) + tax_key = "tl_id" + tax_col = paste0("tax_", colnames(stxDb$taxa)) + tax_col[ grep(paste0("^tax_",tax_key,"$"),tax_col) ] <- tax_key # replace tax_key with tl_id + colnames(stxDb$taxa) <- tax_col # set new column names + + # Select pre-defined columns for output + tax.out = stxDb$taxa[, c(tax_key, tax_columns), with = FALSE] + # Merge taxonomy data with tox data by tl_id + tox.dt = merge(tox.dt, tax.out, all.x = TRUE, by = tax_key) + ## Filtering ecotox_grp ## if(!is.null(ecotox_grp)){ regstr = paste(ecotox_grp, collapse = "|") - stxDb$taxa.fst = stxDb$taxa.fst[ grepl(regstr, stxDb$taxa.fst$tax_group) ] - if( nrow( stxDb$taxa.fst ) == 0 ) { + tox.dt = tox.dt[ grepl(regstr, tox.dt$tax_group) ] + if( nrow( tox.dt ) == 0 ) { warning("No query matches found for the provided tax_group. Please check the input values.") return(NULL) } } ## Filtering tax_columns ## - # Specify taxonomy columns for which tax filtering can be applied - var_ls = list( + var_ls = list( # Specify taxonomy columns for which tax filtering can be applied tax_class = tax_class, tax_order = tax_order, tax_family = tax_family, tax_genus = tax_genus ) - tax.out = filter_dt( stxDb$taxa.fst, var_ls) - if( is.null(tax.out) ) { return(NULL) } # Check - - # Select pre-defined columns for output - tax_key = "tl_id" - tax.out = tax.out[, c(tax_key, tax_columns), with = FALSE] - # Merge taxonomy data with tox data by tl_id - merge(tax.out, tox.dt, all.x = TRUE, by = tax_key) -> tox.dt - tox.dt[, (tax_key) := NULL] # drop columns here - + tox.dt = filter_dt( tox.dt, var_ls) + if( is.null(tox.dt) ) { return(NULL) } # Check # Step 3: Final Tox data filtering - # rmv any rows with NA in result_id - tox.dt = tox.dt[!is.na(result_id)] + tox.dt = tox.dt[!is.na(result_id)] # rmv any rows with NA in result_id <- this should not be the case but to be save! # Filter for the selected columns var_ls = list( concentration_unit = concentration_unit, @@ -232,7 +304,13 @@ stx_query = function( if( is.null(tox.dt) ) { return(NULL) } # Check # Filter for duration - tox.dt = tox.dt[duration %between% c(min(duration),max(duration))] + lower = min(duration) + upper = max(duration) + if (lower != 0 | upper != Inf) { + #tox.dt = tox.dt[duration %between% c(lower, upper)] # <- this works only when data.table is loaded + tox.dt = tox.dt[duration >= lower & duration <= upper] # <- this works always + } + # Check if( nrow( tox.dt ) == 0 ) { warning("No query matches found. Please check the input filter values.") @@ -241,8 +319,8 @@ stx_query = function( # Step 4: Append references if wanted if(include_reference){ - message("Appending reference information ...") - tox.dt = merge(tox.dt, stxDb$refs.fst, all.x = TRUE, by = "ref_number") + if(verbose) message("Appending reference information ...") + tox.dt = merge(tox.dt, stxDb$refs, all.x = TRUE, by = "ref_number") tox.dt[, c("ref_number") := NULL] } @@ -291,9 +369,9 @@ stx_query = function( #' stx_catalog = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { if (!silent) message('Retrieving Standartox catalog..') - l = stx_download(data_type = 'catalog', dir_out = dir_out) + ls = stx_download(data_type = 'catalog', dir_out = dir_out)[[1]] - return(l) + return(ls) } #' Retrieve Standartox toxicity values @@ -319,8 +397,8 @@ stx_catalog = function(silent = FALSE, dir_out = file.path(tempdir(), "standarto #' @export #' stx_data = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { - if (!silent) message('Retrieving Standartox data..') - out = stx_download(data_type = 'test_fin', dir_out = dir_out) + if(!silent) message('Retrieving Standartox data..') + out = stx_download(data_type = 'test_fin', dir_out = dir_out)[[1]] return(out) } @@ -356,7 +434,7 @@ stx_data = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox") #' stx_chem = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { if (!silent) message('Retrieving Standartox chemical information..') - out = stx_download(data_type = 'phch', dir_out = dir_out) + out = stx_download(data_type = 'phch', dir_out = dir_out)[[1]] return(out) } @@ -392,7 +470,7 @@ stx_chem = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox") #' stx_taxa = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { if (!silent) message('Retrieving Standartox taxa information..') - out = stx_download(data_type = 'taxa', dir_out = dir_out) + out = stx_download(data_type = 'taxa', dir_out = dir_out)[[1]] return(out) } @@ -458,7 +536,7 @@ stx_aggregate = function(dat = NULL) { #' stx_meta = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox")) { if (!silent) message('Retrieving Standartox meta information..') - out = stx_download(data_type = 'meta', dir_out = dir_out) + out = stx_download(data_type = 'meta', dir_out = dir_out)[[1]] return(out) } From cea60b1644433befcf30093f2bdcbd668ac406c3 Mon Sep 17 00:00:00 2001 From: Hannes Reinwald Date: Tue, 17 Jun 2025 14:00:20 +0200 Subject: [PATCH 5/6] Added more examples for stx_query() --- R/example_code.R | 94 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 27 deletions(-) diff --git a/R/example_code.R b/R/example_code.R index 1506798..c290fa1 100644 --- a/R/example_code.R +++ b/R/example_code.R @@ -1,54 +1,94 @@ -# filter_dt ----------------------------------- +# filter_dt() ----------------------------------- taxa.dt = stx_taxa() colnames(taxa.dt) # inspect column names to filter for # Specify taxonomy columns for which tax filtering can be applied. # Make sure that list names match the column names in your data table! var_ls = list( - family = 'Cyprinidae', - genus = c('Daphnia','Ceriodaphnia') + family = "Cyprinidae", + genus = c("Daphnia","Ceriodaphnia") ) tax.out = filter_dt( taxa.dt, var_ls, silent = FALSE) View(tax.out) # inspect the output -# stx_query --------------------------------- -# Note that by default stx_query filters for EC50, NOEC and LOEC values and for duration_unit = "h" (hours). -# if you need something else specify in filter! +# stx_query() --------------------------------- -# Example 1) - basix stx query -tox.dt1 = stx_query(verbose = T) # will return results filtered for duration_unit = "h" (hours) +# Example 1) - Basic stx_query() call +# will return results filtered for default endpoint_group = c("XX50", "NOEX", "LOEX") and duration_unit = "h" +dt = stx_query(verbose = T) -# to keep all duration values, set duration_unit = NULL, default is "h" -tox.dt2 = stx_query(duration_unit = NULL, verbose = T) # keep NA values ) +# If you wish to filter for different endpoint groups, you can specify them in the query. +stx_catalog()$endpoint_group # to view available endpoint groups +dt1 = stx_query(endpoint_group = c("Bioconc","MATC","MCIG")) -# To get everything! -tox.dt3 = stx_query( endpoint_group = NULL, - duration_unit = NULL, - verbose = T, rm_na = F) # remove NA values +# if you wish to keep all duration_unit and endpoint_groups, set them to NULL. +dt2 = stx_query(duration_unit = NULL, endpoint_group = NULL, verbose = T) +# including "NR" (not reported) values. This is everything you can retrieve via stx_data() as well. +dt3 = stx_query(duration_unit = NULL, endpoint_group = NULL, verbose = T, rm_na = F) -# NOTE: stx_data() will deliver ALL available results without any pre-filtering! -dt = stx_data() -# Example 2) -tox.dt = stx_query( +# Example 2) - Filter for specific taxonomic groups +# Filter for fish and algae for an exposure duration of 12 to 120 hours and concentration unit in g/l for endpoint group XX50 (50% effect/lethality values) +stx_catalog()$group # to get an idea about available taxonomic groups +stx_catalog()$concentration_unit +dt4 = stx_query( verbose = T, endpoint_group = "XX50", - duration = c(12, 96), + duration = c(12, 120), concentration_unit = "g/l", - tax_genus = c("Danio","Pimephales") - #tax_family = c("Cyprinidae","Salmonidae") - #tax_group = c("fish","algae") + tax_group = c("fish","algae") ) -# Example 3) -tox.dt = stx_query( + +# Example 3) - Filter for specific genus and chemical compounds + append reference data +# make sure to extract only EC50/LC50 values from 12 - 96 hours and of comparable concentration units +dt5 = stx_query( verbose = T, - cas_number = c("482-89-3", "50-00-0","1071-83-6"), - #endpoint_group = "XX50", + cas_number = c("1071-83-6","63-25-2","138261-41-3"), # glyphosate, carbaryl and imidacloprid + endpoint_group = "XX50", # to get only 50% effect/lethality values duration = c(12, 96), concentration_unit = "g/l", - tax_genus = c("Danio","Daphnia"), + tax_genus = c("Danio","Daphnia","Ceriodaphnia","Chironomus"), + include_reference = TRUE +) + + +# Example 4) - filter for specific taxonomic families and append reference data + more taxonomic lineage information +stx_catalog()$family # get an idea about which family to use by inspecting the taxa catalog +colnames(stx_taxa())[-1] # all possible taxonomic columns to extract from stx_taxa() + +dt6 = stx_query( + verbose = T, + endpoint_group = "XX50", + duration = c(12, 120), + concentration_unit = "g/l", + tax_family = c("Cyprinidae","Salmonidae","Daphniidae"), + tax_columns = c("group","taxon","genus","family","order","class","habitat"), # to get more taxonomic lineage information include_reference = TRUE ) + + +# Example 5) - Specific data selection +# get ALL LC50 values for 96 - 120 h of exposure for zebra fish (Danio rerio) +danio = stx_query( + endpoint_group = "XX50", + duration = c(96, 120), + effect = "mortality", + concentration_unit = "g/l", + concentration_type = "active ingredient", + tax_genus = "Danio", + include_reference = TRUE +) + +# get ALL LC50 values for 24 - 48 h of exposure for Daphnia +dmag = stx_query( + endpoint_group = "XX50", + duration = c(24, 48), + effect = "mortality", + concentration_unit = "g/l", + concentration_type = "active ingredient", + tax_genus = "Daphnia", + include_reference = TRUE +) \ No newline at end of file From e91a2e15467ad5babc91d8847cb33f8101e6721e Mon Sep 17 00:00:00 2001 From: Hannes Reinwald Date: Tue, 17 Jun 2025 14:03:59 +0200 Subject: [PATCH 6/6] Updated function one more time and added documentation and code examples --- R/standartox.R | 101 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 30 deletions(-) diff --git a/R/standartox.R b/R/standartox.R index 6e02edb..aad3924 100644 --- a/R/standartox.R +++ b/R/standartox.R @@ -154,35 +154,75 @@ filter_dt = function(dt, var_ls, silent = TRUE){ } -#' Query Standartox toxicity values +#' Query Standartox Toxicity Data #' -#' Retrieve toxicity values from the Standartox data base on Zenodo.org \url{https://doi.org/10.5281/zenodo.3785030}. -#' -#' @return Returns a data.table containing Standartox data base query results. +#' Retrieve and filter toxicity data from the Standartox database (\url{https://doi.org/10.5281/zenodo.3785030}) using chemical, experimental, and taxonomic criteria. #' -#' @param cas character, integer; Limit data base query to specific CAS numbers, multiple entries possible (e.g. c('1071-83-6', '1071836'), NULL (default). -#' @param concentration_unit character; Limit data base query to specific concentration units (e.g. ug/l - default). -#' @param concentration_type character; Limit data base query to specific concentration types, can be one of NULL (default), 'active ingredient', 'formulation', 'total', 'not reported', 'unionized', 'dissolved', 'labile'. See \url{https://cfpub.epa.gov/ecotox/pdf/codeappendix.pdf} p.4. -#' @param duration integer vector of length two; Limit data base query to specific test durations (hours) (e.g. c(24, 48)). NULL (default). -#' @param endpoint character; Choose endypoint type, must be one of 'XX50' (default), 'NOEX', 'LOEX'. -#' @param effect character; Limit data base query to specific effect groups, multiple entries possible (e.g. 'Mortality', 'Intoxication', 'Growth'). See \url{https://cfpub.epa.gov/ecotox/pdf/codeappendix.pdf} p.95. NULL (default). -#' @param exposure character; Choose exposure type, (e.g. aquatic, environmental, diet). NULL (default). -#' @param chemical_role character; Limit data base query to specific chemical roles (e.g. insecticide), multiple entries possible. NULL (default). -#' @param chemical_class character; Limit data base query to specific chemical classes (e.g. neonicotinoid), multiple entries possible. NULL (default). -#' @param taxa character; Limit data base query to specific taxa, multiple entries possible. NULL (default). -#' @param ecotox_grp character; Convenience grouping of organisms in ecotoxicology, must be one of NULL (default), 'invertebrate', 'fish', 'plant_land', 'macrophyte', 'algae'. -#' @param trophic_lvl character; Trophic level of organism, must be one of NULL (default), 'autotroph', 'heterotroph'. -#' @param habitat character; Limit data base query to specific organism habitats, can be one of NULL (default) 'marine', 'brackish', 'freshwater', 'terrestrial'. -#' @param region character; Limit data base query to organisms occurring in specific regions, can be one of NULL (default) 'africa', 'america_north', 'america_south', 'asia', 'europe', 'oceania'. -#' @param vers integer; Choose the version of the EPA Ecotox on which Standartox is based on. NULL (default) accesses the most recent version. -#' @param ... currently not used +#' @param cas_number character; Optional. Vector of CAS numbers to filter chemicals (e.g. \code{c("1071-83-6","63-25-2","138261-41-3")}). Default is \code{NULL} (no filtering). +#' @param endpoint_group character; Optional. Endpoint group(s) to filter results. All possible endpoint groups can be checked via \code{stx_catalog()$endpoint_group}. Default is \code{c("XX50", "NOEX", "LOEX")}. +#' @param exposure character; Optional. Vector of exposure types (e.g. \code{"aquatic"}). All possible exposure types can be checked via \code{stx_catalog()$exposure}. Default is \code{NULL}. +#' @param effect character; Optional. Vector of effect types (e.g. \code{"Mortality", "Growth"}). All possible effect types can be checked via \code{stx_catalog()$effect}. Default is \code{NULL}. +#' @param duration numeric; Optional. Numeric vector of length two specifying minimum and maximum test duration (in hours), e.g. \code{c(0, 48)}. Default is \code{c(0, Inf)}. +#' @param duration_unit character; Optional. Filter by duration unit (e.g. \code{"h"} for hours). All possible duration units can be checked via \code{stx_catalog()$duration_unit}. Set to \code{NULL} to keep all. Default is \code{"h"}. +#' @param concentration_unit character; Optional. Filter by concentration unit (e.g. \code{"g/l"}). All possible concentration units can be checked via \code{stx_catalog()$concentration_unit}. Default is \code{NULL}. +#' @param concentration_type character; Optional. Filter by concentration type (e.g. \code{"active ingredient"}). All possible concentration types can be checked via \code{stx_catalog()$concentration_type}. Default is \code{NULL}. +#' @param tax_columns character; Columns of taxonomic information to append to results. All possible columns can be checked via \code{colnames(stx_taxa())}. Default is \code{c("group", "taxon", "genus", "family")}. +#' @param tax_genus character; Optional. Filter by genus. All possible genera can be checked via \code{stx_catalog()$genus}. Default is \code{NULL}. +#' @param tax_family character; Optional. Filter by family. All possible families can be checked via \code{stx_catalog()$family}. Default is \code{NULL}. +#' @param tax_order character; Optional. Filter by order. All possible orders can be checked via \code{stx_catalog()$order}. Default is \code{NULL}. +#' @param tax_class character; Optional. Filter by class. All possible classes can be checked via \code{stx_catalog()$class}. Default is \code{NULL}. +#' @param ecotox_grp character; Optional. Filter by one or more ecotoxicological groups. Possible values are \code{"invertebrate"}, \code{"plant"}, \code{"fish"}, \code{"fungi"}, \code{"algae"}, \code{"aves"}, \code{"amphibia"}, \code{"mammalia"}, \code{"reptilia"}, \code{"macrophyte"}. All possible ecotox groups can be checked via \code{stx_catalog()$group}. Multiple entries possible. Default is \code{NULL}. +#' @param include_reference logical; If \code{TRUE}, append reference information. Default is \code{FALSE}. +#' @param rm_na logical; If \code{TRUE}, remove rows with "NR" (not reported) values. Default is \code{TRUE}. +#' @param verbose logical; If \code{TRUE}, print progress messages. Default is \code{FALSE}. +#' @param ... Additional arguments passed to \code{stx_download}. +#' +#' @return Returns a \code{data.table} with filtered Standartox toxicity data. +#' +#' @author Hannes Reinwald #' -#' @return Returns a list of three data.tables (filtered data base query results, aggregated data base query results, meta information) -#' @author Andreas Scharmueller \email{andschar@@protonmail.com} -#' #' @examples #' \donttest{ #' +#' Basic stx_query() call: Will return results filtered for default endpoint_group = c("XX50", "NOEX", "LOEX") and duration_unit = "h" +#' stx_query(verbose = T) +#' +#' # If you wish to filter for different endpoint groups, you can specify them in the query. +#' stx_catalog()$endpoint_group # to view available endpoint groups +#' stx_query(endpoint_group = c("Bioconc","MATC","MCIG")) +#' +#' # Query for a specific CAS number, endpoint group, and tax group(s) +#' stx_query( +#' cas_number = "1071-83-6", +#' endpoint_group = "NOEX", +#' duration = c(0, 48), +#' ecotox_group = c("invertebrate", "fish"), +#' tax_family = "Daphniidae" +#' ) +#' +#' # get ALL LC50 values for 96 - 120 h of exposure for zebra fish (Danio rerio) +#' stx_query( +#' endpoint_group = "XX50", +#' duration = c(96, 120), +#' effect = "mortality", +#' concentration_unit = "g/l", +#' concentration_type = "active ingredient", +#' tax_genus = "Danio", +#' include_reference = TRUE +#' ) +#' +#' # get ALL LC50 values for 24 - 48 h of exposure for Daphnia +#' stx_query( +#' endpoint_group = "XX50", +#' duration = c(24, 48), +#' effect = "mortality", +#' concentration_unit = "g/l", +#' concentration_type = "active ingredient", +#' tax_genus = "Daphnia", +#' include_reference = TRUE +#' ) +#' } +#' #' @export #' stx_query = function( @@ -197,12 +237,12 @@ stx_query = function( concentration_unit = NULL, # character vector concentration_type = NULL, # character vector ## TAXA FILTERING ## - tax_columns = c('tax_group', 'tax_taxon', 'tax_genus', 'tax_family'), # Taxonomy columns to append to the query results. DEFAULT: c('tax_group', 'tax_taxon', 'tax_genus', 'tax_family') + tax_columns = c('group', 'taxon', 'genus', 'family'), # Taxonomy columns to append to the query results. DEFAULT: c('tax_group', 'tax_taxon', 'tax_genus', 'tax_family') tax_genus = NULL, # character vector tax_family = NULL, # character vector tax_order = NULL, # character vector tax_class = NULL, # character vector - ecotox_grp = NULL, # character vector + tax_group = NULL, # character vector ## REFERENCE SECTION ## include_reference = FALSE, # Default FALSE rm_na = TRUE, # Default TRUE; if FALSE, keep NR values in the result @@ -266,6 +306,7 @@ stx_query = function( tax_col = paste0("tax_", colnames(stxDb$taxa)) tax_col[ grep(paste0("^tax_",tax_key,"$"),tax_col) ] <- tax_key # replace tax_key with tl_id colnames(stxDb$taxa) <- tax_col # set new column names + tax_columns = paste0("tax_", tax_columns) # append 'tax_' prefix to tax_columns # Select pre-defined columns for output tax.out = stxDb$taxa[, c(tax_key, tax_columns), with = FALSE] @@ -273,8 +314,8 @@ stx_query = function( tox.dt = merge(tox.dt, tax.out, all.x = TRUE, by = tax_key) ## Filtering ecotox_grp ## - if(!is.null(ecotox_grp)){ - regstr = paste(ecotox_grp, collapse = "|") + if(!is.null(tax_group)){ + regstr = paste(tax_group, collapse = "|") tox.dt = tox.dt[ grepl(regstr, tox.dt$tax_group) ] if( nrow( tox.dt ) == 0 ) { warning("No query matches found for the provided tax_group. Please check the input values.") @@ -289,7 +330,8 @@ stx_query = function( tax_genus = tax_genus ) tox.dt = filter_dt( tox.dt, var_ls) - if( is.null(tox.dt) ) { return(NULL) } # Check + if( is.null(tox.dt) ) { return(NULL) } # Check + suppressWarnings( tox.dt[, (tax_key) := NULL] ) # don't need the tl_id column anymore. # Step 3: Final Tox data filtering tox.dt = tox.dt[!is.na(result_id)] # rmv any rows with NA in result_id <- this should not be the case but to be save! @@ -338,7 +380,6 @@ stx_query = function( } - #' Retrieve data catalog #' #' Retrieve a data catalog for all variables (and their values) that can be retrieved with stx_query() @@ -539,4 +580,4 @@ stx_meta = function(silent = FALSE, dir_out = file.path(tempdir(), "standartox") out = stx_download(data_type = 'meta', dir_out = dir_out)[[1]] return(out) -} +} \ No newline at end of file