Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
webchem 0.5.0.9005
======================
webchem 0.5.0.9010

NEW FEATURES

* get_cid() now can search by registry IDs (e.g. CAS RN), and can handle more complex requests like searching for similar compounds.
* Retrieve chemical data from PubChem content pages with pc_sect().
* get_etoxid() now can search by CAS, EC, GSBL and RTECS numbers. Added `from = ` argument. [PR #241, added by @andschar]
* nist_ri() now can search by name, InChI, InChIKey, or CAS. The `cas` argument is deprecated. Use `query` instead with `from = "cas"`
Expand Down
3 changes: 2 additions & 1 deletion R/flavornet.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
fn_percept <- function(CAS, verbose = TRUE, ...)
{
foo <- function (CAS, verbose){
qurl = paste0("http://www.flavornet.org/info/",CAS,".html")
qurl <- paste0("http://www.flavornet.org/info/",CAS,".html")
if (verbose)
message(qurl)
Sys.sleep(rgamma(1, shape = 10, scale = 1/10))
Expand All @@ -43,5 +43,6 @@ fn_percept <- function(CAS, verbose = TRUE, ...)
}
percepts <- sapply(CAS, foo, verbose = verbose)
percepts <- setNames(percepts, CAS)
suppressWarnings(closeAllConnections())
return(percepts)
}
241 changes: 170 additions & 71 deletions R/pubchem.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,54 @@
#' Retrieve Pubchem Id (CID)
#' Retrieve Pubchem Compound ID (CID)
#'
#' Return CompoundID (CID) for a search query using PUG-REST,
#' see \url{https://pubchem.ncbi.nlm.nih.gov/}.
#' @param query character; search term.
#' @param from character; type of input, can be one of "name" (default), "cid",
#' "sid", "aid", "smiles", "inchi", "inchikey"
#' @param match character; How should multiple hits be handled?, "all" all matches are returned, "best" the best matching is returned, "ask" enters an interactive mode and the user is asked for input, "na" returns NA if multiple hits are found.
#' @param search_substances logical; If TRUE also searches PubChem SIDs
#' Retrieve compound IDs (CIDs) from PubChem.
#' @param query character; search term, one or more compounds.
#' @param from character; type of input. See details for more information.
#' @param domain character; query domain, can be one of \code{"compound"},
#' \code{"substance"}, \code{"assay"}.
#' @param match character; How should multiple hits be handled?, \code{"all"}
#' all matches are returned, \code{"best"} the best matching is returned,
#' \code{"ask"} enters an interactive mode and the user is asked for input,
#' \code{"na"} returns NA if multiple hits are found.
#' @param verbose logical; should a verbose output be printed on the console?
#' @param arg character; optinal arguments like "name_type=word" to match
#' individual words.
#' @param first deprecated. Use `match` instead.
#' @param ... currently unused.
#' @return a tibble.
#'
#' @details Valid values for the \code{from} argument depend on the
#' \code{domain}:
#' \itemize{
#' \item{\code{compound}: \code{"name"}, \code{"smiles"}, \code{"inchi"},
#' \code{"inchikey"}, \code{"formula"}, \code{"sdf"}, <xref>,
#' <structure search>, <fast search>.}
#' \item{\code{substance}: \code{"name"}, \code{"sid"},
#' \code{<xref>}, \code{"sourceid/<source id>"} or \code{"sourceall"}.}
#' \item{\code{assay}: \code{"aid"}, \code{<assay target>}.}
#' }
#' @details <structure search> is assembled as "{\code{substructure} |
#' \code{superstructure} | \code{similarity} | \code{identity}} / {\code{smiles}
#' | \code{inchi} | \code{sdf} | \code{cid}}", e.g.
#' \code{from = "substructure/smiles"}.
#' @details \code{<xref>} is assembled as "\code{xref}/\{\code{RegistryID} |
#' \code{RN} | \code{PubMedID} | \code{MMDBID} | \code{ProteinGI},
#' \code{NucleotideGI} | \code{TaxonomyID} | \code{MIMID} | \code{GeneID} |
#' \code{ProbeID} | \code{PatentID}\}", e.g. \code{from = "xref/RN"} will query
#' by CAS RN.
#' @details <fast search> is either \code{fastformula} or it is assembled as
#' "{\code{fastidentity} | \code{fastsimilarity_2d} | \code{fastsimilarity_3d} |
#' \code{fastsubstructure} | \code{fastsuperstructure}}/{\code{smiles} |
#' \code{smarts} | \code{inchi} | \code{sdf} | \code{cid}}", e.g.
#' \code{from = "fastidentity/smiles"}.
#' @details \code{<source id>} is any valid PubChem Data Source ID. When
#' \code{from = "sourceid/<source id>"}, the query is the ID of the substance in
#' the depositor's database.
#' @details If \code{from = "sourceall"} the query is one or more valid Pubchem
#' depositor names. Depositor names are not case sensitive.
#' @details Depositor names and Data Source IDs can be found at
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @details \code{<assay target>} is assembled as "\code{target}/\{\code{gi} |
#' \code{proteinname} | \code{geneid} | \code{genesymbol} | \code{accession}\}",
#' e.g. \code{from = "target/geneid"} will query by GeneID.
#' @references Wang, Y., J. Xiao, T. O. Suzek, et al. 2009 PubChem: A Public
#' Information System for
#' Analyzing Bioactivities of Small Molecules. Nucleic Acids Research 37:
Expand All @@ -34,6 +69,7 @@
#' usage policies of the indicidual data sources
#' \url{https://pubchem.ncbi.nlm.nih.gov/sources/}.
#' @author Eduard Szoecs, \email{eduardszoecs@@gmail.com}
#' @author Tamás Stirling, \email{stirling.tamas@@gmail.com}
#' @import httr
#' @importFrom purrr map map2
#' @importFrom jsonlite fromJSON
Expand All @@ -45,102 +81,165 @@
#' # might fail if API is not available
#' get_cid("Triclosan")
#' get_cid("Triclosan", arg = "name_type=word")
#' get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey")
#' # from SMILES
#' get_cid("CCCC", from = "smiles")
#' # from InChI
#' get_cid("InChI=1S/CH5N/c1-2/h2H2,1H3", from = "inchi")
#' # from InChIKey
#' get_cid("BPGDAMSIGCZZLK-UHFFFAOYSA-N", from = "inchikey")
#' # from formula
#' get_cid("C26H52NO6P", from = "formula")
#' # from CAS RN
#' get_cid("56-40-6", from = "xref/rn")
#' # similarity
#' get_cid(5564, from = "similarity/cid")
#' get_cid("CCO", from = "similarity/smiles")
#' # from SID
#' get_cid("126534046", from = "sid", domain = "substance")
#' # sourceid
#' get_cid("VCC957895", from = "sourceid/23706", domain = "substance")
#' # sourceall
#' get_cid("Optopharma Ltd", from = "sourceall", domain = "substance")
#' # from AID (CIDs of substances tested in the assay)
#' get_cid(170004, from = "aid", domain = "assay")
#' # from GeneID (CIDs of substances tested on the gene)
#' get_cid(25086, from = "target/geneid", domain = "assay")
#'
#' # multiple inputs
#' comp <- c("Triclosan", "Aspirin")
#' get_cid(comp)
#' get_cid(c("Triclosan", "Aspirin"))
#'
#' }
get_cid <-
function(query,
from = c("name", "cid", "sid", "aid", "smiles", "inchi", "inchikey"),
from = "name",
domain = c("compound", "substance", "assay"),
match = c("all", "first", "ask", "na"),
verbose = TRUE,
search_substances = FALSE,
arg = NULL,
first = NULL,
...) {

# from can be cid | name | smiles | inchi | sdf | inchikey | formula
# query <- c("Aspirin")
# from = "name"

#deprecate `first`
if (!is.null(first) && first == TRUE) {
message("`first = TRUE` is deprecated. Use `match = 'first'` instead")
match <- "first"
} else if (!is.null(first) && first==FALSE) {
} else if (!is.null(first) && first == FALSE) {
message("`first = FALSE` is deprecated. Use `match = 'all'` instead")
match <- "all"
}

from <- match.arg(from)
#input validation
from <- tolower(from)
domain <- match.arg(domain)
xref <- paste(
"xref",
c("registryid", "rn", "pubmedid", "mmdbid", "proteingi", "nucleotidegi",
"taxonomyid", "mimid", "geneid", "probeid", "patentid"),
sep = "/"
)
structure_search <- expand.grid(
c("substructure", "superstructure", "similarity", "identity"),
c("smiles", "inchi", "sdf", "cid")
)
structure_search <- paste(structure_search$Var1, structure_search$Var2,
sep = "/")
fast_search <- expand.grid(
c("fastidentity", "fastsimilarity_2d", "fastsimilarity_3d",
"fastsubstructure", "fastsuperstructure"),
c("smiles", "smarts", "inchi", "sdf", "cid")
)
fast_search <- c(with(fast_search, paste(Var1, Var2, sep = "/")),
"fastformula")
targets <- paste("target", c("gi", "proteinname", "geneid", "genesymbol",
"accession"), sep = "/")
if (domain == "compound") {
from_choices <- c("cid", "name", "smiles", "inchi", "sdf", "inchikey",
"formula", structure_search, xref, fast_search)
from <- match.arg(from, choices = from_choices)
}
if (domain == "substance") {
if (grepl("^sourceid/", from) == FALSE) {
from <- match.arg(from, choices = c("sid", "name", xref, "sourceall"))
}
}
if (domain == "assay") {
from <- match.arg(from, choices = c("aid", targets))
}
match <- match.arg(match)

foo <- function(query, from, match, scope = "compound",
verbose, arg, ...) {
if (is.na(query))
return(NA)
prolog <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
input <- paste0("/", scope, "/", from)
output <- "/cids/JSON"
if (!is.null(arg))
arg <- paste0("?", arg)
qurl <- paste0(prolog, input, output, arg)
if (verbose)
message(qurl)
Sys.sleep(rgamma(1, shape = 15, scale = 1/10))
cont <- try(
content(
POST(qurl,
body = paste0(from, "=", query)),
type = "text", encoding = "UTF-8"),
silent = TRUE
)
if (inherits(cont, "try-error")) {
warning("Problem with web service encountered... Returning NA.")
foo <- function(query, from, domain, match, verbose, arg, ...) {
if (is.na(query)) {
if (verbose) message(paste0(query, " is invalid. Returning NA."))
return(NA)
}
cont <- jsonlite::fromJSON(cont)
if (names(cont) == "Fault") {
warning(cont$Fault$Details, ". Returning NA.")
return(NA)
if (verbose) {
message(paste0("Querying ", query, ". "), appendLF = FALSE)
}
if (is.character(query)) query <- URLencode(query)
if (from %in% structure_search) {
qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug",
domain, from, query, "json", sep = "/")
}
else {
qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug",
domain, from, query, "cids", "json", sep = "/")
}
if (!is.null(arg)) qurl <- paste0(qurl, "?", arg)
Sys.sleep(rgamma(1, shape = 15, scale = 1 / 10))
if (from == "inchi") {
qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug",
domain, from, "cids", "json", sep = "/")
res <- httr::POST(qurl, body = paste0("inchi=", query),
user_agent("webchem"), handle = handle(""))
}
else {
res <- httr::POST(qurl, user_agent("webchem"),
handle = handle(""))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add user_agent() ?

}
if (res$status_code != 200) {
if (res$status_code == 202) {
cont <- httr::content(res, type = "text", encoding = "UTF-8")
listkey <- jsonlite::fromJSON(cont)$Waiting$ListKey
qurl <- paste("https://pubchem.ncbi.nlm.nih.gov/rest/pug/", domain,
"listkey", listkey, "cids", "json", sep = "/")
while (res$status_code == 202) {
Sys.sleep(5 + rgamma(1, shape = 15, scale = 1 / 10))
res <- httr::POST(qurl, user_agent("webchem"), handle = handle(""))
}
if (res$status_code != 200) {
if (verbose) message(httr::message_for_status(res))
return(NA)
}
}
else{
if (verbose) message(httr::message_for_status(res))
return(NA)
}
}
if (verbose) message(httr::message_for_status(res))
cont <- httr::content(res, type = "text", encoding = "UTF-8")
if (domain == "compound") {
cont <- jsonlite::fromJSON(cont)$IdentifierList$CID
}
if (scope == "substance") {
cont <- cont$InformationList$Information$CID
if (domain == "substance") {
cont <- jsonlite::fromJSON(cont)$InformationList$Information$CID
}
if (domain == "assay") {
cont <- jsonlite::fromJSON(cont)$InformationList$Information$CID
}
out <- unique(unlist(cont))
out <- matcher(x = out, match = match, verbose = verbose)
out <- matcher(x = out, query = query, match = match, verbose = verbose)
out <- as.character(out)
names(out) <- NULL
return(out)
}

out <- map(query,
~foo(query = .x, from = from, match = match,
out <- map(query,
~foo(query = .x, from = from, domain = domain, match = match,
verbose = verbose, arg = arg))
out <- setNames(out, query)

if (search_substances) {
out2 <- map(query,
~foo(query = .x, from = from, match = match, scope = "substance",
verbose = verbose, arg = arg))
out2 <- setNames(out2, query)

out <- map2(out, out2, c)
out <- map(out, unique)
}

out <-
out <- setNames(out, query)
out <-
lapply(out, enframe, name = NULL, value = "cid") %>%
bind_rows(.id = "query")
return(out)
return(out)
}



#' Retrieve compound properties from a pubchem CID
#'
#' Retrieve compound information from pubchem CID, see
Expand Down
Loading