
diff --git a/.Rbuildignore b/.Rbuildignore index 31147302..e5ad7ded 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -15,3 +15,5 @@ ^doc$ ^Meta$ ^CRAN-SUBMISSION$ +^paper\.md$ +^paper\.bib$ diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml new file mode 100644 index 00000000..b03847b6 --- /dev/null +++ b/.github/workflows/draft-pdf.yml @@ -0,0 +1,28 @@ +name: Draft PDF +on: + push: + paths: + - paper.md + - paper.bib + +jobs: + paper: + runs-on: ubuntu-latest + name: Paper Draft + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build draft PDF + uses: openjournals/openjournals-draft-action@master + with: + journal: joss + # This should be the path to the paper within your repo. + paper-path: paper.md + - name: Upload + uses: actions/upload-artifact@v4 + with: + name: paper + # This is the output path where Pandoc will write the compiled + # PDF. Note, this should be the same directory as the input + # paper.md + path: paper.pdf diff --git a/.github/workflows/rhub.yaml b/.github/workflows/rhub.yaml new file mode 100644 index 00000000..74ec7b05 --- /dev/null +++ b/.github/workflows/rhub.yaml @@ -0,0 +1,95 @@ +# R-hub's generic GitHub Actions workflow file. It's canonical location is at +# https://github.com/r-hub/actions/blob/v1/workflows/rhub.yaml +# You can update this file to a newer version using the rhub2 package: +# +# rhub::rhub_setup() +# +# It is unlikely that you need to modify this file manually. + +name: R-hub +run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" + +on: + workflow_dispatch: + inputs: + config: + description: 'A comma separated list of R-hub platforms to use.' + type: string + default: 'linux,windows,macos' + name: + description: 'Run name. You can leave this empty now.' + type: string + id: + description: 'Unique ID. You can leave this empty now.' + type: string + +jobs: + + setup: + runs-on: ubuntu-latest + outputs: + containers: ${{ steps.rhub-setup.outputs.containers }} + platforms: ${{ steps.rhub-setup.outputs.platforms }} + + steps: + # NO NEED TO CHECKOUT HERE + - uses: r-hub/actions/setup@v1 + with: + config: ${{ github.event.inputs.config }} + id: rhub-setup + + linux-containers: + needs: setup + if: ${{ needs.setup.outputs.containers != '[]' }} + runs-on: ubuntu-latest + name: ${{ matrix.config.label }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.containers) }} + container: + image: ${{ matrix.config.container }} + + steps: + - uses: r-hub/actions/checkout@v1 + - uses: r-hub/actions/platform-info@v1 + with: + token: ${{ secrets.RHUB_TOKEN }} + job-config: ${{ matrix.config.job-config }} + - uses: r-hub/actions/setup-deps@v1 + with: + token: ${{ secrets.RHUB_TOKEN }} + job-config: ${{ matrix.config.job-config }} + - uses: r-hub/actions/run-check@v1 + with: + token: ${{ secrets.RHUB_TOKEN }} + job-config: ${{ matrix.config.job-config }} + + other-platforms: + needs: setup + if: ${{ needs.setup.outputs.platforms != '[]' }} + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.label }} + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.setup.outputs.platforms) }} + + steps: + - uses: r-hub/actions/checkout@v1 + - uses: r-hub/actions/setup-r@v1 + with: + job-config: ${{ matrix.config.job-config }} + token: ${{ secrets.RHUB_TOKEN }} + - uses: r-hub/actions/platform-info@v1 + with: + token: ${{ secrets.RHUB_TOKEN }} + job-config: ${{ matrix.config.job-config }} + - uses: r-hub/actions/setup-deps@v1 + with: + job-config: ${{ matrix.config.job-config }} + token: ${{ secrets.RHUB_TOKEN }} + - uses: r-hub/actions/run-check@v1 + with: + job-config: ${{ matrix.config.job-config }} + token: ${{ secrets.RHUB_TOKEN }} diff --git a/DESCRIPTION b/DESCRIPTION index d2912ecd..9799a187 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: dbparser Title: Drugs Databases Parser -Version: 2.0.3 +Version: 2.2.1 Authors@R: c( person("Mohammed", "Ali", email = "moh_fcis@yahoo.com", role = c("aut", "cre")), @@ -12,13 +12,15 @@ Description: This tool is for parsing public drug databases such as 'DrugBank' X License: MIT + file LICENSE Encoding: UTF-8 Imports: + data.table, dplyr, progress, purrr, tibble, tools, + utils, XML -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.3 Suggests: canvasXpress, knitr, diff --git a/NAMESPACE b/NAMESPACE index 2fa538f5..33347102 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,17 @@ # Generated by roxygen2: do not edit by hand +export(add_database_info) export(cett_nodes_options) export(drug_node_options) +export(merge_drugbank_onsides) +export(merge_drugbank_twosides) export(parseDrugBank) +export(parseOnSIDES) +export(parseTWOSIDES) export(references_node_options) export(show_dvobject_metadata) +export(subset_drugbank_dvobject) +export(subset_onsides_dvobject) import(dplyr) importFrom(XML,xmlApply) importFrom(XML,xmlChildren) @@ -16,7 +23,17 @@ importFrom(XML,xmlSize) importFrom(XML,xmlToDataFrame) importFrom(XML,xmlToList) importFrom(XML,xmlValue) +importFrom(data.table,fread) +importFrom(dplyr,.data) +importFrom(dplyr,distinct) +importFrom(dplyr,filter) +importFrom(dplyr,left_join) +importFrom(dplyr,mutate) +importFrom(dplyr,pull) +importFrom(dplyr,rename) +importFrom(dplyr,select) importFrom(progress,progress_bar) +importFrom(purrr,"%||%") importFrom(purrr,'%>%') importFrom(purrr,is_empty) importFrom(purrr,is_null) @@ -26,5 +43,6 @@ importFrom(tibble,as_tibble) importFrom(tibble,as_tibble_row) importFrom(tibble,tibble) importFrom(tibble,tibble_row) +importFrom(utils,object.size) importFrom(utils,stack) importFrom(utils,unzip) diff --git a/NEWS.md b/NEWS.md index 89f0cfe5..ab0867fa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,58 @@ +# dbparser + +# dbparser 2.2.1 + +This is a major feature release focused on expanding `dbparser`'s capabilities into real-world pharmacovigilance and drug-drug interaction analysis. The integration engine has been formalized around a "Hub and Spoke" model, with DrugBank acting as the central hub. + +## NEW FEATURES + +* **New Parser: `parseOnSIDES()`** + * Parses the relational CSV files from the OnSIDES database, a modern resource for adverse drug events extracted from FDA labels. + * Returns a `dvobject` containing the 7 core relational tables and an optional `high_confidence` summary table. + +* **New Parser: `parseTWOSIDES()`** + * Parses the TWOSIDES database, the leading resource for drug-drug interaction (DDI) adverse event signals from real-world data. + * Returns a `dvobject` containing the `drug_drug_interactions` table. + * The parser correctly handles known column name misspellings in the source data (e.g., `drug_1_rxnorn_id`). + +* **New Integration Function: `merge_drugbank_onsides()`** + * Merges a DrugBank `dvobject` with an OnSIDES `dvobject`. + * Automatically creates an enriched `integrated_data` list, linking OnSIDES data to DrugBank IDs via RxNorm CUIs. + * The function is **chainable**, meaning it can be used in a `%>%` pipeline after other merge functions. + +* **New Integration Function: `merge_drugbank_twosides()`** + * Merges a DrugBank `dvobject` with a TWOSIDES `dvobject`. + * Performs a "double join" to enrich both drugs in an interaction pair with their DrugBank IDs and names. + * Uses a robust "union" logic to keep interactions even if only one of the two drugs is present in the input DrugBank object. + * The function is also **chainable**. +* **Subset a DrugBank dvobject function: `subset_drugbank_dvobject()`** + * Intelligently filters a DrugBank dvobject to retain only the data associated with a specified list of drugbank_ids. It correctly handles the deep, +multi-level nested structure of the entire object, including the complex relationships within the `cett` list. +* **Subset an OnSIDES dvobject function: `subset_onsides_dvobject()`** + * Intelligently filters an OnSIDES dvobject by cascading filters through the +relational tables, ensuring the final subset is self-consistent. +* **Adding metadata to existing `dvobject` objects using function: `add_database_info()`** +* **Major enhancements to function `show_dvobject_metadata()`** + * Displays information about passed dbobject object including basic info, + database metadata, and all data.frames contained within nested lists. + +* Run `vignette("dbparser_2_2", package = "dbparser")` for more info + +## DOCUMENTATION + +* **New Vignette: "Integrated Pharmacovigilance"** + * A comprehensive new tutorial demonstrating a full, three-way integration of DrugBank, OnSIDES, and TWOSIDES. + * Includes a complete scientific case study analyzing single-drug vs. polypharmacy risks. + * Introduces a reproducible example workflow using a pre-computed RDS data object hosted externally to keep the package lightweight. + * Run `vignette("drugbank_nside", package = "dbparser")` for more info +* Updated existing vignette and package Readme with enhanced examples + +## BUG FIXES & MINOR IMPROVEMENTS + +* Several minor fixes are done + +--- + # dbparser 2.0.3 ## Enhancements diff --git a/R/drugbank_parser.R b/R/drugbank_parser.R index 9154474e..37e0bc01 100644 --- a/R/drugbank_parser.R +++ b/R/drugbank_parser.R @@ -80,7 +80,13 @@ parseDrugBank <- function(db_path, message("Completed loading DrugBank DB into memory") message("...........................................") pkg_env$root <- XML::xmlRoot(parsed_db) - dvobject <- add_drugbank_info(dvobject = dvobject) + dvobject <- add_database_info(dvobject = dvobject, + db_version = XML::xmlGetAttr( + node = pkg_env$root, + name = "version"), + db_exported_date = XML::xmlGetAttr( + node = pkg_env$root, + name = "exported-on")) message("parsing drugs elements") dvobject[["drugs"]] <- parse_drug_nodes(drug_options) diff --git a/R/dvobject_metadata.R b/R/dvobject_metadata.R index 4e6a0e81..b7902716 100644 --- a/R/dvobject_metadata.R +++ b/R/dvobject_metadata.R @@ -12,35 +12,207 @@ init_dvobject <- function() { } -#' add_drugbank_info -#' Add passed DrugBank db metadata to passed dvobject +#' add_database_info +#' Assign passed databases db metadata to passed dvobject +#' +#' @param dvobject dvobject to assign metadata to it +#' @param db_type database type (defualt="DrugBank") +#' @param db_version database version as string +#' @param db_exported_date database official export date #' -#' @keywords internal #' @return dvobject -add_drugbank_info <- function(dvobject) { - db_info <- attr(dvobject, "original_db_info") - - db_info[["db_type"]] <- "DrugBank" - db_info[["db_version"]] <- XML::xmlGetAttr(node = pkg_env$root, - name = "version") - db_info[["db_exported_date"]] <- XML::xmlGetAttr(node = pkg_env$root, - name = "exported-on") +#' @family utility +#' @export +#' @importFrom dplyr .data filter select rename mutate left_join +add_database_info <- function(dvobject, + db_type = "DrugBank", + db_version = NULL, + db_exported_date = NULL) { + db_info <- attr(dvobject, db_type) + db_info[["db_type"]] <- db_type + db_info[["db_version"]] <- db_version + db_info[["db_exported_date"]] <- db_exported_date attr(dvobject, "original_db_info") <- db_info + class(dvobject) <- "dvobject" dvobject } -#' init_dvobject -#' Returns data.frame with two columns (key, value) of dvobject attributes +#' Display dvobject Metadata #' -#' @param dvobject - dvobject list to show related metadata +#' Displays information about passed dbobject object including basic info, +#' database metadata, and all data.frames contained within nested lists. +#' +#' @param obj A dvobject +#' @param return_df Logical. If TRUE, returns metadata data.frame without printing. +#' Default is FALSE. +#' +#' @return Invisibly returns a data.frame containing dvobject metadata +#' +#' @examples +#' \dontrun{ +#' display_merged_db_attrs(drugbank) +#' metadata <- display_merged_db_attrs(drugbank, return_df = TRUE) +#' } #' -#' @return data.frame -#' @family utility #' @export -show_dvobject_metadata <- function(dvobject) { - dvobject_attributes <- attr(dvobject, "original_db_info") - dvobject_attributes[["class"]] <- class(dvobject) - data.frame(Atrribute = names(dvobject_attributes), Value = stack(dvobject_attributes)[[1]]) +#' @family utility +#' @importFrom utils object.size +show_dvobject_metadata <- function(obj, return_df = FALSE) { + # Initialize return variable + metadata <- NULL + + # Validate input + attrs <- attributes(obj) + + if (is.null(attrs) || is.null(attrs$original_db_info)) { + stop("Object does not have expected database attributes (missing 'original_db_info')") + } + + # Find all dataframes in object + df_info <- find_dataframes_recursive(obj) + + # Count only top-level lists (not data.frames) + top_level_lists <- count_top_level_lists(obj) + + # Find second database if it exists + second_db_info <- find_second_database(attrs) + + # Build metadata + metadata <- build_metadata( + attrs = attrs, + has_second_db = second_db_info$has_second_db, + second_db_name = second_db_info$second_db_name + ) + + # Display output if not returning data.frame + if (!return_df) { + cat("=== BASIC INFO ===\n") + basic_info <- data.frame( + Class = paste(attrs$class, collapse = ", "), + Total_DataFrames = nrow(df_info), + Top_Level_Lists = top_level_lists, + Object_Size = format(object.size(obj), units = "auto"), + stringsAsFactors = FALSE + ) + print(basic_info) + + cat("\n=== DATABASE METADATA ===\n") + print(metadata, row.names = FALSE) + + cat("\n=== DATA.FRAMES ===\n") + print(df_info, row.names = FALSE) + } + + # Return metadata + invisible(metadata) +} + + +# Helper Functions for Display Attributes +# These are internal functions not exported to users + +#' Find All Data Frames Recursively +#' @param x Object to search +#' @param prefix Current path prefix +#' @return Data.frame with paths, dimensions, and sizes +#' @keywords internal +find_dataframes_recursive <- function(x, prefix = "") { + result <- NULL + + if (is.data.frame(x)) { + result <- data.frame( + Path = prefix, + Rows = nrow(x), + Cols = ncol(x), + Size = format(object.size(x), units = "auto"), + stringsAsFactors = FALSE + ) + } else if (is.list(x) && (length(x) > 0)) { + results <- lapply(names(x), function(name) { + new_prefix <- if (prefix == "") name else paste0(prefix, "$", name) + find_dataframes_recursive(x[[name]], new_prefix) + }) + result <- do.call(rbind, results[!sapply(results, is.null)]) + } + + result +} + +#' Count Top Level Lists +#' @param obj Object to analyze +#' @return Integer count of top-level lists (excluding data.frames) +#' @keywords internal +count_top_level_lists <- function(obj) { + sum(sapply(obj, function(x) { + (is.list(x) && !is.data.frame(x)) + })) +} + +#' Find Second Database in Attributes +#' @param attrs Attributes list +#' @return List with has_second_db (logical) and second_db_name (character) +#' @keywords internal +find_second_database <- function(attrs) { + result <- list( + has_second_db = FALSE, + second_db_name = NULL + ) + + standard_attrs <- c("names", "class", "original_db_info", "row.names") + potential_second_db <- setdiff(names(attrs), standard_attrs) + + if (length(potential_second_db) > 0) { + db_name <- potential_second_db[length(potential_second_db)] + db_attr <- attrs[[db_name]] + required_fields <- c("db_type") + + if ((is.list(db_attr)) && (any(required_fields %in% names(db_attr)))) { + result$has_second_db <- TRUE + result$second_db_name <- db_name + } + } + + result +} + +#' Build Metadata Data Frame +#' @param attrs Attributes list +#' @param has_second_db Logical indicating if second database exists +#' @param second_db_name Name of second database attribute +#' @return Data.frame with database metadata +#' @keywords internal +#' @importFrom purrr %||% +build_metadata <- function(attrs, has_second_db, second_db_name) { + metadata <- NULL + + if (has_second_db) { + metadata <- rbind( + data.frame( + Database = "First Database", + Type = attrs$DrugBankDB$db_type %||% "Unknown", + Version = attrs$DrugBankDB$db_version %||% "Unknown", + Export_Date = attrs$DrugBankDB$db_exported_date %||% "Unknown", + stringsAsFactors = FALSE + ), + data.frame( + Database = "Second Database", + Type = attrs[[second_db_name]]$db_type %||% "Unknown", + Version = attrs[[second_db_name]]$db_version %||% "Unknown", + Export_Date = attrs[[second_db_name]]$db_exported_date %||% "Unknown", + stringsAsFactors = FALSE + ) + ) + } else { + metadata <- data.frame( + Database = "Original", + Type = attrs$original_db_info$db_type %||% "Unknown", + Version = attrs$original_db_info$db_version %||% "Unknown", + Export_Date = attrs$original_db_info$db_exported_date %||% "Unknown", + stringsAsFactors = FALSE + ) + } + + metadata } diff --git a/R/merge_helpers.R b/R/merge_helpers.R new file mode 100644 index 00000000..df882137 --- /dev/null +++ b/R/merge_helpers.R @@ -0,0 +1,261 @@ +#' Merge DrugBank and OnSIDES Database Objects +#' +#' Creates an integrated dvobject object by linking DrugBank dvobject with +#' OnSIDES dvobject using RxNorm CUIs as the bridge. +#' +#' @details +#' This function performs the following key steps: +#' 1. Creates a mapping table between DrugBank IDs and RxNorm CUIs from the DrugBank object. +#' 2. Enriches the relevant OnSIDES tables (`vocab_rxnorm_ingredient` and optionally +#' `high_confidence`) by adding a `drugbank_id` column. +#' 3. Assembles a new list object containing all original tables plus the enriched ones +#' and the ID mapping table itself. +#' +#' The resulting object allows for powerful queries that span both mechanistic data from +#' DrugBank and clinical side-effect data from OnSIDES. +#' Supports piping and chaining with other merge functions. +#' +#' @param db_object A dvobject from `parseDrugBank()` OR an existing merged +#' dvobject (containing `$drugbank`). +#' @param onsides_db A dvobject produced by `dbparser::parseOnSIDES()`. +#' +#' @return A new dvobject containing the integrated data. +#' +#' @export +#' @family mergers +#' @importFrom dplyr filter select rename mutate left_join .data +#' +#' @examples +#' \dontrun{ +#' # First, parse the individual databases +#' drugbank <- parseDrugBank("path/to/drugbank.xml") +#' onsides <- parseOnSIDES("path/to/onsides_csvs/") +#' +#' # Now, merge them into a single, powerful object +#' merged_db <- merge_drugbank_onsides(drugbank, onsides) +#' +#' # --- Example Analysis: Find the protein targets of all drugs known to --- +#' # --- cause the side effect "Hepatitis" with high confidence. --- +#' +#' # 1. Find the MedDRA ID for "Hepatitis" +#' hepatitis_id <- merged_db$onsides$vocab_meddra_adverse_effect %>% +#' filter(meddra_name == "Hepatitis") %>% +#' pull(meddra_id) +#' +#' # 2. Find all drug ingredients linked to this effect in the high_confidence table +#' drug_ids_causing_hepatitis <- merged_db$onsides$high_confidence_enriched %>% +#' filter(effect_meddra_id == hepatitis_id) %>% +#' pull(drugbank_id) %>% +#' na.omit() %>% +#' unique() +#' +#' # 3. Look up the targets for these DrugBank IDs +#' targets_of_interest <- merged_db$targets %>% +#' filter(parent_key %in% drug_ids_causing_hepatitis) %>% +#' select(drug_id = parent_key, target_name = name, gene_name) +#' +#' head(targets_of_interest) +#' } +merge_drugbank_onsides <- function(db_object, onsides_db) { + + # --- Step 0: Input Validation and Hub Detection --- + # This logic enables the Pipe (%>%) and Chaining. + + if ("drugbank" %in% names(db_object)) { + # CASE A: Input is an already-merged object (e.g., passed via pipe from another merge) + drugbank_db <- db_object$drugbank + merged_object <- db_object # Start with existing data to preserve previous merges + } else { + # CASE B: Input is a raw DrugBank object + drugbank_db <- db_object + merged_object <- init_dvobject() + merged_object$drugbank <- db_object + attr(merged_object, "DrugBankDB") <- attr(drugbank_db, "original_db_info") + } + + # Validate the Hub + if (!inherits(drugbank_db, "dvobject") || + (!"drugs" %in% names(drugbank_db))) { + stop("`db_object` must contain a valid DrugBank dvobject.") + } + + if (!inherits(drugbank_db, "dvobject") || + (!"external_identifiers" %in% names(drugbank_db$drugs))) { + stop("`drugbank_db` must contain external_identifiers data.") + } + + # Validate the Spoke + if (!inherits(onsides_db, "dvobject") || + (!"vocab_rxnorm_ingredient" %in% names(onsides_db))) { + stop("`onsides_db` must be a valid dvobject from parseOnSIDES().") + } + + # --- Step 1: Create the Bridge (RxCUI Mapping Table) --- + message("Creating DrugBank ID <-> RxCUI mapping table...") + rxcui_mapping_df <- drugbank_db$drugs$external_identifiers %>% + dplyr::filter(.data$resource == "RxCUI") %>% + dplyr::select(all_of("drugbank_id"), rxcui = .data$identifier) %>% + dplyr::distinct() + + # --- Step 2: Enrich OnSIDES Tables --- + message("Enriching OnSIDES tables with DrugBank IDs...") + + # Enrich the core ingredient vocabulary + onsides_ingredient_enriched <- onsides_db$vocab_rxnorm_ingredient %>% + dplyr::left_join(rxcui_mapping_df, by = c("rxnorm_id" = "rxcui")) + + # Optionally enrich the high_confidence table if it exists + if ("high_confidence" %in% names(onsides_db)) { + onsides_hc_enriched <- onsides_db$high_confidence %>% + dplyr::mutate(ingredient_id = as.character(.data$ingredient_id)) %>% + dplyr::left_join(rxcui_mapping_df, by = c("ingredient_id" = "rxcui")) + } + + # --- Step 3: Assemble the Final Merged Object --- + message("Assembling final merged object...") + + # Add OnSIDES structure (Initialize if needed, but append to existing) + if (is.null(merged_object$onsides)) { + merged_object$onsides <- list() + } + + # Copy all OnSIDES tables + for (name in names(onsides_db)) { + merged_object$onsides[[name]] <- onsides_db[[name]] + } + + # Ensure integrated_data list exists + if (is.null(merged_object$integrated_data)) { + merged_object$integrated_data <- list() + } + + merged_object$integrated_data[["vocab_rxnorm_ingredient_enriched"]] <- onsides_ingredient_enriched + + if (exists("onsides_hc_enriched")) { + merged_object$integrated_data[["high_confidence_enriched"]] <- onsides_hc_enriched + } + + # Add the mapping table itself for user reference + merged_object$integrated_data[["DrugBank_RxCUI_Mapping"]] <- rxcui_mapping_df + + # Update metadata + attr(merged_object, "onSideDB") <- attr(onsides_db, "original_db_info") + + # Assign a new class (Prepend to keep existing classes like DrugBankTWOSIDESDb) + class(merged_object) <- unique(c("DrugBankOnSIDESDb", class(merged_object))) + + message("Merge complete.") + merged_object +} + + +#' Merge a DrugBank dvobject with a TWOSIDES dvobject +#' +#' Integrates drug-drug interaction data from TWOSIDES with the rich mechanistic +#' information from DrugBank. This function is chainable and can accept a raw +#' DrugBank object or an already-merged dvobject. +#' +#' @param db_object A dvobject from `parseDrugBank()` or an existing merged dvobject. +#' @param twosides_db A dvobject from `parseTWOSIDES()`. +#' +#' @return A new, nested dvobject with the TWOSIDES data added. +#' +#' @importFrom dplyr filter select rename mutate left_join .data distinct +#' @family mergers +#' @export +merge_drugbank_twosides <- function(db_object, twosides_db) { + + # --- Step 0: Input Validation and Hub Detection (Pipe Friendly) --- + if ("drugbank" %in% names(db_object)) { + # Case A: Input is an already-merged object + drugbank_db <- db_object$drugbank + merged_object <- db_object + } else { + # Case B: Input is a raw DrugBank object + drugbank_db <- db_object + merged_object <- init_dvobject() + merged_object$drugbank <- db_object + attr(merged_object, "DrugBankDB") <- attr(drugbank_db, "original_db_info") + } + + # Validate inputs + if (!inherits(drugbank_db, "dvobject") || (!"drugs" %in% names(drugbank_db))) { + stop("`db_object` must contain a valid DrugBank dvobject.") + } + if (!inherits(drugbank_db, "dvobject") || (!"external_identifiers" %in% names(drugbank_db$drugs))) { + stop("`drugbank_db` must contain external_identifiers data.") + } + if (!is.list(twosides_db) || !("drug_drug_interactions" %in% names(twosides_db))) { + stop("`twosides_db` must be a valid dvobject from parseTWOSIDES().") + } + + # --- Step 1: Create Bridge --- + message("Creating DrugBank ID <-> RxCUI mapping table...") + rxcui_mapping_df <- drugbank_db$drugs$external_identifiers %>% + dplyr::filter(.data$resource == "RxCUI") %>% + dplyr::select(all_of("drugbank_id"), rxcui = .data$identifier) %>% + dplyr::mutate(rxcui = .data$rxcui) %>% + dplyr::distinct() + + # Drug name lookup + drug_name_lookup <- drugbank_db$drugs$general_information %>% + dplyr::select(all_of("drugbank_id"), drug_name = .data$name) + + # --- Step 2: Enrich TWOSIDES Data --- + message("Enriching TWOSIDES data with DrugBank information...") + + # Prepare lookup tables for double joining + rxcui_map_1 <- rxcui_mapping_df %>% dplyr::rename(drugbank_id_1 = .data$drugbank_id) + rxcui_map_2 <- rxcui_mapping_df %>% dplyr::rename(drugbank_id_2 = .data$drugbank_id) + + drug_name_lookup_1 <- drug_name_lookup %>% + dplyr::rename(drug_name_1 = .data$drug_name, drugbank_id_1 = .data$drugbank_id) + drug_name_lookup_2 <- drug_name_lookup %>% + dplyr::rename(drug_name_2 = .data$drug_name, drugbank_id_2 = .data$drugbank_id) + + enriched_ddis <- twosides_db$drug_drug_interactions %>% + dplyr::mutate(drug_1_rxnorn_id = as.character(.data$drug_1_rxnorn_id), + drug_2_rxnorm_id = as.character(.data$drug_2_rxnorm_id)) %>% + # LOGIC CHANGE 1: Union (OR) Filter + # Note the spelling: rxnorn + dplyr::filter((.data$drug_1_rxnorn_id %in% rxcui_mapping_df$rxcui) | + (.data$drug_2_rxnorm_id %in% rxcui_mapping_df$rxcui)) %>% + + # Join for Drug 1 (using 'rxnorn' spelling) + dplyr::left_join(rxcui_map_1, by = c("drug_1_rxnorn_id" = "rxcui")) %>% + dplyr::left_join(drug_name_lookup_1, by = "drugbank_id_1") %>% + + # Join for Drug 2 (Twosides seems to use 'rxnorm') + dplyr::left_join(rxcui_map_2, by = c("drug_2_rxnorm_id" = "rxcui")) %>% + dplyr::left_join(drug_name_lookup_2, by = "drugbank_id_2") %>% + + # LOGIC CHANGE 2: Keep if at least one side matched + dplyr::filter(!is.na(.data$drugbank_id_1) | !is.na(.data$drugbank_id_2)) %>% + + # LOGIC CHANGE 3: Fallback names to prevent NAs + dplyr::mutate( + drug_name_1 = dplyr::coalesce(.data$drug_name_1, .data$drug_1_concept_name), + drug_name_2 = dplyr::coalesce(.data$drug_name_2, .data$drug_2_concept_name) + ) + + # --- Step 3: Assemble Final Object --- + # Initialize integrated_data if it doesn't exist + if (is.null(merged_object$integrated_data)) { + merged_object$integrated_data <- list() + } + + # Add the new enriched table + merged_object$integrated_data$drug_drug_interactions <- enriched_ddis + + # Add raw Twosides data + merged_object$twosides <- twosides_db + + # --- Step 4: Metadata --- + attr(merged_object, "TwoSidesDB") <- attr(twosides_db, "original_db_info") + + # Prepend new class + class(merged_object) <- unique(c("DrugBankTWOSIDESDb", class(merged_object))) + + message("Merge complete.") + merged_object +} diff --git a/R/nsides_parsers.R b/R/nsides_parsers.R new file mode 100644 index 00000000..4f8485cf --- /dev/null +++ b/R/nsides_parsers.R @@ -0,0 +1,58 @@ +#' Parse the TWOSIDES Drug-Drug Interaction Database +#' +#' Reads the \href{https://tatonettilab-resources.s3.amazonaws.com/nsides/TWOSIDES.csv.gz}{TWOSIDES} data file, which contains adverse event data for pairs of +#' drugs taken concurrently (N=2 interactions). +#' +#' \href{https://tatonettilab-resources.s3.amazonaws.com/nsides/TWOSIDES.csv.gz}{TWOSIDES} is a database of drug-drug interaction safety signals +#' mined from the FDA's Adverse Event Reporting System using the same +#' approach as is used to generate OffSIDES. +#' +#' Database fields as follow: +#' \describe{ +#' \item{drug_1_rxnorn_id}{RxNORM identifier for drug 1} +#' \item{drug_1_concept_name}{RxNORM name string for drug 1} +#' \item{drug_2_rxnorm_id}{RxNORM identifier for drug 2} +#' \item{drug_2_concept_name}{RxNORM name string for drug 3} +#' \item{condition_meddra_id}{MedDRA identifier for the side effect} +#' \item{condition_concpet_name}{MedDRA name string for the side effect} +#' \item{A}{The number of reports for the pair of drugs that report the side effect} +#' \item{B}{The number of reports for the pair of drugs that do not report the side effect} +#' \item{C}{The number of reports for other PSM matched drugs (including perhaps the single versions of drug 1 or drug 2) that report the side effect} +#' \item{D}{The number of reports for other PSM matched drugs and other side effects} +#' \item{PRR}{Proportional reporting ratio, PRR=(A/(A+B))/(C/(C+D))} +#' \item{PRR_error}{Error estimate of the PRR} +#' \item{mean_reporting_frequency}{Proportion of reports for the drug that report the side effect, A/(A+B)} +#' } +#' +#' @param twosides_file_path Path to the TWOSIDES data file (e.g., 'TWOSIDES.csv.gz'). +#' @param db_version used twoside version (default = NULL) +#' @param db_exported_date used twoside release date (default = NULL) +#' +#' @return A dvobject of class `TWOSIDESDB` containing the `drug_drug_interactions` +#' data frame and associated metadata. +#' +#' @export +#' @family parsers +#' @importFrom data.table fread +parseTWOSIDES <- function(twosides_file_path, + db_version = NULL, + db_exported_date = NULL) { + if (!file.exists(twosides_file_path)) { + stop("TWOSIDES file not found at path: ", twosides_file_path) + } + + message("Parsing TWOSIDES drug-drug interaction data...") + twosides_data <- data.table::fread(twosides_file_path) + message("Parsing complete.") + + twosides_db <- init_dvobject() + twosides_db$drug_drug_interactions <- twosides_data + + twosides_db <- add_database_info( + dvobject = twosides_db, + db_type = "TWOSIDESDB", + db_version = db_version, + db_exported_date = db_exported_date) + + twosides_db +} diff --git a/R/onsides_parser.R b/R/onsides_parser.R new file mode 100644 index 00000000..12678aee --- /dev/null +++ b/R/onsides_parser.R @@ -0,0 +1,69 @@ + +#' Parse the OnSIDES Core Relational Database +#' +#' Parses the core relational tables from the OnSIDES database. +#' +#' @param dataDir A string specifying the path to the directory containing the +#' OnSIDES CSV files. +#' @param include_high_confidence Logical. If TRUE (the default), the function +#' will also parse the `high_confidence.csv` file, which is a pre-aggregated +#' summary of ingredient-to-effect relationships. If the file is not found, +#' a warning is issued. +#' @param db_version used onside version (default = NULL) +#' @param db_exported_date used onside release date (default = NULL) +#' +#' @return dvobject +#' @family parsers +#' +#' @export +#' @family parsers +#' @importFrom data.table fread +parseOnSIDES <- function(dataDir, + include_high_confidence = TRUE, + db_version = NULL, + db_exported_date = NULL) { + # The 7 canonical files + core_files <- c( + "product_label.csv", + "product_adverse_effect.csv", + "product_to_rxnorm.csv", + "vocab_meddra_adverse_effect.csv", + "vocab_rxnorm_ingredient.csv", + "vocab_rxnorm_product.csv", + "vocab_rxnorm_ingredient_to_product.csv" + ) + + file_paths <- file.path(dataDir, core_files) + names(file_paths) <- gsub("\\.csv$", "", core_files) + + if (!all(file.exists(file_paths))) { + missing <- core_files[!file.exists(file_paths)] + stop("Core files not found in '", dataDir, "':\n", paste(missing, collapse = "\n")) + } + + message("Parsing the 7 core OnSIDES database tables...") + db_tables <- lapply(names(file_paths), function(name) { + message("Reading ", name, ".csv ...") + data.table::fread(file_paths[[name]]) + }) + names(db_tables) <- names(file_paths) + + # --- Optional Handling of high_confidence.csv --- + if (include_high_confidence) { + hc_path <- file.path(dataDir, "high_confidence.csv") + if (file.exists(hc_path)) { + message("Reading high_confidence.csv ...") + hc_table <- data.table::fread(hc_path) + db_tables$high_confidence <- hc_table + } else { + warning("`include_high_confidence` was TRUE, but 'high_confidence.csv' was not found.") + } + } + + message("Successfully parsed OnSIDES database.") + db_tables <- add_database_info(dvobject = db_tables, + db_type = "OnSIDES", + db_version = db_version, + db_exported_date = db_exported_date) + db_tables +} diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 00000000..fd8cd70f --- /dev/null +++ b/R/utils.R @@ -0,0 +1,288 @@ +#' Create a reusable helper function to process one component (carrier, enzyme, etc.) +#' @param component the component to subset from CETT +#' @param component_name componemt name +#' @param drug_ids passed drugs ids to subset for +#' +#' @importFrom dplyr .data +#' +#' @return A new, smaller dvobject with the same structure. +#' @noRd +#' @keywords internal +subset_cett_component <- function(component, component_name, drug_ids) { + new_component <- list() + + if (!is.null(component) && (NROW(component$general_information) > 0)) { + # The name of the intermediate ID, e.g., "carrier_id", "target_id" + intermediate_id_col <- paste0(sub("s$", "", component_name), "_id") + + # Step A: Filter the top-level `general_information` table by drug_id. + # This is our anchor. + general_info_filtered <- component$general_information %>% + dplyr::filter(.data$drugbank_id %in% drug_ids) + + # Step B: From this anchor, get the set of relevant intermediate IDs. + relevant_intermediate_ids <- general_info_filtered[[intermediate_id_col]] %>% + unique() + # No matching items for this component + if (length(relevant_intermediate_ids) > 0) { + # Step C: Use these intermediate IDs to filter all other tables in the component. + new_component <- list() + new_component$general_information <- general_info_filtered + new_component$actions <- component$actions %>% + dplyr::filter(.data[[intermediate_id_col]] %in% relevant_intermediate_ids) + + # Step D: Recurse into the `polypeptides` list, using the same intermediate IDs + if (!is.null(component$polypeptides)) { + new_component$polypeptides <- list() + for (poly_table_name in names(component$polypeptides)) { + poly_table <- component$polypeptides[[poly_table_name]] + if (is.data.frame(poly_table) && intermediate_id_col %in% names(poly_table)) { + new_component$polypeptides[[poly_table_name]] <- poly_table %>% + dplyr::filter(.data[[intermediate_id_col]] %in% relevant_intermediate_ids) + } + } + } + } + } + + new_component +} + + +#' Subset a DrugBank dvobject by a vector of DrugBank IDs +#' +#' @details +#' Intelligently filters a DrugBank dvobject to retain only the data associated +#' with a specified list of drugbank_ids. It correctly handles the deep, +#' multi-level nested structure of the entire object, including the complex +#' relationships within the `cett` list. +#' +#' @param dvobject The dvobject from `parseDrugBank()`. +#' @param drug_ids A character vector of `drugbank_id` values to keep. +#' +#' @return A new, smaller dvobject with the same structure and attributes. +#' +#' @export +#' @importFrom dplyr filter .data +#' +#' @examples +#' \dontrun{ +#' library(dbparser) +#' one_drug <- subset_drugbank_dvobject(dvobject = dbdataset::drugbank, +#' drug_ids = "DB00001") +#' } +#' @family utility +subset_drugbank_dvobject <- function(dvobject, drug_ids) { + new_dvobject <- NULL + + if ((length(drug_ids) == 0) || (sum(nchar(drug_ids)) == 0)) { + warning("`drug_ids` is empty. Returning NULL") + } else { + new_dvobject <- init_dvobject() + + # --- 1. Filter the `drugs` list (many sub-tables) --- + if (!is.null(dvobject$drugs)) { + message("Subsetting `drugs` list...") + new_dvobject$drugs <- list() + for (name in names(dvobject$drugs)) { + sub_table <- dvobject$drugs[[name]] + # Most tables here link directly via drugbank_id + if (is.data.frame(sub_table) && ("drugbank_id" %in% names(sub_table))) { + filtered_subtable<- sub_table %>% + dplyr::filter(.data$drugbank_id %in% drug_ids) + if (NROW(filtered_subtable) > 0) { + new_dvobject$drugs[[name]] <- filtered_subtable + } + } + } + } + + # --- 2. Filter the `salts`, `products` data.frames --- + for (name in c("salts", "products")) { + if (NROW(dvobject[[name]]) > 0) { + message(paste("Subsetting", name, "...")) + filtered_subtable <- dvobject[[name]] %>% + dplyr::filter(.data$drugbank_id %in% drug_ids) + if (NROW(filtered_subtable) > 0) { + new_dvobject[[name]] <- filtered_subtable + } + } + } + + # --- 3. Filter the drugs`references` list --- + if (!is.null(dvobject$references) && !is.null(dvobject$references$drugs)) { + message("Subsetting drugs `references` list...") + new_dvobject$references$drugs <- list() + for (name in names(dvobject$references$drugs)) { + sub_table <- dvobject$references$drugs[[name]] + if (is.data.frame(sub_table) && "drugbank_id" %in% names(sub_table)) { + filtered_subtable <- sub_table %>% + dplyr::filter(.data$drugbank_id %in% drug_ids) + if (NROW(filtered_subtable) > 0) { + new_dvobject$references$drugs[[name]] <- filtered_subtable + } + } + } + } + + # --- 4. Filter the complex, multi-level `cett` List --- + if (!is.null(dvobject$cett)) { + message("Subsetting complex `cett` list...") + new_dvobject$cett <- list() + + # Apply the helper to each component within cett + for (cett_name in c("carriers", "enzymes", "targets", "transporters")) { + component <- subset_cett_component( + component = dvobject$cett[[cett_name]], + component_name = cett_name, + drug_ids = drug_ids) + + if (length(component) > 0) { + new_dvobject$cett[[cett_name]] <- component + } + } + } + + # --- 5. Filter the CETT`references` list --- + if (!is.null(dvobject$references)) { + for (cett_name in c("carriers", "enzymes", "targets", "transporters")) { + if ((length(dvobject$references[[cett_name]]) > 0) && + (length(new_dvobject$cett[[cett_name]]) > 0)) { + message("Subsetting ", cett_name ," references list...") + cett_references <- list() + # The name of the intermediate ID, e.g., "carrier_id", "target_id" + intermediate_id_col <- paste0(sub("s$", "", cett_name), "_id") + + for (name in names(dvobject$references[[cett_name]])) { + sub_table <- dvobject$references[[cett_name]][[name]] + if (is.data.frame(sub_table) && (intermediate_id_col %in% names(sub_table))) { + filtered_subtable <- sub_table %>% + dplyr::filter(.data[[intermediate_id_col]] %in% new_dvobject$cett[[cett_name]][["general_information"]][[intermediate_id_col]]) + if (NROW(filtered_subtable) > 0) { + new_dvobject$references[[cett_name]][[name]] <- filtered_subtable + } + } + } + } + } + } + + attr(new_dvobject, "original_db_info") <- attr(dvobject, "original_db_info") + class(new_dvobject) <- "dvobject" + + # --- Final Step: Preserve original object's attributes --- + #attributes(new_dvobject) <- attributes(dvobject) + message("Subsetting complete.") + } + + new_dvobject +} + + +#' Subset an OnSIDES dvobject by a vector of RxNorm Ingredient IDs (Schema-Aware) +#' +#' Intelligently filters an OnSIDES dvobject by cascading filters through the +#' relational tables, ensuring the final subset is self-consistent. +#' +#' @param dvobject A dvobject from `parseOnSIDES()`. +#' @param ingredient_ids A character vector of RxNorm CUIs (ingredients) to keep. +#' +#' @export +#' @importFrom dplyr filter pull .data +#' +#' @return A new, smaller dvobject with the same structure. +#' @family utility +subset_onsides_dvobject <- function(dvobject, ingredient_ids) { + new_dvobject <- NULL + + if ((length(ingredient_ids) == 0) || (sum(nchar(ingredient_ids)) == 0)) { + warning("`ingredient_ids` is empty. Returning NULL") + } else { + new_dvobject <- init_dvobject() + + # --- 1. Get the cascading set of keys --- + message("Subsetting OnSIDES: Identifying all related keys...") + # Find all products containing our target ingredients + relevant_product_ids <- dvobject$vocab_rxnorm_ingredient_to_product %>% + dplyr::filter(.data$ingredient_id %in% ingredient_ids) %>% + dplyr::pull(.data$product_id) %>% + unique() + + # Find all labels associated with those products + relevant_label_ids <- dvobject$product_to_rxnorm %>% + dplyr::filter(.data$rxnorm_product_id %in% relevant_product_ids) %>% + dplyr::pull(.data$label_id) %>% + unique() + + # --- 2. Filter the main data tables --- + message("Filtering main OnSIDES data tables...") + product_adverse_effect <- dvobject$product_adverse_effect %>% + dplyr::filter(.data$product_label_id %in% relevant_label_ids) + + if (NROW(product_adverse_effect) > 0) { + new_dvobject$product_adverse_effect <- product_adverse_effect + } + + if ("high_confidence" %in% names(dvobject)) { + high_confidence <- dvobject$high_confidence %>% + dplyr::filter(.data$ingredient_id %in% ingredient_ids) + + if (NROW(high_confidence) > 0) { + new_dvobject$high_confidence <- high_confidence + } + } + + # --- 3. Filter the "bridge" and vocabulary tables to keep the subset lean --- + message("Filtering vocabulary and mapping tables...") + product_label <- dvobject$product_label %>% + dplyr::filter(.data$label_id %in% relevant_label_ids) + + if (NROW(product_label) > 0) { + new_dvobject$product_label <- product_label + } + + product_to_rxnorm <- dvobject$product_to_rxnorm %>% + dplyr::filter(.data$label_id %in% relevant_label_ids) + + if (NROW(product_to_rxnorm) > 0) { + new_dvobject$product_to_rxnorm <- product_to_rxnorm + } + + vocab_rxnorm_ingredient_to_product <- dvobject$vocab_rxnorm_ingredient_to_product %>% + dplyr::filter(.data$ingredient_id %in% ingredient_ids) + + if (NROW(vocab_rxnorm_ingredient_to_product) > 0) { + new_dvobject$vocab_rxnorm_ingredient_to_product <- vocab_rxnorm_ingredient_to_product + } + + # Find all MedDRA effects that are actually present in our subset + relevant_meddra_ids <- new_dvobject$product_adverse_effect$effect_meddra_id %>% unique() + + vocab_meddra_adverse_effect <- dvobject$vocab_meddra_adverse_effect %>% + dplyr::filter(.data$meddra_id %in% relevant_meddra_ids) + + if (NROW(vocab_meddra_adverse_effect) > 0) { + new_dvobject$vocab_meddra_adverse_effect <- vocab_meddra_adverse_effect + } + + vocab_rxnorm_ingredient <- dvobject$vocab_rxnorm_ingredient %>% + dplyr::filter(.data$rxnorm_id %in% ingredient_ids) + + if (NROW(vocab_rxnorm_ingredient) > 0) { + new_dvobject$vocab_rxnorm_ingredient <- vocab_rxnorm_ingredient + } + + vocab_rxnorm_product <- dvobject$vocab_rxnorm_product %>% + dplyr::filter(.data$rxnorm_id %in% relevant_product_ids) + + if (NROW(vocab_rxnorm_product) > 0) { + new_dvobject$vocab_rxnorm_product <- vocab_rxnorm_product + } + + # Preserve attributes and return + attr(new_dvobject, "original_db_info") <- attr(dvobject, "original_db_info") + class(new_dvobject) <- "dvobject" + } + + new_dvobject +} diff --git a/README.Rmd b/README.Rmd index 4cde3902..59eb4fb7 100644 --- a/README.Rmd +++ b/README.Rmd @@ -21,28 +21,33 @@ library(ggplot2) [](https://www.repostatus.org/#active) [](https://lifecycle.r-lib.org/articles/stages.html) [](https://cran.r-project.org/package=dbparser) -[](https://www.rdocumentation.org/packages/dbparser) +[](https://www.rdocumentation.org/packages/dbparser) [](https://bestpractices.coreinfrastructure.org/projects/3311) -[](https://github.com/ropensci/software-review/issues/347) +[](https://github.com/ropensci/software-review/issues/347) ## Overview Drugs databases vary too much in their formats and structures which making related data analysis not a very easy job and requires a lot of efforts to work on only -two databases together such as [DrugBank](https://go.drugbank.com/) and [KEGG](https://www.genome.jp/kegg/). +two databases together such as [DrugBank](https://go.drugbank.com/), [OnSIDES](https://onsidesdb.org/), and [TWOSIDES](https://tatonettilab.org/resources/nsides/). -Hence, `dbparser` package aims to parse different public drugs databases as [DrugBank](https://go.drugbank.com/) or [KEGG](https://www.genome.jp/kegg/) into +Hence, `dbparser` package aims to parse different public drugs databases into a single and unified format R object called `dvobject` (stands for drugverse object). +With recent updates, `dbparser` has evolved into an **integration engine**, allowing you to merge mechanistic data (DrugBank) with real-world phenotypic data (OnSIDES) and drug-drug interaction risks (TWOSIDES). + That should help in: - working with single data object and not multiple databases in different formats, - using R analysis capabilities easily on drugs data, - ease of transferring data between researchers after performing required data -analysis or `dvobject` and storing results in the same object in a very easy manner +analysis or `dvobject` and storing results in the same object in a very easy manner. ### dvobject Structure `dvobject` introduces a unified and compressed format of drugs data. -It is an R list object that contains one or more of the following sub-lists: +It is an R list object. + +**For a single database (e.g., DrugBank):** +It contains one or more of the following sub-lists: - **drugs**: list of data.frames that contain drugs information (i.e. synonyms, classifications, ...) and it is the only mandatory list - **salts**: data.frame contains drugs salts information @@ -50,10 +55,19 @@ It is an R list object that contains one or more of the following sub-lists: - **references**: data.frame of articles, links and textbooks about drugs or CETT data - **cett**: list of data.frames contain targets, enzymes, carriers and transporters information +**For a merged database (Integrated Pharmacovigilance):** +When databases are merged using `merge_drugbank_onsides` or `merge_drugbank_twosides`, the `dvobject` becomes a nested structure containing: + +- **drugbank**: The mechanistic hub. +- **onsides**: The side-effect data (from FDA labels). +- **twosides**: The drug-drug interaction data. +- **integrated_data**: Enriched tables that bridge the databases (e.g., linking DrugBank IDs to OnSIDES adverse events). +- **metadata**: Detailed provenance for all contained datasets. + ## Drug Databases Parsers are available for the following databases (it is in progress list) -### DrugBank +### 1. DrugBank [DrugBank](https://go.drugbank.com/) database is a comprehensive, freely accessible, online database containing information on drugs and drug @@ -72,7 +86,7 @@ The `dbparser` package parses the DrugBank XML database into `R` tibbles that ca If you are waiting for access to the DrugBank database, or do not intend to do a deep dive with the data, you may wish to use the `dbdataset` -[package](https://interstellar-consultation-services.github.io/dbdataset/), which contains +[package](https://interstellar-egypt.github.io/dbdataset/), which contains the DrugBank database already parsed into `dvobject`. Note that this is a large package that exceeds the limit set by CRAN. It is only available on GitHub. @@ -81,6 +95,42 @@ successfully. If you find errors with these versions or any other version please submit an issue [here](https://github.com/ropensci/dbparser/issues). +### 2. OnSIDES (The Phenotype) +[OnSIDES](https://onsidesdb.org/) provides adverse drug events extracted from thousands of FDA drug labels using machine learning. +* **Parser:** `parseOnSIDES()` +* **Input:** Directory containing OnSIDES CSV files. + +### 3. TWOSIDES (Polypharmacy) +[TWOSIDES](https://tatonettilab.org/resources/nsides/) provides data on drug-drug interactions and the adverse events that arise when two drugs are taken together. +* **Parser:** `parseTWOSIDES()` +* **Input:** The `TWOSIDES.csv.gz` file. + +## Quick Start: Integration Pipeline + +The power of `dbparser` lies in its ability to chain parsers and mergers together. Here is how you can build a complete pharmacovigilance dataset: + +```r +library(dbparser) +library(dplyr) + +# 1. Parse the raw databases +drugbank_db <- parseDrugBank("data/drugbank.xml") +onsides_db <- parseOnSIDES("data/onsides/") +twosides_db <- parseTWOSIDES("data/TWOSIDES.csv.gz") + +# 2. Build the Integrated Knowledge Graph +# DrugBank serves as the hub. We chain the merges. +final_db <- drugbank_db %>% + merge_drugbank_onsides(onsides_db) %>% + merge_drugbank_twosides(twosides_db) + +# 3. Analyze Results +# Example: Accessing the enriched drug-drug interaction table +head(final_db$integrated_data$drug_drug_interactions) +``` + +For a detailed case study, please refer to the [Integrated Pharmacovigilance Vignette](https://docs.ropensci.org/dbparser/articles/drugbank_nside.html). + ## Installation You can install the released version of dbparser from diff --git a/README.md b/README.md index 36f84eef..85d24fd5 100644 --- a/README.md +++ b/README.md @@ -12,23 +12,29 @@ developed.](https://www.repostatus.org/badges/latest/active.svg)](https://www.re stable](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html) [](https://cran.r-project.org/package=dbparser) -[](https://www.rdocumentation.org/packages/dbparser) +[](https://www.rdocumentation.org/packages/dbparser) [](https://bestpractices.coreinfrastructure.org/projects/3311) -[](https://github.com/ropensci/software-review/issues/347) +[](https://github.com/ropensci/software-review/issues/347) ## Overview Drugs databases vary too much in their formats and structures which making related data analysis not a very easy job and requires a lot of efforts to work on only two databases together such as -[DrugBank](https://go.drugbank.com/) and -[KEGG](https://www.genome.jp/kegg/). +[DrugBank](https://go.drugbank.com/), [OnSIDES](https://onsidesdb.org/), +and [TWOSIDES](https://tatonettilab.org/resources/nsides/). Hence, `dbparser` package aims to parse different public drugs databases -as [DrugBank](https://go.drugbank.com/) or -[KEGG](https://www.genome.jp/kegg/) into single and unified format R -object called `dvobject` (stands for drugverse object). +into a single and unified format R object called `dvobject` (stands for +drugverse object). + +With recent updates, `dbparser` has evolved into an **integration +engine**, allowing you to merge mechanistic data (DrugBank) with +real-world phenotypic data (OnSIDES) and drug-drug interaction risks +(TWOSIDES). That should help in: @@ -37,13 +43,15 @@ That should help in: - using R analysis capabilities easily on drugs data, - ease of transferring data between researchers after performing required data analysis or `dvobject` and storing results in the same - object in a very easy manner + object in a very easy manner. ### dvobject Structure `dvobject` introduces a unified and compressed format of drugs data. It -is an R list object that contains one or more of the following -sub-lists: +is an R list object. + +**For a single database (e.g., DrugBank):** It contains one or more of +the following sub-lists: - **drugs**: list of data.frames that contain drugs information (i.e. synonyms, classifications, …) and it is the only mandatory list @@ -55,12 +63,23 @@ sub-lists: - **cett**: list of data.frames contain targets, enzymes, carriers and transporters information +**For a merged database (Integrated Pharmacovigilance):** When databases +are merged using `merge_drugbank_onsides` or `merge_drugbank_twosides`, +the `dvobject` becomes a nested structure containing: + +- **drugbank**: The mechanistic hub. +- **onsides**: The side-effect data (from FDA labels). +- **twosides**: The drug-drug interaction data. +- **integrated_data**: Enriched tables that bridge the databases (e.g., + linking DrugBank IDs to OnSIDES adverse events). +- **metadata**: Detailed provenance for all contained datasets. + ## Drug Databases Parsers are available for the following databases (it is in progress list) -### DrugBank +### 1. DrugBank [DrugBank](https://go.drugbank.com/) database is a comprehensive, freely accessible, online database containing information on drugs and drug @@ -83,16 +102,59 @@ more details. If you are waiting for access to the DrugBank database, or do not intend to do a deep dive with the data, you may wish to use the `dbdataset` -[package](https://interstellar-consultation-services.github.io/dbdataset/), -which contains the DrugBank database already parsed into `dvobject`. -Note that this is a large package that exceeds the limit set by CRAN. It -is only available on GitHub. +[package](https://interstellar-egypt.github.io/dbdataset/), which +contains the DrugBank database already parsed into `dvobject`. Note that +this is a large package that exceeds the limit set by CRAN. It is only +available on GitHub. `dbparser` is tested against DrugBank versions *5.1.0* through *5.1.12* successfully. If you find errors with these versions or any other version please submit an issue [here](https://github.com/ropensci/dbparser/issues). +### 2. OnSIDES (The Phenotype) + +[OnSIDES](https://onsidesdb.org/) provides adverse drug events extracted +from thousands of FDA drug labels using machine learning. \* **Parser:** +`parseOnSIDES()` \* **Input:** Directory containing OnSIDES CSV files. + +### 3. TWOSIDES (Polypharmacy) + +[TWOSIDES](https://tatonettilab.org/resources/nsides/) provides data on +drug-drug interactions and the adverse events that arise when two drugs +are taken together. \* **Parser:** `parseTWOSIDES()` \* **Input:** The +`TWOSIDES.csv.gz` file. + +## Quick Start: Integration Pipeline + +The power of `dbparser` lies in its ability to chain parsers and mergers +together. Here is how you can build a complete pharmacovigilance +dataset: + +``` r +library(dbparser) +library(dplyr) + +# 1. Parse the raw databases +drugbank_db <- parseDrugBank("data/drugbank.xml") +onsides_db <- parseOnSIDES("data/onsides/") +twosides_db <- parseTWOSIDES("data/TWOSIDES.csv.gz") + +# 2. Build the Integrated Knowledge Graph +# DrugBank serves as the hub. We chain the merges. +final_db <- drugbank_db %>% + merge_drugbank_onsides(onsides_db) %>% + merge_drugbank_twosides(twosides_db) + +# 3. Analyze Results +# Example: Accessing the enriched drug-drug interaction table +head(final_db$integrated_data$drug_drug_interactions) +``` + +For a detailed case study, please refer to the [Integrated +Pharmacovigilance +Vignette](https://docs.ropensci.org/dbparser/articles/drugbank_nside.html). + ## Installation You can install the released version of dbparser from @@ -134,7 +196,7 @@ citation("dbparser") #> To cite dbparser in publications use: #> #> Mohammed Ali, Ali Ezzat (). dbparser: DrugBank Database XML Parser. -#> R package version 2.0.3. +#> R package version 2.2.0. #> #> A BibTeX entry for LaTeX users is #> @@ -142,7 +204,7 @@ citation("dbparser") #> title = {DrugBank Database XML Parser}, #> author = {Mohammed Ali and Ali Ezzat}, #> organization = {Interstellar for Consultinc inc.}, -#> note = {R package version 2.0.3}, +#> note = {R package version 2.2.0}, #> url = {https://CRAN.R-project.org/package=dbparser}, #> } ``` diff --git a/_pkgdown.yml b/_pkgdown.yml index 7cbf2dcc..d177f4da 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,17 +1,19 @@ +template: + bootstrap: 5 + bootswatch: flatly + +url: https://docs.ropensci.org/dbparser + navbar: + bg: primary structure: - left: - - home - - intro - - reference - - articles - - tutorials - - news - right: github + left: [intro, reference, articles, news] + right: [search, github] components: home: icon: fas fa-home fa-lg href: index.html + aria-label: "Home" reference: text: Reference href: reference/index.html @@ -21,6 +23,10 @@ navbar: news: text: Changelog href: news/index.html + github: + icon: fab fa-github + href: https://github.com/ropensci/dbparser + aria-label: "GitHub" reference: - title: "Drugs Databases Parser" @@ -31,6 +37,10 @@ reference: desc: "Different functions to handle dvobject" contents: - has_concept("utility") +- title: "Merge functions" + desc: "Different functions to merge different types of dvobjects" + contents: + - has_concept("mergers") authors: Mohammed Ali: diff --git a/cran-comments.md b/cran-comments.md index a7291b70..bcc7a9ad 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,22 +1,16 @@ ## Test environments -### local Windows 10(x86_64-w64-mingw32), 4.3.2 +### local Windows 11, 4.5.1 0 errors | 0 warnings | 0 note ### Windows Server 2022, R-devel, 64 bit -0 errors | 0 warnings | 1 note - -``` Found the following files/directories: - 'lastMiKTeXException'``` +0 errors | 0 warnings | 0 note ### Ubuntu Linux 20.04.1 LTS, R-release, GCC -0 errors | 0 warnings | 0 not +0 errors | 0 warnings | 0 note ### Fedora Linux, R-devel, clang, gfortran -0 errors | 0 warnings | 1 note - -```* checking HTML version of manual ... NOTE -Skipping checking HTML validation: no command 'tidy' found``` +0 errors | 0 warnings | 0 note ### win-builder (R version 4.4.0 RC (2024-04-16 r86451 ucrt)) 0 errors | 0 warnings | 0 note @@ -27,3 +21,10 @@ We checked 0 reverse dependencies, comparing R CMD check results across CRAN and * We saw 0 new problems * We failed to check 0 packages + +### RHub + +rhub::rc_submit() # multiple environments are used for testing + +### URL checks +- DrugBank URLs (403): Valid URLs, site blocks automated requests diff --git a/docs/404.html b/docs/404.html index f0b3f1a5..4e29c6ab 100644 --- a/docs/404.html +++ b/docs/404.html @@ -4,116 +4,85 @@ - +