diff --git a/.github/workflows/bookdown.yml b/.github/workflows/bookdown.yml index 3b99b25..00b7642 100644 --- a/.github/workflows/bookdown.yml +++ b/.github/workflows/bookdown.yml @@ -1,5 +1,4 @@ name: Build Bookdown - on: push: branches: @@ -14,7 +13,6 @@ permissions: jobs: build: runs-on: macos-latest - steps: - name: Checkout repository uses: actions/checkout@v3 @@ -22,7 +20,7 @@ jobs: - name: Set up R uses: r-lib/actions/setup-r@v2 with: - r-version: "4.4.0" + r-version: "4.5.0" - name: Install system dependencies run: | @@ -31,17 +29,19 @@ jobs: - name: Install CRAN & Bioconductor packages run: | - # Install remotes if missing Rscript -e 'if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' - # Install CRAN dependencies from DESCRIPTION - Rscript -e 'remotes::install_deps(dependencies = TRUE, upgrade = "never")' - - # Install BiocManager if missing + # Install BiocManager and Bioconductor packages FIRST + # so remotes::install_deps() doesn't warn about missing deps Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")' + + Rscript -e 'BiocManager::install(c("EnhancedVolcano","DESeq2","edgeR","RUVSeq","piano","pcaMethods","impute","imputeLCMD","GEOquery"), ask = FALSE)' + + # Pin factoextra + Rscript -e 'remotes::install_version("factoextra", version = "1.0.7", repos = "https://cran.rstudio.com")' - # Install Bioconductor packages - Rscript -e 'BiocManager::install(c("EnhancedVolcano","DESeq2","edgeR","RUVSeq","piano","pcaMethods","impute", "imputeLCMD"), ask = FALSE)' + # Now install remaining CRAN deps from DESCRIPTION + Rscript -e 'remotes::install_deps(dependencies = TRUE, upgrade = "never")' - name: Build bookdown run: | @@ -61,4 +61,4 @@ jobs: with: github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./book - force_orphan: true + force_orphan: true \ No newline at end of file diff --git a/.github/workflows/test_bookdown.yml b/.github/workflows/test_bookdown.yml index 24314e3..77d0742 100644 --- a/.github/workflows/test_bookdown.yml +++ b/.github/workflows/test_bookdown.yml @@ -27,7 +27,7 @@ jobs: - name: Set up R uses: r-lib/actions/setup-r@v2 with: - r-version: "4.4.0" + r-version: "4.5.0" - name: Install system dependencies run: | @@ -36,17 +36,19 @@ jobs: - name: Install CRAN & Bioconductor packages run: | - # Install remotes if missing Rscript -e 'if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes")' - # Install CRAN dependencies from DESCRIPTION - Rscript -e 'remotes::install_deps(dependencies = TRUE, upgrade = "never")' - - # Install BiocManager if missing + # Install BiocManager and Bioconductor packages FIRST + # so remotes::install_deps() doesn't warn about missing deps Rscript -e 'if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")' - # Install Bioconductor packages - Rscript -e 'BiocManager::install(c("EnhancedVolcano","DESeq2","edgeR","RUVSeq","piano","pcaMethods","impute", "imputeLCMD"), ask = FALSE)' + Rscript -e 'BiocManager::install(c("EnhancedVolcano","DESeq2","edgeR","RUVSeq","piano","pcaMethods","impute","imputeLCMD","GEOquery"), ask = FALSE)' + + # Pin factoextra + Rscript -e 'remotes::install_version("factoextra", version = "1.0.7", repos = "https://cran.rstudio.com")' + + # Now install remaining CRAN deps from DESCRIPTION + Rscript -e 'remotes::install_deps(dependencies = TRUE, upgrade = "never")' - name: Build bookdown run: | diff --git a/.gitignore b/.gitignore index 5b6a065..79a894b 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +.DS_Store \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7f4d997 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,9 @@ +repos: + - repo: local + hooks: + - id: namer + name: Name Rmd chunks with namer + language: system + entry: Rscript -e 'files <- list.files(".", pattern="[.][Rr]md$", recursive=TRUE, full.names=TRUE); files <- files[!grepl("(^|/)book/|(^|/)_book/", files)]; for (f in files) namer::name_chunks(f); cat("Processed", length(files), "Rmd files\n")' + pass_filenames: false + always_run: true \ No newline at end of file diff --git a/Chapter_1/.DS_Store b/Chapter_1/.DS_Store deleted file mode 100644 index cec1077..0000000 Binary files a/Chapter_1/.DS_Store and /dev/null differ diff --git a/Chapter_1/01-Chapter1.Rmd b/Chapter_1/01-Chapter1.Rmd index 66c3643..e1af6d7 100644 --- a/Chapter_1/01-Chapter1.Rmd +++ b/Chapter_1/01-Chapter1.Rmd @@ -30,7 +30,7 @@ Proper data management is of utmost importance while leading data analyses withi The FAIR principles describe a framework for data management and stewardship aimed at increasing the value of data by enabling sharing and reuse. These principles were originally developed from discussions during the [Jointly Designing a Data FAIRport](https://www.lorentzcenter.nl/jointly-designing-a-data-fairport.html) meeting at the Lorentz Center in Leiden, The Netherlands in 2014, which brought together stakeholders to discuss the creation of an environment for virtual computational science. The resulting principles are technology agnostic, discipline independent, community driven, and internationally adopted. Below is a schematic providing an overview of this guiding principle: -```{r, echo=FALSE, fig.height=3.5, fig.width=3.5, fig.align='center'} +```{r 01-Chapter1-1, echo=FALSE, fig.height=3.5, fig.width=3.5, fig.align='center' } knitr::include_graphics("Chapter_1/Module1_1_Input/Module1_1_Image1.png") ``` @@ -376,7 +376,7 @@ This training module focuses on providing an example of how to organize and uplo ## The Dataverse Project Dataverse, organized through [The Dataverse Project](https://dataverse.org/), is a popular repository option that allows for upload of most types of material, without any stringent requirements. The Dataverse organization also provides ample resources on how to organize, upload, and share data through Dataverse. These resources include very thorough, readable, and user guides and best practices. -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-2, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_2_Input/Module1_2_Image1.png") ``` *Screenshot of the main page of [The Dataverse Project](https://dataverse.org/)* @@ -391,13 +391,13 @@ Remember how we pointed out that a Dataverse is similar to a folder system on a As an example, using the UNC Dataverse, here we can see various sub-Dataverses that have been created as repositories for specific projects or types of data. -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-3, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_2_Input/Module1_2_Image2.png") ``` As another example looking within a specific Dataverse, here we can see the Dataverse that hosts datasets and publications for Dr. Julia Rager's lab, the [Ragerlab-Dataverse](https://dataverse.unc.edu/dataverse/ragerlab). -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-4, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_2_Input/Module1_2_Image3.png") ``` @@ -423,7 +423,7 @@ Before uploading your data to any data repository, it is important to structure **TAME 2.0 Module 1.1 FAIR Data Management Practices** and **TAME 2.0 Module 1.4 Data Wrangling in Excel** are also helpful resources to reference when thinking about organizing your data. A general example of an organized, long format dataset in Excel in provided below: -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-5, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_2_Input/Module1_2_Image4.png") ``` @@ -479,7 +479,7 @@ First, keep in mind, depending on the specific repository you are using, you may Generally, a metadata file consists of a set of descriptors for each variable in the data. If you are uploading data that contains many covariates or descriptive variables, it is essential that you provide a metadata file that describes these covariates. Both a description of the variable as well as any specific levels of any categorical or factor type variables. From the dataset presented previously, here we present an example of an associated metadata file: -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-6, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_2_Input/Module1_2_Image5.png") ``` @@ -499,7 +499,7 @@ knitr::include_graphics("Chapter_1/Module1_2_Input/Module1_2_Image5.png") Now, let's review how to actually create a Dataverse. First, navigate to the parent Dataverse that you would like to use as your primary host website. For example, our group uses the [UNC Dataverse](https://dataverse.unc.edu/). If you do not already have one, create a username and login. Then, from the home Dataverse page, click "Add Data" and select "New Dataverse". -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-7, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_2_Input/Module1_2_Image6.png") ``` @@ -522,7 +522,7 @@ Creating a dataset creates a page for your data containing information about tha + Navigate to the Dataverse page under which your dataset will live + Click "Add Data" and then select "New Dataset" -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-8, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_2_Input/Module1_2_Image7.png") ``` @@ -567,7 +567,7 @@ This training module serves a launch pad for getting acclimated with Github and ## Creating an Account First, users must create their own accounts within github to start uploading/sharing code. To do this, navigate to [github.com](github.com), click "Sign Up", and follow the on screen instructions. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-9, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image10.png") ``` @@ -577,12 +577,12 @@ knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image10.png") A repository, also known as a "repo", is similar to a project folder that will contain all code pertaining to a specific project (which can be used for specific research programs, grants, or manuscripts, as examples). A repository can be set to public or private. If a repo is initially set to private to keep findings confidential prior to publication, it can always be updated to public once findings are ready for public dissemination. Multiple people can be allowed to work on a project together within a single repository. To access the repositories that are currently available to you through your user account, click the circle in top right-hand corner and click "Your repositories". -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-10, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image11.png") ``` To create a new repository, click on the green button that says "New". -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-11, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image12.png") ``` @@ -591,7 +591,7 @@ Then give your repository a descriptive name. We often edit the repo titles to m For more information, visit Github's [Create a repo](https://docs.github.com/en/get-started/quickstart/create-a-repo) documentation. Then click "Add a README file" to initiate the README file, which is important to continually edit to provide analysis-specific background information, and any additional information that would be helpful during and after code is drafted to better facilitate tracking information and project details. *We provide further details surrounding specific information that can be included within the README file below.* -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-12, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image13.png") ``` @@ -602,7 +602,7 @@ knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image13.png") The simplest way to upload code is to first navigate to the repository that you would like to upload your code/associated files to. Note that this could represent a repo that you created or that someone granted you access to. Click “Add file” then click “Upload files”. Drag and drop your file containing your script into github and click “Commit changes”. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-13, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image1.png") ``` @@ -614,12 +614,12 @@ A more advanced way to upload code is by using the command line, which allows a To keep the repository organized, it might be necessary to create a new folder (like the folder labeled “1.1. Summary Statistics” in the above screenshot). Files can be grouped into these folders based on the type of analysis. To do so, click on the new file and then click on the pencil icon next to the "Blame" button. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-14, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image2.png") ``` Click on the box that contains the title of the file. Write the title of your new folder and then end with a forward slash (/). In the screenshot below, we're creating a new folder entitled "New Folder". Click “Commit changes” and your file should now be in a new folder. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-15, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image3.png") ``` @@ -629,7 +629,7 @@ knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image3.png") Saving iterations of code can save valuable time later as analyses are constantly being updated and edited. If your code undergoes substantial changes, (e.g., adding/ removing steps or if there’s code that is likely to be beneficial later on, but is no longer relevant to the current analysis), it is helpful to save that version in Github for future reference. To do so, create a subfolder named “Archive” and move the old file into it. If you have multiple versions of a file with the same name, add the current date to prevent the file from being overwritten later on as seen in the screenshot below. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-16, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image4.png") ``` @@ -643,7 +643,7 @@ Once the old file version has been archived, now upload the most recent version ## Updating Repository Titles and Structure to Support a Manuscript If the code is for a manuscript, it's helpful to include the table or figure name it pertains to in the manuscript in parentheses. For example, "Baseline Clusters (Figure 3)". This allows viewers to find find the code for each table or figure faster. -```{r, echo=FALSE, fig.width=6, fig.height=7, fig.align='center'} +```{r 01-Chapter1-17, echo=FALSE, fig.width=6, fig.height=7, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image5.png") ``` @@ -653,12 +653,12 @@ knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image5.png") A README.md file is used to describe the overall aims and purpose of the analyses in the repository or a folder within a repository. It is often the first file that someone will look at in a repo/folder, so it is important to include information that would be valuable to an outsider trying to make use of the work. To add a README.md file, click “Add file” and then “Create new file”. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-18, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image6.png") ``` Name your file “README.md”. -```{r, echo=FALSE, fig.width=6, fig.height=7, fig.align='center'} +```{r 01-Chapter1-19, echo=FALSE, fig.width=6, fig.height=7, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image7.png") ``` @@ -676,7 +676,7 @@ The final README.md file for the **OVERALL** repository for manuscript submissio + Include both the goal of the analysis and the methodology used (ie. Using chi square tests to determine if there are statistically significant differences across demographic groups) + If the code was written in the software Jupyter (ie. has the extension .ipynb not .R or .Rmd), NBViewer is a website that can render jupyter notebooks (files). This is helpful, because sometimes the files take too long to render, so link the repository from the NB viewer website. + Go to [nbviewer.org](nbviewer.org) --> type in the name of the repository --> copy the url and add it to the README.md file -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-20, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image8.png") ``` @@ -686,7 +686,7 @@ The final README.md file for the a subfolder within a repository should look som + Brief description of each file + Include both the goal of the analysis and the methodology used + Table or Figure name in the corresponding manuscript (if applicable) -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-21, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image9.png") ``` @@ -712,32 +712,32 @@ Github is a useful platform for managing and facilitating code tracking performe When creating a repository on Github, it automatically creates a default branch entitled "main". It's possible to create a new **branch** which allows a programmer to make changes to files in a repository in isolation from the main branch. This is beneficial, because the same file can be compared across branches, potentially created by different scientists, and merged together to reflect those changes. **Note:** In order for this to work the file in main branch has to have the same name and the file in the newly created branch. Let's start by creating a new branch. First, navigate to a repository, select "main" and then "View all branches". -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-22, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image14.png") ``` Click "New branch", give your branch a title, and click "Create new branch". In the screenshot, you'll see the new branch entitled "jr-changes". -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-23, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image15.png") ``` As a new collaborator interested in comparing and merging code changes to a file, click on the new branch that was just created. Based on the screenshot, that means click "jr-changes". -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-24, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image16.png") ``` After uploading the file(s) to this branch, you'll see a notification that this branch is now a certain number of commits ahead of the main branch. A **commit** records the number of changes to files in a branch. Based on the screenshot, "jr-changes" is now 2 commits ahead of "main". -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-25, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image17.png") ``` Click on "2 commits ahead" and scroll down to compare versions between the "main" and "jr-changes" branches. A pull request will need to be created. A **pull request** allows other collaborators to see changes made to a file within a branch. These proposed changes can be discussed and amended before merging them into the main branch. For more information, visit Github's [branches](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-branches), [pull requests](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) and [comparing branches in pull requests](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-comparing-branches-in-pull-requests) documentation. -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-26, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image18.png") ``` Go ahead and click on "Create pull request". Click on "Create pull request" again on the next screen. Select "Merge pull request" and then "Confirm merge". -```{r, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} +```{r 01-Chapter1-27, echo=FALSE, fig.width=2, fig.height=3, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_3_Input/Module1_3_Image19.png") ``` @@ -786,7 +786,7 @@ Open Microsoft Excel and prior to **ANY** edits, click “File” --> “Save As Let's first view what the dataset currently looks like: -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-28, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image1.png") ``` @@ -819,7 +819,7 @@ Before we can begin organizing the data, we need to remove the entirely blank ro + **Excel Trick #2:** An easier way to remove blank rows and cells for larger datasets, includes clicking "Find & Select"--> "Special" --> "Blanks" --> click "OK" to select all blank rows and cells. Click "Delete" within the home tab --> "Delete sheet rows". After removing the blank rows, the file should look like the screenshot below. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-29, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image2.png") ``` @@ -844,7 +844,7 @@ Metadata explains what each column represents in the dataset. Metadata is now a + Then relabel the original data tab as “XXXX_DATA” (ie., “Allostatic_DATA). + Within the metadata tab, create three columns: the first, "Column Identifier", contains each of the column names found in the data tab; the second, "Code", contains the individual variable/ abbreviation for each column identifier; the third, "Description" contains additional information and definitions for abbreviations. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-30, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image3.png") ``` @@ -868,7 +868,7 @@ For this dataset, the following variables were edited: **Excel Trick:** To change cells that contain the same data simultaneously, navigate to "Edit", click "Find", and then "Replace". Once the categorical data have been abbreviated, add those abbreviations to the metadata and describe what they symbolize. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-31, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image4.png") ``` @@ -888,7 +888,7 @@ Analysis-specific subjects are created to give an ordinal subject number to each + Relabel the subject number/identifier column as “Original_Subject_Number” and create an ordinal subject number column labeled “Subject_Number”. R reads in spaces between words as periods, therefore it’s common practice to replace spaces with underscores when doing data analysis in R. Avoid using dashes in column names or anywhere else in the dataset. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-32, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image5.png") ``` @@ -905,7 +905,7 @@ In this case, this dataset contains dashes and Greek letters within some of the These data will likely be shared with collaborators, uploaded onto data deposition websites, and used as supporting information in published manuscripts. For these purposes, it is nice to format data in Excel such that it is visually appealing and easy to digest. For example, here, it is nice to bold column identifiers and center the data, as shown below: -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-33, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image6.png") ``` @@ -920,7 +920,7 @@ The subject identifier column labeled, “Group_Subject_No”, combines the subj + Copy the entire column and paste only the values in the second column by navigating to the drop down arrow next to "Paste" and click "Paste Values". + Label the second column "Group_Subject_No" and delete the first column. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-34, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image7.png") ``` @@ -935,14 +935,14 @@ This step was not completed for this current data, since it had a smaller size a A wide format contains values that **DO NOT** repeat the subject identifier column. For this dataset, each subject has one row containing all of its data, therefore the subject identifier occurs once in the dataset. **Wide Format** -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-35, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image8.png") ``` A long format contains values that **DO** repeat the subject identifier column. For this dataset, that means a new column was created entitled "Variable" containing all the mediator names and a column entitled "Value" containing all their corresponding values. In the screenshot, an additional column, "Category", was added to help with the categorization of mediators in R analyses. **Long Format** -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-36, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image9.png") ``` @@ -957,23 +957,23 @@ To do this, a power query in Excel will be used. Note: If you are working on a M 2. Click the tab at the top that says "Data". Then click "Get Data (Power Query)" at the far left. 3. It will ask you to choose a data source. Click "Blank table" in the bottom row. 4. Paste the data into the table. (Hint: Use the shortcut Ctrl + "v"). At this point, your screen should look like the screenshot below. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-37, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image10.png") ``` 5. Click "Use first row as headers" and then click "Next" in the bottom right hand corner. 6. Select all the columns with biomarker names. That should be the column "Cortisol" through the end. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-38, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image11.png") ``` 7. Click the "Transform" button in the upper left hand corner. Then click "Unpivot columns" in the middle of the pane. The final result should look like the sceenshot below with all the biomarkers now in one column entitled "Attribute" and their corresponding values in another column entitled "Value". -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-39, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image12.png") ``` 8. To save this, go back to the "Home" tab and click "Close & load". You should see something similar to the screenshot below. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-40, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image13.png") ``` @@ -982,7 +982,7 @@ knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image13.png") 11. Now the "Category" column can be created to identify the types of biomarkers in the dataset. The allostatic load (AL) biomarkers denoted in the "Category" column include the variables Cortisol, CRP, Fibrinogen, Hba1c, HDL, and Noradrenaline. The rest of the variables were labeled as cytokines. Additionally, we can make this data more closely resemble the final long format screenshot by bolding the headers, centering all the data, etc. We have successfully wrangled our data and the final dataset now looks like this: -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-41, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image14.png") ``` @@ -992,17 +992,17 @@ knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image14.png") A PivotTable is a tool in Excel used to summarize numerical data. It’s called a pivot table, because it pivots or changes how the data is displayed to make statistical inferences. This can be useful for generating initial summary-level statistics to guage the distribution of data. To create a PivotTable, start by selecting all of the data. (Hint: Try using the keyboard shortcut mentioned above.) Click "Insert" tab on the upper left-hand side, click "PivotTable", and click "OK". The new PivotTable should be available in a new sheet as seen in the screenshot below. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-42, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image15.png") ``` A PivotTable will be constructed based on the column headers that can be dragged into the PivotTable fields located on the right-hand side. For example, what if we were interested in determining if there were differences in average expression between non-smokers and cigarette smokers in each category of biomarkers? As seen below, drag the "Group" variable under the "Rows" field and drag the "Value" variable under the "Values" field. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-43, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image16.png") ``` Notice that it automatically calculates the sum of the expression values for each group. To change the function to average, click the "i" icon and select "Average". The output should mirror what's below with non-smokers having an average expression that's more than double that of cigarette smokers. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 01-Chapter1-44, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_1/Module1_4_Input/Module1_4_Image17.png") ``` diff --git a/Chapter_2/.DS_Store b/Chapter_2/.DS_Store deleted file mode 100644 index 0e10a95..0000000 Binary files a/Chapter_2/.DS_Store and /dev/null differ diff --git a/Chapter_2/02-Chapter2.Rmd b/Chapter_2/02-Chapter2.Rmd index 4420a4a..fdf734c 100644 --- a/Chapter_2/02-Chapter2.Rmd +++ b/Chapter_2/02-Chapter2.Rmd @@ -51,7 +51,7 @@ To download RStudio: ### RStudio Orientation Here is a screenshot demonstrating what the RStudio desktop app looks like: -```{r, echo=FALSE, fig.align = "center"} +```{r 02-Chapter2-1, echo=FALSE, fig.align = "center" } knitr::include_graphics("Chapter_2/Module2_1_Input/Image1.png") ``` @@ -67,7 +67,7 @@ The default RStudio layout has four main panes (numbered above in the blue boxes 4. **Help:** where help pages will appear for packages and functions (see below for further instructions on the help option) Under "Tools" → "Global Options," RStudio panes can be customized to appear in different configurations or with different color themes. A number of other options can also be changed. For example, you can choose to have colors highlighted the color they appear or rainbow colored parentheses that can help you visualize nested code. -```{r, echo=FALSE, fig.align = "center"} +```{r 02-Chapter2-2, echo=FALSE, fig.align = "center" } knitr::include_graphics("Chapter_2/Module2_1_Input/Image2.png") ``` @@ -90,7 +90,7 @@ More information on these packages, as well as many others, is included througho R packages often do not need to be downloaded from a website. Instead, you can install packages and load them through running script in R. Note that you only need to install packages one time, but packages must be loaded each time you start a new R session. -```{r, eval=FALSE, echo=TRUE} +```{r 02-Chapter2-3, eval=FALSE, echo=TRUE} # Install the package install.packages(“tidyverse”) @@ -108,21 +108,21 @@ Many packages also exist as part of the baseline configuration of an R working e You can learn more about a function by typing one question mark before the name of the function, which will bring up documentation in the Help tab of the Viewer window. Importantly, this documentation includes a description of the different arguments that can be passed to the function and examples for how to use the function. -```{r, eval=FALSE} +```{r 02-Chapter2-4, eval=FALSE} ?install.packages ``` -```{r, echo=FALSE, fig.align = "center", out.width = "400px"} +```{r 02-Chapter2-5, echo=FALSE, fig.align = "center", out.width = "400px" } knitr::include_graphics("Chapter_2/Module2_1_Input/Image3.png") ``` You can learn more about a package by typing two question marks before the name of the package. This will bring up vingettes and help pages associated with that package. -```{r, eval=FALSE} +```{r 02-Chapter2-6, eval=FALSE} ??tidyverse ``` -```{r, echo=FALSE, fig.align = "center", out.width = "400px"} +```{r 02-Chapter2-7, echo=FALSE, fig.align = "center", out.width = "400px" } knitr::include_graphics("Chapter_2/Module2_1_Input/Image4.png") ``` @@ -151,7 +151,7 @@ Data are stored in data structures. There are many different data structures in + **Data frames:** similar to a matrix but can contain different data types and additional attributes such as row names (and is one of the most common data structures in environmental health research). Tibbles are a stricter type of data frame implemented in the *tidyverse* package. + **Lists:** a special type of vector that acts as a container – other data structures can be stored within the list, and lists can contain other lists. Lists can contain elements that are different data structures. -```{r, echo=FALSE, fig.align = "center"} +```{r 02-Chapter2-8, echo=FALSE, fig.align = "center"} knitr::include_graphics("Chapter_2/Module2_1_Input/Image5.png") ``` @@ -160,22 +160,22 @@ knitr::include_graphics("Chapter_2/Module2_1_Input/Image5.png") R code is written line by line. It may take just one line or many lines of code for one step to be executed, depending on the number of arguments to the function you are using. R code is executed (run) by selecting the line(s) of code to run and pressing return/enter (or a keyboard shortcut), or by clicking "Run" in the upper right corner of the script. A very simple example of running code is as follows: -```{r} +```{r 02-Chapter2-9} 3 + 4 ``` We can see that when we ran our code, the answer was returned. But what if we want to store that answer? We can assign that number to a variable named `x` using the assignment operator `<-`: -```{r} +```{r 02-Chapter2-10} x <- 3 + 4 ``` Then, if we run a line of code with our variable, we will get that value: -```{r} +```{r 02-Chapter2-11} x ``` The assignment operator can also be used to assign values to any of the data structures discussed above, such as vectors and data frames, as shown here: -```{r} +```{r 02-Chapter2-12} # Creating a vector of values called my_values my_values <- c(7, 3, 8, 9) @@ -194,7 +194,7 @@ my_df You may have noticed in the code chunks above that there were `#` followed by phrases describing the code. R allows for scripts to contain non-code elements, called comments, that will not be run or interpreted. Comments are useful to help make code more interpretable for others or to add reminders of what and why parts of code may have been written. To make a comment, simply use a `#` followed by the comment. A `#` only comments out a single line of code. In other words, only that line will be commented and therefore not be run, but lines directly above/below it will still be run: -```{r} +```{r 02-Chapter2-13} # This is an R comment! ``` @@ -206,13 +206,13 @@ RStudio will autofill function names and object names as you type, which can sav For example, let's say we instead named our example data frame something much longer, and we had two data frames with similar names. If we start typing in `my_` and pause our typing, all of the objects that start with that name will appear as options in a list. To select which one to autofill, navigate down the list and click return/enter. -```{r} +```{r 02-Chapter2-14} my_df_with_really_long_name <- data.frame(values = my_values, color = c("Blue", "Red", "Yellow", "Purple")) my_df_with_really_long_name_2 <- data.frame(values = my_values, color = c("Green", "Teal", "Magenta", "Orange")) ``` -```{r, echo=FALSE, fig.align = "center"} +```{r 02-Chapter2-15, echo=FALSE, fig.align = "center"} knitr::include_graphics("Chapter_2/Module2_1_Input/Image6.png") ``` @@ -228,7 +228,7 @@ getwd() To set or change the location of your working directory, run the following: -```{r, eval=FALSE, echo=TRUE} +```{r 02-Chapter2-16, eval=FALSE, echo=TRUE} setwd("/file path to where your input files are") ``` @@ -236,7 +236,7 @@ Note that macOS file paths use `/` to separate folders, whereas PC file paths us You can easily find the file path to your desired working directory by navigating to "Session", then "Set Working Directory", and "Choose Directory": -```{r, echo=FALSE, out.width = "500px", fig.align = "center"} +```{r 02-Chapter2-17, echo=FALSE, out.width = "500px", fig.align = "center" } knitr::include_graphics("Chapter_2/Module2_1_Input/Image7.png") ``` @@ -244,7 +244,7 @@ In the popup box, navigate to the folder you want to set as your working directo Within your working directory, you can make sub-folders to keep your analyses organized. Here is an example folder hierarchy: -```{r, echo=FALSE, out.width = "300px", fig.align = "center"} +```{r 02-Chapter2-18, echo=FALSE, out.width = "300px", fig.align = "center" } knitr::include_graphics("Chapter_2/Module2_1_Input/Image8.png") ``` @@ -260,7 +260,7 @@ How you set up your folder hierarchy is highly dependent on your specific analys Creating projects allows you to store your progress (open script, global environment) for one project in an R Project File. This facilitates quick transitions between multiple projects. Find detailed information about how to set up projects [here](https://support.posit.co/hc/en-us/articles/200526207-Using-RStudio-Projects). -```{r, echo=FALSE, fig.align = "center"} +```{r 02-Chapter2-19, echo=FALSE, fig.align = "center" } knitr::include_graphics("Chapter_2/Module2_1_Input/Image9.png") ``` @@ -272,7 +272,7 @@ Other datatypes such as SAS data files or large .csv files may require different Below, we will demonstrate how to read in .csv and .txt files: -```{r} +```{r 02-Chapter2-20} # Read in the .csv data that's located in our working directory csv.dataset <- read.csv("Chapter_2/Module2_1_Input/Module2_1_InputData1.csv") @@ -288,32 +288,32 @@ After data have been loaded into R, or created within R, you will likely want to Datasets can be viewed in their entirety, or datasets can be subsetted to quickly look at part of the data. Here's some example script to view just the beginnings of a dataframe using the `head()` function: -```{r} +```{r 02-Chapter2-21} head(csv.dataset) ``` Here, you can see that this automatically brings up a view of the first five rows of the dataframe. Another way to view the first five rows of a dataframe is to run the following: -```{r} +```{r 02-Chapter2-22} csv.dataset[1:5,] ``` This brings us to an important concept - indexing! Brackets are used in R to index. Within the bracket, the first argument represents the row numbers, and the second argument represents the column numbers. A colon between two numbers means to select all of the columns in between the left and right numbers. The above line of code told R to select rows 1 to 5, and, by leaving the column argument blank, all of the columns. Expanding on this, to view the first 5 rows and 2 columns, we can run the following: -```{r} +```{r 02-Chapter2-23} csv.dataset[1:5, 1:2] ``` For another example: What if we want to only view the first and third row, and first and fourth column? We can use a vector within the index to do this: -```{r} +```{r 02-Chapter2-24} csv.dataset[c(1, 3), c(1, 4)] ``` To view the entire dataset, use the `View()` function: -```{r, eval=FALSE, echo=TRUE} +```{r 02-Chapter2-25, eval=FALSE, echo=TRUE} View(csv.dataset) ``` @@ -323,7 +323,7 @@ Another way to view a dataset is to just click on the name of the data in the en As discussed above, there are a number of different data structures and types that can be used in R. Here, we will demonstrate functions that can be used to identify data structures and types within R objects. The `glimpse()` function, which is part of the *tidyverse* package, is helpful because it allows us to see an overview of our column names and the types of data contained within those columns. -```{r message = FALSE} +```{r 02-Chapter2-26, message = FALSE} # Load tidyverse package library(tidyverse) @@ -332,7 +332,7 @@ glimpse(csv.dataset) Here, we see that our `Sample` column is a character column, while the rest are integers. The `class()` function is also helpful for understanding objects in our global environment: -```{r} +```{r 02-Chapter2-27} # What class (data structure) is our object? class(csv.dataset) @@ -349,12 +349,12 @@ Now that we have these datasets saved as dataframes, we can use these as example There are many ways to export data in R. Data can be written out into a .csv file, tab delimited .txt file, or RData file, for example. There are also many functions within packages that write out specific datasets generated by that package. To write out to a .csv file: -```{r, eval=F} +```{r 02-Chapter2-28, eval=F} write.csv(csv.dataset, "Module2_1_SameCSVFileNowOut.csv") ``` To write out a .txt tab delimited file: -```{r, eval=F} +```{r 02-Chapter2-29, eval=F} write.table(txt.dataset, "Module2_1_SameTXTFileNowOut.txt") ``` @@ -383,7 +383,7 @@ load("entire_workspace.RData") ## Code Troubleshooting Learning how to code is an iterative, exploratory process. The secret to coding is to... -```{r, echo=FALSE, fig.align = "center"} +```{r 02-Chapter2-30, echo=FALSE, fig.align = "center" } knitr::include_graphics("Chapter_2/Module2_1_Input/Image10.png") ``` @@ -470,7 +470,7 @@ In the following sections, we will be addressing these questions. Keep in mind t Two of the most common scripting file types applicable to the R language are .R (normal R files) and .Rmd (R Markdown). Normal R files appear as plain text and can be used for running any normal R code. R Markdown files are used for more intensive documentation of code and allow for a combination of code, non-code text explaining the code, and viewing of code output, tables, and figures that are rendered together into an output file (typically .html, although other formats such as .pdf are also offered). For example, TAME is coded using R Markdown, which allows us to include blocks of non-code text, hyperlinks, annotated code, schematics, and output figures all in one place. We highly encourage the use of R Markdown as the default scripting file type for R-based projects because it produces a polished final document that is easy for others to follow, whereas .R files are more appropriate for short, one-off analyses and writing in-depth functions and packages. However, code executed in normal .R files and R Markdown will produce the same results, and ultimately, which file type to use is personal preference. See below for screenshots that demonstrate some of the stylistic differences between .R, .Rmd, and .Rmd knitted to HTML format: -```{r out.width = "1000px", echo = FALSE, fig.align = 'center'} +```{r 02-Chapter2-31, out.width = "1000px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_2/Module2_2_Input/Image1.png") ``` @@ -507,7 +507,7 @@ Once your script is created and named, it is generally recommended to include a In R, it is common to include multiple `#`, the comment operator, or a `#` followed by another special character, to start and end a block of coding annotation or the script header. An example of this in an .R file is shown below: -```{r} +```{r 02-Chapter2-32} ######################################################################## ######################################################################## ### Script Longer Title @@ -523,7 +523,7 @@ In R, it is common to include multiple `#`, the comment operator, or a `#` follo This block of comment operators is common in .R but not .Rmd files because .Rmd files have their own specific type of header, known as the [YAML](https://zsmith27.github.io/rmarkdown_crash-course/lesson-4-yaml-headers.html), which contains the title, author, date, and formatting outputs for the .Rmd file: -```{r out.width = "300px", echo = FALSE, fig.align = 'center'} +```{r 02-Chapter2-33, out.width = "300px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_2/Module2_2_Input/Image2.png") ``` @@ -539,7 +539,7 @@ Annotations are notes embedded within your code as comments that will not be run In general, annotations will be short sentences that describe what your code does or why you are executing that specific code. This can be helpful when you are defining a covariate a specific way, performing a specific analytical technique, or just generally explaining why you are doing what you're doing. -```{r, eval=F} +```{r 02-Chapter2-34, eval=F} # Performing logistic regression to assess association between xyz and abc # Regression confounders: V1, V2, V3 ... @@ -552,7 +552,7 @@ xyz.regression.output = glm(xyz ~ abc + V1 + V2 + V3, family=binomial(), data=ex Another common approach to annotations is to use mid-script type headings to separate out the script into various sections. For example, you might want to create distinct sections for "Loading Packages, Data, and Setup", "Covariate Definition", "Correlation Analysis", "Regression Analysis", etc. This can help you, and others, reading your script, to navigate the script more easily. It also can be more visually pleasing to see the script split up into multiple sections as opposed to one giant chunk of code interspersed with comments. Similar to above, the following example is specific to .R files. For .Rmd files, sub headers can be created by increasing the number of `#` before the header. -```{r, eval=F} +```{r 02-Chapter2-35, eval=F} ########################################################################### ########################################################################### @@ -582,7 +582,7 @@ xyz.regression.output = glm(xyz ~ abc + V1 + V2 + V3, family=binomial(), data=ex #### Quick, short comments and annotations While it is important to provide descriptive annotations, not every one needs to be a sentence or longer. As stated previously, it is not necessary to comment every single line. Here is an example of very brief commenting: -```{r, eval=F} +```{r 02-Chapter2-36, eval=F } # Loading necessary packages @@ -597,7 +597,7 @@ In the example above, we can see that these short comments clearly convey what t Coding style is often a contentious topic! There are MANY styles of coding, and no two coders have the same exact style, even if they are following the same reference. Here, we will provide some guides to coding style and go over some of the basic, general tips for making your code readable and efficient. Here is an example showing how you can use spacing to align variable assignment: -```{r, eval=F} +```{r 02-Chapter2-37, eval=F} # Example of using spacing for alignment of variable assignment @@ -613,7 +613,7 @@ For spacing around certain symbols and operators: + Include a space on either side of symbols such as `<` + The first (opening) curly brace should not be on its own line, but the second (closing) should -```{r, eval = F} +```{r 02-Chapter2-38, eval = F} # Example of poor style if(Longer_variable_name_x 25 & MEdu == 3) ``` Additionally, we can subset and select specific columns we would like to keep, using the `select` argument within the `subset()` function: -```{r} +```{r 02-Chapter2-51} # Filtering for subjects whose BMI is less than 22 or greater than 27 # Also selecting the BMI, maternal age, and maternal education columns subset.data5 <- subset(full.data, BMI < 22 | BMI > 27, select = subset.columns) @@ -881,17 +881,17 @@ Melting and casting refers to the conversion of data to "long" or "wide" form as Here, we'll illustrate some example script to melt and cast data using the [*reshape2*](https://www.rdocumentation.org/packages/reshape2/versions/1.4.4) package. Let's first install and load the `reshape2` package: -```{r message = FALSE} +```{r 02-Chapter2-52, message = FALSE} if (!requireNamespace("reshape2")) install.packages("reshape2"); ``` -```{r} +```{r 02-Chapter2-53} library(reshape2) ``` Using the fully merged dataframe, let's remind ourselves what these data look like in the current dataframe format: -```{r} +```{r 02-Chapter2-54} head(full.data) ``` @@ -899,7 +899,7 @@ head(full.data) These data are represented by single subject identifiers listed as unique IDs per row, with associated environmental measures and demographic data organized across the columns. Thus, this dataframe is currently in **wide (also known as casted)** format. Let's convert this dataframe to **long (also known as melted)** format. Here, will will specify that we want a row for each unique sample ID + variable measure pair by using `id = "ID"`: -```{r} +```{r 02-Chapter2-55} full.melted <- melt(full.data, id = "ID") # Viewing this new dataframe @@ -909,17 +909,17 @@ head(full.melted) You can see here that each measure that was originally contained as a unique column has been reoriented, such that the original column header is now listed throughout the second column labeled `variable`. Then, the third column contains the value of this variable. Let's see an example view of the middle of this new dataframe: -```{r} +```{r 02-Chapter2-56} full.melted[1100:1110,1:3] ``` Here, we can see a different variable (DWAs) now being listed. This continues throughout the entire dataframe, which has the following dimensions: -```{r} +```{r 02-Chapter2-57} dim(full.melted) ``` Let's now re-cast this dataframe back into wide format using the `dcast()` function. Here, we are telling the `dcast()` function to give us a sample (ID) for every variable in the column labeled `variable`. The column names from the variable column and corresponding values from the value column are then used to fill in the dataset: -```{r} +```{r 02-Chapter2-58} full.cast <- dcast(full.melted, ID ~ variable) head(full.cast) ``` @@ -927,14 +927,14 @@ head(full.cast) Here, we can see that this dataframe is back in its original casted (or wide) format. Now that we're familiar with some base R functions to reshape our data, let's answer our original question: What is the average urinary chromium concentration for each maternal education level? Although it is not necessary to calculate the average, we could first subset our data frame to only include the two columns we are interested in (MEdu and UCr): -```{r} +```{r 02-Chapter2-59} subset.data6 <- full.data[,c("MEdu", "UCr")] head(subset.data6) ``` Next, we will make a new data frame for each maternal education level: -```{r} +```{r 02-Chapter2-60} # Creating new data frames based on maternal education category data.matedu.1 <- subset(subset.data6, MEdu == 1) data.matedu.2 <- subset(subset.data6, MEdu == 2) @@ -945,7 +945,7 @@ head(data.matedu.1) ``` Last, we can calculate the average urinary chromium concentration using each of our data frames: -```{r} +```{r 02-Chapter2-61} mean(data.matedu.1$UCr) mean(data.matedu.2$UCr) mean(data.matedu.3$UCr) @@ -983,13 +983,13 @@ Here, we will carry out all the of the same data organization exercises demonstr ### Downloading and Loading the Tidyverse Package If you don't have *tidyverse* already installed, you will need to install it using: -```{r message = FALSE} +```{r 02-Chapter2-62, message = FALSE} if(!require(tidyverse)) install.packages("tidyverse") ``` And then load the *tidyverse* package using: -```{r} +```{r 02-Chapter2-63} library(tidyverse) ``` @@ -998,14 +998,14 @@ Note that by loading the *tidyverse* package, you are also loading all of the pa ### Merging Data Using Tidyverse Syntax To merge the same example dataframes using *tidyverse*, you can run the following script: -```{r} +```{r 02-Chapter2-64} full.data.tidy <- inner_join(demographic_data, chemical_data, by = "ID") head(full.data.tidy) ``` Note that you can still merge dataframes that have different ID column names with the argument `by = c("ID.x", "ID.y")`. *tidyverse* also has other `join`, functions, shown in the graphic below ([source](https://tavareshugo.github.io/r-intro-tidyverse-gapminder/08-joins/index.html)): -```{r echo = FALSE, out.width = "400px", fig.align = "center"} +```{r 02-Chapter2-65, echo = FALSE, out.width = "400px", fig.align = "center"} knitr::include_graphics("Chapter_2/Module2_3_Input/Image1.svg") ``` @@ -1021,7 +1021,7 @@ knitr::include_graphics("Chapter_2/Module2_3_Input/Image1.svg") One of the most important elements of Tidyverse syntax is use of the pipe operator (`%>%`). The pipe operator can be used to chain multiple functions together. It takes the object (typically a dataframe) to the left of the pipe operator and passes it to the function to the right of the pipe operator. Multiple pipes can be used in chain to execute multiple data cleaning steps without the need for intermediate dataframes. The pipe operator can be used to pass data to functions within all of the Tidyverse universe packages, not just the functions demonstrated here. Below, we can see the same code executed above, but this time with the pipe operator. The `demographic_data` dataframe is passed to `inner_join()` as the first argument to that function, with the following arguments remaining the same. -```{r} +```{r 02-Chapter2-66} full.data.tidy2 <- demographic_data %>% inner_join(chemical_data, by = "ID") @@ -1029,7 +1029,7 @@ head(full.data.tidy2) ``` Because the pipe operator is often used in a chain, it is best practice is to start a new line after each pipe operator, with the new lines of code indented. This makes code with multiple piped steps easier to follow. However, if just one function is being executed, the pipe operator can be used on the same line as the input and function or omitted altogether (as shown in the previous two code chunks). Here is an example of placing the function to the right of the pipe operator on a new line, with placeholder functions shown as additional steps: -```{r eval = FALSE} +```{r 02-Chapter2-67, eval = FALSE} full.data.tidy3 <- demographic_data %>% inner_join(chemical_data, by = "ID") %>% additional_function_1() %>% @@ -1041,7 +1041,7 @@ full.data.tidy3 <- demographic_data %>% #### Column-wise functions The `select()` function is used to subset columns in Tidyverse. Here, we can use our previously defined vector `subset.columns` in the `select()` function to keep only the columns in our `subset.columns` vector. The `all_of()` function tells the `select()` to keep all of the columns that match elements of the `subset.columns` vector. -```{r} +```{r 02-Chapter2-68} subset.tidy1 <- full.data.tidy %>% select(all_of(subset.columns)) @@ -1049,7 +1049,7 @@ head(subset.tidy1) ``` There are many different ways that `select()` can be used. See below for some examples using dummy variable names: -```{r eval = FALSE} +```{r 02-Chapter2-69, eval = FALSE} # Select specific ranges in the dataframe data <- data %>% select(start_column_1:end_column_1) @@ -1063,7 +1063,7 @@ data <- data %>% ``` To select columns that have names that contain specific strings, you can use functions such as `starts_with()`, `ends_with()`, and `contains()`. These functions allow you to ignore the case of the strings with `ignore.case = TRUE`. These arguments can be combined with specific column names and other selection ranges. -```{r eval = FALSE} +```{r 02-Chapter2-70, eval = FALSE} data <- data %>% select(starts_with("starting_string")) @@ -1072,7 +1072,7 @@ data <- data %>% ``` To remove columns using tidyverse, you can use similar code, but include a `-` sign before the argument defining the columns. -```{r} +```{r 02-Chapter2-71} # Removing columns subset.tidy2 <- full.data.tidy %>% select(-all_of(subset.columns)) @@ -1085,7 +1085,7 @@ head(subset.tidy2) The `slice()` function can be used to keep or remove a certain number of rows based on their position within the dataframe. For example, we can retain only the first 100 rows using the following code: -```{r} +```{r 02-Chapter2-72} subset.tidy3 <- full.data.tidy %>% slice(1:100) @@ -1093,7 +1093,7 @@ dim(subset.tidy3) ``` Or, we can remove the first 100 rows: -```{r} +```{r 02-Chapter2-73} subset.tidy4 <- full.data.tidy %>% slice(-c(1:100)) @@ -1103,7 +1103,7 @@ dim(subset.tidy4) The related functions `slice_min()` and `slice_max()` can be used to select rows with the smallest or largest values of a variable. The `filter()` function can be used to keep or remove specific rows based on conditional statements. For example, we can keep only rows where BMI is greater than 25 and age is greater than 31: -```{r} +```{r 02-Chapter2-74} subset.tidy5 <- full.data.tidy %>% filter(BMI > 25 & MAge > 31) @@ -1113,7 +1113,7 @@ dim(subset.tidy5) #### Combining column and row-wise functions Now, we can see how Tidyverse makes it easy to chain together multiple data manipulation steps. Here, we first filter rows based on values for BMI and age, then we select our columns of interest: -```{r} +```{r 02-Chapter2-75} subset.tidy6 <- full.data.tidy %>% filter(BMI > 25 & MAge > 31) %>% select(BMI, MAge, MEdu) @@ -1126,7 +1126,7 @@ head(subset.tidy6) To melt and cast data in Tidyverse, you can use the pivot functions (i.e., `pivot_longer()` or `pivot_wider()`). The first argument in the `pivot_longer()` function specifies which columns should be pivoted. This can be specified with either positive or negative selection - i.e., naming columns to pivot with a vector or range or naming columns not to pivot with a `-` sign. Here, we are telling the function to pivot all of the columns except the ID column, which we need to keep to be able to trace back which values came from which subject. The `names_to =` argument allows you to set what you want to name the column that stores the variable names (the column names in wide format). The `values_to =` argument allows you to set what you want to name the column that stores the values. We almost always call these columns "var" and "value", respectively, but you can name them anything that makes sense for your dataset. -```{r} +```{r 02-Chapter2-76} full.pivotlong <- full.data.tidy %>% pivot_longer(-ID, names_to = "var", values_to = "value") @@ -1134,7 +1134,7 @@ head(full.pivotlong, 15) ``` To pivot our data back to wide format, we can use `pivot_wider()`, which will pull the column names from the column specified in the `names_from =` argument and the corresponding values from the column specified in the `values_from = ` argument. -```{r} +```{r 02-Chapter2-77} full.pivotwide <- full.pivotlong %>% pivot_wider(names_from = "var", values_from = "value") @@ -1144,7 +1144,7 @@ head(full.pivotwide) Now that we're familiar with some *tidyverse* functions to reshape our data, let's answer our original question: What is the average urinary Chromium concentration for each maternal education level? We can use the `group_by()` function to group our dataset by education class, then the summarize function to calculate the mean of our variable of interest within each class. Note how much shorter and more efficient this code is than the code we used to calculate the same values using base R! -```{r} +```{r 02-Chapter2-78} full.data %>% group_by(MEdu) %>% summarize(Avg_UCr = mean(UCr)) @@ -1202,7 +1202,7 @@ We will demonstrate how this analysis can be approached using for loops, functio If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you. We will be using the *tidyverse* package for data manipulation steps and the [*rstatix*](https://github.com/kassambara/rstatix) package for statistical tests, as it provides pipe friendly adaptations of the base R statistical tests and returns results in a dataframe rather than a list format, making results easier to access. This brings up an important aspect of coding efficiency - sometimes, there is already a package that has been designed with functions to help you execute your desired analysis in an efficient way, so you don't need to write custom functions yourself! So, don't forget to explore packages relevant to your analysis before spending a lot of time developing custom solutions (although, sometimes this is necessary). -```{r message = FALSE} +```{r 02-Chapter2-79, message = FALSE} if (!requireNamespace("tidyverse")) install.packages("tidyverse") if (!requireNamespace("rstatix")) @@ -1211,14 +1211,14 @@ if (!requireNamespace("rstatix")) #### Loading required packages -```{r message = FALSE} +```{r 02-Chapter2-80, message = FALSE} library(tidyverse) library(rstatix) ``` #### Setting your working directory -```{r eval = FALSE} +```{r 02-Chapter2-81, eval = FALSE} setwd("/file path to where your input files are") ``` @@ -1226,7 +1226,7 @@ setwd("/file path to where your input files are") The first example dataset contains subject demographic data, and the second dataset contains corresponding chemical data. Familiarize yourself with these data used previously in **TAME 2.0 Module 2.3 Data Manipulation and Reshaping**. -```{r} +```{r 02-Chapter2-82} # Load the demographic data demographic_data <- read.csv("Chapter_2/Module2_4_Input/Module2_4_InputData1.csv") @@ -1243,7 +1243,7 @@ head(chemical_data) #### Preparing the example dataset For ease of analysis, we will merge these two datasets before proceeding. -```{r} +```{r 02-Chapter2-83} # Merging data full_data <- inner_join(demographic_data, chemical_data, by = "ID") @@ -1252,7 +1252,7 @@ head(full_data) ``` Continuous demographic variables, like BMI, are often dichotomized (or converted to a categorical variable with two categories representing higher vs. lower values) to increase statistical power in analyses. This is particularly important for clinical data that tend to have smaller sample sizes. In our initial dataframe, BMI is a continuous or numeric variable; however, our questions require us to dichotomize BMI. We can use the following code, which relies on if/else logic (see **TAME 2.0 Module 2.3 Data Manipulation and Reshaping** for more information) to generate a new column representing our dichotomized BMI variable for our first environmental health question. -```{r} +```{r 02-Chapter2-84} # Adding dichotomized BMI column full_data <- full_data %>% mutate(Dichotomized_BMI = ifelse(BMI < 25, "Normal", "Overweight")) @@ -1270,7 +1270,7 @@ We can see that we now have created a new column entitled `Dichotomized_BMI` tha We will start with loops. There are three main types of loops in R: `for`, `while`, and `repeat`. We will focus on `for` loops in this module, but for more in-depth information on loops, including the additional types of loops, see [here](https://intro2r.com/loops.html). Before applying loops to our data, let's discuss how `for` loops work. The basic structure of a `for` loop is shown here: -```{r} +```{r 02-Chapter2-85} # Basic structure of a for loop for (i in 1:4){ print(i) @@ -1278,7 +1278,7 @@ for (i in 1:4){ ``` `for` loops always start with `for` followed by a statement in parentheses. The argument in the parentheses tells R how to iterate (or repeat) through the code in the curly brackets. Here, we are telling R to iterate through the code in curly brackets 4 times. Each time we told R to print the value of our iterator, or `i`, which has a value of 1, 2, 3, and then 4. Loops can also iterate through columns in a dataset. For example, we can use a `for` loop to print the ages of each subject: -```{r} +```{r 02-Chapter2-86} # Creating a smaller dataframe for our loop example full_data_subset <- full_data[1:6, ] @@ -1298,7 +1298,7 @@ Now that we know how a `for` loop works, how can we apply this approach to deter Because our data are normally distributed and there are two groups that we are comparing, we will use a t-test applied to each metal measured in drinking water. Testing for assumptions is outside the scope of this module, but see **TAME 2.0 Module 3.3 Normality Tests and Data Transformation** for more information on this topic. Running a t-test in R is very simple, which we can demonstrate by running a t-test on the drinking water arsenic data: -```{r} +```{r 02-Chapter2-87} # Running t-test and storing results in t_test_res t_test_res <- full_data %>% t_test(DWAs ~ Dichotomized_BMI) @@ -1317,7 +1317,7 @@ Let's break down the steps of our `for` loop before executing the code. 3. Third, we will actually run our for loop. This will tell R: for each variable in our `vars_of_interest` vector, run a t-test with that variable (and store the results in a temporary dataframe called "res"), then add those results to our final results dataframe. A row will be added to the results dataframe each time R iterates through a new variable, resulting in a dataframe that stores the results of all of our t-tests. -```{r} +```{r 02-Chapter2-88} # Defining variables (columns) we want to run a t-test on vars_of_interest <- c("DWAs", "DWCd", "DWCr") @@ -1359,7 +1359,7 @@ Note the use of the code `as.formula(paste0(i, "~ Dichotomized_BMI"))`. Let's ta Many statistical test functions and regression functions require one argument to be a formula, which is typically formatted as `y ~ x`, where y is the dependent variable of interest and x is an independent variable. For some functions, additional variables can be included on the right side of the formula to represent covariates (additional variables of interest). The function `as.formula()` returns the argument in parentheses in formula format so that it can be correctly passed to other functions. We can demonstrate that here by assigning a dummy variable `j` the character string `var1`: -```{r} +```{r 02-Chapter2-89} # Assigning variable j <- "var1" @@ -1368,12 +1368,12 @@ as.formula(paste(j, " ~ Dichotomized_BMI", sep = "")) ``` We can use the `paste()` function to combine strings of characters. The paste function takes each argument (as many arguments as is needed) and pastes them together into one character string, with the separator between arguments set by the `sep = ` argument. When our y variable is changing with each iteration of our for loop, we can use the `paste()` function to write our formula correctly by telling the function to paste the variable `i`, followed by the rest of our formula, which stays the same for each iteration of the loop. Let's examine the output of just the `paste()` part of our code: -```{r} +```{r 02-Chapter2-90} paste(j, " ~ Dichotomized_BMI", sep = "") ``` The `paste()` function is very flexible and can be useful in many other settings when you need to create one character string from arguments from different sources! Notice that the output looks different from the output of `as.formula()`. There is a returned index (`[1]`), and there are quotes around the character string. The last function we will highlight here is the `noquote()` function, which can be helpful if you'd like a string without quotes: -```{r} +```{r 02-Chapter2-91} noquote(paste(j, " ~ Dichotomized_BMI", sep = "")) ``` @@ -1387,7 +1387,7 @@ Next, we will learn about functions and apply them to our dataset to answer our Functions are useful when you want to execute a block of code organized together to perform one specific task, and you want to be able to change parameters for that task easily rather than having to copy and paste code over and over that largely stays the same but might have small modifications in certain arguments. The basic structure of a function is as follows: -```{r eval = FALSE} +```{r 02-Chapter2-92, eval = FALSE} function_name <- function(parameter_1, parameter_2...){ # Function body (where the code goes) @@ -1402,7 +1402,7 @@ A function requires you to name it as we did with `function_name`. In parenthese When writing your own functions, it is important to describe the purpose of the function, its input, its parameters, and its output so that others can understand what your functions does and how to use it. This can be defined either in text above a code chunk if you are using R Markdown or as comments within the code itself. We'll start with a simple function. Let's say we want to convert temperatures from Fahrenheit to Celsius. We can write a function that takes the temperature in Fahrenheit and converts it to Celsius. Note that we have given our parameters descriptive names (`fahrenheit_temperature`, `celsius_temperature`), which makes our code more readable than if we assigned them dummy names such as x and y. -```{r} +```{r 02-Chapter2-93} # Function to convert temperatures in Fahrenheit to Celsius ## Parameters: temperature in Fahrenheit (input) ## Output: temperature in Celsius @@ -1417,7 +1417,7 @@ fahrenheit_to_celsius <- function(fahrenheit_temperature){ Notice that the above code block was run, but there isn't an output. Rather, running the code assigns the function code to that function. When you run code defining a function, that function will appear in your Global Environment under the "Functions" section. We can see the output of the function by providing an input value. Let's start by converting 41 degrees Fahrenheit to Celsius: -```{r} +```{r 02-Chapter2-94} # Calling the function # Here, 41 is the `fahrenheit_temperature` in the function fahrenheit_to_celsius(41) @@ -1425,7 +1425,7 @@ fahrenheit_to_celsius(41) 41 degrees Fahrenheit is equivalent to 5 degrees Celsius. We can also have the function convert a vector of values. -```{r} +```{r 02-Chapter2-95} # Defining vector of temperatures vector_of_temperatures <- c(81,74,23,65) @@ -1435,7 +1435,7 @@ fahrenheit_to_celsius(vector_of_temperatures) Before getting back to answer our environmental health related questions, let's look at one more example of a function. This time we'll create a function that can calculate the circumference of a circle based on its radius in inches. Here you can also see a different style of commenting to describe the function's purpose, inputs, and outputs. -```{r} +```{r 02-Chapter2-96} circle_circumference <- function(radius){ # Calculating a circle's circumference based on the radius inches @@ -1457,7 +1457,7 @@ circle_circumference(3) So, if a circle had a radius of 3 inches, its circumference would be ~19 inches. What if we were interested in seeing the diameter to double check our code? -```{r error = TRUE, suppress_error_alert = TRUE} +```{r 02-Chapter2-97, error = TRUE, suppress_error_alert = TRUE} diameter ``` @@ -1473,7 +1473,7 @@ We can adapt our previous `for` loop code into a function that will take differe + Changing the BMI cutoff from a number (in our previous code) to our parameter name that specifies the cutoff + Changing the group names for assigning category (in our previous code) to our parameter names -```{r} +```{r 02-Chapter2-98} # Function to dichotomize BMI into different categories and return results of t-test on drinking water metals between dichotomized groups ## Parameters: @@ -1514,7 +1514,7 @@ bmi_DW_ttest <- function(input_data, bmi_cutoff, lower_group_name, upper_group_n ``` For the first example of using the function, we have included the name of each argument for clarity, but this isn't necessary *if* you pass in the arguments *in the order they were defined when writing the function*. -```{r} +```{r 02-Chapter2-99} # Defining variables (columns) we want to run a t-test on vars_of_interest <- c("DWAs", "DWCd", "DWCr") @@ -1524,7 +1524,7 @@ bmi_DW_ttest(input_data = full_data, bmi_cutoff = 25, lower_group_name = "Normal ``` Here, we can see the same results as above in the **Loops** section. We can next apply the function to answer our additional environmental health questions: -```{r} +```{r 02-Chapter2-100} # Apply function for underweight vs. non-underweight (bmi_cutoff = 18.5) bmi_DW_ttest(full_data, 18.5, "Underweight", "Non-Underweight", vars_of_interest) @@ -1551,7 +1551,7 @@ In the last section of this module, we will demonstrate how to use list operatio ## List operations Lists are a data type in R that can store other data types (including lists, to make nested lists). This allows you to store multiple dataframes in one object and apply the same functions to each dataframe in the list. Lists can also be helpful for storing the results of a function if you would like to be able to access multiple outputs. For example, if we return to our example of a function that calculates the circumference of a circle, we can store both the diameter and circumference as list objects. The function will then return a list containing both of these values when called. -```{r} +```{r 02-Chapter2-101} # Adding list element to our function circle_circumference_4 <- function(radius){ # Calculating a circle's circumference and diameter based on the radius in inches @@ -1577,7 +1577,7 @@ circle_circumference_4(10) ``` We can also call the results individually using the following code: -```{r} +```{r 02-Chapter2-102} # Storing results of function circle_10 <- circle_circumference_4(10) @@ -1599,7 +1599,7 @@ circle_10[2] ``` In the context of our dataset, we can use list operations to clean up and combine our results from all three BMI stratification approaches. This is often necessary to prepare data to share with collaborators or for supplementary tables in a manuscript. Let's revisit our code for producing our statistical results, this time assigning our results to a dataframe rather than viewing them. -```{r} +```{r 02-Chapter2-103} # Defining variables (columns) we want to run a t-test on vars_of_interest <- c("DWAs", "DWCd", "DWCr") @@ -1624,7 +1624,7 @@ For publication purposes, let's say we want to make the following formatting cha + Collapse all of our data into one final dataframe We can first write a function to execute these cleaning steps: -```{r} +```{r 02-Chapter2-104} # Function to clean results dataframes ## Parameters: @@ -1650,7 +1650,7 @@ data_cleaning <- function(input_data) { ``` Then, we can make a list of our dataframes to clean and apply: -```{r} +```{r 02-Chapter2-105} # Making list of dataframes t_test_res_list <- list(norm_vs_overweight, under_vs_nonunderweight, nonobese_vs_obese) @@ -1659,7 +1659,7 @@ head(t_test_res_list) ``` And we can apply the cleaning function to each of the dataframes using the `lapply()` function, which takes a list as the first argument and the function to apply to each list element as the second argument: -```{r} +```{r 02-Chapter2-106} # Applying cleaning function t_test_res_list_cleaned <- lapply(t_test_res_list, data_cleaning) @@ -1668,7 +1668,7 @@ head(t_test_res_list_cleaned) ``` Last, we can collapse our list down into one dataframe using the `do.call()` and `rbind.data.frame()` functions, which together, take the elements of the list and collapse them into a dataframe by binding the rows together: -```{r} +```{r 02-Chapter2-107} t_test_res_cleaned <- do.call(rbind.data.frame, t_test_res_list_cleaned) # Viewing final dataframe @@ -1681,7 +1681,7 @@ The above example is just that - an example to demonstrate the mechanics of usin 2. Bind all three dataframes together, then execute the cleaning steps. We will demonstrate #2 below: -```{r} +```{r 02-Chapter2-108} # Start by binding the rows of each of the results dataframes t_test_res_cleaned_2 <- bind_rows(norm_vs_overweight, under_vs_nonunderweight, nonobese_vs_obese) %>% diff --git a/Chapter_3/.DS_Store b/Chapter_3/.DS_Store deleted file mode 100644 index d85a296..0000000 Binary files a/Chapter_3/.DS_Store and /dev/null differ diff --git a/Chapter_3/03-Chapter3.Rmd b/Chapter_3/03-Chapter3.Rmd index b0e55d7..d9c636d 100644 --- a/Chapter_3/03-Chapter3.Rmd +++ b/Chapter_3/03-Chapter3.Rmd @@ -56,7 +56,7 @@ For additional resources on *ggplot2* see [ggplot2 Posit Documentation](https:// ### Script Preparations #### Cleaning the global environment -```{r} +```{r 03-Chapter3-1} rm(list=ls()) ``` @@ -72,7 +72,7 @@ if (!requireNamespace("pheatmap")) ``` #### Loading R packages required for this session -```{r echo=TRUE, eval=TRUE, warning=FALSE, results='hide', message=FALSE} +```{r 03-Chapter3-2, echo=TRUE, eval=TRUE, warning=FALSE, results='hide', message=FALSE} library(tidyverse) library(GGally) library(corrplot) @@ -81,13 +81,13 @@ library(pheatmap) ``` #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 03-Chapter3-3, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` #### Importing example dataset Then let's read in our example dataset. As mentioned in the introduction, this example dataset represents chemical measurements across 12 different biomass burn scenarios representing potential wildfire events. Let's upload and view these data: -```{r} +```{r 03-Chapter3-4} # Load the data smoke_data <- read.csv("Chapter_3/Module3_1_Input/Module3_1_InputData.csv") @@ -109,7 +109,7 @@ We can create a **density plot** to answer the first question. Similar to a hist In this example of a density plot, we'll visualize the distributions of chemical concentration data on the x axis. A density plot automatically displays where values are concentrated on the y axis. Additionally, we'll want to have multiple density plots within the same figure for each biomass burn condition. Before the data can be visualized, it needs to be converted from a wide to long format. This is because we need to have variable or column names entitled `Chemical_Concentration` and `Biomass_Burn_Condition` that can be placed into `ggplot()`. For review on converting between long and wide formats and using other tidyverse tools, see **TAME 2.0 Module 2.3 Data Manipulation & Reshaping**. -```{r} +```{r 03-Chapter3-5} longer_smoke_data = pivot_longer(smoke_data, cols = 4:13, names_to = "Biomass_Burn_Condition", values_to = "Chemical_Concentration") @@ -123,7 +123,7 @@ A data preparation method that is commonly used to convert values into those tha For this example, we will normalize the chemical concentration dataset using a basic scaling and centering procedure using the base R function, `scale()`. This algorithm results in the normalization of a dataset using the mean value and standard deviation. This scaling step will convert chemical concentration values in our dataset into normalized values across samples, such that each chemical's concentration distributions are more easily comparable between the different biomass burn conditions. For more information on the `scale()` function, see its associated [RDocumentation](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/scale) and helpful tutorial on [Implementing the scale() function in R](https://www.journaldev.com/47818/r-scale-function). -```{r} +```{r 03-Chapter3-6} scaled_longer_smoke_data = longer_smoke_data %>% # scaling within each chemical group_by(Chemical) %>% @@ -140,7 +140,7 @@ Now that we have our dataset formatted, let's plot it. ## Density Plot Visualization The following code can be used to generate a density plot: -```{r fig.align = "center"} +```{r 03-Chapter3-7, fig.align = "center"} ggplot(scaled_longer_smoke_data, aes(x = Scaled_Chemical_Concentration, color = Biomass_Burn_Condition)) + geom_density() ``` @@ -160,7 +160,7 @@ ggplot(scaled_longer_smoke_data, aes(x = Scaled_Chemical_Concentration, color = A **boxplot** can also be used to answer our first environmental health question: **How do the distributions of the chemical concentration data differ based on each biomass burn scenario?**. A boxplot also displays a data's distribution, but it incorporates a visualization of a five number summary (i.e., minimum, first quartile, median, third quartile, and maximum). Any outliers are displayed as dots. For this example, let's have `Scaled_Chemical_Concentration` on the x axis and `Biomass_Burn_Condition` on the y axis. The `scaled_longer_smoke_data` dataframe is the format we need, so we'll use that for plotting. -```{r fig.align = "center"} +```{r 03-Chapter3-8, fig.align = "center"} ggplot(scaled_longer_smoke_data, aes(x = Scaled_Chemical_Concentration, y = Biomass_Burn_Condition, color = Biomass_Burn_Condition)) + geom_boxplot() @@ -183,7 +183,7 @@ Let's turn our attention to the second environmental health question: **Are ther *GGally* is a package that serves as an extension of *ggplot2*, the baseline R plotting system based on the grammar of graphics. GGally is very useful for creating plots that compare groups or features within a dataset, among many other utilities. Here we will demonstrate the `ggpairs()` function within *GGally* using the scaled chemistry dataset. This function will produce an image that shows correlation values between biomass burn sample pairs and also illustrates the overall distributions of values in the samples. For more information on *GGally*, see its associated [RDocumentation](https://www.rdocumentation.org/packages/GGally/versions/1.5.0) and [example helpful tutorial](http://www.sthda.com/english/wiki/ggally-r-package-extension-to-ggplot2-for-correlation-matrix-and-survival-plots-r-software-and-data-visualization). *GGally* requires a wide dataframe with ids (i.e.,`Chemical`) as the rows and the variables that will be compared to each other (i.e.,`Biomass_Burn_Condition`) as the columns. Let's create that dataframe. -```{r} +```{r 03-Chapter3-9} # first selecting the chemical, biomass burn condition, and # the scaled chemical concentration columns wide_scaled_data = scaled_longer_smoke_data %>% @@ -196,7 +196,7 @@ head(wide_scaled_data) ``` By default, `ggpairs()` displays Pearson's correlations. To show Spearman's correlations takes more nuance, but can be done using the code that has been commented out below. -```{r fig.align = "center", fig.width = 15, fig.height = 15} +```{r 03-Chapter3-10, fig.align = "center", fig.width = 15, fig.height = 15} # ggpairs with Pearson's correlations wide_scaled_data = data.frame(as.matrix(wide_scaled_data)) @@ -225,7 +225,7 @@ The upper right portion displays the correlation values, where a value less than ::: We can visualize correlations another way using the other function from *GGally*, `ggcorr()`, which visualizes each correlation as a square. Note that this function calculates Pearson's correlations by default. However, this can be changed using the `method` parameter shown in the code commented out below. -```{r fig.align = "center", fig.width = 10, fig.height = 7} +```{r 03-Chapter3-11, fig.align = "center", fig.width = 10, fig.height = 7} # Pearson's correlations ggcorr(wide_scaled_data) @@ -234,7 +234,7 @@ ggcorr(wide_scaled_data) ``` We'll visualize correlations between each of the groups using one more figure using the `corrplot()` function from the *corrplot* package. -```{r fig.align = "center"} +```{r 03-Chapter3-12, fig.align = "center"} # Need to supply corrplot with a correlation matrix, here, using the 'cor' function corrplot(cor(wide_scaled_data)) ``` @@ -252,7 +252,7 @@ Last, we'll turn our attention to answering the final environmental health quest For this example, we can plot `Biomass_Burn_Condition` and `Chemical.Category` on the axes and fill in the values with `Scaled_Chemical_Concentration`. When generating heatmaps, scaled values are often used to better distinguish patterns between groups/samples. In this example, we also plan to display the median scaled concentration value within the heatmap as an additional layer of helpful information to aid in interpretation. To do so, we'll need to take the median chemical concentration for each biomass burn condition within each chemical category. However, since we want `ggplot()` to visualize the median scaled values with the color of the tiles this step was already necessary. -```{r} +```{r 03-Chapter3-13} # We'll find the median value and add that data to the dataframe as an additional column heatmap_df = scaled_longer_smoke_data %>% group_by(Biomass_Burn_Condition, Chemical.Category) %>% @@ -262,7 +262,7 @@ head(heatmap_df) ``` Now we can plot the data and add the `Median_Scaled_Concentration` to the figure using `geom_text()`. Note that specifying the original `Scaled_Chemical_Concentration` in the **fill** parameter will NOT give you the same heatmap as specifying the median values in `ggplot()`. -```{r fig.align = "center", fig.width = 12, fig.height= 5} +```{r 03-Chapter3-14, fig.align = "center", fig.width = 12, fig.height= 5} ggplot(data = heatmap_df, aes(x = Chemical.Category, y = Biomass_Burn_Condition, fill = Median_Scaled_Concentration)) + geom_tile() + # function used to specify a heatmap for ggplot @@ -279,7 +279,7 @@ ggplot(data = heatmap_df, aes(x = Chemical.Category, y = Biomass_Burn_Condition, ::: This same heatmap can be achieved another way using the `pheatmap()` function from the *pheatmap* package. Using this function requires us to use a wide dataset, which we need to create. It will contain `Chemical.Category`, `Biomass_Burn_Condition` and `Scaled_Chemical_Concentration`. -```{r, message=FALSE} +```{r 03-Chapter3-15, message=FALSE} heatmap_df2 = scaled_longer_smoke_data %>% group_by(Biomass_Burn_Condition, Chemical.Category) %>% # using the summarize function instead of mutate function as was done previously since we only need the median values now @@ -294,7 +294,7 @@ head(heatmap_df2) ``` Now let's generate the same heatmap this time using the `pheatmap()` function: -```{r fig.align = "center"} +```{r 03-Chapter3-16, fig.align = "center"} pheatmap(heatmap_df2, # removing the clustering option from both rows and columns cluster_rows = FALSE, cluster_cols = FALSE, @@ -324,7 +324,7 @@ For additional figures available and to view aspects of figures that can be chan **Hint 2**: Use the function `facet_wrap()` within `ggplot()` to separate the heatmaps by `Burn_Condition`. ::: -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 03-Chapter3-17, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_3/Module3_1_Input/Module3_1_Image1.png") ``` @@ -369,20 +369,20 @@ Keys to successful data visualizations: Color can be used to visualize a variable. There are three ways to categorize color schemes - sequential, diverging, and qualitative. Below, definitions are provided for each along with example figures that we've previously published that illustrate each color scheme. In addition, figure titles and captions are also provided for context. Note that some of these figures have been simplified from what was originally published to show more streamlined examples for TAME. - **Sequential**: intended for ordered categorical data (i.e., disease severity, likert scale, quintiles). The choropleth map below is from [Winker, Payton et. al](https://doi.org/10.3389/fpubh.2024.1339700). -```{r, echo=FALSE, out.width = "65%", fig.align='center'} +```{r 03-Chapter3-18, echo=FALSE, out.width = "65%", fig.align='center'} knitr::include_graphics("Chapter_3/Module3_2_Input/Module3_2_Image1.png") ```
**Figure 1. Geospatial distribution of the risk of future wildfire events across North Carolina.** Census tracts in North Carolina were binned into quintiles based on Wildfire Hazard Potential (WHP) with 1 (pale orange) having the lowest risk and 5 (dark red) having the highest risk. Figure regenerated here in alignment with its published [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)]
- **Diverging**: intended to emphasize continuous data at extremes of the data range (typically using darker colors) and mid-range values (typically using lighter colors). This color scheme is ideal for charts like heatmaps. The heatmap below is from [Payton, Perryman et. al](0.1152/ajplung.00299.2021). -```{r, echo=FALSE, out.width = "90%", fig.align='center'} +```{r 03-Chapter3-19, echo=FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics("Chapter_3/Module3_2_Input/Module3_2_Image2.png") ```
**Figure 2. Individual cytokine expression levels across all subjects.** Cytokine concentrations were derived from nasal lavage fluid samples. On the x axis, subjects were ordered first according to tobacco use status, starting with non-smokers then cigarette smokers and e-cigarette users. Within tobacco use groups, subjects are ordered from lowest to highest average cytokine concentration from left to right. Within each cluster shown on the y axis, cytokines are ordered from lowest to highest average cytokine concentration from bottom to top. Figure regenerated here in alignment with its published [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)
- **Qualitative**: intended for nominal categorical data to visualize clear differences between groups (i.e., soil types and exposure groups). The dendrogram below is from [Koval et. al](10.1038/s41370-022-00451-8). -```{r, echo=FALSE, out.width = "75%", fig.align='center'} +```{r 03-Chapter3-20, echo=FALSE, out.width = "75%", fig.align='center'} knitr::include_graphics("Chapter_3/Module3_2_Input/Module3_2_Image3.png") ```
**Figure 3. Translating chemical use inventory data to inform human exposure patterning.** Groups A-I illustrate the identified clusters of exposure source categories. Figure regenerated here in alignment with its published [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)
@@ -401,13 +401,13 @@ In this module, *ggplot2*, R's data visualization package will be used to walk t ### Script Preparations #### Cleaning the global environment -```{r, clear_ env, echo=TRUE, eval=TRUE} +```{r 03-Chapter3-21, clear_env, echo=TRUE, eval=TRUE} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, install_libs2, echo=TRUE, eval=TRUE, warning=FALSE, results='hide', message=FALSE} +```{r 03-Chapter3-22, install_libs2, echo=TRUE, eval=TRUE, warning=FALSE, results='hide', message=FALSE} if (!requireNamespace("MetBrewer")) install.packages("MetBrewer"); if (!requireNamespace("RColorBrewer")) @@ -419,7 +419,7 @@ if (!requireNamespace("cowplot")) ``` #### Loading required R packages -```{r, echo=TRUE, eval=TRUE, warning=FALSE, results='hide', message=FALSE} +```{r 03-Chapter3-23, echo=TRUE, eval=TRUE, warning=FALSE, results='hide', message=FALSE} library(tidyverse) library(MetBrewer) library(RColorBrewer) @@ -428,13 +428,13 @@ library(cowplot) ``` #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 03-Chapter3-24, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` #### Importing example dataset Let's now read in our example dataset. As mentioned in the introduction, this example dataset represents chemical measurements across 12 different biomass burn scenarios, representing chemicals emitted during potential wildfire events. Let's upload and view these data: -```{r} +```{r 03-Chapter3-25} # Load the data smoke_data <- read.csv("Chapter_3/Module3_2_Input/Module3_2_InputData.csv") @@ -453,7 +453,7 @@ This training module was specifically developed to answer the following environm #### Formatting dataframes for downstream visualization code First, format the dataframe by changing it from a wide to long format and normalizing the chemical concentration data. For more details on this data reshaping visit **TAME 2.0 Module 2.3 Data Manipulation & Reshaping**. -```{r} +```{r 03-Chapter3-26} scaled_longer_smoke_data = pivot_longer(smoke_data, cols = 4:13, names_to = "Biomass_Burn_Condition", values_to = "Chemical_Concentration") %>% # scaling within each chemical @@ -469,7 +469,7 @@ head(scaled_longer_smoke_data) As we did in the previous module, a boxplot will be constructed to answer the first environmental heath question: **How do the distributions of the chemical concentration data differ based on each biomass burn scenario?**. Let's remind ourselves of the original figure from the previous module. -```{r fig.align = "center", echo = FALSE, fig.width = 7, fig.height = 5} +```{r 03-Chapter3-27, fig.align = "center", echo = FALSE, fig.width = 7, fig.height = 5} ggplot(data = scaled_longer_smoke_data, aes(x = Scaled_Chemical_Concentration, color = Biomass_Burn_Condition)) + geom_boxplot() ``` @@ -497,7 +497,7 @@ Many journals now require that authors report every single value when making dat ::: Let's start with addressing **#1: Legibility of Axis Text**. The legend title and axis titles can easily be changed with `ggplot()`, so that will be done later. To remove the underscore from the `Biomass_Burn_Condition` column, we can use the function `gsub()`, which will replace all of the underscores with spaces, resulting in a cleaner-looking graph. -```{r} +```{r 03-Chapter3-28} # First adding spaces between the biomass burn conditions scaled_longer_smoke_data = scaled_longer_smoke_data %>% mutate(Biomass_Burn_Condition = gsub("_", " ", Biomass_Burn_Condition)) @@ -510,7 +510,7 @@ head(scaled_longer_smoke_data) After calculating the median scaled chemical concentration for each biomass burn condition, the new dataframe will be arranged from lowest to highest median scaled concentration from the top of the dataframe to the bottom. This order will be saved in a vector, `median_biomass_order`. Although the biomass burn conditions are saved from lowest to highest concentration, `ggplot()` will plot them in reverse order with the highest concentration at the top and the lowest at the bottom of the y axis. Axis reordering can also be accomplished using `reorder` within the `ggplot()` function as described [here](https://guslipkin.medium.com/reordering-bar-and-column-charts-with-ggplot2-in-r-435fad1c643e) and [here](https://r-graph-gallery.com/267-reorder-a-variable-in-ggplot2.html). -```{r} +```{r 03-Chapter3-29} median_biomass = scaled_longer_smoke_data %>% group_by(Biomass_Burn_Condition) %>% summarize(Median_Concentration = median(Scaled_Chemical_Concentration)) %>% @@ -524,7 +524,7 @@ median_biomass_order = median_biomass$Biomass_Burn_Condition ``` -```{r} +```{r 03-Chapter3-30} # Putting into factor to organize the burn conditions scaled_longer_smoke_data$Biomass_Burn_Condition = factor(scaled_longer_smoke_data$Biomass_Burn_Condition, levels = median_biomass_order) @@ -534,14 +534,14 @@ head(scaled_longer_smoke_data) ``` Now that the dataframe has been finalized, we can plot the new boxplot. The final revision, **#3: Making Use of Color**, will be addressed with `ggplot()`. However, a palette can be chosen from the *MetBrewer* package. -```{r} +```{r 03-Chapter3-31} # Choosing the "Jurarez" palette from the `MetBrewer` package # `n = 12`, since there are 12 biomass burn conditions juarez_colors = met.brewer(name = "Juarez", n = 12)[1:12] ``` **#4. Show all data points when possible** will also be addressed with `ggplot()` by simply using `geom_point()`. -```{r fig.align = "center", out.width = "75%", out.height = "75%"} +```{r 03-Chapter3-32, fig.align = "center", out.width = "75%", out.height = "75%"} FigureX1 = ggplot(scaled_longer_smoke_data, aes(x = Scaled_Chemical_Concentration, y = Biomass_Burn_Condition, color = Biomass_Burn_Condition)) + geom_boxplot() + @@ -581,7 +581,7 @@ Other aspects of the figure were changed in the latest version, but those are mi ## Creating an Improved Heatmap Visualization We'll use a heatmap to answer the second environmental health question: **Which classes of chemicals show the highest concentrations across the evaluated biomass burn conditions?** Let's view the original heatmap from the previous module and find aspects of it that can be improved. -```{r fig.align = "center", fig.width = 10, fig.height= 5} +```{r 03-Chapter3-33, fig.align = "center", fig.width = 10, fig.height= 5} # Changing the biomass condition variable back to a character from a factor scaled_longer_smoke_data$Biomass_Burn_Condition = as.character(scaled_longer_smoke_data$Biomass_Burn_Condition) @@ -615,7 +615,7 @@ Notice that in the boxplot we used a qualitative palette, which is best for crea **#1: Legibility of Text** can be addressed in `ggplot()` and so can **#2: Reordering the heatmap**. `Biomass_Burn_Condition` has already been reordered and put into a factor, but we need to do the same with `Chemical.Category`. Similar to before, median scaled chemical concentration for each chemical category will be calculated. However, this time the new dataframe will be arranged from highest to lowest median scaled concentration from the top of the dataframe to the bottom. `ggplot()` will plot them in the SAME order with the highest concentration on the left side and the lowest on the right side of the figure. -```{r} +```{r 03-Chapter3-34} # Order the chemical category by the median scaled chemical concentration median_chemical = scaled_longer_smoke_data %>% group_by(Chemical.Category) %>% @@ -628,7 +628,7 @@ head(median_chemical) median_chemical_order = median_chemical$Chemical.Category ``` -```{r} +```{r 03-Chapter3-35} # Putting into factor to organize the chemical categories scaled_longer_smoke_data$Chemical.Category = factor(scaled_longer_smoke_data$Chemical.Category, levels = median_chemical_order) @@ -642,14 +642,14 @@ head(scaled_longer_smoke_data) ``` Now that the dataframe has been finalized, we can plot the new boxplot. The final revision, **#3: Making Use of Color**, will be addressed with `ggplot()`. Here a palette is chosen from the *RColorBrewer* package. -```{r} +```{r 03-Chapter3-36} # Only needed to choose 2 colors for 'low' and 'high' in the heatmap # `n = 8` in the code to generate more colors that can be chosen from rcolorbrewer_colors = brewer.pal(n = 8, name = 'Accent') ``` -```{r fig.align = "center", fig.width = 10, fig.height = 4} +```{r 03-Chapter3-37, fig.align = "center", fig.width = 10, fig.height = 4} FigureX2 = ggplot(data = scaled_longer_smoke_data, aes(x = Chemical.Category, y = Biomass_Burn_Condition, fill = Median_Scaled_Concentration)) + geom_tile(color = 'white') + # adds white space between the tiles @@ -684,7 +684,7 @@ An appropriate title for this figure could be: It would be helpful if there was a way to group these chemical profiles based on similarity and that's where the `pheatmap()` function can be helpful when it can be difficult to spot those patterns using visual inspection alone. Just for fun, let's briefly visualize a hierarchical clustering heatmap, which will be used to group both the biomass burn conditions and chemical categories based on their chemical concentrations. In this module, we'll focus only on the `pheatmap()` visualization, but more information on hierarchical clustering can be found in **Module 5.5 Unsupervised Machine Learning II: Additional Clustering Applications**. As we showed in the previous module, this function requires a wide dataframe which we'll need to create. It will contain `Chemical.Category`, `Biomass_Burn_Condition` and `Scaled_Chemical_Concentration`. -```{r, message=FALSE} +```{r 03-Chapter3-38, message=FALSE} heatmap_df2 = scaled_longer_smoke_data %>% group_by(Biomass_Burn_Condition, Chemical.Category) %>% # using the summarize function instead of mutate function as was done previously since we only need the median values now @@ -699,7 +699,7 @@ head(heatmap_df2) ``` Now let's generate the same heatmap this time using the `pheatmap()` function: -```{r fig.align = "center"} +```{r 03-Chapter3-39, fig.align = "center"} # creating a color palette blue_pink_palette = colorRampPalette(c(rcolorbrewer_colors[5], rcolorbrewer_colors[6])) @@ -723,7 +723,7 @@ By using incorporating the dendrogram into the visualization, it's easier to see We can combine figures using the `plot_grid()` function from the *cowplot* package. For additional information on the `plot_grid()` function and parameters that can be changed see [Arranging Plots in a Grid](https://wilkelab.org/cowplot/articles/plot_grid.html). Other packages that have figure combining capabilities include the *[patchwork](https://patchwork.data-imaginist.com/)* package and the [`grid_arrange()`](https://cran.r-project.org/web/packages/gridExtra/vignettes/arrangeGrob.html) function from the *gridExtra* package. Figures can also be combined after they're exported from R using other applications like MS powerpoint and Adobe pdf. -```{r fig.align = "center", fig.width = 20, fig.height = 6, fig.retina= 3} +```{r 03-Chapter3-40, fig.align = "center", fig.width = 20, fig.height = 6, fig.retina= 3 } FigureX = plot_grid(FigureX1, FigureX2, # Adding labels, changing size their size and position labels = "AUTO", label_size = 15, label_x = 0.04, @@ -765,7 +765,7 @@ Replicate the figure below! The heatmap is the same as the "Test Your Knowledge" **Hint**: To view additional aspects of figures that can be changed in *ggplot2* check out this [GGPlot2 Cheat Sheet](https://www.maths.usyd.edu.au/u/UG/SM/STAT3022/r/current/Misc/data-visualization-2.1.pdf). It might come in handy! ::: -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 03-Chapter3-41, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_3/Module3_2_Input/Module3_2_Image4.png") ``` @@ -806,14 +806,14 @@ In the current example dataset, chemical exposure profiles were obtained from th ### Script Preparations #### Cleaning the global environment -```{r} +```{r 03-Chapter3-42} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r message = FALSE} +```{r 03-Chapter3-43, message = FALSE} if (!requireNamespace("openxlsx")) install.packages("openxlsx"); if (!requireNamespace("tidyverse")) @@ -823,19 +823,19 @@ if (!requireNamespace("ggpubr")) ``` #### Loading R packages required for this session -```{r message = FALSE} +```{r 03-Chapter3-44, message = FALSE} library(openxlsx) # for importing data library(tidyverse) # for manipulating and plotting data library(ggpubr) # for making Q-Q plots with ggplot ``` #### Set your working directory -```{r eval = FALSE} +```{r 03-Chapter3-45, eval = FALSE} setwd("/filepath to where your input files are") ``` #### Importing example dataset -```{r message = FALSE} +```{r 03-Chapter3-46, message = FALSE} # Import data wrist_data <- read.xlsx("Chapter_3/Module3_3_Input/Module3_3_InputData.xlsx") @@ -871,7 +871,7 @@ Before answering these questions, let's define normality and how to test for it ## What is a Normal Distribution? A normal distribution is a distribution of data in which values are distributed roughly symmetrically out from the mean such that 68.3% of values fall within one standard deviation of the mean, 95.4% of values fall within 2 standard deviations of the mean, and 99.7% of values fall within three standard deviations of the mean. -```{r out.width = "800px", echo = FALSE, fig.align = 'center'} +```{r 03-Chapter3-47, out.width = "800px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_3/Module3_3_Input/Module3_3_Image1.png") ```
Figure Credit: D Wells, CC BY-SA 4.0 , via Wikimedia Commons
@@ -887,12 +887,12 @@ We can begin by assessing the normality of our data through plots. For example, ### Histograms Let's start with visualizing the distribution of the participant's ages using the `hist()` function that is part of base R. -```{r fig.align = 'center'} +```{r 03-Chapter3-48, fig.align = 'center'} hist(wrist_data$Age) ``` We can edit some of the parameters to improve this basic histogram visualization. For example, we can decrease the size of each bin using the breaks parameter: -```{r fig.align = 'center'} +```{r 03-Chapter3-49, fig.align = 'center'} hist(wrist_data$Age, breaks = 10) ``` @@ -900,7 +900,7 @@ The `hist()` function is useful for plotting single distributions, but what if w First, we'll pivot our data to longer to prepare for plotting. Then, we'll make our plot. We can use the `theme_set()` function to set a default graphing theme for the rest of the script. A graphing theme represents a set of default formatting parameters (mostly colors) that ggplot will use to make your graphs. `theme_bw()` is a basic theme that includes a white background for the plot and dark grey axis text and minor axis lines. The theme that you use is a matter of personal preference. For more on the different themes available through *ggplot2*, see [here](https://ggplot2.tidyverse.org/reference/ggtheme.html). -```{r message = FALSE, fig.align = 'center'} +```{r 03-Chapter3-50, message = FALSE, fig.align = 'center'} # Pivot data longer to prepare for plotting wrist_data_long <- wrist_data %>% pivot_longer(!S_ID, names_to = "variable", values_to = "value") @@ -920,7 +920,7 @@ From these histograms, we can see that our chemical variables do not appear to b ### Q-Q Plots Q-Q (quantile-quantile) plots are another way to visually assess normality. Similar to the histogram above, we can create a single Q-Q plot for the age variable using base R functions. Normal Q-Q plots (Q-Q plots where the theoretical quantiles are based on a normal distribution) have theoretical quantiles on the x-axis and sample quantiles, representing the distribution of the variable of interest from the dataset, on the y-axis. If the variable of interest is normally distributed, the points on the graph will fall along the reference line. -```{r fig.align = 'center'} +```{r 03-Chapter3-51, fig.align = 'center'} # Plot points qqnorm(wrist_data$Age) @@ -930,7 +930,7 @@ qqline(wrist_data$Age) Small variations from the reference line, as seen above, are to be expected for the most extreme values. Overall, we can see that the age data are relatively normally distributed, as the points fall along the reference line. To make a figure panel with Q-Q plots for all of our variables of interest, we can use the `ggqqplot()` function within the *[ggpubr](https://rpkgs.datanovia.com/ggpubr/)* package. This function generates Q-Q plots and has arguments that are similar to *ggplot2*. -```{r fig.align = 'center'} +```{r 03-Chapter3-52, fig.align = 'center'} ggqqplot(wrist_data_long, x = "value", facet.by = "variable", ggtheme = theme_bw(), scales = "free") ``` With this figure panel, we can see that the chemical data have very noticeable deviations from the reference, suggesting non-normal distributions. @@ -946,7 +946,7 @@ Next, we will implement a quantitative approach to assessing normality, based on ### Single Variable Normality Assessment We will use the Shapiro-Wilk test to quantitatively assess whether our data distribution is normal, again looking at the age data. This test can be carried out simply using the `shapiro.test()` function from the base R stats package. When using this test and interpreting its results, it is important to remember that the null hypothesis is that the sample distribution is normal, and a significant p-value means the distribution is non-normal. -```{r} +```{r 03-Chapter3-53} shapiro.test(wrist_data$Age) ``` This test resulted in a p-value of 0.8143, so we cannot reject the null hypothesis (that data are normally distributed). This means that we can assume that age is normally distributed, which is consistent with our visualizations above. @@ -954,7 +954,7 @@ This test resulted in a p-value of 0.8143, so we cannot reject the null hypothes ### Multiple Variable Normality Assessment With a large dataset containing many variables of interest (e.g., our example data with multiple chemicals), it is more efficient to test each column for normality and then store those results in a dataframe. We can use the base R function `apply()` to apply the Shapiro Wilk test over all of the numeric columns of our dataframe. This function generates a list of results, with a list element for each variable tested. There are also other ways that you could iterate through each of your columns, such as a `for` loop or a function as discussed in **TAME 2.0 Module 2.4 Improving Coding Efficiencies**. -```{r} +```{r 03-Chapter3-54} # Apply Shapiro Wilk test shapiro_res <- apply(wrist_data %>% select(-S_ID), 2, shapiro.test) @@ -963,7 +963,7 @@ glimpse(shapiro_res[1:3]) ``` We can then convert those list results into a dataframe. Each variable is now in a row, with columns describing outputs of the statistical test. -```{r} +```{r 03-Chapter3-55} # Create results dataframe shapiro_res <- do.call(rbind.data.frame, shapiro_res) @@ -972,7 +972,7 @@ shapiro_res ``` Finally, we can clean up our results dataframe and add a column that will quickly tell us whether our variables are normally or non-normally distributed based on the Shapiro-Wilk normality test results. -```{r} +```{r 03-Chapter3-56} # Clean dataframe shapiro_res <- shapiro_res %>% @@ -1023,7 +1023,7 @@ There are a number of approaches that can be used to change the range and/or dis When data are non-normally distributed, such as with the chemical concentrations in our example dataset, it may be desirable to transform the data so that the distribution becomes closer to a normal distribution, particularly if there are only parametric tests available to test your hypothesis. A common transformation used in environmental health research is log~2~ transformation, in which data are transformed by taking the log~2~ of each value in the dataframe. Let's log~2~ transform our chemical data and examine the resulting histograms and Q-Q plots to qualitatively assess whether data appear more normal following transformation. We will apply a pseudo-log~2~ transformation, where we will add 1 to each value before log~2~ transforming so that all resulting values are positive and any zeroes in the dataframe do not return -Inf. -```{r fig.align = 'center'} +```{r 03-Chapter3-57, fig.align = 'center'} # Apply psuedo log2 (pslog2) transformation to chemical data wrist_data_pslog2 <- wrist_data %>% mutate(across(DEP:TOTM, ~ log2(.x + 1))) @@ -1039,13 +1039,13 @@ ggplot(wrist_data_pslog2_long, aes(value)) + labs(y = "# of Observations", x = "Value") ``` -```{r fig.align = 'center'} +```{r 03-Chapter3-58, fig.align = 'center'} # Make a figure panel of Q-Q plots ggqqplot(wrist_data_pslog2_long, x = "value", facet.by = "variable", ggtheme = theme_bw(), scales = "free") ``` Both the histograms and the Q-Q plots demonstrate that our log~2~ transformed data are more normally distributed than the raw data graphed above. Let's apply the Shapiro-Wilk test to our log~2~ transformed data to determine if the chemical distributions are normally distributed. -```{r} +```{r 03-Chapter3-59} # Apply Shapiro Wilk test shapiro_res_pslog2 <- apply(wrist_data_pslog2 %>% select(-S_ID), 2, shapiro.test) @@ -1066,7 +1066,7 @@ shapiro_res_pslog2 ``` The results from the Shapiro-Wilk test demonstrate that the the log~2~ chemical concentration data are more normally distributed than the raw data. Overall, the p-values, even for the chemicals that are still non-normally distributed, are much higher, and only 2 out of the 8 chemicals are non-normally distributed by the Shapiro-Wilk test. We can also calculate average p-values across all variables for our raw and log~2~ transformed data to further demonstrate this point. -```{r} +```{r 03-Chapter3-60} # Calculate the mean Shapiro-Wilk p-value for the raw chemical data mean(shapiro_res$p.value) @@ -1152,13 +1152,13 @@ These statistical tests are very simple, with more extensive examples and associ ### Script Preparations #### Cleaning the global environment -```{r} +```{r 03-Chapter3-61} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, results=FALSE, message=FALSE} +```{r 03-Chapter3-62, results=FALSE, message=FALSE} if (!requireNamespace("tidyverse")) install.packages("tidyverse"); if (!requireNamespace("car")) @@ -1170,7 +1170,7 @@ if(!requireNamespace("effects")) ``` #### Loading R packages required for this session -```{r, results=FALSE, message=FALSE} +```{r 03-Chapter3-63, results=FALSE, message=FALSE} library(tidyverse) # all tidyverse packages, including dplyr and ggplot2 library(car) # package for statistical tests library(ggpubr) # ggplot2 based plots @@ -1178,20 +1178,20 @@ library(effects) # for linear modeling ``` #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 03-Chapter3-64, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` #### Importing example datasets Let's read in our example dataset. Note that these data are similar to those used previously, except that demographic and chemical measurement data were previously merged, and a few additional columns of subject information/demographics were added to serve as more thorough examples of data for use in this training module. -```{r} +```{r 03-Chapter3-65} # Loading data full.data <- read.csv("Chapter_3/Module3_4_Input/Module3_4_InputData.csv") ``` Let's view the top of the first 9 columns of data in this dataframe: -```{r} +```{r 03-Chapter3-66} full.data[1:10,1:9] ``` @@ -1208,7 +1208,7 @@ These represent the subject information/demographic data, which include the foll + `Smoker3`: "Never", "Former", or "Current" smoking status Let's now view the remaining columns (columns 10-15) in this dataframe: -```{r} +```{r 03-Chapter3-67} full.data[1:10,10:15] ``` @@ -1252,12 +1252,12 @@ As discussed in the previous module, there are a few ways to evaluate the normal Let's start with the first approach based on data visualizations. In this module, we'll primarily be generating figures using the ***ggubr*** package which is specifically designed to generate ggplot2-based figures using more streamlined coding syntax. In addition, this package has statistical parameters for plotting that are useful for basic statistical analysis, especially for people with introductory experience to plotting in R. For further documentation on *ggubr*, click [here](https://jtr13.github.io/cc20/brief-introduction-and-tutorial-of-ggpubr-package.html). Let's begin with a [histogram](https://en.wikipedia.org/wiki/Histogram) to view the distribution of BMI data using the `gghistogram()` function from the *ggubr* package: -```{r fig.width=5, fig.height=4, fig.align = 'center'} +```{r 03-Chapter3-68, fig.width=5, fig.height=4, fig.align = 'center'} gghistogram(data = full.data, x = "BMI", bins = 20) ``` Let's also view the [Q–Q (quantile-quantile) plot](https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot) using the `ggqqplot()` function also from the *ggubr* package: -```{r fig.width=5, fig.height=5, fig.align = 'center'} +```{r 03-Chapter3-69, fig.width=5, fig.height=5, fig.align = 'center'} ggqqplot(full.data$BMI, ylab = "BMI") ``` @@ -1266,14 +1266,14 @@ From these visualizations, the BMI variable appears to be normally distributed,
Let's now implement the second approach based on statistical tests for normality. Here, let's use the [Shapiro-Wilk test](https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test) as an example, again looking at the BMI data. -```{r} +```{r 03-Chapter3-70} shapiro.test(full.data$BMI) ``` This test resulted in a p-value of 0.3773, so we cannot reject the null hypothesis (that the BMI data are normally distributed). These findings support the assumption that these data are normally distributed. Next, we'll assess homogeneity of variance using the Levene's test. This will be done using the `leveneTest()`function from the *car* package: -```{r} +```{r 03-Chapter3-71} # First convert the smoker variable to a factor full.data$Smoker = factor(full.data$Smoker, levels = c("NS", "S")) leveneTest(BMI ~ Smoker, data = full.data) @@ -1288,14 +1288,14 @@ T-tests are commonly used to test for a significant difference between the means We will specifically implement a two sample t-test (or independent samples t-test). Let’s first visualize the BMI data across these two groups using boxplots: -```{r fig.width=5, fig.height=4, fig.align = 'center'} +```{r 03-Chapter3-72, fig.width=5, fig.height=4, fig.align = 'center'} ggboxplot(data = full.data, x = "Smoker", y = "BMI") ``` From this plot, it looks like non-smokers (labeled "NS") *may* have significantly higher BMI than smokers (labeled "S"), though we need statistical evaluation of these data to more thoroughly evaluate this potential data trend. It is easy to perform a t-test on these data using the `t.test()` function from the base R stats package: -```{r} +```{r 03-Chapter3-73} t.test(data = full.data, BMI ~ Smoker) ``` @@ -1309,7 +1309,7 @@ t.test(data = full.data, BMI ~ Smoker) ::: It's also helpful to save these results into a variable within the R global environment, which then allows us to access specific output values and extract them more easily for our records. For example, we can run the following to specifically extract the resulting p-value from this test: -```{r fig.align = 'center'} +```{r 03-Chapter3-74, fig.align = 'center'} ttest.res <- t.test(data = full.data, BMI ~ Smoker) # making a list in the R global environment with the statistical results signif(ttest.res$p.value, 2) # pulling the p-value and using the `signif` function to round to 2 significant figures ``` @@ -1320,20 +1320,20 @@ signif(ttest.res$p.value, 2) # pulling the p-value and using the `signif` functi Analysis of Variance (ANOVA) is a statistical method that can be used to compare means across three or more groups in normally distributed data. To demonstrate an ANOVA test on this dataset, let's answer **Environmental Health Question 2**: Are there statistically significant differences in BMI between current, former, and never smokers? To do this we'll use the `Smoker3` variable from our dataset. Let's again start by viewing these data distributions using a boxplot: -```{r fig.align = 'center'} +```{r 03-Chapter3-75, fig.align = 'center'} ggboxplot(data = full.data, x = "Smoker3", y = "BMI") ``` From this cursory review of the data, it looks like the current smokers likely demonstrate significantly different BMI measures than the former and never smokers, though we need statistical tests to verify this potential trend. We also require statistical tests to evaluate potential differences (or lack of differences) between former and never smokers. Let’s now run the ANOVA to compare BMI between smoking groups, using the `aov()` function to fit an ANOVA model: -```{r} +```{r 03-Chapter3-76} smoker_anova = aov(data = full.data, BMI ~ Smoker3) smoker_anova ``` We need to extract the typical ANOVA results table using either the `summary()` or `anova()` function on the resulting fitted object: -```{r} +```{r 03-Chapter3-77} anova(smoker_anova) ``` @@ -1350,7 +1350,7 @@ This table outputs a lot of information, including the `F value` referring to th ::: Let's run a Tukey's post hoc test using the `TukeyHSD()` function in base R to determine which of the current, former, and never smokers have significant differences in BMI: -```{r} +```{r 03-Chapter3-78} smoker_tukey = TukeyHSD(smoker_anova) smoker_tukey ``` @@ -1358,7 +1358,7 @@ smoker_tukey Although the above Tukey object contains a column `p adj`, those are the raw unadjusted p values. It is common practice to adjust p values from multiple comparisons to prevent the reporting of false positives or reporting of a significant difference that doesn't actually exist ([Feise, 2002](https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/1471-2288-2-8#:~:text=Thus%2C%20the%20main%20benefit%20of,exists%20%5B10%E2%80%9321%5D.)). There are a couple of different methods that are used to adjust p values including the Bonferroni and the Benjamini & Hochberg approaches. For this example, we'll use the `p.adjust()` function to obtain the Benjamini & Hochberg adjusted p values. Check out the associated [RDocumentation](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/p.adjust) to discover other methods that can be used to adjust p values using the `p.adjust()` function: -```{r} +```{r 03-Chapter3-79} # First converting the Tukey object into a dataframe smoker_tukey_df = data.frame(smoker_tukey$Smoker3) %>% # renaming the `p adj` to `P Value` for clarity @@ -1388,7 +1388,7 @@ We will first visualize the data and a run simple correlation analysis to evalua Plotting the variables against one another and adding a linear regression line using the function `ggscatter()` from the *ggubr* package: -```{r fig.align = 'center'} +```{r 03-Chapter3-80, fig.align = 'center'} ggscatter(full.data, x = "BMI", y = "BW", # Adding a linear line with 95% condfidence intervals as the shaded region add = "reg.line", conf.int = TRUE, @@ -1399,7 +1399,7 @@ ggscatter(full.data, x = "BMI", y = "BW", ``` We can also run a basic correlation analysis between these two variables using the `cor.test()` function. This function uses the Pearson's correlation test as default, which we can implement here due to the previously discussed assumption of normality for this dataset. Note that other tests are needed in instances when data are not normally distributed (e.g., Spearman Rank). This function is used here to extract the Pearson's correlation coefficient and p-value (which also appear above in the upper left corner of the graph): -```{r} +```{r 03-Chapter3-81} cor.res <- cor.test(full.data$BW, full.data$BMI) signif(cor.res$estimate, 2) signif(cor.res$p.value, 2) @@ -1408,7 +1408,7 @@ signif(cor.res$p.value, 2) Together, it looks like there may be an association between BW and BMI, based on these correlation results, demonstrating a significant p-value of 0.0004. To test this further, let’s run a linear regression analysis using the `lm()` function, using BMI (X) as the independent variable and BW as the dependent variable (Y): -```{r} +```{r 03-Chapter3-82} crude_lm <- lm(data = full.data, BW ~ BMI) summary(crude_lm) # viewing the results summary ``` @@ -1424,12 +1424,12 @@ summary(crude_lm) # viewing the results summary Additionally, we can derive confidence intervals for the BMI estimate using: -```{r} +```{r 03-Chapter3-83} confint(crude_lm)["BMI",] ``` Notice that the r-squared (R^2^) value in regression output is the squared value of the previously calculated correlation coefficient (R). -```{r} +```{r 03-Chapter3-84} signif(sqrt(summary(crude_lm)$r.squared), 2) ``` @@ -1437,7 +1437,7 @@ signif(sqrt(summary(crude_lm)$r.squared), 2) In epidemiological studies, the potential influence of confounders is considered by including important covariates within the final regression model. Let's go ahead and investigate **Environmental Health Question 4**: Are maternal age and gestational age considered potential covariates in the relationship between maternal BMI and birth weight? We can do that by adding those variables to the linear model. -```{r} +```{r 03-Chapter3-85} adjusted_lm = lm(data = full.data, BW ~ BMI + MAge + GA) summary(adjusted_lm) ``` @@ -1447,7 +1447,7 @@ summary(adjusted_lm) Let's further visualize these regression modeling results by adding a regression line to the original scatterplot. Before doing so, we'll use the `effect()` function from the *effects* package to make estimated predictions of birth weight values for the crude and adjusted linear models. The crude model only has BMI as the dependent variable, while the adjusted model includes BMI, maternal age, and gestational age as dependent variables. This function creates a table that contains 5 columns: fitted values for BMI (`BMI`), predictor values (`fit`), standard errors of the predictions (`se`), lower confidence limits (`lower`), and upper confidence limits (`upper`). An additional column, `Model`, was added to specify whether the values correspond to the crude or adjusted model. For additional information on visualizing adjusted linear models, see [Plotting Adjusted Associations in R](https://nickmichalak.com/post/2019-02-13-plotting-adjusted-associations-in-r/plotting-adjusted-associations-in-r/). -```{r} +```{r 03-Chapter3-86} crude_lm_predtable = data.frame(effect(term = "BMI", mod = crude_lm), Model = "Crude") adjusted_lm_predtable = data.frame(effect(term = "BMI", mod = adjusted_lm), Model = "Adjusted") @@ -1456,7 +1456,7 @@ crude_lm_predtable ``` Now we can plot each linear model and their corresponding 95% confidence intervals (CI). It's easier to visualize this using *ggplot2* instead of *ggubr* so that's what we'll use: -```{r fig.align = 'center'} +```{r 03-Chapter3-87, fig.align = 'center'} options(repr.plot.width=9, repr.plot.height=6) # changing dimensions of the entire figure ggplot(full.data, aes(x = BMI, y = BW)) + geom_point() + @@ -1485,7 +1485,7 @@ ggplot(full.data, aes(x = BMI, y = BW)) + To carry out a logistic regression, we need to evaluate one continuous variable (here, we select gestational age, using the `GA` variable) and one dichotomous variable (here, we select smoking status, using the `Smoker` variable) to evaluate **Environmental Health Question 5**: Are there statistically significant differences in gestational age based on whether a subject is a non-smoker or a smoker? Because smoking status is a dichotomous variable, we will use logistic regression to look at this relationship. Let's first visualize these data using a stacked bar plot for the dichotomous smoker dataset: -```{r fig.width=5, fig.height=4, fig.align = 'center'} +```{r 03-Chapter3-88, fig.width=5, fig.height=4, fig.align = 'center'} ggboxplot(data = full.data, x = "Smoker", y = "GA") ``` @@ -1494,7 +1494,7 @@ With this visualization, it's difficult to tell whether or not there are signifi
Let's now run the statistical analysis, using logistic regression modeling: -```{r} +```{r 03-Chapter3-89} # Before running the model, "Smoker", needs to be binarized to 0's or 1's for the glm function glm_data = full.data %>% mutate(Smoker = ifelse(Smoker == "NS", 0,1)) @@ -1507,7 +1507,7 @@ summary(log.res) # viewing the results ``` Similar to the regression modeling analysis, we can also derive confidence intervals: -```{r} +```{r 03-Chapter3-90} confint(log.res)["GA",] ``` @@ -1531,23 +1531,23 @@ The number of samples or subjects (*n*) considered to be sufficiently large enou For this example, we are interested in evaluating the potential relationship between two categorical variables: smoking status (using the `Smoker` variable) and categorical BMI group (using the `BMIcat` variable) to address **Environmental Health Question 6**: Is there a relationship between smoking status and BMI? To run these categorical statistical tests, let's first create and view a 2-way contingency table describing the frequencies of observations across the categorical BMI and smoking groups: -```{r} +```{r 03-Chapter3-91} ContingencyTable <- with(full.data, table(BMIcat, Smoker)) ContingencyTable ``` Now let's run the Chi-squared test on this table: -```{r} +```{r 03-Chapter3-92} chisq.test(ContingencyTable) ``` Note that we can also run the Chi-squared test using the following code, without having to generate the contingency table: -```{r warning = FALSE} +```{r 03-Chapter3-93, warning = FALSE} chisq.test(full.data$BMIcat, full.data$Smoker) ``` Or: -```{r warning = FALSE} +```{r 03-Chapter3-94, warning = FALSE} with(full.data, chisq.test(BMIcat, Smoker)) ``` @@ -1562,7 +1562,7 @@ Note that these all produce the same results. *With this, we can answer **Enviro
We can also run a Fisher's Exact Test when considering sample sizes. We won't run this here due to computing time, but here is some example code for your records: -```{r} +```{r 03-Chapter3-95} #With small sample sizes, can use Fisher's Exact Test #fisher.test(full.data$BMI, full.data$Smoker) ``` @@ -1579,4 +1579,4 @@ Test Your Knowledge :::tyk 1. If we're interested in investigating if there are significant differences in birth weight based on maternal education level, which statistical test should you use? 2. Is that relationship considered to be statistically significant and how can we visualize the distributions of these groups? -::: \ No newline at end of file +::: diff --git a/Chapter_4/.DS_Store b/Chapter_4/.DS_Store deleted file mode 100644 index 1392f74..0000000 Binary files a/Chapter_4/.DS_Store and /dev/null differ diff --git a/Chapter_4/04-Chapter4.Rmd b/Chapter_4/04-Chapter4.Rmd index 667cc12..64a60ea 100644 --- a/Chapter_4/04-Chapter4.Rmd +++ b/Chapter_4/04-Chapter4.Rmd @@ -24,7 +24,7 @@ Biological replicates are the preferred unit of statistical comparison because t The final "N" that you report should reflect your biological replicates, or independent experiments. What constitutes an independent experiment or biological replicate is highly field-, lab-, organism-, and endpoint-dependent, so make sure to discuss this within your research group in the experiment planning phase and again before your analysis begins. No matter what you choose, ensure that when you report your results, you are transparent about what your biological replicates are. For example, the below diagram (adapted from [BitesizeBio](https://bitesizebio.com/47982/n-number-cell-lines/)) illustrates different ways of defining replicates in experiments with cell lines: -```{r, echo = FALSE, fig.align = "center", out.width = "650px"} +```{r 04-Chapter4-1, echo = FALSE, fig.align = "center", out.width = "650px" } knitr::include_graphics("Chapter_4/Module4_1_Input/Module4_1_Image1.png") ``` @@ -39,7 +39,7 @@ Also note that to perform statistical analyses, an N of at least 3 biological re Technical replicates are repeated measurements on the same sample or biological source, demonstrating the variation underlying protocols, equipment, and sample handling. In environmental health research, there can be technical replicates separately related to either the experimental design or the downstream analyses. Technical replicates related to experimental design refer to the chemical exposure for cell-based (*in vitro*) experiments, where there may be multiple wells of cells from the same passage or human/mouse exposed to the same treatment. Technical replicates related to downstream analyses refer to the endpoints that are measured after chemical exposure in each sample. To illustrate this, consider an experiment where cells from four unique human donors (D1-D4) are grown in cell culture plates, and then three wells of cells from each donor are exposed to a chemical treatment (Tx) or a vehicle control (Ctrl). The plate layout might look something like this, with technical replicates related to experimental design, i.e. chemical exposure, in the same color: -```{r, echo = FALSE, fig.align = "center", out.width = "500px"} +```{r 04-Chapter4-2, echo = FALSE, fig.align = "center", out.width = "500px" } knitr::include_graphics("Chapter_4/Module4_1_Input/Module4_1_Image2.png") ``` @@ -47,7 +47,7 @@ For this experiment, we have four biological replicates (the four donors) and th Following the exposure of the cells to a chemical of interest, the media is collected from each well and assayed using a plate reader assay for concentrations of a marker of inflammation. For each sample collected (from each well), there are three technical replicates used to measure the concentration of the inflammatory marker. The purpose of these technical replicates is to capture potential unintended well-to-well variation in the plate reader assay. The plate layout might look something like this, ***with the letter and number in each well of the plate layout representing the well in the exposure plate layout that the media sample being assayed came from***: -```{r, echo = FALSE, fig.align = "center", out.width = "800px"} +```{r 04-Chapter4-3, echo = FALSE, fig.align = "center", out.width = "800px" } knitr::include_graphics("Chapter_4/Module4_1_Input/Module4_1_Image3.png") ``` @@ -82,7 +82,7 @@ In this chapter, we will be using an example dataset derived from an *in vitro*, In this experiment, primary human bronchial epithelial cells (HBECs) from sixteen different donors were exposed to the gas acrolein, which is emitted from the combustion of fossil fuels, tobacco, wood, and plastic. Inhalation exposure to acrolein is associated with airway inhalation, and this study aimed to understand how exposure to acrolein changes secretion of markers of inflammation. Prior to experimentation, the HBECs were grown on a permeable membrane support for 24 days with air on one side and liquid media on the other side, allowing them to differentiate into a form that is very similar to what is found in the human body. The cells were then exposed for 2 hours to 0 (filtered air), 0.6, 1, 2, or 4 ppm acrolein, with two technical replicate wells from each donor per dose. Twenty-four hours later, the media was collected, and concentrations of inflammatory markers were measured using an [enzyme-linked immunosorbent assay (ELISA)](https://www.thermofisher.com/us/en/home/life-science/protein-biology/protein-biology-learning-center/protein-biology-resource-library/pierce-protein-methods/overview-elisa.html). -```{r, echo = FALSE, fig.align = "center", out.width = "900px"} +```{r 04-Chapter4-4, echo = FALSE, fig.align = "center", out.width = "900px" } knitr::include_graphics("Chapter_4/Module4_1_Input/Module4_1_Image4.png") ``` @@ -92,7 +92,7 @@ Note that this is a matched experimental design because cells from every donor w Next, let's familiarize ourselves with the data that resulted from this experiment. There are two input data files, one that contains cytokine concentration data and one that contains demographic information about the donors: -```{r, echo = FALSE, fig.align = "center", out.width = "900px"} +```{r 04-Chapter4-5, echo = FALSE, fig.align = "center", out.width = "900px" } knitr::include_graphics("Chapter_4/Module4_1_Input/Module4_1_Image5.png") ``` @@ -137,7 +137,7 @@ This training module was specifically developed to answer the following environm ## Data Import First, we need to import our data. Data can be imported into R from many different file formats, including .csv (as demonstrated in previous chapters), .txt, .xlsx, and .pdf. Often, data are formatted in Excel prior to import, and the [*openxlsx*](https://ycphs.github.io/openxlsx/) package provides helpful functions that allow the user to import data from Excel, create workbooks for storing results generated in R, and export data from R to Excel workbooks. Below, we will use the `read.xlsx()` function to import our data directly from Excel. Other useful packages include [*pdftools*](https://github.com/ropensci/pdftools) (PDF import), [*tm*](https://cran.r-project.org/web/packages/tm/vignettes/tm.pdf) (text mining of PDFs), and [*plater*](https://cran.r-project.org/web/packages/plater/vignettes/plater-basics.html) (plate reader formatted data import). -```{r, echo = FALSE, fig.align = "center", out.width = "850px"} +```{r 04-Chapter4-6, echo = FALSE, fig.align = "center", out.width = "850px" } knitr::include_graphics("Chapter_4/Module4_2_Input/Module4_2_Image1.png") ``` @@ -146,13 +146,13 @@ knitr::include_graphics("Chapter_4/Module4_2_Input/Module4_2_Image1.png") #### Set working directory In preparation, first let's set our working directory to the folder path that contains our input files: -```{r eval = FALSE} +```{r 04-Chapter4-7, eval = FALSE} setwd("/filepath to where your input files are") ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, install__libs, echo=TRUE, eval=FALSE, warning=FALSE, results='hide', message=FALSE} +```{r 04-Chapter4-8, install__libs, echo=TRUE, eval=FALSE, warning=FALSE, results='hide', message=FALSE} if (!requireNamespace("table1")) install.packages("table1"); if (!requireNamespace("vtable")) @@ -167,7 +167,7 @@ BiocManager::install("imputeLCMD") #### Load required packages And load required packages: -```{r message = FALSE} +```{r 04-Chapter4-9, message = FALSE} library(openxlsx) # for importing Excel files library(DT) # for easier viewing of data tables library(tidyverse) # for data cleaning and graphing @@ -180,7 +180,7 @@ library(ggpubr) # for making Q-Q plots with ggplot #### Import example datasets Next, let's read in our example datasets: -```{r} +```{r 04-Chapter4-10} biomarker_data <- read.xlsx("Chapter_4/Module4_2_Input/Module4_2_InputData1.xlsx") demographic_data <- read.xlsx("Chapter_4/Module4_2_Input/Module4_2_InputData2.xlsx") ``` @@ -188,12 +188,12 @@ demographic_data <- read.xlsx("Chapter_4/Module4_2_Input/Module4_2_InputData2.xl #### View example datasets First, let's preview our example data. Using the `datatable()` function from the *DT* package allows us to interactively scroll through our biomarker data. -```{r} +```{r 04-Chapter4-11} datatable(biomarker_data) ``` We can see that our biomarker data are arranged with samples in rows and sample information and biomarker measurements in the columns. -```{r} +```{r 04-Chapter4-12} datatable(demographic_data) ``` @@ -204,7 +204,7 @@ Our demographic data provide information about the donors that our cells came fr ## Handling Missing Values Next, we will investigate whether we have missing values and which variables and donors have missing values. -```{r} +```{r 04-Chapter4-13} # Calculate the total number of NAs per variable biomarker_data %>% summarise(across(IL1B:VEGF, ~sum(is.na(.)))) @@ -256,7 +256,7 @@ If you do impute missing values, make sure to include both your raw and imputed Before imputing our data, it is a good idea to implement a background filter that checks to see if a certain percentage of values for each variable are missing. For variables with a very high percentage of missing values, imputation can be unreliable because there is not enough information for the imputation algorithm to reference. The threshold for what this percentage should be can vary by study design and the extent to which your data are subset into groups that may have differing biomarker profiles; however, a common threshold we frequently use is to remove variables with missing data for 25% or more of samples. We can use the following code to calculate the percentage values missing for each endpoint: -```{r} +```{r 04-Chapter4-14} biomarker_data %>% summarise(across(IL1B:VEGF, ~sum(is.na(.))/nrow(biomarker_data)*100)) ``` @@ -264,7 +264,7 @@ biomarker_data %>% Here, we can see that only about 3-4% of values are missing for our variables with missing data, so we will proceed to imputation with our dataset as-is. We will impute values using QRILC, which pulls from the left side of the data distribution (the lower values) to impute missing values. We will write a function that will apply QRILC imputation to our dataframe. This function takes a dataframe with missing values as input and returns a dataframe with QRILC imputed values in place of NAs as output. -```{r} +```{r 04-Chapter4-15} QRILC_imputation = function(df){ # Normalize data before applying QRILC per QRILC documentation ## Select only numeric columns, psuedo log2 transform, and convert to a matrix @@ -292,7 +292,7 @@ QRILC_imputation = function(df){ ``` Now we can apply the `QRILC_imputation()` function to our dataframe. We use the function `set.seed()` to ensure that the QRILC function generates the same numbers each time we run the script. For more on setting seeds, see [here](https://www.statology.org/set-seed-in-r/). -```{r} +```{r 04-Chapter4-16} # Set random seed to ensure reproducibility in results set.seed(1104) @@ -304,7 +304,7 @@ biomarker_data_imp <- QRILC_imputation(biomarker_data) ## Averaging Replicates The last step we need to take before our data are ready for analysis is averaging the two technical replicates for each donor and dose. We will do this by creating an ID column that represents the donor and dose together and using that column to group and average the data. This results in a dataframe where our rows contain data representing each biological replicate exposed to each of the five concentrations of acrolein. -```{r} +```{r 04-Chapter4-17} biomarker_data_imp_avg <- biomarker_data_imp %>% # Create an ID column that represents the donor and dose @@ -332,7 +332,7 @@ Generating descriptive statistics (e.g., mean, median, mode, range, standard dev ### Method #1 - Tidyverse and Basic Functions The mean, or average of data points, is one of the most commonly reported summary statistics and is often reported as mean ± standard deviation to demonstrate the spread in the data. Here, we will make a table of mean ± standard deviation for each of our biomarkers across each of the dose groups using *tidyverse* functions. -```{r} +```{r 04-Chapter4-18} # Calculate means biomarker_group_means <- biomarker_data_imp_avg %>% group_by(Dose) %>% @@ -343,7 +343,7 @@ datatable(biomarker_group_means) ``` You'll notice that there are a lot of decimal places in our calculated means, while in our original data, there are only two decimal places. We can add a step to round the data to our above code chunk to produce cleaner results. -```{r} +```{r 04-Chapter4-19} # Calculate means biomarker_group_means <- biomarker_data_imp_avg %>% group_by(Dose) %>% @@ -364,7 +364,7 @@ datatable(biomarker_group_means) ::: We can use very similar code to calculate our standard deviations: -```{r} +```{r 04-Chapter4-20} # Calculate means biomarker_group_sds <- biomarker_data_imp_avg %>% group_by(Dose) %>% @@ -384,7 +384,7 @@ Now we've calculated both the means and standard deviations! However, these are 5. Pivot the dataframe wider so that the dataframe resembles what we started with for the means and standard deviations. First, we'll pivot each dataframe to a long format and create a variable that represents each unique row. -```{r} +```{r 04-Chapter4-21} # Pivot dataframes longer and create variable column for each row biomarker_group_means_long <- pivot_longer(biomarker_group_means, !Dose, names_to = "variable", values_to = "mean") %>% @@ -400,7 +400,7 @@ datatable(biomarker_group_means_long) ``` Next, we will join the mean and standard deviation datasets. Notice that we are only joining the `Dose_variable` and `sd` columns from the standard deviation dataframe to prevent duplicate columns (`Dose`, `variable`) from being included. -```{r} +```{r 04-Chapter4-22} # Merge the dataframes by row biomarker_group_summstats <- left_join(biomarker_group_means_long, biomarker_group_sds_long %>% select(c(Dose_variable, sd)), @@ -411,7 +411,7 @@ datatable(biomarker_group_summstats) ``` Then, we can unite the mean and standard deviation columns and add the ± symbol between them by storing that character as a variable and pasting that variable in our `paste()` function. -```{r} +```{r 04-Chapter4-23} # Store plus/minus character plusminus <-"\u00b1" Encoding(plusminus)<-"UTF-8" @@ -425,7 +425,7 @@ datatable(biomarker_group_summstats) ``` Last, we can pivot the dataframe wider to revert it to its original layout, which is easier to read. -```{r} +```{r 04-Chapter4-24} # Pivot dataframe wider biomarker_group_summstats <- biomarker_group_summstats %>% @@ -444,7 +444,7 @@ These data are now in a publication-ready format that can be exported to a .txt, ### Method #2 - Applying a List of Functions Calculating our mean and standard deviation separately using *tidyverse* wasn't too difficult, but what if we want to calculate other descriptive statistics, such as minimum, median, and maximum? We could use the above approach, but we would need to make a separate dataframe for each and then merge them all together. Instead, we can use the `map_dfr()` function from the *purrr* package, which is also part of *tidyverse.* This function takes a list of functions you want to apply to your data and applies these functions over specified columns in the data. Let's see how it works: -```{r} +```{r 04-Chapter4-25} # Define summary functions summary_functs <- lst(min, median, mean, max, sd) @@ -467,24 +467,24 @@ There are also packages that have been developed for specifically making summary #### Table1 The *table1* package makes summary tables using the function `table1()`, which takes the columns that you want in the rows of the table on the left side of the first argument, followed by `|` and then the grouping variable. The output table can be customized in a number of ways, including what summary statistics are output and whether or not statistical comparisons are run between groups (see package vignette for more details). -```{r} +```{r 04-Chapter4-26} # Get names of all of the columns to include in the table paste(names(biomarker_data_imp_avg %>% select(IL1B:VEGF)), collapse=" + ") ``` -```{r eval = FALSE} +```{r 04-Chapter4-27, eval = FALSE} # Make the table table1(~ IL1B + IL6 + IL8 + IL10 + TNFa + VEGF | Dose, data = biomarker_data_imp_avg) ``` -```{r, echo = FALSE, fig.align = "center", out.width = "850px"} +```{r 04-Chapter4-28, echo = FALSE, fig.align = "center", out.width = "850px" } knitr::include_graphics("Chapter_4/Module4_2_Input/Module4_2_Image2.png") ``` #### Vtable The *vtable* package includes the function `st()`, which can also be used to make HTML tables (and other output formats; see `out` argument). For example: -```{r} +```{r 04-Chapter4-29} # HTML output st(biomarker_data_imp_avg, group = 'Dose') @@ -501,7 +501,7 @@ Similar to *table1*, see the package vignette for detailed information about how The last step we will take before beginning to test our data for statistical differences between groups (in the next module) is to understand our data's distribution through normality assessment. This will inform which statistical tests we will perform on our data. For more detail on normality testing, including detailed explanations of each type of normality assessment and explanations of the code underlying the following graphs and tables, see **TAME 2.0 Module 3.3 Normality Tests and Data Transformations**. We'll start by looking at histograms of our data for qualitative normality assessment: -```{r message = FALSE, fig.align = 'center'} +```{r 04-Chapter4-30, message = FALSE, fig.align = 'center'} # Set theme theme_set(theme_bw()) @@ -519,14 +519,14 @@ ggplot(biomarker_data_imp_avg_long, aes(value)) + From these histograms, we can see that IL-1$\beta$ appears to be normally distributed, while the other endpoints do not appear to be normally distributed. We can also use Q-Q plots to assess normality qualitatively: -```{r fig.align = 'center'} +```{r 04-Chapter4-31, fig.align = 'center'} ggqqplot(biomarker_data_imp_avg_long, x = "value", facet.by = "variable", ggtheme = theme_bw(), scales = "free") ``` With this figure panel, we can see that most of the variables have very noticeable deviations from the reference, suggesting non-normal distributions. To assess normality quantitatively, we can use the Shapiro-Wilk test. Note that the null hypothesis is that the sample distribution is normal, and a significant p-value means the distribution is non-normal. -```{r} +```{r 04-Chapter4-32} # Apply Shapiro Wilk test to dataframe shapiro_res <- apply(biomarker_data_imp_avg %>% select(IL1B:VEGF), 2, shapiro.test) @@ -558,14 +558,14 @@ datatable(shapiro_res) ### Log~2~ Transforming and Re-Assessing Normality Log~2~ transformation is a common transformation used in environmental health research and can move data closer to a normal distribution. For more on data transformation, see **TAME 2.0 Module 3.3 Normality Tests and Data Transformations**. We will pseudo-log~2~ transform our data, which adds a 1 to each value before log~2~ transformation and ensures that resulting values are positive real numbers. Let's see if the log~2~ data are more normally distributed than the raw data. -```{r} +```{r 04-Chapter4-33} # Apply log2 transformation to data biomarker_data_imp_avg_log2 <- biomarker_data_imp_avg %>% mutate(across(IL1B:VEGF, ~ log2(.x + 1))) ``` Make histogram panel: -```{r fig.align = 'center'} +```{r 04-Chapter4-34, fig.align = 'center'} # Pivot data longer and make figure panel of histograms biomarker_data_imp_avg_log2_long <- biomarker_data_imp_avg_log2 %>% pivot_longer(-c(Donor, Dose), names_to = "variable", values_to = "value") @@ -578,12 +578,12 @@ ggplot(biomarker_data_imp_avg_log2_long, aes(value)) + ``` Make Q-Q plot panel: -```{r fig.align = 'center'} +```{r 04-Chapter4-35, fig.align = 'center'} ggqqplot(biomarker_data_imp_avg_log2_long, x = "value", facet.by = "variable", ggtheme = theme_bw(), scales = "free") ``` Run Shapiro-Wilk test: -```{r} +```{r 04-Chapter4-36} # Apply Shapiro Wilk test shapiro_res_log2 <- apply(biomarker_data_imp_avg_log2 %>% select(IL1B:VEGF), 2, shapiro.test) @@ -669,12 +669,12 @@ The following example is based on extracting data from PDFs generated by Nanopar For this example, we will be extracting data from 5 PDFs that are identically formatted but contain information unique to each sample. The samples represent particles isolated from epithelial cell media following an experiment where cells were exposed to four different environmental chemicals (labeled "A", "B", "C", and "D") or a vehicle control (labeled "Ctrl"). Here is what a full view of one of the PDFs looks like, with values we want to extract highlighted in yellow: -```{r echo = FALSE, out.width = "850px", fig.align = "center"} +```{r 04-Chapter4-37, echo = FALSE, out.width = "850px", fig.align = "center"} knitr::include_graphics("Chapter_4/Module4_3_Input/Module4_3_Image1.png") ``` Our goal is to extract these values and end up with a dataframe that looks like this, with each sample in a row and each variable in a column: -```{r echo = FALSE, message = FALSE} +```{r 04-Chapter4-38, echo = FALSE, message = FALSE} # Loading packages library(tidyverse) library(openxlsx) @@ -694,7 +694,7 @@ datatable(ending_data) ``` If your files are not already named in a way that reflects unique sample information, such as the date of the experiment or sample ID, update your file names to contain this information before proceeding with the script. Here are the names for the example PDF files: -```{r, out.width = "400px", echo = FALSE, fig.align = 'center'} +```{r 04-Chapter4-39, out.width = "400px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_4/Module4_3_Input/Module4_3_Image2.png") ``` @@ -706,7 +706,7 @@ knitr::include_graphics("Chapter_4/Module4_3_Input/Module4_3_Image2.png") If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you. We will be using the *pdftools* and *tm* packages to extract text from the PDF. And instead of using `head()` to preview dataframes, we will be using the function `datatable()` from the *DT* package. This function produces interactive tables and generates better formatting for viewing dataframes that have long character strings (like the ones we will be viewing in this section). -```{r eval = FALSE} +```{r 04-Chapter4-40, eval = FALSE} if (!requireNamespace("pdftools")) install.packages("pdftools") if (!requireNamespace("tm")) @@ -718,7 +718,7 @@ if (!requireNamespace("janitor")) ``` Next, load the packages. -```{r warning = FALSE, message = FALSE} +```{r 04-Chapter4-41, warning = FALSE, message = FALSE} library(tidyverse) library(pdftools) library(tm) @@ -729,22 +729,22 @@ library(janitor) #### Initial data import from PDF files The following code stores the file names of all of the files in your directory that end in .pdf. To ensure that only PDFs of interest are imported, consider making a subfolder within your directory containing only the PDF extraction script file and the PDFs you want to extract data from. -```{r} +```{r 04-Chapter4-42} pdf_list <- list.files(path = "./Chapter_4/Module4_3_Input", pattern = "488.pdf$") ``` We can see that each of our file names are now contained in the list. -```{r} +```{r 04-Chapter4-43} head(pdf_list) ``` Next, we need to make a dataframe to store the extracted data. The `PDF Identifier` column will store the file name, and the `Text` column will store extracted text from the PDF. -```{r} +```{r 04-Chapter4-44} pdf_raw <- data.frame("PDF Identifier" = c(), "Text" = c()) ``` The following code uses a `for` loop to loop through each file (as stored in the pdf_list vector) and extract the text from the PDF. Sometimes this code generates duplicates, so we will also remove the duplicates with `distinct()`. -```{r message = FALSE, warning = FALSE} +```{r 04-Chapter4-45, message = FALSE, warning = FALSE} for (i in 1:length(pdf_list)){ # Iterating through each pdf file and separating each line of text @@ -766,7 +766,7 @@ pdf_raw <- pdf_raw %>% ``` The new dataframe contains the data from all of the PDFs, with the `PDF Identifier` column containing the name of the input PDF file that corresponds to the text in the column next to it. -```{r} +```{r 04-Chapter4-46} datatable(pdf_raw) ``` @@ -778,7 +778,7 @@ Specific variables of interest can be extracted from the `pdf_raw` dataframe by It is important to note that there can be different numbers of spaces in each row and after each semicolon, which will change the `sep` argument for each variable. For example, there are a different number of spaces after the semicolon for "Dilution Factor" than there are for "Concentration" (see above PDF screen shot for reference). We will work through an example for the first variable of interest, dilution factor, in detail. First, we can see what the dataframe looks like when we just filter rows based on keeping only rows that contain the string "Dilution Factor" in the text column using the `grepl()` function. -```{r} +```{r 04-Chapter4-47} dilution_factor_df <- pdf_raw %>% filter(grepl("Dilution Factor", Text)) @@ -786,13 +786,13 @@ datatable(dilution_factor_df) ``` The value we are trying to extract is at the end of a long character string. We will want to use the tidyverse function `separate()` to isolate those values, but we need to know what part of the character string will separate the dilution factor values from the rest of the text. To determine this, we can call just one of the data cells and copy the semicolon and following spaces for use in the `separate()` function. -```{r} +```{r 04-Chapter4-48} # Return the value in the first row and second column. dilution_factor_df[1,2] ``` Building on top of the previous code, we can now separate the dilution factor value from the rest of the text in the string. The `separate()` function takes an input data column and separates it into two or more columns based on the character passed to the separation argument. Here, everything before the separation string is discarded by setting the first new column to NA. Everything after the separation string will be stored in a new column called `Dilution Factor`, The starting `Text` column is removed by default. -```{r} +```{r 04-Chapter4-49} dilution_factor_df <- pdf_raw %>% filter(grepl("Dilution Factor", Text)) %>% separate(Text, into = c(NA, "Dilution Factor"), sep = ": ") @@ -801,7 +801,7 @@ datatable(dilution_factor_df) ``` For the "Original Concentration" variable, we filter rows by the string "pH" because the word concentration is found in multiple locations in the document. -```{r} +```{r 04-Chapter4-50} concentration_df = pdf_raw %>% filter(grepl("pH", Text)) %>% separate(Text, c(NA, "Concentration"), sep = ": ") @@ -810,7 +810,7 @@ datatable(concentration_df) ``` With the dilution factor variable, there were no additional characters after the value of interest, but here, "Particles / mL" remains and needs to be removed so that the data can be used in downstream analyses. We can add an additional cleaning step to remove "Particles / mL" from the data and add the units to the column title. `sep = " P"` refers to the space before and first letter of the string to be removed. -```{r} +```{r 04-Chapter4-51} concentration_df = pdf_raw %>% filter(grepl("pH", Text)) %>% separate(Text, c(NA, "Concentration"), sep = ": ") %>% @@ -820,7 +820,7 @@ datatable(concentration_df) ``` Next, we want to extract size distribution data from the lower table. Note that the space in the first `separate()` function comes from the space between the "Number" and "Concentration" column in the string, and the space in the second `separate()` function comes from the space between the variable name and the number of interest. We can also convert values to numeric since they are currently stored as characters. -```{r} +```{r 04-Chapter4-52} size_distribution_df = pdf_raw %>% filter(grepl("X10", Text)| grepl("X50 ", Text)| grepl("X90", Text) | grepl("Mean", Text)| grepl("StdDev", Text)) %>% separate(Text, c("Text", NA), sep = " ") %>% @@ -834,7 +834,7 @@ datatable(size_distribution_df) ### Creating the final dataframe Now that we have created dataframes for all of the variables that we are interested in, we can join them together into one final dataframe. -```{r} +```{r 04-Chapter4-53} # Make list of all dataframes to include all_variables <- list(dilution_factor_df, concentration_df, size_distribution_df) @@ -848,7 +848,7 @@ datatable(full_df) ``` For easier downstream analysis, the last step is to separate the `PDF Identifier` column into an informative sample ID that matches up with other experimental data. -```{r} +```{r 04-Chapter4-54} final_df <- full_df %>% separate('PDF Identifier', # Split sample identifier column into new columns, retaining the original column @@ -860,7 +860,7 @@ datatable(final_df) ``` Let's make a graph to help us answer Environmental Health Question 1. -```{r message = FALSE} +```{r 04-Chapter4-55, message = FALSE} theme_set(theme_bw()) data_for_graphing <- final_df %>% @@ -893,12 +893,12 @@ The above workflow is useful if you just want to extract a few specific values f The following example is based on extracting dataframes from a long PDF containing many individual data tables. This particular PDF came from the NIH's BioLINCC Repository and details variables that researchers can request from the repository. Variables are part of larger datasets that contain many variables, with each dataset in a separate table. All of the tables are stored in one PDF file, and some of the tables are longer than one page (this will become relevant later on!). Similar to the first PDF workflow, remember that this is a specific example intended to demonstrate how to work through extracting data from PDFs. Modifications will need to be made for differently formatted PDFs. Here is what the first three pages of our 75-page starting PDF look like: -```{r echo = FALSE, out.width = "850px", fig.align = "center"} +```{r 04-Chapter4-56, echo = FALSE, out.width = "850px", fig.align = "center"} knitr::include_graphics("Chapter_4/Module4_3_Input/Module4_3_Image3.png") ``` If we zoom in a bit more on the first page, we can see that the dataset name is defined in bold above each table. This formatting is consistent throughout the PDF. -```{r echo = FALSE, out.width = "850px", fig.align = "center"} +```{r 04-Chapter4-57, echo = FALSE, out.width = "850px", fig.align = "center"} knitr::include_graphics("Chapter_4/Module4_3_Input/Module4_3_Image4.png") ``` @@ -910,7 +910,7 @@ The zoomed in view also allows us to see the columns and their contents more cle - `Label`: A description of the variable and values associated with the variable. After extracting the data, we want to end up with a dataframe that contains all of the variables, their corresponding columns, and a column that indicates which dataset the variable is associated with: -```{r echo = FALSE} +```{r 04-Chapter4-58, echo = FALSE} biolincc_final <- read.xlsx("Chapter_4/Module4_3_Input/Module4_3_InputData3.xlsx") %>% clean_names() @@ -923,7 +923,7 @@ datatable(biolincc_final) Similar to previous sections, we need to install and load a few packages before proceeding. The *tabulapdf* package needs to be installed in a specific way as shown below and can sometimes be difficult to install on Macs. If errors are produced, follow the troubleshooting tips outlined in [this](https://stackoverflow.com/questions/67849830/how-to-install-rjava-package-in-mac-with-m1-architecture) Stack Overflow solution. -```{r eval = FALSE} +```{r 04-Chapter4-59, eval = FALSE} # To install all of the packages except for tabulapdf if (!requireNamespace("stringr")) install.packages("stringr") @@ -933,7 +933,7 @@ if (!requireNamespace("rJava")) install.packages("rJava") ``` -```{r message = FALSE, eval = FALSE} +```{r 04-Chapter4-60, message = FALSE, eval = FALSE} # To install tabulapdf if (!require("remotes")) { install.packages("remotes") @@ -945,7 +945,7 @@ remotes::install_github(c("ropensci/tabulizerjars", "ropensci/tabulapdf"), force ``` Load packages: -```{r message = FALSE, eval = FALSE} +```{r 04-Chapter4-61, message = FALSE, eval = FALSE} library(tabulapdf) library(tidyverse) library(janitor) @@ -956,17 +956,17 @@ library(stringr) #### Initial data import from PDF file The `extract_tables()` function automatically extracts tables from PDFs and stores them as tibbles (a specific tidyverse data structure similar to a dataframe) within a list. One table is extracted per page, even if the table spans multiple pages. This line of code can take a few seconds to run depending on the length of your PDF. -```{r} +```{r 04-Chapter4-62} tables <-tabulapdf::extract_tables("Chapter_4/Module4_3_Input/Module4_3_InputData4.pdf", output = "tibble") ``` Glimpsing the first three elements in the tables list, we can see that each list element is a dataframe containing the columns from the PDF tables. -```{r} +```{r 04-Chapter4-63} glimpse(tables[1:3]) ``` Exploring further, here is how each dataframe is formatted: -```{r} +```{r 04-Chapter4-64} datatable(tables[[1]]) ``` @@ -975,7 +975,7 @@ Notice that, although the dataframe format mirrors the PDF table format, the lab ### Cleaning dataframes First, we will select the columns we are interested in and use the `fill()` function to change the NAs in the "Num" column so that each line of text in the "Label" column has the correct "Num" value in the same row. -```{r} +```{r 04-Chapter4-65} cleaned_table1 <- data.frame(tables[[1]]) %>% # Extract the first table in the list # Select only the columns of interest @@ -991,7 +991,7 @@ datatable(cleaned_table1) ``` We still need to move all of the Label text for each variable into one cell in one row instead of across multiple rows. For this, we can use the `unlist()` function. Here is a demonstration of how the `unlist()` function works using just the first variable: -```{r} +```{r 04-Chapter4-66} cleaned_table1_var1 <- cleaned_table1 %>% # Filter dataframe to just contain rows associated with the first variable @@ -1004,7 +1004,7 @@ datatable(cleaned_table1_var1) ``` We now have all of the text we want in one cell, but we have duplicate rows that we don't need. We can get rid of these rows by assigning blank values "NA" and then omitting rows that contain NAs. -```{r warning = FALSE} +```{r 04-Chapter4-67, warning = FALSE} cleaned_table1_var1 <- cleaned_table1_var1 %>% mutate(across(Variable, na_if, "")) %>% na.omit() @@ -1013,7 +1013,7 @@ datatable(cleaned_table1_var1) ``` We need to apply this code to the whole dataframe and not just one variable, so we can add `group_by(Num)` to our cleaning workflow, followed by the code we just applied to our filtered dataframe. -```{r warning = FALSE} +```{r 04-Chapter4-68, warning = FALSE} cleaned_table1 <- data.frame(tables[[1]]) %>% # Extract the first table in the list # Select only the columns of interest @@ -1042,7 +1042,7 @@ datatable(cleaned_table1) Ultimately, we need to clean up each dataframe in the list the same way, and we need all of the dataframes to be in one dataframe, instead of in a list. There are a couple of different ways to do this. Both rely on the code shown above for cleaning up each dataframe. Option #1 uses a for loop, while Option #2 uses application of a function on the list of dataframes. Both result in the same ending dataframe! **Option #1** -```{r warning = FALSE} +```{r 04-Chapter4-69, warning = FALSE} # Create a dataframe for storing variables variables <- data.frame() @@ -1066,7 +1066,7 @@ datatable(variables) ``` **Option #2** -```{r warning = FALSE} +```{r 04-Chapter4-70, warning = FALSE} # Write a function that applies all of the cleaning steps to an dataframe (output = cleaned dataframe) clean_tables <- function(data) { @@ -1101,14 +1101,14 @@ First, we will read in the pdf using the PDF tools package. This results in a ve + Each line is separated by `\n` + Elements [1] and [2] of the vector contain the text "dataset Name:", while element [3] does not because the third page was a continuation of the table from the second page and therefore did not have a table title. -```{r} +```{r 04-Chapter4-71} table_names <- pdf_text("Chapter_4/Module4_3_Input/Module4_3_InputData4.pdf") head(table_names[1:3]) ``` Similar to the table cleaning section, we will work through an example of extracting the text of interest from one of these character vectors, then apply the same code to all of the character vectors. First, we will select just the first element in the vector and make it into a dataframe. -```{r} +```{r 04-Chapter4-72} # Create dataframe dataset_name_df_var1 <- data.frame(strsplit(table_names[1], "\n")) @@ -1120,7 +1120,7 @@ datatable(dataset_name_df_var1) ``` Next, we will extract the dataset name using the same approach used in extracting values from the nanoparticle tracking example above and assign the name to a variable. We filter by the string "Data Set Name" because this is the start of the text string in the row where our dataset name is stored and is the same across all of our datasets. -```{r} +```{r 04-Chapter4-73} # Create dataframe dataset_name_df_var1 <- dataset_name_df_var1 %>% filter(grepl("Data Set Name", dataset_name_df_var1$Text)) %>% @@ -1134,7 +1134,7 @@ dataset_name_var1 ``` Now that we have the dataset name stored as a variable, we can create a dataframe that will correspond to the rows in our `variables` dataframe. The challenge is that each dataset contains a different number of variables! We can determine how many rows each dataset contains by returning to our `variables` dataframe and calculating the number of rows associated with each dataset. The following code splits the `variables` dataframe into a list of dataframes by each occurrence of 1 in the "Num" column (when the numbering restarts for a new dataset). -```{r} +```{r 04-Chapter4-74} # Calculate the number of rows associated with each dataset for reference dataset_list <- split(variables, cumsum(variables$Num == 1)) @@ -1142,7 +1142,7 @@ glimpse(dataset_list[1:3]) ``` The number of rows in each list is the number of variables in that dataset. We can use this value in creating our dataframe of dataset names. -```{r} +```{r 04-Chapter4-75} # Store the number of rows in a variable n_rows = nrow(data.frame(dataset_list[1])) @@ -1154,7 +1154,7 @@ datatable(dataset_name_var1) ``` We now have a dataframe that can be joined with our `variables` dataframe for the first table. We can apply this approach to each table in our original PDF using a `for` loop. -```{r} +```{r 04-Chapter4-76} # Make dataframe to store dataset names dataset_names <- data.frame() @@ -1201,7 +1201,7 @@ datatable(dataset_names) ### Combining Dataset Names and Variable Information Last, we will merge together the dataframe containing dataset names and variable information. -```{r} +```{r 04-Chapter4-77} # Merge together final_variable_df <- cbind(dataset_names, variables) %>% rename("Variable Description" = "Label", "Variable Number Within Dataset" = "Num") %>% @@ -1211,7 +1211,7 @@ datatable(final_variable_df) ``` We can also determine how many total variables we have, all of which are accessible via the table we just generated. -```{r} +```{r 04-Chapter4-78} # Total number of variables nrow(final_variable_df) @@ -1263,7 +1263,7 @@ This training module was specifically developed to answer the following environm ### Workspace Preparation and Data Import Here, we will import the processed data that we generated at the end of **TAME 2.0 Module 4.2 Data Import, Processing, and Summary Statistics**. These data, along with the associated demographic data, were introduced in **TAME 2.0 Module 4.1 Overview of Experimental Design and Example Data**. These data represent log~2~ concentrations of inflammatory biomarkers secreted by airway epithelial cells after exposure to four different concentrations of acrolein (plus filtered air as a control). We will also load packages that will be needed for the analysis, including previously introduced packages such as *openxlsx*, *tidyverse*, *DT*, and *ggpubr*, and additional packages relevant to statistical analysis and graphing that will be discussed in greater detail below. -```{r message = FALSE} +```{r 04-Chapter4-79, message = FALSE} # Load packages library(openxlsx) library(tidyverse) @@ -1272,7 +1272,7 @@ library(rstatix) library(ggpubr) ``` -```{r} +```{r 04-Chapter4-80} # Import data biomarker_data <- read.xlsx("Chapter_4/Module4_4_Input/Module4_4_InputData1.xlsx") demographic_data <- read.xlsx("Chapter_4/Module4_4_Input/Module4_4_InputData2.xlsx") @@ -1327,7 +1327,7 @@ This decision matters more when dealing with smaller sample sizes (*n*<10) as sm ### Which test should I choose? We provide the following flowchart to help guide your choice of statistical test to compare two groups: -```{r, echo = FALSE, fig.align = "center", out.width = "800px"} +```{r 04-Chapter4-81, echo = FALSE, fig.align = "center", out.width = "800px" } knitr::include_graphics("Chapter_4/Module4_4_Input/Module4_4_Image1.png") ``` @@ -1345,34 +1345,34 @@ In discussions of effect size, the population size is also a consideration - a s We will start by performing a statistical test to determine whether there are significant differences in biomarker concentrations between male and female donors at baseline (0 ppm exposure). Previously we determined that the majority of our data was non-normally distributed (see **TAME 2.0 Module 4.2 Data Import, Processing, and Summary Statistics**), so we'll skip testing for that assumption in this module. Based on those results, we will use the Wilcoxon test to determine if there are significant differences between groups. The Wilcoxon test does not assume homogeneity of variance, so we do not need to test for that prior to applying the test. This is an unpaired analysis because samples collected from the cells derived from male and female donor cells are different sets of cells (i.e., independent from each other). Thus, the specific statistical test applied will be the Wilcoxon Rank Sum test. First, we will filter our dataframe to only data representing the control (0 ppm) exposure: -```{r} +```{r 04-Chapter4-82} biomarker_data_malevsfemale <- biomarker_data %>% filter(Dose == "0") ``` Next, we need to add the demographic data to our dataframe: -```{r} +```{r 04-Chapter4-83} biomarker_data_malevsfemale <- biomarker_data_malevsfemale %>% left_join(demographic_data %>% select(Donor, Sex), by = "Donor") ``` Here is what our data look like now: -```{r} +```{r 04-Chapter4-84} datatable(biomarker_data_malevsfemale) ``` We can demonstrate the basic anatomy of the Wilcoxon test function `wilcox.test()` by running the function on just one variable. -```{r} +```{r 04-Chapter4-85} wilcox.test(IL1B ~ Sex, data = biomarker_data_malevsfemale) ``` The p-value of 0.8371 indicates that males and females do not have significantly different concentrations of IL-1$\beta$. The `wilcox.test()` function is part of the pre-loaded package *stats*. The package [*rstatix*](https://rpkgs.datanovia.com/rstatix/) provides identical statistical tests to *stats* but in a pipe-friendly (tidyverse-friendly) format, and these functions output results as dataframes rather than the text displayed above. -```{r} +```{r 04-Chapter4-86} biomarker_data_malevsfemale %>% wilcox_test(IL1B ~ Sex) ``` Here, we can see the exact same results as with the `wilcox.test()` function. For the rest of this module, we'll proceed with using the *rstatix* version of statistical testing functions. Although it is simple to run the Wilcoxon test with the code above, it's impractical for a large number of endpoints and doesn't store the results in an organized way. Instead, we can run the Wilcoxon test over every variable of interest using a `for` loop. There are also other ways you could approach this, such as a function applied over a list. This `for` loop runs the Wilcoxon test on each endpoint, stores the results in a dataframe, and then binds together the results dataframes for each variable of interest. Note that you could easily change `wilcox_test()` to `t_test()` and add additional arguments to modify the way the statistical test is run. -```{r warning = FALSE} +```{r 04-Chapter4-87, warning = FALSE} # Create a vector with the names of the variables you want to run the test on endpoints <- colnames(biomarker_data_malevsfemale %>% select(IL1B:VEGF)) @@ -1433,18 +1433,18 @@ For this analysis, we will not adjust for multiple hypothesis testing due to our To demonstrate an example of a paired two group test, we can also determine whether exposure to 4 ppm acrolein significantly changes biomarker concentrations. This is now a paired design because each donor's cells were exposed to both 0 and 4 ppm acrolein. To prepare the data, we will filter the dataframe to only include 0 and 4 ppm: -```{r} +```{r 04-Chapter4-88} biomarker_data_0vs4 <- biomarker_data %>% filter(Dose == "0" | Dose == "4") ``` Let's view the dataframe. Note how the measurements for each donor are next to each other - this an important element of the default handling of the paired analysis in R. The dataframe should have the donors in the same order for the 0 and 4 ppm data. -```{r} +```{r 04-Chapter4-89} datatable(biomarker_data_0vs4) ``` We can now run the same type of loop that we ran before, changing the independent variable in the formula to `~ Dose` and adding `paired = TRUE` to the `wilcox_test()` function. -```{r} +```{r 04-Chapter4-90} # Create a vector with the names of the variables you want to run the test on endpoints <- colnames(biomarker_data_0vs4 %>% select(IL1B:VEGF)) @@ -1472,7 +1472,7 @@ dose_wilcoxres Although this dataframe contains useful information about our statistical test, such as the groups being compared, the sample size (*n*) of each group, and the test statistic, what we really want (and what would likely be shared in supplemental material), is a more simplified version of these results in table format and more detailed information (*n*, specific statistical test, groups being compared) in the table legend. We can clean up the results using the following code to make clearer column names and ensure that the p-values are formatted consistently. -```{r} +```{r 04-Chapter4-91} dose_wilcoxres <- dose_wilcoxres %>% select(c(.y., p)) %>% mutate(p = format(p, digits = 3, scientific = TRUE)) %>% @@ -1499,7 +1499,7 @@ Now, let's visualize our results using *ggplot2*. For an introduction to *ggplot ### Single Plots We will start by making a very basic box and whisker plot of the IL-1$\beta$ data with individual data points overlaid. It is best practice to show all data points, allowing the reader to view the whole spread of the data, which can be obscured by plots such as bar plots with mean and standard error. -```{r, fig.align = "center"} +```{r 04-Chapter4-92, fig.align = "center"} # Setting theme for plot theme_set(theme_bw()) @@ -1510,7 +1510,7 @@ ggplot(biomarker_data_0vs4, aes(x = Dose, y = IL1B)) + ``` We could add statistical markings to denote significance to this graph manually in PowerPoint or Adobe Illustrator, but there are actually R packages that act as extensions to *ggplot2* and will do this for you! Two of our favorites are [*ggpubr*](http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/76-add-p-values-and-significance-levels-to-ggplots/) and [*ggsignif*](https://cran.r-project.org/web/packages/ggsignif/vignettes/intro.html). Here is an example using *ggpubr*: -```{r, fig.align = "center"} +```{r 04-Chapter4-93, fig.align = "center"} ggplot(biomarker_data_0vs4, aes(x = Dose, y = IL1B)) + geom_boxplot() + geom_jitter(position = position_jitter(0.15)) + @@ -1519,7 +1519,7 @@ ggplot(biomarker_data_0vs4, aes(x = Dose, y = IL1B)) + ``` We can further clean up our figure by modifying elements of the plot's theme, including the font sizes, axis range, colors, and the way that the statistical results are presented. Perfecting figures can be time consuming but ultimately worth it, because clear figures aid greatly in presenting a coherent story that is understandable to readers/listeners. -```{r fig.align = "center"} +```{r 04-Chapter4-94, fig.align = "center"} ggplot(biomarker_data_0vs4, aes(x = Dose, y = IL1B)) + # outlier.shape = NA removes outliers geom_boxplot(aes(fill = Dose), outlier.shape = NA) + @@ -1548,7 +1548,7 @@ Making one plot was relatively straightforward, but to graph all of our endpoint While these are workable solutions and would get us to the same place, *ggplot2* actually contains a function - `facet_wrap()` - that can be used to graph multiple endpoints from the same groups in one figure panel, which takes care of a lot of the work for us! To prepare our data for facet plotting, first we will pivot it longer: -```{r} +```{r 04-Chapter4-95} biomarker_data_0vs4_long <- biomarker_data_0vs4 %>% pivot_longer(-c(Donor, Dose), names_to = "variable", values_to = "value") @@ -1572,7 +1572,7 @@ Then, we can use similar code to what we used to make our single graph, with a f - To add padding along the y axis, allowing space for significance asterisks, we added `scale_y_continuous(expand = expansion(mult = c(0.1, 0.4)))` -```{r warning = FALSE, fig.align = "center"} +```{r 04-Chapter4-96, warning = FALSE, fig.align = "center"} # Create clean labels for the graph titles new_labels <- c("IL10" = "IL-10", "IL1B" = "IL-1\u03B2 ", "IL6" = "IL-6", "IL8" = "IL-8", "TNFa" = "TNF-\u03b1", "VEGF" = "VEGF") @@ -1651,12 +1651,12 @@ This training module was specifically developed to answer the following environm Here, we will import the processed data that we generated at the end of TAME 2.0 Module 4.2, introduced in **TAME 2.0 Module 4.1 Overview of Experimental Design and Example Data** and the associated demographic data. These data represent log~2~ concentrations of inflammatory biomarkers secreted by airway epithelial cells after exposure to four different concentrations of acrolein (plus filtered air as a control). We will also load packages that will be needed for the analysis, including previously introduced packages such as *openxlsx*, *tidyverse*, *DT*, *ggpubr*, and *rstatix*. #### Cleaning the global environment -```{r, clear_envi, echo=TRUE, eval=TRUE} +```{r 04-Chapter4-97, clear_envi, echo=TRUE, eval=TRUE} rm(list=ls()) ``` #### Loading R packages required for this session -```{r, load_ libs, echo=TRUE, eval=TRUE, warning=FALSE, error=FALSE, results='hide', message=FALSE} +```{r 04-Chapter4-98, load_libs, echo=TRUE, eval=TRUE, warning=FALSE, error=FALSE, results='hide', message=FALSE} library(openxlsx) library(tidyverse) library(DT) @@ -1665,12 +1665,12 @@ library(ggpubr) ``` #### Set your working directory -```{r, file_path, echo=TRUE, eval=FALSE, error=FALSE, results='hide', message=FALSE} +```{r 04-Chapter4-99, file_path, echo=TRUE, eval=FALSE, error=FALSE, results='hide', message=FALSE} setwd("/filepath to where your input files are") ``` #### Importing example dataset -```{r, read_ data, echo=TRUE, eval=TRUE} +```{r 04-Chapter4-100, read_data, echo=TRUE, eval=TRUE} biomarker_data <- read.xlsx("Chapter_4/Module4_5_Input/Module4_5_InputData1.xlsx") demographic_data <- read.xlsx("Chapter_4/Module4_5_Input/Module4_5_InputData2.xlsx") @@ -1683,7 +1683,7 @@ datatable(demographic_data) ## Overview of Multi-Group Statistical Tests Before applying statistical tests to our data, let's first review the mechanics of multi-group statistical tests, including overall effects tests and post-hoc tests. -```{r, echo = FALSE, fig.align = "center", out.width = "600px"} +```{r 04-Chapter4-101, echo = FALSE, fig.align = "center", out.width = "600px" } knitr::include_graphics("Chapter_4/Module4_5_Input/Module4_5_Image1.png") ``` @@ -1691,7 +1691,7 @@ knitr::include_graphics("Chapter_4/Module4_5_Input/Module4_5_Image1.png") The first step for multi-group statistical testing is to run an overall effects test. The null hypothesis for the overall effects test is that there are no differences among group means. A significant p-value rejects the null hypothesis that the groups are drawn from populations with the same mean and indicates that at least one group differs significantly from the overall mean. Similar to two-group statistical testing, choice of the specific overall statistical test to run depends on whether the data are normally or non-normally distributed and whether the experimental design is paired: -```{r, echo = FALSE, fig.align = "center", out.width = "700px"} +```{r 04-Chapter4-102, echo = FALSE, fig.align = "center", out.width = "700px" } knitr::include_graphics("Chapter_4/Module4_5_Input/Module4_5_Image2.png") ``` @@ -1700,7 +1700,7 @@ Importantly, overall effects tests return **one** p-value regardless of the numb ### Post-Hoc Testing If significance is obtained with an overall effects test, we can use post-hoc testing to determine which specific pairs of groups are significantly different from each other. Just as with two group statistical tests and overall effects multi-group statistical tests, choosing the appropriate post-hoc test depends on the data's normality and whether the experimental design is paired: -```{r, echo = FALSE, fig.align = "center", out.width = "700px"} +```{r 04-Chapter4-103, echo = FALSE, fig.align = "center", out.width = "700px" } knitr::include_graphics("Chapter_4/Module4_5_Input/Module4_5_Image3.png") ``` @@ -1715,7 +1715,7 @@ When applying a post-hoc test, you may choose to compare every group to every ot ### Which test should I choose? Use the following flowchart to help guide your choice of statistical test to compare multiple groups: -```{r, echo = FALSE, fig.align = "center", out.width = "900px"} +```{r 04-Chapter4-104, echo = FALSE, fig.align = "center", out.width = "900px" } knitr::include_graphics("Chapter_4/Module4_5_Input/Module4_5_Image4.png") ``` @@ -1724,14 +1724,14 @@ knitr::include_graphics("Chapter_4/Module4_5_Input/Module4_5_Image4.png") ## Multi-Group Analysis Example To determine whether there are significant differences across all of our doses, the Friedman test is the most appropriate due to our matched experimental design and non-normally distributed data. The `friedman_test()` function is part of the [rstatix](https://github.com/kassambara/rstatix) package. This package also has many other helpful functions for statistical tests that are pipe/tidyverse friendly. To demonstrate how this test works, we will first perform the test on one variable: -```{r} +```{r 04-Chapter4-105} biomarker_data %>% friedman_test(IL1B ~ Dose | Donor) ``` A p-value of 0.01 indicates that we can reject the null hypothesis that all of our data are drawn from groups that have equivalent means. Now, we can run a `for` loop similar to our two-group comparisons in **TAME 2.0 Module 4.4 Two Group Comparisons and Visualizations** to determine the overall p-value for each endpoint: -```{r} +```{r 04-Chapter4-106} # Create a vector with the names of the variables you want to run the test on endpoints <- colnames(biomarker_data %>% select(IL1B:VEGF)) @@ -1757,7 +1757,7 @@ datatable(dose_friedmanres) ``` These results demonstrate that all of our endpoints have significant overall differences across doses (p < 0.05). To determine which pairwise comparisons are significant, we next need to apply a post-hoc test. We will apply a pairwise, paired Wilcoxon test due to our experimental design and data distribution, with the Benjamini-Hochberg (BH) correction for multiple testing: -```{r} +```{r 04-Chapter4-107} dose_wilcox_posthoc_IL1B <- biomarker_data %>% pairwise_wilcox_test(IL1B ~ Dose, paired = TRUE, p.adjust.method = "BH") @@ -1765,7 +1765,7 @@ dose_wilcox_posthoc_IL1B ``` Here, we can now see whether there are statistically significant differences in IL-1$\beta$ secretion between each of our doses. To generate pairwise comparison results for each of our inflammatory biomarkers, we can run a for loop similar to the one we ran for our overall test: -```{r} +```{r 04-Chapter4-108} # Create a vector with the names of the variables you want to run the test on endpoints <- colnames(biomarker_data %>% select(IL1B:VEGF)) @@ -1790,7 +1790,7 @@ datatable(dose_wilcox_posthoc) ``` We now have a dataframe storing all of our pairwise comparison results. However, this is a lot to scroll through, making it hard to interpret. We can generate a publication-quality table by manipulating the table and joining it with the overall test data. -```{r} +```{r 04-Chapter4-109} dose_results_cleaned <- dose_wilcox_posthoc %>% unite(comparison, group1, group2, sep = " vs. ") %>% select(c(.y., comparison, p.adj)) %>% @@ -1804,7 +1804,7 @@ datatable(dose_results_cleaned) ``` To more easily see overall significance patterns, we could also make the same table but with significance stars instead of p-values by keeping the `p.adjust.signif` column instead of the `p.adj` column in our post-hoc test results dataframe: -```{r} +```{r 04-Chapter4-110} dose_results_cleaned_2 <- dose_wilcox_posthoc %>% unite(comparison, group1, group2, sep = " vs. ") %>% select(c(.y., comparison, p.adj.signif)) %>% @@ -1841,13 +1841,13 @@ We first need to format our existing statistical results so that they match the + `y.position`, which tells the function where to plot the significance markers Our results dataframe for IL-1$\beta$ already contains our groups and p-values: -```{r} +```{r 04-Chapter4-111} datatable(dose_wilcox_posthoc_IL1B) ``` We can add the position columns using the function `add_xy_position()`: -```{r} +```{r 04-Chapter4-112} dose_wilcox_posthoc_IL1B <- dose_wilcox_posthoc_IL1B %>% add_xy_position(x = "Dose", step.increase = 2) @@ -1855,7 +1855,7 @@ datatable(dose_wilcox_posthoc_IL1B) ``` Now, we are ready to make a graph of our results. We will use `stat_friedman_test()` to add our overall p-value and `stat_pvalue_manual()` to add our pairwise values. -```{r out.width = "600px", message = FALSE, fig.align = "center"} +```{r 04-Chapter4-113, out.width = "600px", message = FALSE, fig.align = "center"} # Set graphing theme theme_set(theme_bw()) @@ -1884,7 +1884,7 @@ However, to make room for all of our annotations, our data become compressed, an First, let's filter our results to significant results and change the symbol for comparisons that are not to the 0 dose to a caret (^) instead of stars. We can do this by creating a new column called label that keeps the existing label if `group1` is 0, and if not, changes the label to a caret of the same length. We then use the summarize function to paste the labels for each of the groups together, resulting in a final dataframe containing our annotations for our plot. -```{r} +```{r 04-Chapter4-114} dose_wilcox_posthoc_IL1B_2 <- dose_wilcox_posthoc_IL1B %>% # Filter results to those that are significant @@ -1909,7 +1909,7 @@ dose_wilcox_posthoc_IL1B_2 ``` Then, we can use the same code as for our previous plot, but instead of using `stat_pvalue_manual()`, we will use `geom_text()` in combination with the dataframe we just created. -```{r out.width = "600px", fig.align = "center"} +```{r 04-Chapter4-115, out.width = "600px", fig.align = "center"} ggplot(biomarker_data, aes(x = Dose, y = IL1B)) + geom_boxplot(aes(fill = Dose), outlier.shape = NA) + scale_fill_manual(values = c("#BFBFBF", "#D5A298", "#E38273", "#EB5F4E", "#EE2B2B")) + @@ -1950,11 +1950,11 @@ We will take similar steps here that we did when constructing our single endpoin 2. Add to the label/annotation dataframe what we want the y position for each of the labels to be, which will be different for each endpoint. First, let's create our annotations dataframe. We will start with the results dataframe from our posthoc testing: -```{r} +```{r 04-Chapter4-116} datatable(dose_wilcox_posthoc) ``` -```{r} +```{r 04-Chapter4-117} dose_wilcox_posthoc_forgraph <- dose_wilcox_posthoc %>% filter(p.adj <= 0.05) %>% @@ -1970,7 +1970,7 @@ datatable(dose_wilcox_posthoc_forgraph) The `Dose` column will be used to tell *ggplot2* where to place the annotations on the x axis, but we need to also specify where to add the annotations on the y axis. This will be different for each variable because each variable is on a different scale. We can approach this by computing the maximum value of each variable, then increasing that by 20% to add some space on top of the points. -```{r} +```{r 04-Chapter4-118} sig_labs_y <- biomarker_data %>% summarise(across(IL1B:VEGF, \(x) max(x))) %>% t() %>% as.data.frame() %>% @@ -1982,13 +1982,13 @@ sig_labs_y ``` Then, we can join these data to our labeling dataframe to complete what we need to make the annotations. -```{r} +```{r 04-Chapter4-119} dose_wilcox_posthoc_forgraph <- dose_wilcox_posthoc_forgraph %>% left_join(sig_labs_y, by = "variable") ``` Now, it's time to graph! Keep in mind that although the plotting script can get long and unweildy, each line is just a new instruction to ggplot about a formatting element or an additional layer to add to the graph. -```{r out.width = "800px", fig.align = "center"} +```{r 04-Chapter4-120, out.width = "800px", fig.align = "center"} # Pivot data longer biomarker_data_long <- biomarker_data %>% pivot_longer(-c(Donor, Dose), names_to = "variable", values_to = "value") @@ -2090,12 +2090,12 @@ This training module was specifically developed to answer the following environm Here, we will import the processed data that we generated at the end of TAME 2.0 Module 4.2, introduced in **TAME 2.0 Module 4.1 Overview of Experimental Design and Example Data** and associated demographic data. These data represent log~2~ concentrations of inflammatory biomarkers secreted by airway epithelial cells after exposure to four different concentrations of acrolein (plus filtered air as a control). We will also load packages that will be needed for the analysis, including previously introduced packages such as *openxlsx*, *tidyverse*, *DT*, *ggpubr*, and *rstatix*. #### Cleaning the global environment -```{r, clear__env, echo=TRUE, eval=FALSE} +```{r 04-Chapter4-121, clear__env, echo=TRUE, eval=FALSE} rm(list=ls()) ``` #### Loading R packages required for this session -```{r, load__libs, echo=TRUE, eval=TRUE, warning=FALSE, error=FALSE, results='hide', message=FALSE} +```{r 04-Chapter4-122, load__libs, echo=TRUE, eval=TRUE, warning=FALSE, error=FALSE, results='hide', message=FALSE} library(openxlsx) library(tidyverse) library(DT) @@ -2108,12 +2108,12 @@ theme_set(theme_bw()) # Set graphing theme ``` #### Set your working directory -```{r, file path, echo=TRUE, eval=FALSE, error=FALSE, results='hide', message=FALSE} +```{r 04-Chapter4-123, file_path, echo=TRUE, eval=FALSE, error=FALSE, results='hide', message=FALSE} setwd("/filepath to where your input files are") ``` #### Importing example dataset -```{r, read__data, echo=TRUE, eval=TRUE} +```{r 04-Chapter4-124, read__data, echo=TRUE, eval=TRUE} biomarker_data <- read.xlsx("Chapter_4/Module4_6_Input/Module4_6_InputData1.xlsx") demographic_data <- read.xlsx("Chapter_4/Module4_6_Input/Module4_6_InputData2.xlsx") @@ -2160,7 +2160,7 @@ The first step would be to check that the assumptions (independence, homogeneity To run our two-way ANOVA, we will use the `anova_test()` function from the *rstatix* package. This function allows us to define subject identifiers for matching between-subject factor variables (such as sex - factors that differ between subjects) and within-subject factors (such as dose - factors that are measured within each subject). Since we have both between- and within- subject factors, we will specifically be running a two-way mixed ANOVA. First, we need to add our demographic data to our biomarker data so that these variables can be incorporated into the analysis. Also, we need to convert `Dose` into a factor to specify the levels. -```{r} +```{r 04-Chapter4-125} biomarker_data <- biomarker_data %>% left_join(demographic_data, by = "Donor") %>% mutate(Dose = factor(Dose, levels = c("0", "0.6", "1", "2", "4"))) @@ -2170,7 +2170,7 @@ datatable(biomarker_data) ``` Then, we can demonstrate how to run the two-way ANOVA and what the results look like by running the test on just one of our variables (IL-1$\beta$). -```{r} +```{r 04-Chapter4-126} get_anova_table(anova_test(data = biomarker_data, dv = IL1B, wid = Donor, @@ -2193,7 +2193,7 @@ Similar to previous modules, we now want to apply our two-way ANOVA to each of o 1. Loop through each column in the data and apply the test to each column. 2. Pull out statistics we are interested in (for example, p-value) and bind the results from each column together into a results dataframe. -```{r} +```{r 04-Chapter4-127} # Create a vector with the names of the variables you want to run the test on endpoints <- colnames(biomarker_data %>% dplyr::select(IL1B:VEGF)) @@ -2236,7 +2236,7 @@ An appropriate title for this table could be: From this table, dose is the only variable with significant differences in concentrations in all 6 biomarkers (p-value < 0.05). Although we know that dose has significant differences overall, an ANOVA test doesn't tell us which doses of acrolein differ from each other or the directionality of each biomarker's change in concentration after exposure to each dose. Therefore, we need to use a post-hoc test. One common post-hoc test following a one-way or two-way ANOVA is a Tukey’s HSD. However, there is no way to pass the output of the `anova_test()` function to the `TukeyHSD()` function. A good alternative is a pairwise t-test with a Bonferroni correction. Our data are paired in that there are repeated measures (doses) on each subject. -```{r} +```{r 04-Chapter4-128} # Create data frame to store results twoway_aov_pairedt <- data.frame(Comparison = c("0_0.6", "0_1", "0_2", "0_4", "0.6_1", "0.6_2", "0.6_4", "1_2", "1_4", "2_4")) @@ -2285,7 +2285,7 @@ Note that this table and the two-way ANOVA table would likely be put into supple Since our overall p-values associated with dose were significant for a number of mediators, we will proceed with creating our final figures with our endpoints by dose, showing the overall two-way ANOVA p-value and the pairwise comparisons from our post hoc paired pairwise t-tests. To facilitate plotting in a faceted panel, we'll first pivot our `biomarker_data` dataframe longer. -```{r} +```{r 04-Chapter4-129} biomarker_data_long <- biomarker_data %>% dplyr::select(-c(Age_yr, Sex)) %>% pivot_longer(-c(Donor, Dose), names_to = "Variable", values_to = "Value") @@ -2294,7 +2294,7 @@ datatable(biomarker_data_long) ``` Then, we will create an annotation dataframe for adding our overall two-way ANOVA p-values. This dataframe needs to contain a column for our variables (to match with our variable column in our `biomarker_data_long` dataframe) and the p-value for annotation. We can extract these from our `two_way_aov_res` dataframe generated above. -```{r} +```{r 04-Chapter4-130} overall_dose_pvals <- twoway_aov_res %>% # Transpose dataframe column_to_rownames("Factor") %>% @@ -2308,7 +2308,7 @@ datatable(overall_dose_pvals) ``` We now have our p-values for each biomarker. Next, we'll make a column where our p-values are formatted with "p = " for annotation on the graph. -```{r} +```{r 04-Chapter4-131} overall_dose_pvals <- overall_dose_pvals %>% mutate(`P Value` = formatC(`P Value`, format = "e", digits = 2), label = paste("p = ", `P Value`, sep = "")) @@ -2317,7 +2317,7 @@ datatable(overall_dose_pvals) ``` Finally, we'll add a column indicating where to add the labels on the y-axis. This will be different for each variable because each variable is on a different scale. We can approach this by computing the maximum value of each variable, then increasing that by 10% to add some space on top of the points. -```{r} +```{r 04-Chapter4-132} sig_labs_y <- biomarker_data %>% summarise(across(IL1B:VEGF, \(x) max(x))) %>% t() %>% as.data.frame() %>% @@ -2336,7 +2336,7 @@ datatable(overall_dose_pvals) ``` Now, we'll use the `biomarker_data` dataframe to plot our individual points and boxplots (similar to the plotting demonstrated in previous TAME Chapter 4 modules) and our `overall_dose_pvals` dataframe to add our p value annotation. -```{r fig.width = 12, fig.height = 6, fig.align='center'} +```{r 04-Chapter4-133, fig.width = 12, fig.height = 6, fig.align='center'} # Create clean labels for the graph titles new_labels <- c("IL10" = "IL-10", "IL1B" = "IL-1\u03B2 ", "IL6" = "IL-6", "IL8" = "IL-8", "TNFa" = "TNF-\u03b1", "VEGF" = "VEGF") @@ -2376,7 +2376,7 @@ It's a bit more difficult to add the pairwise t test results to the boxplots com In the following ANCOVA example, we'll still investigate potential differences in cytokine concentrations as result of varying doses of acrolein. However, this time we'll adjust for sex and age to answer our second environmental health question: **Are there significant differences in inflammatory biomarker concentrations across different doses of acrolein after controlling for sex and age?**. Let's first demonstrate how to run an ANCOVA and what the results look like by running the test on just one of our variables (IL-1$\beta$). The `Anova()` function was specifically designed to run type II or III ANOVA tests, which have different approaches to dealing with interactions terms and unbalanced datasets. For more information on Type I, II, III ANOVA tests, check out [Anova – Type I/II/III SS explained](https://md.psych.bio.uni-goettingen.de/mv/unit/lm_cat/lm_cat_unbal_ss_explained.html). For the purposes of this example just know that isn't much of a difference between the type I, II, or III results. -```{r} +```{r 04-Chapter4-134} anova_test = aov(IL1B ~ Dose + Sex + Age_yr, data = biomarker_data) type3_anova = Anova(anova_test, type = 'III') type3_anova @@ -2384,7 +2384,7 @@ type3_anova Based on the table above, there are significant differences in IL-1$\beta$ concentrations in dose after adjusting for sex and age (p-value = 0.009). Now we'll run ANCOVA tests across all of our biomarkers. -```{r} +```{r 04-Chapter4-135} # Create data frame to store results ancova_res = data.frame() @@ -2417,7 +2417,7 @@ datatable(ancova_res) ``` Looking at the table above, there are statistically differences in all cytokine concentrations with the exception of IL-6 based on dose (p adj < 0.05). To determine what doses were significantly different from one another we'll need to run Tukey's post hoc tests. -```{r} +```{r 04-Chapter4-136} # Create results data frame with a column showing the comparisons (extracted from single run vs for loop) tukey_res <- data.frame(Comparison = c("0.6 - 0", "1 - 0", "2 - 0", "4 - 0", "1 - 0.6", "2 - 0.6", "4 - 0.6", "2 - 1", "4 - 1", "4 - 2")) @@ -2463,7 +2463,7 @@ Before graphing these results, we first need to think about which ones we want t 2. Filter to comparisons including only the 0 group. 3. Pivot the dataframe longer, to match the format of our data used as input for facet plotting. 4. Filter to only p-values that are less that 0.05. -```{r} +```{r 04-Chapter4-137} tukey_res_forgraph <- tukey_res %>% separate(Comparison, into = c("group1", "group2"), sep = " - ") %>% filter(group2 == "0") %>% @@ -2480,7 +2480,7 @@ Next, we can take a few steps to add columns to the dataframe that will aid in g 1. Add a column for significance stars. 2. Add a column to indicate the y position for the significance annotation (similar to the above example with the two-way ANOVA). -```{r} +```{r 04-Chapter4-138} # Add column for significance stars tukey_res_forgraph <- tukey_res_forgraph %>% mutate(p.signif = ifelse(`P Value` < 0.0001, "****", @@ -2507,7 +2507,7 @@ datatable(tukey_res_forgraph) ``` We also need to prepare our overall p-values from our ANCOVA for display: -```{r} +```{r 04-Chapter4-139} ancova_res_forgraphing <- ancova_res %>% rename(`P Value` = Dose) %>% rownames_to_column("Variable") %>% @@ -2518,7 +2518,7 @@ ancova_res_forgraphing <- ancova_res %>% ``` Now, we are ready to make our graph! We will use similar code to the above, this time adding in our significance stars over specific columns. -```{r fig.width = 12, fig.height = 7, fig.align='center'} +```{r 04-Chapter4-140, fig.width = 12, fig.height = 7, fig.align='center'} # Make graph ggplot(biomarker_data_long, aes(x = Dose, y = Value)) + # outlier.shape = NA removes outliers diff --git a/Chapter_5/.DS_Store b/Chapter_5/.DS_Store deleted file mode 100644 index 030439e..0000000 Binary files a/Chapter_5/.DS_Store and /dev/null differ diff --git a/Chapter_5/05-Chapter5.Rmd b/Chapter_5/05-Chapter5.Rmd index 85ed6aa..4fee44a 100644 --- a/Chapter_5/05-Chapter5.Rmd +++ b/Chapter_5/05-Chapter5.Rmd @@ -36,7 +36,7 @@ Before diving in to the applications of AI and ML in environmental health, let's **Machine Learning (ML)** can be thought of as a subset of AI and describes a computer system that iteratively learns and improves from that experience autonomously. Below is a high level taxonomy of AI. It's not meant to be an exhaustive depiction of all AI techniques but a simple visualization of how some of these methodologies are nested within each other. **Note**: AI can be categorized in different ways and may deviate from what is illustrated below. -```{r out.width = "800px", echo = FALSE, out.width = "75%", fig.align = 'center'} +```{r 05-Chapter5-1, out.width = "800px", echo = FALSE, out.width = "75%", fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_1_Input/Module5_1_Image1.png") ``` @@ -48,7 +48,7 @@ Advantages of AI and ML include the automation of repetitive tasks, complex prob It is important to understand the methodological "roots" of current methods. Otherwise, it seems like every approach is novel! AI and ML methods have been around since the mid- to late- 1900s and continue to evolve in the present day. The earliest conceptual roots for these approaches can be traced from antiquity; however, it is generally thought that the field was named "artificial intelligence" at the ["Dartmouth Workshop"](https://home.dartmouth.edu/about/artificial-intelligence-ai-coined-dartmouth) in 1956, led by John McCarthy and others. The following schematic demonstrates the general taxonomy (categories, sub-fields, and specific methods) of modern AI and ML: -```{r out.width = "800px", echo = FALSE, fig.align = 'center'} +```{r 05-Chapter5-2, out.width = "800px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_1_Input/Module5_1_Image2.png") ``` @@ -74,17 +74,17 @@ There are many avenues to incorporate ML into environmental health research, all One well-known problem that can be better addressed by incorporating ML is the 'too many chemicals, too little data' problem. To detail, there are thousands of chemicals in commerce today. Testing these chemicals one by one for toxicity using comprehensive animal screening experiments would take decades and is not feasible financially. Current efforts to address this problem include using cell-based high throughput screening to efficiently determine biological responses to a variety of chemical exposures and treatment conditions. -```{r out.width = "700px", echo = FALSE, fig.align = 'center'} +```{r 05-Chapter5-3, out.width = "700px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_1_Input/Module5_1_Image3.png") ``` These screening efforts result in increasing amounts of data, which can be gathered to start building big databases. -```{r out.width = "700px", echo = FALSE, fig.align = 'center'} +```{r 05-Chapter5-4, out.width = "700px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_1_Input/Module5_1_Image4.png") ``` When many of these datasets and databases are combined, including diversity across different types of screening platforms, technologies, cell types, species, and other experimental variables, the associated dimensionality of the data gets "big." -```{r out.width = "500px", echo = FALSE, fig.align = 'center'} +```{r 05-Chapter5-5, out.width = "500px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_1_Input/Module5_1_Image5.png") ``` @@ -101,7 +101,7 @@ This image shows graphical abstractions of how a "problem" is solved using: + Traditional statistics ((A) logistic regression and (B) linear regression), OR + Machine learning ((C) support vector machines, (D) artificial neural networks, and (E) decision trees) -```{r out.width = "700px", echo = FALSE, fig.align = 'center'} +```{r 05-Chapter5-6, out.width = "700px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_1_Input/Module5_1_Image6.png") ``` @@ -115,7 +115,7 @@ Our *working definition* is that **predictive toxicology** describes a multidisc + Can this suite of *in vitro* assays **predict** what would happen in an organism? + Can we use diverse, high dimensional data to cluster chemicals into **predicted** activity classes? -```{r out.width = "600px", echo = FALSE, fig.align = 'center'} +```{r 05-Chapter5-7, out.width = "600px", echo = FALSE, fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_1_Input/Module5_1_Image7.png") ``` @@ -193,14 +193,14 @@ Supervised machine learning includes: + Classification: Using algorithms to classify a categorical outcome (ie. plant species, disease status, etc.) + Regression: Using algorithms to predict a continuous outcome (ie. gene expression, chemical concentration, etc.) -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 05-Chapter5-8, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_5/Module5_2_Input/Module5_2_Image1.png") ```
Soni, D. (2018, March 22). Supervised vs. Unsupervised Learning. Towards Data Science; Towards Data Science. https://towardsdatascience.com/supervised-vs-unsupervised-learning-14f68e32ea8d
**Unsupervised machine learning**, on the other hand, involves using models to find patterns or associations between variables in a dataset that lacks a known or labeled outcome. For example, unsupervised machine learning has been used to identify new patterns across genes that are co-expressed, informing potential biological pathways mediating human disease ([Botía et. al](https://bmcsystbiol.biomedcentral.com/articles/10.1186/s12918-017-0420-6), [Pagnuco et. al](https://www.sciencedirect.com/science/article/pii/S0888754317300575?via%3Dihub)). -```{r, echo=FALSE, fig.width=52, fig.height=18, fig.align='center', out.width = "75%"} +```{r 05-Chapter5-9, echo=FALSE, fig.width=52, fig.height=18, fig.align='center', out.width = "75%"} knitr::include_graphics("Chapter_5/Module5_2_Input/Module5_2_Image2.png") ```
Langs, G., Röhrich, S., Hofmanninger, J., Prayer, F., Pan, J., Herold, C., & Prosch, H. (2018). Machine learning: from radiomics to discovery and routine. Der Radiologe, 58(S1), 1–6. PMID: [34013136](https://doi.org/10.1007/s00117-018-0407-3). Figure regenerated here in alignment with its published [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/).
@@ -214,19 +214,19 @@ Overall, the distinction between supervised and unsupervised learning is an impo Although this module's example will focus on a random forest model in the coding example below, other commonly used algorithms for supervised machine learning include: + **K-Nearest Neighbors (KNN):** Uses distance to classify a data point in the test set based upon the most common class of neighboring data points from the training set. For more information on KNN, see [K-Nearest Neighbor](https://www.ibm.com/topics/knn). -```{r, echo=FALSE, out.width = "50%",fig.width=4, fig.height=5, fig.align='center'} +```{r 05-Chapter5-10, echo=FALSE, out.width = "50%",fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_5/Module5_2_Input/Module5_2_Image6.png") ``` + **Support Vector Machine (SVM):** Creates a decision boundary line (hyperplane) in n-dimensional space to separate the data into each class so that when new data is presented, they can be easily categorized. For more information on SVM, see [Support Vector Machine](https://www.javatpoint.com/machine-learning-support-vector-machine-algorithm). -```{r, echo=FALSE, out.width = "50%", fig.width=4, fig.height=5, fig.align='center'} +```{r 05-Chapter5-11, echo=FALSE, out.width = "50%", fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_5/Module5_2_Input/Module5_2_Image7.png") ``` + **Random Forest (RF):** Uses a multitude of decision trees trained on a subset of different samples from the training set and the resulting classification of a data point in the test set is aggregated from all the decision trees. A **decision tree** is a hierarchical model that depicts decisions from predictors and their resulting outcomes. It starts with a root node, which represents an initial test from a single predictor. The root node splits into subsequent decision nodes that test another feature. These decision nodes can either feed into more decision nodes or leaf nodes that represent the predicted class label. A branch or a sub-tree refers to a subsection of an entire decision tree. Here is an example decision tree with potential variables and decisions informing a college basketball player's likelihood of being drafted to the NBA: -```{r, echo=FALSE, out.width = "75%",fig.width=4, fig.height=5, fig.align='center'} +```{r 05-Chapter5-12, echo=FALSE, out.width = "75%",fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_5/Module5_2_Input/Module5_2_Image8.png") ``` @@ -253,7 +253,7 @@ Common partitions of the full dataset used to train and test a supervised machin It is common to split the dataset into a training set that contains 60% of the data and the test set that contains 40% of the data, though other common splits include 70% training / 30% test and 80% training / 20% test. -```{r, echo=FALSE, out.width = "65%", fig.align='center'} +```{r 05-Chapter5-13, echo=FALSE, out.width = "65%", fig.align='center'} knitr::include_graphics("Chapter_5/Module5_2_Input/Module5_2_Image3.png") ``` @@ -268,7 +268,7 @@ Although there are [a number of cross validation approaches](https://neptune.ai/ 1. Splitting the training data into 5 groups, or "folds". 2. Five iterations of training/testing are then run where each of the 5 folds serves as the test data once and as part of the training set four times, as seen in the figure below. 3. To measure predictive ability of each of the parameters tested, like the number of features to include, values like accuracy and specificity are calculated for each iteration. The parameters that optimize performance are selected for the final model which will be evaluated against the test set not used in training. -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} +```{r 05-Chapter5-14, echo=FALSE, fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_5/Module5_2_Input/Module5_2_Image4.png") ``` @@ -281,7 +281,7 @@ Evaluation metrics from a confusion matrix are often used to determine the best Let's imagine you're interested in predicting whether or not a player will be drafted to the National Basketball Association (NBA) based on a dataset that contains variables regarding a player's assists, points, height etc. Let's say that this dataset contains information on 253 players with 114 that were actually drafted and 139 that weren't drafted. The confusion matrix below shows a model's results where a player that is drafted is the "positive" class and a player that is not drafted is the "negative" class. -```{r, echo=FALSE, out.width = "50%", fig.width=4, fig.height=5, fig.align='center'} +```{r 05-Chapter5-15, echo=FALSE, out.width = "50%", fig.width=4, fig.height=5, fig.align='center'} knitr::include_graphics("Chapter_5/Module5_2_Input/Module5_2_Image5.png") ``` @@ -338,13 +338,13 @@ This training module was specifically developed to answer the following environm ### Script Preparations #### Cleaning the global environment -```{r, clear__envi, echo=TRUE, eval=TRUE} +```{r 05-Chapter5-16, clear__envi, echo=TRUE, eval=TRUE} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, install_libs, echo=TRUE, eval=TRUE, warning=FALSE, results='hide', message=FALSE} +```{r 05-Chapter5-17, install_libs, echo=TRUE, eval=TRUE, warning=FALSE, results='hide', message=FALSE} if (!requireNamespace("readxl")) install.packages("readxl"); if (!requireNamespace("lubridate")) @@ -362,7 +362,7 @@ if (!requireNamespace("randomForest")) ``` #### Loading R packages required for this session -```{r, load_libs, echo=TRUE, eval=TRUE, warning=FALSE, error=FALSE, results='hide', message=FALSE} +```{r 05-Chapter5-18, load_libs, echo=TRUE, eval=TRUE, warning=FALSE, error=FALSE, results='hide', message=FALSE} library(readxl); library(lubridate); library(tidyverse); @@ -374,12 +374,12 @@ library(cardx); ``` #### Set your working directory -```{r, filepath, echo=TRUE, eval=FALSE, error=FALSE, results='hide', message=FALSE} +```{r 05-Chapter5-19, filepath, echo=TRUE, eval=FALSE, error=FALSE, results='hide', message=FALSE} setwd("/filepath to where your input files are") ``` #### Importing example dataset -```{r, read_data, echo=TRUE, eval=TRUE} +```{r 05-Chapter5-20, read_data, echo=TRUE, eval=TRUE} # Load the data arsenic_data <- data.frame(read_xlsx("Chapter_5/Module5_2_Input/Module5_2_InputData.xlsx")) @@ -400,7 +400,7 @@ The columns in this dataset are described below: ### Changing Data Types First, `Detect_Concentration` needs to be converted from a character to a factor so that Random Forest knows that the non-detect class is the baseline or "negative" class, while the detect class will be the "positive" class. `Water_Sample_Date` will be converted from a character to a date type using the `mdy()` function from the *lubridate* package. This is done so that the model understands this column contains dates. -```{r, convert_type, echo=TRUE, eval=TRUE} +```{r 05-Chapter5-21, convert_type, echo=TRUE, eval=TRUE} arsenic_data <- arsenic_data %>% # Converting `Detect_Concentration` from a character to a factor mutate(Detect_Concentration = relevel(factor(Detect_Concentration), ref = "ND"), @@ -421,7 +421,7 @@ head(arsenic_data) It is useful to run summary statistics on the variables that will be used as predictors in the algorithm to see if there are differences in distributions between the outcomes classes (either non-detect or detect in this case). Typically, greater significance often leads to better predictivity for a certain variable, since the model is better able to separate the classes. We'll use the `tbl_summary()` function from the *gtsummary* package. Note, this may only be practical with smaller datasets or for a subset of predictors if there are many. For more information on the `tbl_summary()` function, check out this helpful [Tutorial](https://www.danieldsjoberg.com/gtsummary/articles/tbl_summary.html). -```{r, tbl, echo=TRUE, eval=TRUE, warning=F, message = F} +```{r 05-Chapter5-22, tbl, echo=TRUE, eval=TRUE, warning=F, message = F} arsenic_data %>% # Displaying the mean and standard deviation in parentheses for all continuous variables tbl_summary( @@ -458,7 +458,7 @@ With these findings, we feel comfortable moving forward with these well water de ### Setting up Cross Validation At this point, we can move forward with training and testing a RF model aimed at predicting whether or not detectable levels of iAs are present in well water samples. We'll take a glance at the distribution of `Detect_Concentration` between the two classes. -```{r, train_test, echo=TRUE, eval=TRUE} +```{r 05-Chapter5-23, train_test, echo=TRUE, eval=TRUE} # Set seed for reproducibility set.seed(17) @@ -479,7 +479,7 @@ We can see that there are notably more non-detects (`ND`) than detects (`D`) in Now we can set up our cross validation and train our model. We will be using the `trainControl()` function from the *caret* package for this task. It is one of the most commonly used libraries for supervised machine learning in R and can be leveraged for a variety algorithms including RF, SVM, KNN, and others. This model will be trained with 5-fold cross validation. Additionally, we will test 2, 3, and 6 predictors through the `mtry` parameter. See the *caret* documentation [here](https://cran.r-project.org/web/packages/caret/vignettes/caret.html). -```{r, train, echo=TRUE, eval=TRUE} +```{r 05-Chapter5-24, train, echo=TRUE, eval=TRUE} # Establish the parameters for our cross validation with 5 folds control <- trainControl(method = 'cv', @@ -495,7 +495,7 @@ tunegrid_rf <- expand.grid(mtry = c(floor(sqrt(p)), p/2, p)) # We will test sqrt
## Predicting iAs Detection with a Random Forest (RF) Model -```{r} +```{r 05-Chapter5-25} # Look at the column names in training dataset colnames(iAs_train) @@ -529,7 +529,7 @@ rf_final ::: Now we can see how well our model does on data it hasn't seen before by applying it to our testing data. -```{r, test, echo=TRUE, eval=TRUE} +```{r 05-Chapter5-26, test, echo=TRUE, eval=TRUE} # Use our best model to predict the classes for our test data. We need to make sure we remove the column of Ds/NDs from our test data. rf_res <- predict(rf_final, iAs_test %>% select(!Detect_Concentration)) @@ -567,7 +567,7 @@ To address this issue, a few methods can be considered. Full implementation of t + **Alternative Performance Metrics**- When training the model, alternative metrics to overall accuracy may yield a more robust model capable of better predicting the minority class. Example alternatives may include balanced accuracy or an [F1-score](https://thedatascientist.com/f-1-measure-useful-imbalanced-class-problems/). The *caret* package further allows for [custom, user-defined metrics](https://topepo.github.io/caret/model-training-and-tuning.html#alternate-performance-metrics) to be evaluated during training by specifying the *summaryFunction* parameter in the `trainControl()` function, as seen below, in addition to the [`defaultSummary()` and `twoClassSummary()` functions](https://cran.r-project.org/web/packages/caret/vignettes/caret.html). In the example code below, we're creating a function (`f1`) that will calculate the F1 score and find the optimal model with the highest F1 score as opposed to the highest accuracy as we did above. -```{r, alt_metric, echo=TRUE, eval=FALSE} +```{r 05-Chapter5-27, alt_metric, echo=TRUE, eval=FALSE} install.packages("MLmetrics") library(MLmetrics) @@ -671,7 +671,7 @@ When choosing variables for decision boundary plots, features that have the most + Cholesterol, given that it had the highest variable importance and + Vitamin D, given its synthesis can be affected by ozone despite it having a lower variable importance in the paper's models. -```{r, echo=FALSE, fig.align='center', out.width = "80%"} +```{r 05-Chapter5-28, echo=FALSE, fig.align='center', out.width = "80%"} knitr::include_graphics("Chapter_5/Module5_3_Input/Module5_3_Image1.png") ```
**Figure 5. Decision boundary plot for SVM model predicting lung response class.** Cholesterol and 25-hydroxyvitamin D were used as predictors visualizing responder status [non-responders(green) and responders (yellow)] and disease status [non-asthmatics (triangles) and asthmatics (circles)]. The shaded regions are the model’s prediction of a subject’s lung response class at a given cholesterol and 25-hydroxyvitamin D concentration.
@@ -701,13 +701,13 @@ This training module was specifically developed to answer the following environm ### Script Preparations #### Cleaning the global environment -```{r} +```{r 05-Chapter5-29} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r message=FALSE} +```{r 05-Chapter5-30, message=FALSE} if (!requireNamespace("readxl")) install.packages("readxl"); if (!requireNamespace("lubridate")) @@ -723,7 +723,7 @@ if (!requireNamespace("themis")) ``` #### Loading R packages required for this session -```{r message=FALSE} +```{r 05-Chapter5-31, message=FALSE} library(readxl) library(lubridate) library(tidyverse) @@ -735,12 +735,12 @@ library(themis) ``` #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 05-Chapter5-32, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` #### Importing example dataset -```{r} +```{r 05-Chapter5-33} # Load the data arsenic_data <- data.frame(read_excel("Chapter_5/Module5_3_Input/Module5_3_InputData.xlsx")) @@ -750,7 +750,7 @@ head(arsenic_data) ### Changing Data Types First, `Detect_Concentration` needs to be converted from a character to a factor so that Random Forest knows that the non-detect class is the baseline or "negative" class, while the detect class will be the "positive" class. `Water_Sample_Date` will be converted from a character to a date type using the `mdy()` function from the *lubridate* package. This is done so that the model understands this column contains dates. -```{r} +```{r 05-Chapter5-34} arsenic_data <- arsenic_data %>% # Converting `Detect_Concentration` from a character to a factor mutate(Detect_Concentration = relevel(factor(Detect_Concentration), ref = "ND"), @@ -769,7 +769,7 @@ head(arsenic_data) Note that the code below is different than the code presented in the previous module, **TAME 2.0 Module 5.2 Supervised Machine Learning**. Both coding methods are valid and produce comparable results, however we wanted to present another way to run *k*-fold cross validation and random forest. In 5-fold cross validation (CV), there are 5 equally-sized folds (ideally!). This means that 80% of the original dataset is split into the 4 folds that comprise the training set and the remaining 20% in the last fold is reserved for the test set. Previously, the `trainControl()` function was used for CV. This time we'll use the `createFolds()` function also from the *caret* package. -```{r} +```{r 05-Chapter5-35} # Setting seed for reproducibility set.seed(12) @@ -789,7 +789,7 @@ mtry_values = c(sqrt(p), p/2, p) # number of predictors to be used in the model ## Predicting iAs Detection with a Random Forest (RF) Model Notice that in the code below we are choosing the final RF model to be the one with the lowest out of bag (OOB) error. In the previous module, the final model was chosen based on the highest accuracy, however this is a similar approach here given that OOB error = 1 - Accuracy. -```{r} +```{r 05-Chapter5-36} # Setting the seed again so the predictions are consistent set.seed(12) @@ -864,18 +864,18 @@ variable_importance_df = variable_importance_df %>% ``` The confusion matrix results from the previous module are shown below. -```{r, echo=FALSE, fig.align='center', out.width = "80%"} +```{r 05-Chapter5-37, echo=FALSE, fig.align='center', out.width = "80%"} knitr::include_graphics("Chapter_5/Module5_3_Input/Module5_3_Image2.png") ``` Now let's double check that when using this new method, our results are still comparable. -```{r} +```{r 05-Chapter5-38} # First comparing results to the previous module round(metrics, 2) ``` They are! Now we'll take a look at the model's variable importance. -```{r} +```{r 05-Chapter5-39} variable_importance_df ``` @@ -883,7 +883,7 @@ Although we have the results we need, let's take it a step further and plot the ### Reformatting the dataframe for plotting First, the dataframe will be transformed so that the figure is more legible. Specifically, spaces will be added between the variables, and the `Predictor` column will be put into a factor to rearrange the order of the variables from lowest to highest mean decrease gini. For additional information on tricks like this to make visualizations easier to read, see **TAME 2.0 Module 3.2 Improving Data Visualizations**. -```{r} +```{r 05-Chapter5-40} # Adding spaces between the variables that need the space modified_variable_importance_df = variable_importance_df %>% mutate(Predictor = gsub("_", " ", Predictor)) @@ -897,7 +897,7 @@ head(modified_variable_importance_df) ``` ## Variable Importance Plot -```{r fig.align='center', out.width = "65%"} +```{r 05-Chapter5-41, fig.align='center', out.width = "65%"} ggplot(data = modified_variable_importance_df , aes(x = MeanDecreaseGini, y = Predictor, size = 2)) + geom_point() + @@ -934,7 +934,7 @@ Since casing depth and pH have been identified as the predictors with the highes ### Decision Boundary Calculation First, models will be trained using only casing depth and pH as variables. Since, the decision boundary plot will be used for visualization purposes, and a 2-D figure can only plot two variables, we will not worry about tuning the parameters as was previously done. In this module, we're creating a decision boundary based on a random forest model, however we'll also explore what decision boundaries look like for other algorithms including support vector machine (SVM), and k nearest neighbor (KNN), logistic regression. Each supervised ML method has its advantages and performance is dependent upon the situation and the dataset. Therefore, it is common to see multiple models used to predict an outcome of interest in a publication. Let's create additional boundary plots still using casing depth and pH, but this time we will use logistic regression, SVM, and KNN as comparisons to RF. -```{r} +```{r 05-Chapter5-42} # Creating a dataframe with variables based on the highest predictors highest_pred_data = data.frame(arsenic_data[,c("Casing_Depth", "pH", "Detect_Concentration")]) @@ -952,7 +952,7 @@ knn_detect_arsenic = knn3(Detect_Concentration~., data = highest_pred_data) # sp ``` From these predictions, decision boundaries will be calculated. This will be done by predicting `Detect_Concentration` between a grid of values - specifically the minimum and maximum of the two predictors (casing depth and pH). A non-linear line will be drawn on the plot to separate the two classes. -```{r} +```{r 05-Chapter5-43} get_grid_df <- function(classification_model, data, resolution = 100, predict_type) { # This function predicts the outcome (Detect_Concentration) at evenly spaced data points using the two variables (pH and casing depth) # to create a decision boundary between the outcome classes (detect and non-detect samples). @@ -1013,7 +1013,7 @@ head(grid_df) ## Decision Boundary Plot Now let's plot the grid of predictions with the sampled data. -```{r warning = FALSE, fig.width=15, fig.height=10, fig.align='center'} +```{r 05-Chapter5-44, warning = FALSE, fig.width=15, fig.height=10, fig.align='center'} # choosing palette from package ggsci_colors = pal_npg()(5) @@ -1062,7 +1062,7 @@ ggplot() + Here, we will create a decision boundary plot still using casing depth and pH, but this time we will make our dataset more balance to see how improve model performance visually. The **Synthetic Minority Oversampling Technique (SMOTE)** was introduced in **TAME 2.0 Module 5.2 Supervised Machine Learning** and will be used to make the dataset more balanced by oversampling the minority class (detect values) and undersampling the majority class (non-detect values). Starting by training each model: -```{r} +```{r 05-Chapter5-45} # Using SMOTE first to balance classes balanced_highest_pred_data = smotenc(highest_pred_data, "Detect_Concentration") @@ -1080,7 +1080,7 @@ knn_detect_arsenic = knn3(Detect_Concentration~., data = balanced_highest_pred_d ``` Now calling the `get_grid_df()` function we created above to create a grid of predictions. -```{r} +```{r 05-Chapter5-46} # Calling function # RF balanced_grid_df_rf = get_grid_df(rf_detect_arsenic, balanced_highest_pred_data, predict_type = "class") %>% @@ -1110,7 +1110,7 @@ balanced_grid_df = rbind(balanced_grid_df_rf, balanced_grid_df_lr, balanced_grid head(balanced_grid_df) ``` -```{r warning = FALSE, fig.width=15, fig.height=10, fig.align='center'} +```{r 05-Chapter5-47, warning = FALSE, fig.width=15, fig.height=10, fig.align='center'} # choosing palette from package ggsci_colors = pal_npg()(5) @@ -1196,7 +1196,7 @@ To reiterate what has been discussed in the previous module, machine learning is **Note**: Unsupervised machine learning is used for exploratory purposes, and just because it can find relationships between data points, that doesn't necessarily mean that those relationships have merit, are indicative of causal relationships, or have direct biological implications. Rather, these methods can be used to find new patterns that can also inform future studies testing direct relationships. ::: -```{r, echo=FALSE, out.width = "75%", fig.align = 'center'} +```{r 05-Chapter5-48, echo=FALSE, out.width = "75%", fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_4_Input/Module5_4_Image1.png") ```
Langs, G., Röhrich, S., Hofmanninger, J., Prayer, F., Pan, J., Herold, C., & Prosch, H. (2018). Machine learning: from radiomics to discovery and routine. Der Radiologe, 58(S1), 1–6. PMID: [34013136](https://doi.org/10.1007/s00117-018-0407-3). Figure regenerated here in alignment with its published [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)
@@ -1241,7 +1241,7 @@ Principal Component Analysis, or PCA, is a dimensionality-reduction technique us 3. Every instance (e.g. chemical) in the original dataset has a "weight" or score" on each PC. 4. Any combination of PCs can be compared to summarize relationships amongst the instances (e.g. chemicals), but typically it's the first two eigenvectors that capture a majority of the variance. -```{r, echo=FALSE, out.width= "80%", fig.align = 'center'} +```{r 05-Chapter5-49, echo=FALSE, out.width= "80%", fig.align = 'center'} knitr::include_graphics("Chapter_5/Module5_4_Input/Module5_4_Image2.png") ``` @@ -1270,13 +1270,13 @@ This training module was specifically developed to answer the following environm ### Script Preparations #### Cleaning the global environment -```{r, clear_env, echo=TRUE, eval=TRUE} +```{r 05-Chapter5-50, clear_env, echo=TRUE, eval=TRUE} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, message=FALSE} +```{r 05-Chapter5-51, message=FALSE} if (!requireNamespace("factoextra")) install.packages("factoextra"); if (!requireNamespace("pheatmap")) @@ -1286,7 +1286,7 @@ if (!requireNamespace("cowplot")) ``` #### Loading required R packages -```{r, results=FALSE, message=FALSE} +```{r 05-Chapter5-52, results=FALSE, message=FALSE} library(tidyverse) library(factoextra) library(pheatmap) #used to make heatmaps @@ -1294,7 +1294,7 @@ library(cowplot) ``` Getting help with packages and functions -```{r, eval = FALSE} +```{r 05-Chapter5-53, eval = FALSE} ?tidyverse # Package documentation for tidyverse ?kmeans # Package documentation for kmeans (a part of the standard stats R package, automatically uploaded) ?prcomp # Package documentation for deriving principal components within a PCA (a part of the standard stats R package, automatically uploaded) @@ -1302,49 +1302,49 @@ Getting help with packages and functions ``` #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 05-Chapter5-54, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` #### Loading the Example Dataset Let's start by loading the datasets needed for this training module. We are going to use a dataset of substances that have a diverse chemical space of PFAS and statin compounds. This list of chemicals will be uploaded alongside physicochemical property data. The chemical lists for 'PFAS' and 'Statins' were obtained from the EPA's Computational Toxicology Dashboard [Chemical Lists](https://comptox.epa.gov/dashboard/chemical-lists). The physicochemical properties were obtained by uploading these lists into the National Toxicology Program’s [Integrated Chemical Environment (ICE)](https://ice.ntp.niehs.nih.gov/). -```{r} +```{r 05-Chapter5-55} dat <- read.csv("Chapter_5/Module5_4_Input/Module5_4_InputData.csv", fileEncoding = "UTF-8-BOM") ``` #### Data Viewing Starting with the overall dimensions: -```{r} +```{r 05-Chapter5-56} dim(dat) ``` Then looking at the first four rows and five columns of data: -```{r} +```{r 05-Chapter5-57} dat[1:4,1:5] ``` Note that the first column, `List`, designates the following two larger chemical classes: -```{r} +```{r 05-Chapter5-58} unique(dat$List) ``` Let's lastly view all of the column headers: -```{r} +```{r 05-Chapter5-59} colnames(dat) ``` In the data file, the first four columns represent chemical identifier information. All remaining columns represent different physicochemical properties derived from OPERA via [Integrated Chemical Environment (ICE)](https://ice.ntp.niehs.nih.gov/). Because the original titles of these physicochemical properties contained commas and spaces, R automatically converted these into periods. Hence, titles like `OPERA..Boiling.Point`. For ease of downstream data analyses, let's create a more focused dataframe option containing only one chemical identifier (CASRN) as row names and then just the physicochemical property columns. -```{r} +```{r 05-Chapter5-60} # Creating a new dataframe that contains the physiocochemical properties chemical_prop_df <- dat[,5:ncol(dat)] rownames(chemical_prop_df) <- dat$CASRN ``` Now explore this data subset: -```{r} +```{r 05-Chapter5-61} dim(chemical_prop_df) # overall dimensions chemical_prop_df[1:4,1:5] # viewing the first four rows and five columns colnames(chemical_prop_df) @@ -1356,7 +1356,7 @@ colnames(chemical_prop_df) Let's first plot two physicochemical properties to determine if and how substances group together without any fancy data reduction or other machine learning techniques. This will answer **Environmental Health Question #1**: Can we differentiate between PFAS and statin chemical classes when considering just the raw physicochemical property variables without applying unsupervised machine learning techniques? Let's put molecular weight (`Molecular.Weight`) as one axis and boiling point (`OPERA..Boiling.Point`) on the other. We'll also color by the chemical classes using the `List` column from the original dataframe. -```{r fig.align='center'} +```{r 05-Chapter5-62, fig.align='center'} ggplot(chemical_prop_df[,1:2], aes(x = Molecular.Weight, y = OPERA..Boiling.Point, color = dat$List)) + geom_point(size = 2) + theme_bw() + ggtitle('Version A: Bivariate Plot of Two Original Physchem Variables') + @@ -1364,7 +1364,7 @@ ggplot(chemical_prop_df[,1:2], aes(x = Molecular.Weight, y = OPERA..Boiling.Poin ``` Let's plot two other physicochemical property variables, Henry's Law constant (`OPERA..Henry.s.Law.Constant`) and melting point (`OPERA..Melting.Point`), to see if the same separation of chemical classes is apparent. -```{r fig.align='center'} +```{r 05-Chapter5-63, fig.align='center'} ggplot(chemical_prop_df[,3:4], aes(x = OPERA..Henry.s.Law.Constant, y = OPERA..Melting.Point, color = dat$List)) + geom_point(size = 2) + theme_bw() + @@ -1390,12 +1390,12 @@ Let's turn our attention to **Environmental Health Question #2**: If substances For this example, let's coerce the *k*-means algorithms to calculate 2 distinct clusters (based on their corresponding mean centered values). Here, we choose to derive two distinct clusters, because we are ultimately going to see if we can use this information to predict each chemical's classification into two distinct chemical classes (i.e., PFAS vs statins). Note that we can derive more clusters using similar code depending on the question being addressed. We can give a name to this variable to easily provide the number of clusters in the next lines of code, `num.centers`: -```{r} +```{r 05-Chapter5-64} num.centers <- 2 ``` Here we derive chemical clusters using *k*-means: -```{r} +```{r 05-Chapter5-65} clusters <- kmeans(chemical_prop_df, # input dataframe centers = num.centers, # number of cluster centers to calculate iter.max = 1000, # the maximum number of iterations allowed @@ -1403,26 +1403,26 @@ clusters <- kmeans(chemical_prop_df, # input dataframe ``` The resulting property values that were derived as the final cluster centers can be pulled using: -```{r} +```{r 05-Chapter5-66} clusters$centers ``` Let's add the cluster assignments to the physicochemical data and create a new dataframe, which can then be used in a heatmap visualization to see how these physicochemical data distributions clustered according to *k*-means. These cluster assignments can be pulled from the `cluster` list output, where chemicals are designated to each cluster with either a 1 or 2. You can view these using: -```{r} +```{r 05-Chapter5-67} clusters$cluster ``` Because these results are listed in the exact same order as the inputted dataframe, we can simply add these assignments to the `chemical_prop_df` dataframe. -```{r} +```{r 05-Chapter5-68} dat_wclusters <- cbind(chemical_prop_df,clusters$cluster) colnames(dat_wclusters)[11] <- "Cluster" # renaming this new column "Custer" dat_wclusters <- dat_wclusters[order(dat_wclusters$Cluster),] # sorting data by cluster assignments ``` To generate a heatmap, we need to first create a separate dataframe for the cluster assignments, ordered in the same way as the physicochemical data: -```{r} +```{r 05-Chapter5-69} hm_cluster <- data.frame(dat_wclusters$Cluster, row.names = row.names(dat_wclusters)) # creating the dataframe colnames(hm_cluster) <- "Cluster" # reassigning the column name hm_cluster$Cluster <- as.factor(hm_cluster$Cluster) # coercing the cluster numbers into factor variables, to make the heatmap prettier @@ -1431,7 +1431,7 @@ head(hm_cluster) # viewing this new cluster assignment dataframe ``` We're going to go ahead and clean up the physiocochemical property names to make the heatmap a bit tidier. -```{r} +```{r 05-Chapter5-70} clean_names1 = gsub("OPERA..", "", colnames(dat_wclusters)) # "\\." denotes a period clean_names2 = gsub("\\.", " ", clean_names1) @@ -1447,7 +1447,7 @@ Then we can call this dataframe (`data_wclusters`) into the following heatmap vi
### Heatmap Visualization of the Resulting *K*-Means Clusters -```{r, fig.height=8, fig.width=10} +```{r 05-Chapter5-71, fig.height=8, fig.width=10} pheatmap(dat_wclusters[,1:10], cluster_rows = FALSE, cluster_cols = FALSE, # no further clustering, for simplicity scale = "column", # scaling the data to make differences across chemicals more apparent @@ -1481,18 +1481,18 @@ Next, we will run through some example analyses applying the common data reducti We can calculate the principal components across ALL physicochemical data across all chemicals using the `prcomp()` function. Always make sure your data is centered and scaled prior to running to PCA, since it's sensitive to variables having different scales. -```{r} +```{r 05-Chapter5-72} my.pca <- prcomp(chemical_prop_df, # input dataframe of physchem data scale = TRUE, center = TRUE) ``` We can see how much of the variance was able to be captured in each of the eigenvectors or dimensions using a scree plot. -```{r fig.align='center'} +```{r 05-Chapter5-73, fig.align='center'} fviz_eig(my.pca, addlabels = TRUE) ``` We can also calculate these values and pull them into a dataframe for future use. For example, to pull the percentage of variance explained by each principal component, we can run the following calculations, where first eigenvalues (eigs) are calculated and then used to calculate percent of variance per principal component: -```{r} +```{r 05-Chapter5-74} eigs <- my.pca$sdev^2 Comp.stats <- data.frame(eigs, eigs/sum(eigs), row.names = names(eigs)) colnames(Comp.stats) <- c("Eigen_Values", "Percent_of_Variance") @@ -1514,12 +1514,12 @@ head(Comp.stats) Next, we'll use PCA to answer **Environmental Health Question #4**: Upon reducing the data through PCA, which physicochemical property contributes the most towards informing data variance captured in the primary principal component (Comp.1)? Here are the resulting scores for each chemical's contribution towards each principal component (shown here as components `PC1`-`PC10`). -```{r} +```{r 05-Chapter5-75} head(my.pca$x) ``` And the resulting loading factors of each property's contribution towards each principal component. -```{r} +```{r 05-Chapter5-76} my.pca$rotation ``` @@ -1540,7 +1540,7 @@ my.pca$rotation Let's turn our attention to **Environmental Health Question #5**: If we did not have information telling us which chemical belonged to which class, could we use PCA and *k*-means to inform whether a chemical is more similar to a PFAS or a statin? We can start by answering this question by visualizing the first two principal components and coloring each chemical according to class (i.e. PFAS vs statins). -```{r fig.align='center'} +```{r 05-Chapter5-77, fig.align='center'} ggplot(data.frame(my.pca$x), aes(x = PC1, y = PC2, color = dat$List)) + geom_point(size = 2) + theme_bw() + ggtitle('Version C: PCA Plot of the First 2 PCs, colored by Chemical Class') + @@ -1566,17 +1566,17 @@ We can also identify cluster-based trends within data that are reduced after run ### Estimate *K*-Means Clusters from PCA Results Let's first run code similar to the previous *k*-means analysis and associated parameters, though instead here we will use data reduced values from the PCA analysis. Specifically, clusters across PCA "scores" values will be derived, where scores represent the relative amount each chemical contributed to each principal component. -```{r} +```{r 05-Chapter5-78} clusters_PCA <- kmeans(my.pca$x, centers = num.centers, iter.max = 1000, nstart = 50) ``` The resulting PCA score values that were derived as the final cluster centers can be pulled using: -```{r} +```{r 05-Chapter5-79} clusters_PCA$centers ``` Viewing the final cluster assignment per chemical: -```{r} +```{r 05-Chapter5-80} head(cbind(rownames(chemical_prop_df),clusters_PCA$cluster)) ``` @@ -1585,7 +1585,7 @@ head(cbind(rownames(chemical_prop_df),clusters_PCA$cluster)) #### Visualizing *K*-Means Clusters from PCA Results Let's now view, again, the results of the main PCA focusing on the first two principal components; though this time let's color each chemical according to *k*-means cluster. -```{r fig.align='center'} +```{r 05-Chapter5-81, fig.align='center'} ggplot(data.frame(my.pca$x), aes(x = PC1, y = PC2, color = as.factor(clusters_PCA$cluster))) + geom_point(size = 2) + theme_bw() + ggtitle('Version D: PCA Plot of the First 2 PCs, colored by k-means Clustering') + @@ -1594,7 +1594,7 @@ ggplot(data.frame(my.pca$x), aes(x = PC1, y = PC2, color = as.factor(clusters_PC ``` Let's put these two PCA plots side by side to compare them more easily. We'll also tidy up the figures a bit so they're closer to publication-ready. -```{r fig.align='center', fig.width = 20, fig.height = 6, fig.retina= 3} +```{r 05-Chapter5-82, fig.align='center', fig.width = 20, fig.height = 6, fig.retina= 3} # PCA plot colored by chemical class pcaplot1 = ggplot(data.frame(my.pca$x), aes(x = PC1, y = PC2, color = dat$List)) + geom_point(size = 2) + @@ -1694,7 +1694,7 @@ Helpful resources on *k*-means clustering include the following: [The Elements o ## Hierarchical Clustering **Hierarchical clustering** groups objects into clusters by repetitively joining similar observations until there is one large cluster (aka agglomerative or bottom-up) or repetitively splitting one large cluster until each observation stands alone (aka divisive or top-down). Regardless of whether agglomerative or divisive hierarchical clustering is used, the results can be visually represented in a tree-like figure called a dendrogram. The dendrogram below is based on the `USArrests` dataset available in R. The datset contains statistics on violent crimes rates (murder, assault, and rape) per capita (per 100,000 residents) for each state in the United States in 1973. For more information on the `USArrests` dataset, check out its associated [RDocumentation](https://www.rdocumentation.org/packages/datasets/versions/3.6.2/topics/USArrests). -```{r fig.align = 'center', echo=FALSE, out.width = "55%"} +```{r 05-Chapter5-83, fig.align = 'center', echo=FALSE, out.width = "55%"} knitr::include_graphics("Chapter_5/Module5_5_Input/Module5_5_Image1.png") ``` @@ -1722,21 +1722,21 @@ Each method has its advantages and disadvantages and more information on all dis Before clustering can be performed, the function needs to be informed of the number of clusters to group the objects into. In the previous module, an example was explored to see if *k*-means clustering would group the chemicals similarly to their chemical class (either a PFAS or statin). Therefore, we told the *k*-means function to cluster into 2 groups. In situations where there is little to no prior knowledge regarding the "correct" number of clusters to specify, methods exist for deriving the optimal number of clusters. Three common methods to find the optimal *k*, or number of clusters, for both *k*-means and hierarchical clustering include: the **elbow method**, **silhouette method**, and the **gap statistic method**. These techniques help us in determining the optimal *k* using visual inspection. + **Elbow Method**: uses a plot of the within cluster sum of squares (WCSS) on the y axis and different values of *k* on the x axis. The location where we no longer observe a significant reduction in WCSS, or where an "elbow" can be seen, is the optimal *k* value. As we can see, after a certain point, having more clusters does not lead to a significant reduction in WCSS. -```{r fig.align = 'center', out.width = "75%", echo=FALSE} +```{r 05-Chapter5-84, fig.align = 'center', out.width = "75%", echo=FALSE} knitr::include_graphics("Chapter_5/Module5_5_Input/Module5_5_Image2.png") ``` Looking at the figures above, the elbow point is much clearer in the first plot versus the second, however, elbow curves from real-world datasets typically resemble the second figure. This is why it's recommended to consider more than one method to determine the optimal number of clusters. + **Silhouette Method**: uses a plot of the average silhouette width (score) on the y axis and different values of *k* on the x axis. The silhouette score is measure of each object's similarity to its own cluster and how dissimilar it is to other clusters. The location where the average silhouette width is *maximized* is the optimal *k* value. -```{r fig.align = 'center', out.width = "65%", echo=FALSE} +```{r 05-Chapter5-85, fig.align = 'center', out.width = "65%", echo=FALSE} knitr::include_graphics("Chapter_5/Module5_5_Input/Module5_5_Image3.png") ``` Based on the figure above, the optimal number of clusters is 2 using the silhouette method. + **Gap Statistic Method**: uses a plot of the gap statistic on the y axis and different values of *k* on the x axis. The gap statistic evaluates the intracluster variation in comparison to expected values derived from a Monte Carlo generated, null reference data distribution for varying values of *k*. The optimal number of clusters is the smallest value where the gap statistic of *k* is greater than or equal to the gap statistic of *k*+1 minus the standard deviation of *k*+1. More details can be found [here](https://uc-r.github.io/kmeans_clustering#:~:text=The%20gap%20statistic%20compares%20the,simulations%20of%20the%20sampling%20process.). -```{r fig.align = 'center', out.width = "65%", echo=FALSE} +```{r 05-Chapter5-86, fig.align = 'center', out.width = "65%", echo=FALSE} knitr::include_graphics("Chapter_5/Module5_5_Input/Module5_5_Image4.png") ``` @@ -1750,7 +1750,7 @@ For additional information and code on all three methods, check out [Determining ## Introduction to Example Data We will apply these techniques using an example dataset from a previously published study where 22 cytokine concentrations were derived from 44 subjects with varying smoking statuses (14 non-smokers, 17 e-cigarette users, and 13 cigarette smokers) from 4 different sampling regions in the body. These samples were derived from nasal lavage fluid (NLF), nasal epithelieum fluid (NELF), sputum, and serum as pictured below. Samples were taken from different regions in the body to compare cytokine expression in the upper respiratory tract, lower respiratory tract, and systemic circulation. -```{r fig.align = 'center', out.width = "75%", echo=FALSE} +```{r 05-Chapter5-87, fig.align = 'center', out.width = "75%", echo=FALSE} knitr::include_graphics("Chapter_5/Module5_5_Input/Module5_5_Image5.png") ``` @@ -1765,13 +1765,13 @@ Let's read in and view the dataset we'll be working with. ### Script Preparations #### Cleaning the global environment -```{r} +```{r 05-Chapter5-88} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, message=FALSE} +```{r 05-Chapter5-89, message=FALSE} if (!requireNamespace("vegan")) install.packages("vegan"); if (!requireNamespace("ggrepel")) @@ -1785,7 +1785,7 @@ install.packages("FactoMineR"); ``` #### Loading required R packages -```{r, message=FALSE} +```{r 05-Chapter5-90, message=FALSE} library(readxl) library(factoextra) library(FactoMineR) @@ -1799,14 +1799,14 @@ suppressPackageStartupMessages(library(dendextend)) ``` #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 05-Chapter5-91, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` #### Importing example dataset Then let's read in our example dataset. As mentioned in the introduction, this example dataset contains cytokine concentrations derived from 44 subjects. Let's import and view these data: -```{r} +```{r 05-Chapter5-92} # Reading in file cytokines_df <- data.frame(read_excel("Chapter_5/Module5_5_Input/Module5_5_InputData.xlsx", sheet = 2)) @@ -1837,7 +1837,7 @@ This training module was specifically developed to answer the following environm 5. Which cytokines have the greatest contributions to the first two eigenvectors? To answer the first environmental health question, let's start by filtering to include only NELF derived samples and non-smokers. -```{r} +```{r 05-Chapter5-93} baseline_df <- cytokines_df %>% filter(Group == "NS", Compartment == "NELF") @@ -1845,7 +1845,7 @@ head(baseline_df) ``` The functions we use will require us to cast the data wider. We will accomplish this using the `dcast()` function from the *reshape2* package. -```{r} +```{r 05-Chapter5-94} wider_baseline_df <- reshape2::dcast(baseline_df, Protein ~ SubjectID, value.var = "Conc_pslog2") %>% column_to_rownames("Protein") @@ -1853,7 +1853,7 @@ head(wider_baseline_df) ``` Now we can derive clusters using the `fviz_nbclust()` function to determine the optimal *k* based on suggestions from the elbow, silhouette, and gap statistic methods. We can use this code for both *k*-means and hierarchical clustering by changing the `FUNcluster` parameter. Lets start with *k*-means:. -```{r fig.align = 'center'} +```{r 05-Chapter5-95, fig.align = 'center'} # Elbow method fviz_nbclust(wider_baseline_df, FUNcluster = kmeans, method = "wss") + labs(subtitle = "Elbow method") @@ -1868,7 +1868,7 @@ fviz_nbclust(wider_baseline_df, FUNcluster = kmeans, method = "gap_stat") + ``` The elbow method is suggesting 2 or 3 clusters, the silhouette method is suggesting 2, and the gap statistic method is suggesting 1. Since each of these methods is recommending different *k* values, we can go ahead and run *k*-means to visualize the clusters and test those different *k*'s. *K*-means clusters will be visualized using the `fviz_cluster()` function. -```{r fig.align = 'center'} +```{r 05-Chapter5-96, fig.align = 'center'} # Choosing to iterate through 2 or 3 clusters using i as our iterator for (i in 2:3){ # nstart = number of random starting partitions, it's recommended for nstart > 1 @@ -1888,7 +1888,7 @@ for (i in 2:3){ ::: The final cluster assignments can easily be obtained using the `kmeans()` function from the *stats* package. -```{r} +```{r 05-Chapter5-97} cluster_kmeans_3 <- kmeans(wider_baseline_df, centers = 3, nstart = 25) cluster_kmeans_df <- data.frame(cluster_kmeans_3$cluster) %>% rownames_to_column("Cytokine") %>% @@ -1906,7 +1906,7 @@ cluster_kmeans_df :::answer **Answer**: After choosing the number of clusters to be 3, the cluster assignments are as follows: -```{r fig.align = 'center', echo=FALSE} +```{r 05-Chapter5-98, fig.align = 'center', echo=FALSE} knitr::include_graphics("Chapter_5/Module5_5_Input/Module5_5_Image7.png") ``` ::: @@ -1918,7 +1918,7 @@ knitr::include_graphics("Chapter_5/Module5_5_Input/Module5_5_Image7.png") Next, we'll turn our attention to answering environmental health questions 3 and 4: What is the optimal number of clusters the cytokines can be grouped into that were derived from nasal epithelium fluid (NELF) in non-smokers using hierarchical clustering? How do the hierarchical cluster assignments compare to the *k*-means cluster assignments? Just as we used the elbow method, silhouette profile, and gap statistic to determine the optimal number of clusters for *k*-means, we can leverage the same approaches for hierarchical by changing the `FUNcluster` parameter. -```{r fig.align = 'center'} +```{r 05-Chapter5-99, fig.align = 'center'} # Elbow method fviz_nbclust(wider_baseline_df, FUNcluster = hcut, method = "wss") + labs(subtitle = "Elbow method") @@ -1943,13 +1943,13 @@ We can see the results are quite similar with 2-3 clusters appearing optimal. ::: Now we can perform the clustering and visualize and extract the results. We'll start by using the `dist()` function to calculate the euclidean distance between the clusters followed by the `hclust()` function to obtain the hierarchical clustering assignments. -```{r} +```{r 05-Chapter5-100} # Viewing the wider dataframe we'll be working with head(wider_baseline_df) ``` -```{r} +```{r 05-Chapter5-101} # First scaling data with each subject (down columns) scaled_df <- data.frame(apply(wider_baseline_df, 2, scale)) rownames(scaled_df) = rownames(wider_baseline_df) @@ -1958,7 +1958,7 @@ head(scaled_df) ``` The `dist()` function is initially used to calculate the Euclidean distance between each cytokine. Next, the `hclust()` function is used to run the hierarchical clustering analysis using the complete method by default. The method can be changed in the function using the method parameter. -```{r} +```{r 05-Chapter5-102} # Calculating euclidean dist dist_matrix <- dist(scaled_df, method = 'euclidean') @@ -1967,7 +1967,7 @@ cytokines_hc <- hclust(dist_matrix) ``` Now we can generate a dendrogram to help us evaluate the results using the `fviz_dend()` function from the *factoextra* package. We use k=3 to be consistent with the *k*-means analysis. -```{r fig.align = 'center', out.width = "75%", warning=FALSE} +```{r 05-Chapter5-103, fig.align = 'center', out.width = "75%", warning=FALSE} fviz_dend(cytokines_hc, k = 3, # Specifying k cex = 0.85, # Label size palette = "futurama", # Color palette see ?ggpubr::ggpar @@ -1979,7 +1979,7 @@ Now we can generate a dendrogram to help us evaluate the results using the `fviz ``` We can also extract those cluster assignments using the `cutree()` function from the *stats* package. -```{r} +```{r 05-Chapter5-104} hc_assignments_df <- data.frame(cutree(cytokines_hc, k = 3)) %>% rownames_to_column("Cytokine") %>% rename(`Hierarchical Cluster` = cutree.cytokines_hc..k...3.) %>% @@ -2008,14 +2008,14 @@ For additional resources on running hierarchical clustering in R, see [Visualizi ## Clustering Plot One additional way to visualize clustering is to plot the first two principal components on the axes and color the data points based on their corresponding cluster. This visualization can be used for both *k*-means and hierarchical clustering using the `fviz_cluster()` function. This figure is essentially a PCA plot with shapes drawn around each cluster to make them distinct from each other. -```{r fig.align = 'center', fig.height=5.5, fig.width=6} +```{r 05-Chapter5-105, fig.align = 'center', fig.height=5.5, fig.width=6} fviz_cluster(cluster_kmeans_3, data = wider_baseline_df) ``` Rather than using the `fviz_cluster()` function as shown in the figure above, we'll extract the data to recreate the sample figure using `ggplot()`. For the manuscript this was necessary, since it was important to facet the plots for each compartment (i.e., NLF, NELF, sputum, and serum). For a single plot, this data extraction isn't required, and the figure above can be further customized within the `fviz_cluster()` function. However, we'll go through the steps of obtaining the indices to recreate the same polygons in `ggplot()` directly. *K*-means actually uses principal component analysis (PCA) to reduce a dataset's dimensionality prior to obtaining the cluster assignments and plotting those clusters. Therefore, to obtain the coordinates of each cytokine within their respective clusters, PCA will need to be run first. -```{r} +```{r 05-Chapter5-106} # First running PCA pca_cytokine <- prcomp(wider_baseline_df, scale = TRUE, center = TRUE) # Only need PC1 and PC2 for plotting, so selecting the first two columns @@ -2030,7 +2030,7 @@ head(baseline_scores_df) ``` Within each cluster, the `chull()` function is used to compute the indices of the points on the convex hull. These are needed for `ggplot()` to create the polygon shapes of each cluster. -```{r} +```{r 05-Chapter5-107} # hull values for cluster 1 cluster_1 <- baseline_scores_df[baseline_scores_df$Cluster == 1, ][chull(baseline_scores_df %>% filter(Cluster == 1)),] @@ -2048,7 +2048,7 @@ head(all_hulls_baseline) ``` Now plotting the clusters using `ggplot()`. -```{r fig.align = 'center', fig.height=5.5, fig.width=6} +```{r 05-Chapter5-108, fig.align = 'center', fig.height=5.5, fig.width=6} ggplot() + geom_point(data = baseline_scores_df, aes(x = PC1, y = PC2, color = Cluster, shape = Cluster), size = 4) + # Adding cytokine names @@ -2090,7 +2090,7 @@ Takeaways from this clustering plot: ## Hierarchical Clustering Visualization We can also build a heatmap using the `pheatmap()` function that has the capability to display hierarchical clustering dendrograms. To do so, we'll need to go back and use the `wider_baseline_df` dataframe. -```{r fig.align = 'center', fig.height=7, fig.width=8} +```{r 05-Chapter5-109, fig.align = 'center', fig.height=7, fig.width=8} pheatmap(wider_baseline_df, cluster_cols = FALSE, # hierarchical clustering of cytokines scale = "column", # scaling the data to make differences across cytokines more apparent @@ -2116,7 +2116,7 @@ Nevertheless, let's identify some key takeaways from this heatmap: ## Variable Contributions To answer our final environmental health question: Which cytokines have the greatest contributions to the first two eigenvectors, we'll use the `fviz_contrib()` function that plots the percentage of each variable's contribution to the principal component(s). It also displays a red dashed line, and variables that fall above are considered to have significant contributions to those principal components. For a refresher on PCA and variable contributions, see the previous module, **TAME 2.0 Module 5.4 Unsupervised Machine Learning**. -```{r fig.align = 'center'} +```{r 05-Chapter5-110, fig.align = 'center'} # kmeans contributions fviz_contrib(pca_cytokine, choice = "ind", addlabels = TRUE, diff --git a/Chapter_6/.DS_Store b/Chapter_6/.DS_Store deleted file mode 100644 index 3a436f6..0000000 Binary files a/Chapter_6/.DS_Store and /dev/null differ diff --git a/Chapter_6/06-Chapter6.Rmd b/Chapter_6/06-Chapter6.Rmd index 85a5fbe..00d7b26 100644 --- a/Chapter_6/06-Chapter6.Rmd +++ b/Chapter_6/06-Chapter6.Rmd @@ -20,7 +20,7 @@ Our example data are derived from a study in which chemical exposure profiles we ### Workspace Preparation and Data Import -```{r message = FALSE} +```{r 06-Chapter6-1, message = FALSE} # Load packages library(tidyverse) # for data organization and manipulation library(janitor) # for data cleaning @@ -40,7 +40,7 @@ theme_set(theme_bw()) ``` First, we will import our raw chemical data and preview it. -```{r warning = FALSE} +```{r 06-Chapter6-2, warning = FALSE} wrist_data <- read.xlsx("Chapter_6/Module6_1_Input/Module6_1_InputData1.xlsx") %>% mutate(across(everything(), \(x) as.numeric(x))) @@ -72,7 +72,7 @@ Although these steps are somewhat specific to our example dataset, similar steps We can use *tidyverse* functions to quickly tabulate how many days participants wore the wristbands. -```{r} +```{r 06-Chapter6-3} wrist_data %>% # Count number of participants for each number of days @@ -100,7 +100,7 @@ wrist_data %>% Because a few participants did not wear their wristbands for all seven days, it will be important to further explore whether there are outlier participants and to normalize the chemical concentrations by number of days the wristband was worn. We can first assess whether any participants have a particularly low or high number of chemicals detected relative to the other participants. We'll prepare the data for graphing by creating a dataframe containing information about how many chemicals were detected per participant. -```{r} +```{r 06-Chapter6-4} wrist_det_by_participant <- wrist_data %>% # Remove Ndays column because we don't need it for this step @@ -120,7 +120,7 @@ datatable(wrist_det_by_participant) ``` Then, we can make our histogram: -```{r warning = FALSE, fig.align = "center"} +```{r 06-Chapter6-5, warning = FALSE, fig.align = "center"} det_per_participant_graph <- ggplot(wrist_det_by_participant, aes(x = n_det)) + geom_histogram(color = "black", fill = "gray60", @@ -145,7 +145,7 @@ From this histogram, we can see that the number of chemicals detected per partic Next, we want to apply a chemical detection filter to remove chemicals from the dataset with very low detection. To start, let's make a dataframe summarizing the percentage of participants in which each chemical was detected and graph this distribution using a histogram. -```{r} +```{r 06-Chapter6-6} # Create dataframe where n_detected is the sum of the rows where there are not NA values chemical_counts <- data.frame(n_detected = colSums(!is.na(wrist_data %>% select(-c(S_ID, Ndays))))) %>% @@ -164,7 +164,7 @@ chemical_counts <- data.frame(n_detected = colSums(!is.na(wrist_data %>% select( datatable(chemical_counts) ``` -```{r fig.align = "center"} +```{r 06-Chapter6-7, fig.align = "center"} det_per_chemical_graph <- ggplot(chemical_counts, aes(x = perc_detected)) + geom_histogram(color = "black", fill = "gray60", @@ -183,7 +183,7 @@ det_per_chemical_graph ``` From this histogram, we can see that many of the chemicals fall in the < 15% or > 90% detection range, with the others distributed evenly between 20 and 90% detection. How we choose to filter our data in part depends on the goals of our analysis. For example, if we only want to keep chemicals detected for almost all of the participants, we could set our threshold at 90% detection: -```{r fig.align = "center"} +```{r 06-Chapter6-8, fig.align = "center"} # Add annotation column chemical_counts <- chemical_counts %>% mutate(det_filter_90 = ifelse(perc_detected > 90, "Yes", "No")) @@ -213,7 +213,7 @@ det_per_chemical_graph_90 However, this only keeps 34 chemicals in our dataset, which is a significant proportion of all of the chemicals measured. We could also consider setting the filter at 20% detection to maximize inclusion of as many chemicals as possible. -```{r fig.align = "center"} +```{r 06-Chapter6-9, fig.align = "center"} # Add annotation column chemical_counts <- chemical_counts %>% mutate(det_filter_20 = ifelse(perc_detected > 20, "Yes", "No")) @@ -252,7 +252,7 @@ det_per_chemical_graph_20 We'll use the 20% detection filter for downstream analyses to maximize inclusion of data for our study. Note that selection of data filters is highly project- and goal- dependent, so be sure to take into consideration typical workflows for your type of data, study, or lab group. -```{r} +```{r 06-Chapter6-10} # Create vector of chemicals to keep chemicals_20perc <- chemical_counts %>% filter(perc_detected > 20) %>% @@ -266,7 +266,7 @@ wrist_data_filtered <- wrist_data %>% We can also summarize chemical detection vs. non-detection by chemical class to understand the number of chemicals in each class that were 1) detected in any participant or 2) detected in more than 20% of participants. -```{r} +```{r 06-Chapter6-11} chemical_count_byclass <- chemical_counts %>% separate(class_chemical, into = c("class", NA), remove = FALSE, sep = "_") %>% group_by(class) %>% @@ -291,7 +291,7 @@ Next, we will check to see if any participants are outliers based on the entire Here, we'll read in the fully cleaned and processed data, which contains data for all 97 participants and the 62 chemicals that passed the detection filter (imputed, time-weighted). We will also apply log2 transformation to move the data closer to a normal distribution. For more on these steps, see **TAME 2.0 Module 3.3 Normality Tests and Data Transformations** and **TAME 2.0 Module 4.2 Data Import, Processing, and Summary Statistics**. -```{r} +```{r 06-Chapter6-12} wrist_data_cleaned <- read.xlsx("Chapter_6/Module6_1_Input/Module6_1_InputData2.xlsx") %>% column_to_rownames("S_ID") %>% mutate(across(everything(), \(x) log2(x+1))) @@ -300,7 +300,7 @@ datatable(wrist_data_cleaned[ 1:6]) ``` First, let's run PCA and plot our data. -```{r fig.align = "center"} +```{r 06-Chapter6-13, fig.align = "center"} # Prepare dataframe wrist_data_cleaned_scaled <- wrist_data_cleaned %>% scale() %>% data.frame() @@ -326,7 +326,7 @@ By visual inspection, it looks like there may be some outliers, so we can use a We can apply this approach to our data by first creating a function to detect PCA outliers based on whether or not that participant passed a certain standard deviation cutoff. -```{r} +```{r 06-Chapter6-14} # Create a function to detect PCA sample outliers. The input is the PCA results data frame and the number of standard deviations for the cutoff. The output is outlier names. outlier_detection = function(pca_df, sd){ @@ -363,7 +363,7 @@ Now that we have explored our dataset and finished processing the data, we can m There are many ways to generate summary statistics tables in R. Here, we will demonstrate a method using the `map_dfr()` function, which takes a list of functions and applies them across columns of the data. The summary statistics are then placed in rows, with each column representing a variable. -```{r warning = FALSE} +```{r 06-Chapter6-15, warning = FALSE} # Define summary functions summary_functs <- lst(min, median, mean, max) @@ -375,7 +375,7 @@ datatable(summarystats_raw[, 1:6]) ``` Through a few cleaning steps, we can transpose and format these data so that they are publication-quality. -```{r} +```{r 06-Chapter6-16} summarystats_raw <- summarystats_raw %>% # Transpose dataframe and return to dataframe class @@ -402,7 +402,7 @@ datatable(summarystats_raw) We can apply the same steps to the cleaned data. -```{r} +```{r 06-Chapter6-17} summarystats_cleaned <- map_dfr(summary_functs, ~ summarise(wrist_data_cleaned, across(1:ncol(wrist_data_cleaned), .x, na.rm = TRUE)), .id = "statistic") %>% t() %>% as.data.frame() %>% @@ -418,7 +418,7 @@ datatable(summarystats_cleaned) Finally, we will merge the data from our `chemical_counts` dataframe (which contains detection information for all of our chemicals) with our summary statistics dataframes. -```{r} +```{r 06-Chapter6-18} summarystats_final <- chemical_counts %>% # Remove 90% detection filter column @@ -438,14 +438,14 @@ datatable(summarystats_final, width = 600) Another important element of any analysis of human data is the demographics table. The demographics table provides key information about the study participants and can help inform downstream analyses, such as exploration of the impact of covariates on the endpoint of interest. There are many different ways to make demographics tables in R. Here, we will demonstrate making a demographics table with the *table1* package. For more on this package, including making tables with multiple groups and testing for statistical differences in demographics between groups, see the *table1* vignette [here](https://benjaminrich.github.io/table1/vignettes/table1-examples.html). First, we'll read in and view our demographic data: -```{r} +```{r 06-Chapter6-19} demo_data <- read.xlsx("Chapter_6/Module6_1_Input/Module6_1_InputData3.xlsx") datatable(demo_data) ``` Then, we can create new labels for our variables so that they are more nicely formatted and more intuitive for display in the table. -```{r} +```{r 06-Chapter6-20} # Create new labels for the demographics table label(demo_data$mat_age_birth) <- "Age at Childbirth" label(demo_data$pc_sex) <- "Sex" @@ -456,7 +456,7 @@ label(demo_data$pc_ed) <- "Educational Attainment" ``` Our demographics data also had "F" for female in the sex column. We can change this to "Female" so that the demographics table is more readable. -```{r} +```{r 06-Chapter6-21} demo_data <- demo_data %>% mutate(pc_sex = dplyr::recode(pc_sex, "F" = "Female")) @@ -464,7 +464,7 @@ label(demo_data$pc_sex) <- "Sex" ``` Now, let's make the table. The first argument in the formula is all of the columns you want to include in the table, followed by the input dataframe. -```{r} +```{r 06-Chapter6-22} table1(~ mat_age_birth + pc_sex + pc_gender + pc_latino_hispanic + pc_race_cleaned + pc_ed, data = demo_data) ``` @@ -476,7 +476,7 @@ There are a couple of steps we could take to clean up the table: 2. Order educational attainment so that it progresses from least to most education. We can change the rendering for our continuous variable by defining our own rendering function (as demonstrated in the package's vignette). -```{r} +```{r 06-Chapter6-23} # Create function for custom table so that Mean (SD) is shown for continuous variables my.render.cont <- function(x) { with(stats.apply.rounding(stats.default(x), digits=2), @@ -485,7 +485,7 @@ my.render.cont <- function(x) { ``` We can order the education attainment by changing it to a factor and defining the levels. -```{r} +```{r 06-Chapter6-24} demo_data <- demo_data %>% mutate(pc_ed = factor(pc_ed, levels = c("High School or GED", "Associate Degree", "Four-Year Degree", "Master's Degree", "Professional Degree or PhD"))) @@ -494,7 +494,7 @@ label(demo_data$pc_ed) <- "Educational Attainment" ``` Then, we can make our final table. -```{r} +```{r 06-Chapter6-25} table1(~ mat_age_birth + pc_sex + pc_gender + pc_latino_hispanic + pc_race_cleaned + pc_ed, data = demo_data, render.continuous = my.render.cont) @@ -563,7 +563,7 @@ Traditional molecular biology techniques typically evaluate the function(s) of i To further understand the molecular consequences of -omics-based alterations, molecules can be overlaid onto molecular networks to uncover biological pathways and molecular functions that are perturbed at the systems biology level. An overview of these generally methods, starting with high-content technologies and ending of systems biology, is provided in the below figure (created with BioRender.com). -```{r, echo=FALSE, fig.align='center'} +```{r 06-Chapter6-26, echo=FALSE, fig.align='center' } knitr::include_graphics("Chapter_6/Module6_2_Input/Module6_2_Image1.png") ``` @@ -587,7 +587,7 @@ Parallel to human genomics/epigenomics-based research is the newer "-omics" topi ## Introduction to Transcriptomics One of the most widely evaluated -omics endpoints is messenger RNA (mRNA) expression (also termed gene expression). As a reminder, mRNA molecules are a major type of RNA produced as the "middle step" in the [Central Dogma Theory](https://en.wikipedia.org/wiki/Central_dogma_of_molecular_biology#:~:text=The%20central%20dogma%20of%20molecular,The%20Central%20Dogma), which describes how genetic DNA is first transcribed into RNA and then translated into protein. Protein molecules are ultimately the major regulators of cellular processes and overall health. Therefore, any perturbations to this process (including changes to mRNA expression levels) can have tremendous consequences on overall cell function and health. A visualization of these steps in the Central Dogma theory are included below. -```{r, echo=FALSE, fig.align='center'} +```{r 06-Chapter6-27, echo=FALSE, fig.align='center' } knitr::include_graphics("Chapter_6/Module6_2_Input/Module6_2_Image2.png") ``` @@ -651,7 +651,7 @@ if (!requireNamespace("piano")) #### Loading R packages required for this session -```{r, message=FALSE, warning=FALSE, error=FALSE} +```{r 06-Chapter6-28, message=FALSE, warning=FALSE, error=FALSE} library(tidyverse) library(DESeq2) library(edgeR) @@ -667,7 +667,7 @@ library(piano) #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 06-Chapter6-29, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` @@ -687,17 +687,17 @@ sampleinfo <- read.csv(file = "Chapter_6/Module6_2_Input/Module6_2_InputData2_Sa ### Data Viewing Let's see how many rows and columns of data are present in the countdata dataframe -```{r} +```{r 06-Chapter6-30} dim(countdata) ``` Let's also view the column headers -```{r} +```{r 06-Chapter6-31} colnames(countdata) ``` And finally let's view the top few rows of data -```{r} +```{r 06-Chapter6-32} head(countdata) ``` Together, this dataframe contains information across 30146 mRNA identifiers, that are labeled according to "Gene name" followed by an underscore and probe number assigned by the platform used in this analysis, BioSpyder TempoSeq Technologies. @@ -706,17 +706,17 @@ A total of 23 columns are included in this dataframe, the first of which represe Let's also see what the metadata dataframe looks like -```{r} +```{r 06-Chapter6-33} dim(sampleinfo) ``` Let's also view the column headers -```{r} +```{r 06-Chapter6-34} colnames(sampleinfo) ``` And finally let's view the top few rows of data -```{r} +```{r 06-Chapter6-35} head(sampleinfo) ``` Together, this dataframe contains information across the 22 total samples, that are labeled according to "SampleID_BioSpyderCountFile" header. These identifiers match those used as column headers in the countdata dataframe. @@ -736,7 +736,7 @@ A total of 9 columns are included in this dataframe, including the following: ### Checking for Duplicate mRNA IDs One common QC/preparation step that is helpful when organizing transcriptomics data is to check for potential duplicate mRNA IDs in the countdata. -```{r} +```{r 06-Chapter6-36} # Visualize this data quickly by viewing top left corner, to check where ID column is located: countdata[1:3,1:5] @@ -763,7 +763,7 @@ In this case, because all potential duplicate checks turn up "FALSE", these data Most of the statistical analyses included in this training module will be carried out using the DESeq2 pipeline. This package requires that the count data and sample information data be formatted in a certain manner, which will expedite the downstream coding needed to carry out the statistics. Here, we will walk users through these initial formatting steps. DESeq2 first requires a `coldata` dataframe, which includes the sample information (i.e., metadata). Let's create this new dataframe based on the original `sampleinfo` dataframe: -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-37, message=F, warning=F, error=F} coldata <- sampleinfo ``` @@ -771,24 +771,24 @@ coldata <- sampleinfo DESeq2 also requires a `countdata` dataframe, which we've previously created; however, this dataframe requires some minor formatting before it can be used as input for downstream script. First, the gene identifiers need to be converted into row names: -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-38, message=F, warning=F, error=F} countdata <- countdata %>% column_to_rownames("Gene") ``` Then, the column names need to be edited. Let's remind ourselves what the column names are currently: -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-39, message=F, warning=F, error=F} colnames(countdata) ``` These column identifiers need to be converted into more intuitive sample IDs, that also indicate treatment. This information can be found in the coldata dataframe. Specifically, information in the column labeled `SampleID_BioSpyderCountFile` will be helpful for these purposes. To replace these original column identifiers with these more helpful sample identifiers, let's first make sure the order of the countdata columns are in the same order as the coldata column of `SampleID_BioSpyderCountFile`: -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-40, message=F, warning=F, error=F} countdata <- setcolorder(countdata, as.character(coldata$SampleID_BioSpyderCountFile)) ``` Now, we can rename the column names within the countdata dataframe with these more helpful identifiers, since both dataframes are now arranged in the same order: -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-41, message=F, warning=F, error=F} colnames(countdata) <- coldata$ID # Rename the countdata column names with the treatment IDs. colnames(countdata) # Viewing these new column names ``` @@ -796,7 +796,7 @@ These new column identifiers look much better, and can better inform downstream When relabeling dataframes, it's always important to triple check any of these major edits. For example, here, let's double check that the same samples appear in the same order between the two working dataframes required for dowstream DESeq2 code: -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-42, message=F, warning=F, error=F} setequal(as.character(coldata$ID), colnames(countdata)) identical(as.character(coldata$ID), colnames(countdata)) ``` @@ -883,7 +883,7 @@ Prior to final statistical analysis, raw transcriptomic data are commonly evalua Let's start by using PCA to identify potential outliers, while providing a visualization of potential sources of variation across the dataset. First we need to move the Gene column back to the rownames so our dataframe is numeric and we can run the PCA script -```{r, message=FALSE, warning=FALSE, error=FALSE} +```{r 06-Chapter6-43, message=FALSE, warning=FALSE, error=FALSE} countdata <- countdata %>% column_to_rownames("Gene") # Let's remind ourselves what these data look like @@ -892,13 +892,13 @@ countdata[1:10,1:5] #viewing first 10 rows and 5 columns Then we can calculate principal components using transposed count data -```{r} +```{r 06-Chapter6-44} pca <- prcomp(t(countdata)) ``` And visualize the percent variation captured by each principal component (PC) with a scree plot -```{r, fig.align='center'} +```{r 06-Chapter6-45, fig.align='center'} # We can generate a scree plot that shows the eigenvalues of each component, indicating how much of the total variation is captured by each component fviz_eig(pca, addlabels = TRUE) ``` @@ -908,7 +908,7 @@ This scree plot indicates that nearly all variation is explained in PC1 and PC2, #### Visualization of Transcriptomic Data using PCA Further visualization of how these transcriptomic data appear through PCA can be produced through a scatter plot showing the data reduced values per sample: -```{r, fig.align='center', warning = FALSE} +```{r 06-Chapter6-46, fig.align='center', warning = FALSE} # Calculate the percent variation captured by each PC pca_percent <- round(100*pca$sdev^2/sum(pca$sdev^2),1) @@ -934,14 +934,14 @@ With this plot, we can see that samples do not demonstrate obvious groupings, wh #### Now lets implement hierarchical clustering to identify potential outliers First we need to create a dataframe of our transposed `countdata` such that samples are rows and genes are columns to input into the clustering algorithm. -```{r} +```{r 06-Chapter6-47} countdata_for_clustering <- t(countdata) countdata_for_clustering[1:5,1:10] # Viewing what this transposed dataframe looks like ``` Next we can run hierarchical clustering in conjunction with the generation of a heatmap. Note that we scale these data for improved visualization. -```{r, fig.align='center'} +```{r 06-Chapter6-48, fig.align='center'} pheatmap(scale(countdata_for_clustering), main="Hierarchical Clustering", cluster_rows=TRUE, cluster_cols = FALSE, fontsize_col = 7, treeheight_row = 60, show_colnames = FALSE) @@ -981,7 +981,7 @@ Here, we leverage the package called *RUVseq* to employ RUV on this sequencing d #### Steps in carrying out RUV using RUVseq on this example dataset: -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-49, message=F, warning=F, error=F} # First we store the treatment IDs and exposure conditions as a separate vector ID <- coldata$ID @@ -1002,14 +1002,14 @@ trt_groups ``` *RUVseq* contains its own set of plotting and normalization functions, though requires input of what's called an object of S4 class SeqExpressionSet. Let's go ahead and make this object, using the *RUVseq* function `newSeqExpressionSet()`: -```{r} +```{r 06-Chapter6-50} exprSet <- newSeqExpressionSet(as.matrix(countdata),phenoData = data.frame(groups,row.names=colnames(countdata))) ``` And then use this object to generate some exploratory plots using built-in tools within *RUVseq*. First starting with some bar charts summarizing overall data distributions per sample: -```{r, fig.align='center'} +```{r 06-Chapter6-51, fig.align='center'} colors <- brewer.pal(4, "Set2") plotRLE(exprSet, outline=FALSE, ylim=c(-4, 4), col=colors[groups]) ``` @@ -1018,7 +1018,7 @@ We can see from this plot that some of the samples show distributions that may v Then viewing a PCA plot of these samples: -```{r, fig.align='center'} +```{r 06-Chapter6-52, fig.align='center'} colors <- brewer.pal(4, "Set2") plotPCA(exprSet, col=colors[groups], cex=1.2) ``` @@ -1027,7 +1027,7 @@ This PCA plot shows pretty good data distributions, with samples mainly showing Now to actually run the RUVseq algorithm, to control for potential sources of sample heterogeneity, we need to first construct a matrix specifying the replicates (samples of the same exposure condition): -```{r} +```{r 06-Chapter6-53} # Construct a matrix specifying the replicates (samples of the same exposure condition) for running RUV differences <- makeGroups(groups) @@ -1039,7 +1039,7 @@ This matrix groups the samples by exposure condition. Here, each of the four row Let's now implement the RUVseq algorithm and, for this example, capture one factor (k=1) of unwanted variation. Note that the k parameter can be modified to capture additional factors, if necessary. -```{r} +```{r 06-Chapter6-54} # Now capture 1 factor (k=1) of unwanted variation ruv_set <- RUVs(exprSet, rownames(countdata), k=1, differences) ``` @@ -1048,21 +1048,21 @@ ruv_set <- RUVs(exprSet, rownames(countdata), k=1, differences) This results in a list of objects within `ruv_set`, which include the following important pieces of information: (1) Estimated factors of unwanted variation are provided in the phenoData object, as viewed using the following: -```{r} +```{r 06-Chapter6-55} # viewing the estimated factors of unwanted variation in the column W_1 pData(ruv_set) ``` (2) Normalized counts obtained by regressing the original counts on the unwanted factors (normalizedCounts object within `ruv_set`). Note that the normalized counts should only used for exploratory purposes and not subsequent differential expression analyses. For additional information on this topic, please refer official *RUVSeq* documentation. The normalized counts can be viewed using the following: -```{r} +```{r 06-Chapter6-56} # Viewing the head of the normalized count data, accounting for unwanted variation head(normCounts(ruv_set)) ``` Let's again generate an exploratory plot using this updated dataset, focusing on the bar chart view since that was the most informative pre-RUV. Here are the updated bar charts summarizing overall data distributions per sample: -```{r, fig.align='center'} +```{r 06-Chapter6-57, fig.align='center'} colors <- brewer.pal(4, "Set2") plotRLE(ruv_set, outline=FALSE, ylim=c(-4, 4), col=colors[groups]) ``` @@ -1114,7 +1114,7 @@ Note that these calculations, among others, are embedded within the DESeq2 funct Here we provide example script that is used to identify which genes are significantly differentially expressed in association with the example biomass smoke exposures, smoldering pine needles and flaming pine needles, as well as a positive inflammation control, LPS. First, we need to set-up the DESeq2 experiment: -```{r, message=FALSE, warning=FALSE, error=FALSE} +```{r 06-Chapter6-58, message=FALSE, warning=FALSE, error=FALSE} # Set up our experiment using our RUV adjusted count and phenotype data. # Our design indicates that our count data is dependent on the exposure condition (groups variable) and our factor of unwanted variation, and we have specified that there not be an intercept term through the use of '~0' dds <- DESeqDataSetFromMatrix(countData = counts(ruv_set), # Grabbing count data from the 'ruv_set' object @@ -1147,7 +1147,7 @@ vsd_matrix <- as.matrix(assay(vsd)) ``` We could also export them using code such as: -```{r eval = FALSE} +```{r 06-Chapter6-59, eval = FALSE} # Export data write.csv(normcounts, "Chapter_6/Module6_2_Input/Module6_2_Output_NormalizedCounts.csv") write.csv(vsd_matrix, "Chapter_6/Module6_2_Input/Module6_2_Output_VSDCounts.csv", row.names=TRUE) @@ -1165,7 +1165,7 @@ which we can easily code for using a loop function, as detailed below. Note that we have commented out the line of code for writing out the CSV because we do not need it for the rest of the module, but this could be used if you need to write out and view results in an external application such as Excel for supplementary materials. -```{r, message=FALSE, warning=FALSE, error=FALSE} +```{r 06-Chapter6-60, message=FALSE, warning=FALSE, error=FALSE} # Run experiment dds_run <- DESeq(dds, betaPrior=FALSE) @@ -1224,7 +1224,7 @@ for (trt in trt_groups){ # Iterate for each of the treatments listed in 'trt_gro Here, we leverage MA plots to show how log fold changes relate to expression levels. In these plots, the log fold change is plotted on the y-axis and expression values are plotted along the x-axis, and dots are colored according to statistical significance (using padj<0.05 as the statistical filter). Here we will generate an MA plot for Flaming Pine Needles. -```{r, message=F, warning=F, error=F, fig.align='center'} +```{r 06-Chapter6-61, message=F, warning=F, error=F, fig.align='center'} res <- results(dds_run, pAdjustMethod = "BH", contrast = c("groups","PineNeedlesFlame_4h_Lung",ctrl)) # Re-extract the DESeq2 results for the flaming pine needles MA <- data.frame(res) # Make a preliminary dataframe of the flaming pine needle results @@ -1259,7 +1259,7 @@ An appropriate title for this figure could be: Similar to MA plots, volcano plots provide visualizations of fold changes in expression from transcriptomic data. However, instead of plotting these values against expression, log fold change is plotted against (adjusted) p-values in volcano plots. Here, we use functions within the *[EnhancedVolcano package](https://www.rdocumentation.org/packages/EnhancedVolcano/versions/1.11.3/topics/EnhancedVolcano)* to generate a volcano plot for Flaming Pine Needles. Running the `EnhancedVolcano()` function to generate an example volcano plot: -```{r, message=FALSE, warning=FALSE, error=FALSE, fig.align='center', out.width = 700, out.height = 580} +```{r 06-Chapter6-62, message=FALSE, warning=FALSE, error=FALSE, fig.align='center', out.width = 700, out.height = 580} Vol <- data.frame(res) # Dataset to use for plotting EnhancedVolcano(Vol, @@ -1297,7 +1297,7 @@ To detail, the following input data are required to run *PIANO*: (1) Background gene set: -```{r} +```{r 06-Chapter6-63} # First grab the rownames of the 'res' object, which was redefined as the DESeq2 results for flaming pine needles prior to MA plot generation, and remove the BioSpyder numeric identifier using a sub function, while maintaining the gene symbol and place these IDs into a new list within the 'res' object (saved as 'id') res$id <- gsub("_.*", "", rownames(res)); @@ -1312,7 +1312,7 @@ Background[1:200] # viewing the first 200 genes in this background list ``` (2) The list of genes identified with significant differential expression associated with flaming pine needles: -```{r} +```{r 06-Chapter6-64} # Similar to the above script, but starting with the res$id object # and filtering for genes with padj < 0.05 @@ -1328,7 +1328,7 @@ Therefore, this gene set includes 488 *unique* genes significantly associated wi (3) The underlying KEGG pathway dataset. Note that this file was simply downloaded from [MSigDB](https://www.gsea-msigdb.org/gsea/msigdb/), ready for upload as a .gmt file. Here, we use the `loadGSC()` function enabled through the *PIANO* package to upload and organize these pathways. -```{r} +```{r 06-Chapter6-65} KEGG_Pathways <- loadGSC(file="Chapter_6/Module6_2_Input/Module6_2_InputData3_KEGGv7.gmt", type="gmt") length(KEGG_Pathways$gsc) # viewing the number of biological pathways contained in the database @@ -1337,7 +1337,7 @@ This KEGG pathway database therefore includes 186 biological pathways available With these data inputs ready, we can now run the pathway enrichment analysis. The enrichment statistic that is commonly employed through the *PIANO* package is based of a hypergeometric test, run through the `runGSAhyper()` function. This returns a p-value for each gene set from which you can determine enrichment status. -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-66, message=F, warning=F, error=F} # Running the piano function based on the hypergeometric statistic Results_GSA <- piano::runGSAhyper(genes=SigGenes, universe=Background,gsc=KEGG_Pathways, gsSizeLim=c(1,Inf), adjMethod = "fdr") @@ -1352,7 +1352,7 @@ This dataframe therefore summarizes the enrichment p-value for each pathway, FDR With these results, let's identify which pathways meet a statistical enrichment p-value filter of 0.05: -```{r} +```{r 06-Chapter6-67} SigPathways <- PathwayResults[which(PathwayResults$`p-value` < 0.05),] rownames(SigPathways) ``` @@ -1482,7 +1482,7 @@ Though statistical methodologies are still evolving, we will be discussing our c There are many methods that can be implemented to also elucidate relationships between individual chemicals/chemical groups in complex mixtures and their resulting toxicity/health effects. Some of the more common methods used in mixtures analyses, as identified by our team, are summarized in the below figure according to potential questions that could be asked in a study. Two of the methods, specifically quantile based g-computation (qgcomp) and bayesian kernel machine regression (BKMR) are highlighted as example mixtures scripted activities (qgcomp in this script and BKMR in Mixtures Methods 2). Throughout TAME 2.0 training materials, other methods are included such as Principal Component Analysis (PCA), K-means clustering, hierarchical clustering, and predictive modeling / machine learning (e.g., Random Forest modeling and variable selection). The following figure provides an overview of the types of questions that can be asked regarding mixtures and models that are commonly used to answer these questions: -```{r, echo=FALSE, fig.align='center'} +```{r 06-Chapter6-68, echo=FALSE, fig.align='center' } knitr::include_graphics("Chapter_6/Module6_3_Input/Module6_3_Mixtures_Methods_Overview.png") ``` @@ -1527,7 +1527,7 @@ library(qgcomp) Optionally, you can also create a current date variable to name output files, and create an output folder. -```{r eval = FALSE} +```{r 06-Chapter6-69, eval = FALSE} # Create a current date variable to name outputfiles cur_date <- str_replace_all(Sys.Date(),"-","") @@ -1536,7 +1536,7 @@ Output_Folder <- ("Module6_3_Output/") ``` ### Data Import -```{r} +```{r 06-Chapter6-70} cohort <- read.csv(file="Chapter_6/Module6_3_Input/Module6_3_InputData.csv") colnames(cohort) head(cohort) @@ -1568,7 +1568,7 @@ Other variables of interest (outcome and covariates) in this dataset: #### Check variable formats Ensure that the outcome variable is binomial (factor) and has the correct reference level. Ensure that the exposure variables are categorical (factors). Ensure that covariates are in the correct variable format -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-71, message=F, warning=F, error=F} #outcome variable cohort <- cohort %>% @@ -1610,7 +1610,7 @@ cohort <- cohort %>% #### Fit adjusted logistic regression models for each metal, for each categorical variable First, we will fit an adjusted logistic regression model for each metal, for each categorical variable, to demonstrate a variable by variable approach before diving into mixtures methods. Note that there are different regression techniques (linear and logistic are covered in another TAME module) and that here we will start with using percentage variables. -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-72, message=F, warning=F, error=F} metals <- c("Arsenic","Cadmium","Chromium", "Copper","Lead","Manganese","Zinc") @@ -1647,7 +1647,7 @@ for (i in 1:length(metals)) { ``` Plot the results: -```{r, message=F, warning=F, error=F, fig.align='center'} +```{r 06-Chapter6-73, message=F, warning=F, error=F, fig.align='center'} perc_plots <- ggarrange(Arsenic_adj_perc_plot, @@ -1663,7 +1663,7 @@ plot(perc_plots1) ``` Save the plots: -```{r, eval = FALSE} +```{r 06-Chapter6-74, eval = FALSE} tiff(file = (paste0(Output_Folder,"/", cur_date, "_NCbirths_pretermbirth_singlemetal_adjusted_models_percplots_1.tiff")), width = 10, height = 8, units = "in", pointsize = 12, res = 600) plot(perc_plots) dev.off() @@ -1675,7 +1675,7 @@ dev.off() ``` We can also run the analysis using limit variables: -```{r, message=F, warning=F, error=F, fig.align='center'} +```{r 06-Chapter6-75, message=F, warning=F, error=F, fig.align='center'} for (i in 1:length(metals)) { metal <- metals[[i]] @@ -1712,7 +1712,7 @@ Note: you will get this warning for some of the models: This is because for the variability in the exposure data, ideally the sample size would be larger (as noted above the analysis this draws from was completed on >1.3million observations). Plot the results: -```{r, message=F, warning=F, error=F, fig.align='center'} +```{r 06-Chapter6-76, message=F, warning=F, error=F, fig.align='center'} limit_plots <- ggarrange(Arsenic_adj_limit_plot, Cadmium_adj_limit_plot, Chromium_adj_limit_plot, @@ -1728,7 +1728,7 @@ plot(limit_plots1) ``` Save the plots: -```{r, eval = FALSE} +```{r 06-Chapter6-77, eval = FALSE} tiff(file = (paste0(Output_Folder,"/", cur_date, "_NCbirths_pretermbirth_singlemetal_adjusted_models_limitplots1.tiff")), width = 10, height = 8, units = "in", pointsize = 12, res = 600) plot(limit_plots) dev.off() @@ -1739,7 +1739,7 @@ dev.off() ``` Merge all of the logistic regression model results. This is the data frame that you could export for supplementary material or to view the results in Excel. -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-78, message=F, warning=F, error=F} #merge all model output results_df <- rbind(Arsenic_adj_perc, Arsenic_adj_limit, Cadmium_adj_perc, Cadmium_adj_limit, @@ -1751,7 +1751,7 @@ results_df <- rbind(Arsenic_adj_perc, Arsenic_adj_limit, ``` To select only the coefficients related to the primary exposures: -```{r} +```{r 06-Chapter6-79} results_df <- results_df %>% filter(str_detect(term, 'limit|50to90|over90')) ``` @@ -1773,7 +1773,7 @@ This file outputs the coefficients and the odds ratios (OR) of the logistic regr While the single contaminant models provide useful information, they cannot inform us of the effect of multiple simultaneous exposures or account for co-occurring contaminant confounding. Therefore, we want to utilize quantile g-compuation to assess mixtures. ## Mixtures Model with Standard qqcomp -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-80, message=F, warning=F, error=F} #list of exposure variables Xnm <- c('Arsenic.Mean_avg', 'Cadmium.Mean_avg', 'Lead.Mean_avg', 'Manganese.Mean_avg', 'Chromium.Mean_avg', 'Copper.Mean_avg', 'Zinc.Mean_avg') #list of covariates @@ -1788,7 +1788,7 @@ PTB_adj_ppb <- qgcomp.noboot(preterm~., In English, `preterm~.` is saying fit a model that has preterm (1/0) as the dependent variable and then the independent variables (exposures and covariates) are all other variables in the dataset (`.`). `expnms=Xnm` is saying that the mixture of exposures is given by the vector `Xnm,` defined above. `dat=cohort[,c(Xnm,covars,'preterm')]` is saying that the dataset to be used to fit this model includes all columns in the cohort dataset that are listed in the `Xnm` and `covars` vectors and also the `preterm` variable. `family=binomial()` is saying that the outcome is a binary outcome and therefore the model will fit a logistic regression model. `q=4` is saying break the exposures into quartiles, other options would be q=3 for teriltes, q=5 for quintiles and so forth. This is a summary of the qgcomp model output -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-81, message=F, warning=F, error=F} PTB_adj_ppb ``` This output can be interpreted as: @@ -1801,12 +1801,12 @@ This output can be interpreted as: IMPORTANT NOTE: as described above, these results differ from the publication (Eaves et al. 2023) because this scripted example is conducted on a smaller subsetted dataset. This is the plot that gives you the weights of the components -```{r, message=F, warning=F, error=F, fig.align='center'} +```{r 06-Chapter6-82, message=F, warning=F, error=F, fig.align='center'} plot(PTB_adj_ppb) ``` To save the plot: -```{r, eval = FALSE} +```{r 06-Chapter6-83, eval = FALSE} tiff(file = (paste0(Output_Folder,"/", cur_date, "_NCbirths_pretermbirth_qgcomp_weights.tiff")), width = 10, height = 8, units = "in", pointsize = 12, res = 600) plot(PTB_adj_ppb) dev.off() @@ -1847,7 +1847,7 @@ We can export the mixtures modeling results using the following code, which stor + Results_SlopeParams outputs the overall mixture effect results + Results_MetalCoeffs outputs the individual mixture components (metals) coefficients. Note that this will also output coefficient for covariates included in the model. + Results_MetalWeights outputs the individual mixture components (metals) weights -```{r, message=F, warning=F, error=F, eval = FALSE} +```{r 06-Chapter6-84, message=F, warning=F, error=F, eval = FALSE} allmodels <- c("PTB_adj_ppb") #if you run more than one qgcomp model, list them here and the following code can output the results in clean format all together clean_print <- function(x){ @@ -2021,7 +2021,7 @@ library(bkmr) ``` Optionally, you can also create a current date variable to name output files, and create an output folder. -```{r eval = FALSE} +```{r 06-Chapter6-85, eval = FALSE} #Create a current date variable to name outputfiles cur_date <- str_replace_all(Sys.Date(),"-","") @@ -2030,7 +2030,7 @@ Output_Folder <- ("Module6_4_Output/") ``` ### Data Import -```{r} +```{r 06-Chapter6-86} cohort <- read.csv(file="Chapter_6/Module6_4_Input/Module6_4_InputData.csv") colnames(cohort) head(cohort) @@ -2071,7 +2071,7 @@ In addition, it is highly recommended to conduct single-contaminant modeling ini First, define a matrix/vector of the exposure mixture, outcome, and confounders/covariates. BKMR performs better when the exposures are on a similar scale and when there are not outliers. Thus, we center and scale the exposure variables first. As noted above, in a complete analysis, thorough examination of exposure variable distributions, including outliers and normality, would be conducted before any exposure-outcome modeling. For more information on normality testing, see **TAME 2.0 Module 3.3 Normality Tests and Data Transformations.** First, we'll assign the matrix variables to their own data frame and scale the data. -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-87, message=F, warning=F, error=F} #exposure mixture variables mixture <- as.matrix(cohort[,10:17]) mixture <- log(mixture) @@ -2080,7 +2080,7 @@ summary(mixture) ``` Then, we'll define the outcome variable and ensure it is the proper class and leveling. -```{r} +```{r 06-Chapter6-88} #outcome variable cohort$inflam_intense <-as.factor(cohort$inflam_intense) cohort$inflam_intense <- relevel(cohort$inflam_intense, ref = "0") @@ -2088,13 +2088,13 @@ y<-as.numeric(as.character(cohort$inflam_intense)) ``` Next, we'll assign the covariates to a matrix. -```{r} +```{r 06-Chapter6-89} #covariates covariates<-as.matrix(cohort[,7:9]) ``` Then, we can fit the BKMR model. Note that this script will take a few minutes to run. -```{r} +```{r 06-Chapter6-90} set.seed(111) fitkm <- kmbayes(y = y, Z = mixture, X = covariates, iter = 5000, verbose = FALSE, varsel = TRUE, family="binomial", est.h = TRUE) ``` @@ -2103,7 +2103,7 @@ For full information regarding options for the kmbayes function, refer to the BK ### Assess Variable Importance BKMR conducts a variable selection procedure and generates posterior inclusion probabilities (PIP). The larger the PIP, the more a variable is contributing to the overall exposure-outcome effect. These are relative to each other,so there is no threshold as to when a variable becomes an "important" contributor (similar to the weights in quantile g-computation). -```{r, message=F, warning=F, error=F} +```{r 06-Chapter6-91, message=F, warning=F, error=F} ExtractPIPs(fitkm) ``` @@ -2123,13 +2123,13 @@ Relative to each other, the contributions to the effect of the mixture on neonat We can use trace plots to evaluate how the parameters in the model converge over the many iterations. We hope to see that the line moves randomly but centers around a straight line -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-92, message=F, warning=F, error=F, fig.align = "center"} sel<-seq(0,5000,by=1) TracePlot(fit = fitkm, par = "beta", sel=sel) ``` Based on this plot, it looks like the burn in period is roughly 1000 iterations. We will remove these from the results. -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-93, message=F, warning=F, error=F, fig.align = "center"} sel<-seq(1000,5000,by=1) TracePlot(fit = fitkm, par = "beta", sel=sel) ``` @@ -2141,7 +2141,7 @@ As described above, one way to examine single effects is to calculate the odds r Here, we use the `PredictorResponseUnivar()` function to generate a dataset that details, at varying levels of each exposure (`z`), the relationship between that exposure and the outcome, holding other exposures at their 50th percentile and covariates constant. This relationship is given by a beta value, which because we have a binomial outcome and fit a probit model represents the log(odds) (`est`). The standard error for the beta value is also calculated (`se`). -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-94, message=F, warning=F, error=F, fig.align = "center"} pred.resp.univar <- PredictorResponseUnivar(fit=fitkm, sel=sel, method="approx", q.fixed = 0.5) @@ -2150,7 +2150,7 @@ head(pred.resp.univar) We can then plot these data for each exposure to visualize the exposure-response function for each exposure. -```{r} +```{r 06-Chapter6-95} ggplot(pred.resp.univar, aes(z, est, ymin = est - 1.96*se, ymax = est + 1.96*se)) + geom_smooth(stat = "identity") + ylab("h(z)") + facet_wrap(~ variable) @@ -2158,7 +2158,7 @@ ggplot(pred.resp.univar, aes(z, est, ymin = est - 1.96*se, Then, we can generate a dataset that contains for each exposure (`variable`), the log(OR) (`est`) (and its standard deviation (`sd`)) corresponding to the odds of neonatal inflammation when an exposure is at its 75th compared to the odds when at the 25th percentile. The log(OR) is estimated at three levels of the other exposures (25th, 50th and 75th percentiles). We can use this dataset to identify odds ratios for neonatal inflammation (comparing the 75th to 25th percentile odds) for each exposure at differing levels of the other exposures. These odds ratios approximate risk, whereby an odds ratio >1 means there is increased risk of neonatal inflammation when that exposure is at its 75th percentile compared to its 25th percentile. We can then plot these data to see the logOR for each metal in relation to neonatal inflammation at varying levels of the rest of the exposures. -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-96, message=F, warning=F, error=F, fig.align = "center"} risks.singvar <- SingVarRiskSummaries(fit=fitkm, qs.diff = c(0.25, 0.75), q.fixed = c(0.25, 0.50, 0.75), method = "approx") @@ -2198,7 +2198,7 @@ ggplot(risks.singvar, aes(variable, est, ymin = est - 1.96*sd, Next, we can generate a dataset that details the effect (ie. log(OR) (`est`) and corresponding standard deviation (`sd`)) on neonatal inflammation of all exposures when at a particular quantile (`quantile`) compared to all exposures being at the 50th percentile. We can use this dataset to identify odds ratios for neonatal inflammation upon simultaneous exposure to the entire mixture for different quantile threshold comparisons. These odds ratios approximate risk, whereby an odds ratio >1 means there is increased risk of neonatal inflammation when the entire mixture is set at the index quantile, compared to the 50th percentile. We can also plot these results to visualize the overall mixture effect dose-response relationship. -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-97, message=F, warning=F, error=F, fig.align = "center"} risks.overall <- OverallRiskSummaries(fit=fitkm, qs=seq(0.25, 0.75, by=0.05), q.fixed = 0.5, method = "approx", sel=sel) @@ -2223,7 +2223,7 @@ ggplot(risks.overall, aes(quantile, est, ymin = est - 1.96*sd, ### Evaluating interactive effects To understand bivariate interactions, we can generate a dataset that for each pairing of exposures details at varying levels of both exposures, the log(odds) (`est`, and associated standard deviation (`sd`)) of neonatal inflammation when all the other exposures are held constant. These plots can be tricky to interpret, so another way of looking at these results is to take "cross sections" at specific quantiles of the second exposure (see next step). -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-98, message=F, warning=F, error=F, fig.align = "center"} pred.resp.bivar <- PredictorResponseBivar(fit=fitkm, min.plot.dist = 1, sel=sel, method="approx") @@ -2238,7 +2238,7 @@ ggplot(pred.resp.bivar, aes(z1, z2, fill = est)) + Next, we generate a dataset that includes for each pairing of exposures, the log(odds) (`est` and associated standard deviation `sd`) of neonatal inflammation at varying concentrations (`z1`) of the first exposure (`variable 1`) when the second exposure (`variable 2` is at its 25th, 50th and 75th percentile (`quantile`). -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-99, message=F, warning=F, error=F, fig.align = "center"} pred.resp.bivar.levels <- PredictorResponseBivarLevels(pred.resp.df= pred.resp.bivar, Z = mixture, both_pairs=TRUE, qs = c(0.25, 0.5, 0.75)) @@ -2252,7 +2252,7 @@ ggplot(pred.resp.bivar.levels, aes(z1, est)) + There is evidence of an interactive effect between two exposures when the exposure-response function for exposure 1 varies in form between the different quantiles of exposure 2. You can also zoom in on one plot, for example: -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-100, message=F, warning=F, error=F, fig.align = "center"} HgCd <- pred.resp.bivar.levels %>% filter(variable1=="Hg_ngg") %>% filter(variable2=="Cd_ngg") @@ -2274,7 +2274,7 @@ ggplot(CdHg, aes(z1, est)) + ``` To visualize interactions between one exposure and the rest of the exposure components, we generate a dataset that details the difference in each exposure's (`variable`) log(OR) comparing 75th to 25th percentile (`est`, and associated standard deviation `sd`) when the other exposure components are at their 75th versus 25th percentile. Perhaps more intuitively, these estimates represent the blue - red points plotted in the second figure under the single exposure effects section. -```{r, message=F, warning=F, error=F, fig.align = "center"} +```{r 06-Chapter6-101, message=F, warning=F, error=F, fig.align = "center"} risks.int <- SingVarIntSummaries(fit=fitkm, qs.diff = c(0.25, 0.75), qs.fixed = c(0.25, 0.75)) @@ -2331,7 +2331,7 @@ Using the simulated dataset within the bkmr package (see below code for how to c Note that the outcome (y) variable is a continuous variable here, rather than binary as in the scripted example. ::: -```{r} +```{r 06-Chapter6-102} # Set seed for reproducibility set.seed(111) @@ -2395,7 +2395,7 @@ The toxicological profiles of these samples were also analyzed using *in vitro* #### Install required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, results=FALSE, message=FALSE} +```{r 06-Chapter6-103, results=FALSE, message=FALSE} if (!requireNamespace("tidyverse")) install.packages("tidyverse"); if (!requireNamespace("readxl")) @@ -2411,7 +2411,7 @@ if (!requireNamespace("ggplotify")) ``` #### Loading required packages -```{r, results=FALSE, message=FALSE} +```{r 06-Chapter6-104, results=FALSE, message=FALSE} library(readxl) #used to read in and work with excel files library(factoextra) #used to run and visualize multivariate analyses, here PCA library(pheatmap) #used to make heatmaps. This can be done in ggplot2 but pheatmap is easier and nicer @@ -2421,7 +2421,7 @@ library(tidyverse) #all tidyverse packages, including dplyr and ggplot2 ``` #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 06-Chapter6-105, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` @@ -2429,7 +2429,7 @@ setwd("/filepath to where your input files are") We need to first read in the chemistry and toxicity data from the provided excel file. Here, data were originally organized such that the actual observations start on row 2 (dataset descriptions were in the first row). So let's implement skip=1, which skips reading in the first row. -```{r} +```{r 06-Chapter6-106} chem <- read_xlsx("Chapter_6/Module6_5_Input/Module6_5_InputData.xlsx" , sheet = "chemistry data", skip=1) # loads the chemistry data tab tox <- read_xlsx("Chapter_6/Module6_5_Input/Module6_5_InputData.xlsx" , sheet = "in vitro data", skip=1) # loads the toxicity data tab ``` @@ -2437,13 +2437,13 @@ tox <- read_xlsx("Chapter_6/Module6_5_Input/Module6_5_InputData.xlsx" , sheet = ### View example dataset Let's first see how many rows and columns of data are present in both datasets: -```{r} +```{r 06-Chapter6-107} dim(chem) ``` The chemistry dataset contains information on 29 samples (rows); and 1 sample identifier + 12 chemicals (total of 13 columns). -```{r} +```{r 06-Chapter6-108} dim(tox) ``` @@ -2451,19 +2451,19 @@ The tox dataset contains information on 29 samples (rows); and 1 sample identifi Let's also see what kind of data are organized within the datasets: -```{r} +```{r 06-Chapter6-109} colnames(chem) ``` -```{r} +```{r 06-Chapter6-110} head(chem) ``` -```{r} +```{r 06-Chapter6-111} colnames(tox) ``` -```{r} +```{r 06-Chapter6-112} head(tox) ``` @@ -2477,18 +2477,18 @@ In summary, PCA finds dimensions (eigenvectors) in the higher dimensional origin Before we can run PCA on this chemistry dataset, we first need to scale the data across samples. We do this here for the chemistry dataset, because we specifically want to evaluate and potentially highlight/emphasize chemicals that may be at relatively low abundance. These low-abundance chemicals may actually be contaminants that drive toxicological effects. Let's first re-save the original chemistry dataset to compare off of: -```{r} +```{r 06-Chapter6-113} chem_original <- chem ``` Then, we'll make a scaled version to carry forward in this analysis. To do this, we move the sample column the row names and then scale and center the data. -```{r} +```{r 06-Chapter6-114} chem <- chem %>% column_to_rownames("Sample") chem <- as.data.frame(scale(as.matrix(chem))) ``` Let's now compare one of the rows of data (here, sample GbE_E) to see what scaling did: -```{r} +```{r 06-Chapter6-115} chem_original[5,] chem[5,] ``` @@ -2496,28 +2496,28 @@ chem[5,] You can see that scaling made the concentrations distributed across each chemical center around 0. Now, we can run PCA on the scaled data: -```{r} +```{r 06-Chapter6-116} chem_pca <- princomp(chem) ``` Looking at the scree plot, we see the first two principal components capture most of the variance in the data (~64%): -```{r fig.align = "center"} +```{r 06-Chapter6-117, fig.align = "center"} fviz_eig(chem_pca, addlabels = TRUE) ``` Here are the resulting PCA scores for each sample, for each principal component (shown here as components 1-12): -```{r} +```{r 06-Chapter6-118} head(chem_pca$scores) ``` And the resulting loading factors of each chemical's contribution towards each principal component. Results are arranged by a chemical's contribution to PC1, the component accounting for most of the variation in the data. -```{r} +```{r 06-Chapter6-119} head(chem_pca$loadings) ``` We can save the chemical-specific loadings into a separate matrix and view them from highest to lowest values for PC1. -```{r} +```{r 06-Chapter6-120} loadings <- as.data.frame.matrix(chem_pca$loadings) loadings %>% arrange(desc(Comp.1)) ``` @@ -2525,7 +2525,7 @@ loadings %>% arrange(desc(Comp.1)) These resulting loading factors allow us to identify which constituents (of the 12 total) contribute to the principal components explaining data variabilities. For instance, we can see here that **Quercetin** is listed at the top, with the largest loading value for principal component 1. Thus, Quercetin represents the constituents that contributes to the overall variability in the dataset to the greatest extent. The next three chemicals are all **Ginkgolide** constituents, followed by **Bilobalide** and **Kaempferol**, and so forth. If we look at principal component 2 (PC2), we can now see a different set of chemicals contributing to the variability captured in this component: -```{r} +```{r 06-Chapter6-121} loadings %>% arrange(desc(Comp.2)) ``` @@ -2533,7 +2533,7 @@ Here, **Ginkgolic Acids** are listed first. We can also visualize sample groupings based on these principal components 1 & 2: -```{r, warning=FALSE, message=FALSE, fig.height=6, fig.width=8, fig.align = "center"} +```{r 06-Chapter6-122, warning=FALSE, message=FALSE, fig.height=6, fig.width=8, fig.align = "center"} # First pull the percent variation captured by each component pca_percent <- round(100*chem_pca$sdev^2/sum(chem_pca$sdev^2),1) @@ -2584,7 +2584,7 @@ As an alternative way of viewing the chemical profile data, we can make a heatma By default, `pheatmap()` uses a Euclidean distance to cluster the observations, which is a very common clustering algorithm. For more details, see the following description of [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) and for more information on hierarchical clustering, see **TAME 2.0 Module 5.5 Unsupervised Machine Learning Part 2: Additional Clustering Applications**. -```{r, warning=FALSE, message=FALSE, fig.align = "center"} +```{r 06-Chapter6-123, warning=FALSE, message=FALSE, fig.align = "center"} chem_hm <- pheatmap(chem, main="GbE Sample Heatmap by Chemistry Profiles", cluster_rows=TRUE, cluster_cols = FALSE, angle_col = 45, fontsize_col = 7, treeheight_row = 60) @@ -2602,14 +2602,14 @@ This plot tells us a lot about the individual chemicals that differentiate the s ::: Let's now revisit the PCA plot: -```{r, warning=FALSE, message=FALSE, fig.height=3, fig.width=5, fig.align = "center"} +```{r 06-Chapter6-124, warning=FALSE, message=FALSE, fig.height=3, fig.width=5, fig.align = "center"} chem_pca_plt ``` GbE_G and GbE_N look so different from the rest of the samples that they could be outliers and potentially influencing overall data trends. Let's make sure that, if we remove these two samples, our sample groupings still look the same. First, we remove those two samples from the dataframe: -```{r, warning=FALSE, message=FALSE} +```{r 06-Chapter6-125, warning=FALSE, message=FALSE} chem_filt <- chem %>% rownames_to_column("Sample") %>% filter(!Sample %in% c("GbE_G","GbE_N")) %>% @@ -2617,7 +2617,7 @@ chem_filt <- chem %>% ``` Then, we can re-run PCA and generate a heatmap of the chemical data with these outlier samples removed: -```{r, warning=FALSE, message=FALSE, fig.align = "center"} +```{r 06-Chapter6-126, warning=FALSE, message=FALSE, fig.align = "center"} chem_filt_pca <- princomp(chem_filt) # Get the percent variation captured by each component @@ -2646,7 +2646,7 @@ chem_filt_pca_plt To view the PCA plots of all samples vs filtered samples: -```{r, warning=FALSE, message=FALSE, fig.height=9, fig.width=8, fig.align = "center"} +```{r 06-Chapter6-127, warning=FALSE, message=FALSE, fig.height=9, fig.width=8, fig.align = "center"} grid.arrange(chem_pca_plt, chem_filt_pca_plt) ``` @@ -2667,22 +2667,22 @@ grid.arrange(chem_pca_plt, chem_filt_pca_plt) Now, we will perform sufficient similarity analysis using the toxicity data. Unlike the chemistry dataset, we can use the toxicity dataset as is without scaling because we want to focus on genes that are showing a large response. Similarly, we want to de-emphasize genes that are showing a strong response to the exposure condition. If we scale these data, we will reduce this needed variability. Here, we first move the sample column to row names: -```{r, warning=FALSE, message=FALSE} +```{r 06-Chapter6-128, warning=FALSE, message=FALSE} tox <- tox %>% column_to_rownames("Sample") ``` Then, we can run PCA on this tox dataframe: -```{r, warning=FALSE, message=FALSE} +```{r 06-Chapter6-129, warning=FALSE, message=FALSE} tox_pca <- princomp(tox) ``` Looking at the scree plot, we see the first two principal components capture most of the variation (~93%): -```{r, warning=FALSE, message=FALSE, fig.align = "center"} +```{r 06-Chapter6-130, warning=FALSE, message=FALSE, fig.align = "center"} fviz_eig(tox_pca, addlabels = TRUE) ``` We can then create a plot of the samples by principal components: -```{r, warning=FALSE, message=FALSE, fig.height=7, fig.width=6, fig.align = "center"} +```{r 06-Chapter6-131, warning=FALSE, message=FALSE, fig.height=7, fig.width=6, fig.align = "center"} # Get the percent variation captured by each component pca_percent <- round(100*tox_pca$sdev^2/sum(tox_pca$sdev^2),1) @@ -2719,7 +2719,7 @@ This plot tells us a lot about sample groupings based on toxicity profiles! Similar to the chemistry data, as an alternative way of viewing the toxicity profile data, we can make a heatmap of the toxicity data: -```{r, warning=FALSE, message=FALSE, fig.align = "center"} +```{r 06-Chapter6-132, warning=FALSE, message=FALSE, fig.align = "center"} tox_hm <- pheatmap(tox, main="GbE Sample Heatmap by Toxicity Profiles", cluster_rows=TRUE, cluster_cols = FALSE, angle_col = 45, fontsize_col = 7, treeheight_row = 60) @@ -2741,18 +2741,18 @@ This plot tells us a lot about the individual genes that differentiate the sampl ## Comparing Chemistry vs. Toxicity Sufficient Similarity Analyses Let's view the PCA plots for both datasets together, side-by-side: -```{r, fig.height=8, fig.width=11, fig.align = "center"} +```{r 06-Chapter6-133, fig.height=8, fig.width=11, fig.align = "center"} pca_compare <- grid.arrange(chem_pca_plt,tox_pca_plt, nrow=1) ``` Let's also view the PCA plots for both datasets together, top-to-bottom, to visualize the trends along both axes better between these two views: -```{r, fig.height=10, fig.width=10, fig.align = "center"} +```{r 06-Chapter6-134, fig.height=10, fig.width=10, fig.align = "center"} pca_compare <- grid.arrange(chem_pca_plt,tox_pca_plt) ``` Here is an edited version of the above figures, highlighting with colored circles some chemical groups of interest identified through chemistry vs toxicity-based sufficient similarity analyses: -```{r, echo=FALSE, fig.align = "center"} +```{r 06-Chapter6-135, echo=FALSE, fig.align = "center" } knitr::include_graphics("Chapter_6/Module6_5_Input/Module6_5_Image1.png") ``` @@ -2841,7 +2841,7 @@ The bioactivity-exposure ratio (BER) is then calculated across chemicals with bo To understand what toxicokinetic modeling is, consider the following scenario: -```{r, echo=FALSE, fig.align = "center"} +```{r 06-Chapter6-136, echo=FALSE, fig.align = "center" } knitr::include_graphics("Chapter_6/Module6_6_Input/Module6_6_Image1.png") ``` @@ -2894,13 +2894,13 @@ For further information on TK modeling background, math, and example models, the ### Script Preparations #### Cleaning the global environment -```{r} +```{r 06-Chapter6-137} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, results=FALSE, message=FALSE} +```{r 06-Chapter6-138, results=FALSE, message=FALSE} if(!nzchar(system.file(package = "ggplot2"))){ install.packages("ggplot2")} if(!nzchar(system.file(package = "reshape2"))){ @@ -2915,7 +2915,7 @@ if(!nzchar(system.file(package = "eulerr"))){ #### Loading R packages required for this session -```{r, results=FALSE, message=FALSE} +```{r 06-Chapter6-139, results=FALSE, message=FALSE} library(ggplot2) # ggplot2 will be used to generate associated graphics library(reshape2) # reshape2 will be used to organize and transform datasets library(stringr) # stringr will be used to aid in various data manipulation steps through this module @@ -2941,14 +2941,14 @@ You can see a browsable list of vignettes by typing `browseVignettes("httk")` at You can get information about any function in *httk*, or indeed any function in any R package, by typing `help()` and placing the function name in quotation marks inside the parentheses. For example, to get information about the *httk* function `solve_model()`, type this: -```{r, eval=FALSE} +```{r 06-Chapter6-140, eval=FALSE} help("solve_model") ``` Note that this module was run with `httk` version 2.4.0. #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 06-Chapter6-141, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` @@ -2996,7 +2996,7 @@ Each of these TK models has chemical-specific parameters. The chemical-specific Look at the first few rows of this data.frame to see everything that's in there (it is a lot of information). -```{r} +```{r 06-Chapter6-142} head(chem.physical_and_invitro.data) ``` @@ -3010,7 +3010,7 @@ You can easily get a list of all the chemicals for which a specific TK model can For example, here is how you get a list of all the chemicals for which the PBTK model can be parameterized for humans. -```{r, warning = FALSE} +```{r 06-Chapter6-143, warning = FALSE} chems_pbtk <- get_cheminfo(info = c("Compound", "CAS", "DTXSID"), model = "pbtk", species = "Human") @@ -3020,19 +3020,19 @@ head(chems_pbtk) #first few rows How many such chemicals have parameter data to run a PBTK model in this package? -```{r} +```{r 06-Chapter6-144} nrow(chems_pbtk) ``` Here is how you get all the chemicals for which the 3-compartment steady-state model can be parameterized for humans. -```{r} +```{r 06-Chapter6-145} chems_3compss <- get_cheminfo(info = c("Compound", "CAS", "DTXSID"), model = "3compartmentss", species = "Human") ``` How many such chemicals have parameter data to run a 3-compartment steady-state model in this package? -```{r} +```{r 06-Chapter6-146} nrow(chems_3compss) ``` @@ -3042,7 +3042,7 @@ The 3-compartment steady-state model can be parameterized for a few more chemica You can solve any of the models for a specified chemical and specified dosing protocol, and get concentration vs. time predictions, using the function `solve_model()`. For example: -```{r, warning=FALSE} +```{r 06-Chapter6-147, warning=FALSE} sol_pbtk <- solve_model(chem.name = "Bisphenol-A", #chemical to simulate model = "pbtk", #TK model to use dosing = list(initial.dose = NULL, #for repeated dosing, if first dose is different from the rest, specify first dose here @@ -3056,13 +3056,13 @@ There are some cryptic-sounding warnings that can safely be ignored. (They are p The output, assigned to `sol_pbtk`, is a matrix with concentration vs. time data for each of the compartments in the pbtk model. Time is in units of days. Additionally, the output traces the amount excreted via passive renal filtration (`Atubules`), the amount metabolized in the liver (`Ametabolized`), and the cumulative area under the curve for plasma concentration vs. time (`AUC`). Here are the first few rows of `sol_pbtk` so you can see the format. -```{r} +```{r 06-Chapter6-148} head(sol_pbtk) ``` You can plot the results, for example plasma concentration vs. time. -```{r fig.align = "center"} +```{r 06-Chapter6-149, fig.align = "center"} sol_pbtk <- as.data.frame(sol_pbtk) #because ggplot2 requires data.frame input, not matrix ggplot(sol_pbtk) + @@ -3078,7 +3078,7 @@ ggplot(sol_pbtk) + We can calculate summary metrics of internal dose -- peak concentration, average concentration, and AUC -- using the function `calc_tkstats()`. We have to specify the dosing protocol and length of simulation. Here, we use the same dosing protocol and simulation length as in the plot above. -```{r, warning = FALSE} +```{r 06-Chapter6-150, warning = FALSE} tkstats <- calc_tkstats(chem.name = "Bisphenol-A", #chemical to simulate stats = c("AUC", "peak", "mean"), #which metrics to return (these are the only three choices) model = "pbtk", #model to use @@ -3107,7 +3107,7 @@ Another summary metric is the steady-state concentration: If the same dose is gi For example, here is a plot of plasma concentration vs. time for 1 mg/kg/day Bisphenol-A, administered for 12 days. You can see how the average plasma concentration reaches a steady state around 1.5 uM. Each peak represents one day's dose. -```{r, warning = FALSE, fig.align = "center"} +```{r 06-Chapter6-151, warning = FALSE, fig.align = "center"} foo <- as.data.frame(solve_pbtk( chem.name='Bisphenol-A', daily.dose=1, @@ -3127,7 +3127,7 @@ ggplot(foo) + Here is the result of `calc_analytic_css()` for a 1 mg/kg/day dose of bisphenol-A. -```{r, warning = FALSE} +```{r 06-Chapter6-152, warning = FALSE} calc_analytic_css(chem.name = "Bisphenol-A", daily.dose = 1, output.units = "uM", @@ -3151,7 +3151,7 @@ calc_analytic_css(chem.name = "Bisphenol-A", For the TK models included in the *httk* package, steady-state concentration is linear with dose for a given chemical. The slope of the line is simply the steady-state concentration for a dose of 1 mg/kg/day. This can be shown by solving `calc_analytic_css()` for several doses, and plotting the dose-$C_{ss}$ points along a line whose slope is equal to $C_{ss}$ for 1 mg/kg/day. -```{r fig.align = "center"} +```{r 06-Chapter6-153, fig.align = "center"} #choose five doses at which to find the Css doses <- c(0.1, #all mg/kg/day 0.5, @@ -3205,7 +3205,7 @@ The procedure is illustrated graphically below. 2. Draw a horizontal line over to the $C_{ss}$-dose line. 3. Drop down vertically to the x-axis and read off the corresponding dose. This is the *administered equivalent dose* (AED): the the external dose or exposure rate, in mg/kg/day, that would produce an internal steady-state plasma concentration equal to the target concentration. -```{r, echo = FALSE, warning = FALSE, fig.align = "center"} +```{r 06-Chapter6-154, echo = FALSE, warning = FALSE, fig.align = "center"} reverseTKfig <- Cssdosefig + geom_segment(aes(x = -Inf, y = 0.8671, xend = 0.75, yend = 0.8671), size = 2, @@ -3269,7 +3269,7 @@ $$ AED = \frac{C_{\textrm{target}}}{C_{ss}\textrm{ for 1 mg/kg/day}} $$ For a given dose, $C_{ss}$ is determined by the values of the parameters of the TK model. These parameters describe absorption, distribution, metabolism, and excretion (ADME) of each chemical. They include both chemical-specific parameters, describing hepatic clearance and protein binding, and chemical-independent parameters, describing physiology. A table of these parameters is presented below. -```{r, results = "asis", echo = FALSE} +```{r 06-Chapter6-155, results = "asis", echo = FALSE} paramtable <- data.frame("Parameter" = c("Intrinsic hepatic clearance rate", "Fraction unbound to plasma protein", "Tissue:plasma partition coefficients", @@ -3305,7 +3305,7 @@ The $C_{ss}$-dose relationship is still linear when variability and uncertainty A distribution of $C_{ss}$-dose slopes is illustrated in the figure below, along with boxplots illustrating the distributions for $C_{ss}$ itself at five different dose levels: 0.05, 0.25, 0.5, 0.75, and 0.95 mg/kg/day. -```{r, echo = FALSE, warning = FALSE, fig.align = "center"} +```{r 06-Chapter6-156, echo = FALSE, warning = FALSE, fig.align = "center"} suppressWarnings(css_examp <- calc_mc_css(chem.name = "Bisphenol-A", which.quantile = c(0.05, #specify which quantiles to return @@ -3387,7 +3387,7 @@ The steps are the same as before: 2. Draw a horizontal line over to intersect each $C_{ss}$-dose line. 3. Where the horizontal line intersects each $C_{ss}$-dose line, drop down vertically to the x-axis and read off each corresponding AED (marked with colored circles matching the color of each $C_{ss}$-dose line). -```{r, echo = FALSE, warning = FALSE, fig.align = "center"} +```{r 06-Chapter6-157, echo = FALSE, warning = FALSE, fig.align = "center"} ggplot(css_dist_wide, aes(x=dose, y = `95%`)) + @@ -3438,7 +3438,7 @@ Next, *httk* calculates the $C_{ss}$-dose slope for each "simulated individual." The following code estimates the 5th percentile, 50th percentile, and 95th percentile of the $C_{ss}$-dose slope for the chemical bisphenol-A. For the sake of simplicity, we will use the 3-compartment steady-state model (rather than the PBTK model used in the previous examples). -```{r, warning=FALSE} +```{r 06-Chapter6-158, warning=FALSE} css_examp <- calc_mc_css(chem.name = "Bisphenol-A", which.quantile = c(0.05, #specify which quantiles to return 0.5, @@ -3470,7 +3470,7 @@ We can easily and (fairly) quickly do this for all 998 chemicals for which the 3 In order to make the Monte Carlo sampling reproducible, set a seed for the random number generator. It doesn't matter what seed you choose -- it can be any integer. Here, the seed is set to 42, because it's the answer to the ultimate question of life, the universe, and everything [(Adams, 1979)](https://en.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy_(novel)). -```{r} +```{r 06-Chapter6-159} set.seed(42) system.time( @@ -3487,7 +3487,7 @@ system.time( ``` Organizing the results: -```{r} +```{r 06-Chapter6-160} #css_3compss comes out as a 3 x 998 array, #where rows are quantiles and columns are chemicals #transpose it so that rows are chemicals and columns are quantiles @@ -3508,17 +3508,17 @@ Here, we will plot the resulting concentration distribution quantiles for each c By default, *ggplot2* will plot the chemical CASRNs in alphabetically-sorted order. To force it to plot them in another order, we have to explicitly specify the desired order. The easiest way to do this is to add a column in the data.frame that contains the chemical names as a factor (categorical) variable, whose levels (categories) are explicitly set to be the CASRNs in our desired plotting order. Then we can tell *ggplot2* to plot that factor variable on the x-axis, rather than the original CASRN variable. Set the ordering of the chemical CASRNs from lowest to highest median value -```{r} +```{r 06-Chapter6-161} chemical_order <- order(css_3compss$`50%`) ``` Create a factor (categorical) CAS column where the factor levels are given by the CASRNs with this ordering. -```{r} +```{r 06-Chapter6-162} css_3compss$CAS_factor <- factor(css_3compss$CAS, levels = css_3compss$CAS[chemical_order]) ``` For plotting ease, reshape the data.frame into "long" format -- rather than having one column for each quantile of the $C_{ss}$ distribution, have a row for each chemical/quantile combination. We use the `melt()` function from the *reshape2* package. -```{r, warning = FALSE} +```{r 06-Chapter6-163, warning = FALSE} css_3compss_melt <- reshape2::melt(css_3compss, id.vars = "CAS_factor", measure.vars = c("5%", "50%", "95%"), @@ -3528,7 +3528,7 @@ head(css_3compss_melt) ``` Plot the slope percentiles. Use a log scale for the y-axis because the slopes span orders of magnitude. Suppress the x-axis labels (the CASRNs) because they are not readable anyway. -```{r fig.align = "center"} +```{r 06-Chapter6-164, fig.align = "center"} ggplot(css_3compss_melt) + geom_point(aes(x=CAS_factor, y = Css_slope, @@ -3573,7 +3573,7 @@ The latest public release of ToxCast high-throughput screening assay data can be Read in the pre-processed dataset and view the first few rows. -```{r} +```{r 06-Chapter6-165} toxcast <- read.csv("Chapter_6/Module6_6_Input/Module6_6_InputData1.csv") head(toxcast) ``` @@ -3588,7 +3588,7 @@ The columns of this data frame are: How many ToxCast chemicals are in this dataset? -```{r} +```{r 06-Chapter6-166} length(unique(toxcast$DTXSID)) ``` @@ -3609,7 +3609,7 @@ Not all of the ToxCast chemicals have TK data built into *httk* such that we can Previously, we used `get_cheminfo()` to get a list of chemicals for which we could run the 3-compartment steady state model, including the names, CASRNs, and DSSTox IDs of those chemicals. That list is stored in variable `chems_3compss`, a data.frame with compound name, CASRN, and DTXSID. Now, we can use that chemical list to subset the ToxCast data. -```{r} +```{r 06-Chapter6-167} toxcast_httk <- subset(toxcast, subset = toxcast$DTXSID %in% chems_3compss$DTXSID) @@ -3617,7 +3617,7 @@ toxcast_httk <- subset(toxcast, How many chemicals are in this subset? -```{r} +```{r 06-Chapter6-168} length(unique(toxcast_httk$DTXSID)) ``` @@ -3626,7 +3626,7 @@ There were 869 *httk* chemicals for which we could run the 3-compartment steady- ### Identifying the Lower-Bound *In Vitro* AC50 Value per Chemical ToxCast/Tox21 screens chemicals across multiple assays, such that each chemical has multiple resulting AC50 values, spanning a range of values. For example, here are boxplots of the AC50s for the first 20 chemicals listed in `chems_3compss`. Note that the chemical identifiers, DTXSID, are used here in these visualizations to represent unique chemicals. -```{r fig.align = "center"} +```{r 06-Chapter6-169, fig.align = "center"} ggplot(toxcast_httk[toxcast_httk$DTXSID %in% chems_3compss[1:20, "DTXSID"], @@ -3646,7 +3646,7 @@ However, sometimes we just want a general idea of what concentrations showed bio Let's calculate the tenth percentile ToxCast AC50 for each chemical. Here, we use the base R function `aggregate()`, which groups a vector (specified in the `x` argument) by a list of factors (specified in the `by` argument), and applies a function to each group (specified in the `FUN` argument). You can add any extra arguments to the `FUN` function as named arguments to `aggregate()`. -```{r} +```{r 06-Chapter6-170} toxcast_httk_P10 <- aggregate(x = toxcast_httk$log10_ac50, #aggregate the AC50s by = list(DTXSID = toxcast_httk$DTXSID), #group AC50s by DTXSID FUN = quantile, #the function to apply to each group @@ -3658,13 +3658,13 @@ names(toxcast_httk_P10) <- c("DTXSID", "log10_ac50_P10") Let's transform the tenth-percentile AC50 values back to the natural scale (they are currently on the log10 scale) and put them in a new column `AC50`. These AC50s will be in uM. -```{r} +```{r 06-Chapter6-171} toxcast_httk_P10$AC50 <- 10^(toxcast_httk_P10$log10_ac50_P10) ``` View the first few rows: -```{r} +```{r 06-Chapter6-172} head(toxcast_httk_P10) ``` @@ -3679,7 +3679,7 @@ Under the hood, `calc_mc_oral_equiv()` first calls `calc_mc_css()` to get percen Here, we're using the `mapply()` function in base R to call `calc_mc_oral_equiv()` in a loop over chemicals. This is because `calc_mc_oral_equiv()` requires two chemical-specific arguments -- the chemical identifier and the concentration for which to compute the equivalent dose. `mapply()` lets us provide vectors of values for each argument (in the named arguments `dtxsid` and `conc`), and will automatically loop over those vectors. We also use the argument `MoreArgs`, a named list of additional arguments to the function in `FUN` that will be the same for every iteration of the loop. Note that this line of code takes a few minutes to run. -```{r, results="hide"} +```{r 06-Chapter6-173, results="hide"} set.seed(42) system.time( @@ -3703,19 +3703,19 @@ head(toxcast_equiv_dose) #look at first few rows Let's add the DTXSIDs back into this data.frame. -```{r} +```{r 06-Chapter6-174} toxcast_equiv_dose$DTXSID <- toxcast_httk_P10$DTXSID ``` We can get the names of these chemicals by using the list of chemicals for which the 3-compartment steady-state model can be parameterized, which was stored in the variable `chems_3compss`. In that dataframe, we have the compound name and CASRN corresponding to each DTXSID. -```{r} +```{r 06-Chapter6-175} head(chems_3compss) ``` Merge `chems_3compss` with `toxcast_equiv_dose`. -```{r} +```{r 06-Chapter6-176} toxcast_equiv_dose <- merge(chems_3compss, toxcast_equiv_dose, by = "DTXSID", @@ -3727,7 +3727,7 @@ head(toxcast_equiv_dose) To find the chemicals with the lowest equivalent doses at the 95th percentile level (corresponding to the most-sensitive 5\% of the population), sort this data.frame in ascending order on the `95%` column. -```{r} +```{r 06-Chapter6-177} toxcast_equiv_dose <- toxcast_equiv_dose[order(toxcast_equiv_dose$`95%`), ] head(toxcast_equiv_dose, 10) #first ten rows of sorted table ``` @@ -3752,7 +3752,7 @@ To estimate potential risk, hazard -- in the form of the equivalent dose for the Here, we will use exposure estimates that have been inferred from CDC NHANES urinary biomonitoring data (Ring et al., 2019). These estimates consist of an estimated median, and estimated upper and lower 95\% credible interval bounds representing uncertainty in that estimated median. These estimates are provided here in the following csv file: -```{r} +```{r 06-Chapter6-178} exposure <- read.csv("Chapter_6/Module6_6_Input/Module6_6_InputData2.csv") head(exposure) #view first few rows ``` @@ -3761,7 +3761,7 @@ head(exposure) #view first few rows To calculate a BER for a chemical, it needs to have both an equivalent dose and an exposure estimate. Not all of the chemicals for which equivalent doses could be computed (*i.e.*, chemicals with both ToxCast AC50s and `httk` data) also have exposure estimates inferred from NHANES. Find out how many do. -```{r} +```{r 06-Chapter6-179} length(intersect(toxcast_equiv_dose$DTXSID, exposure$DTXSID)) ``` @@ -3769,7 +3769,7 @@ This means that, using the ToxCast AC50 data for bioactive concentrations, the N Merge together the ToxCast equivalent doses and the exposure data into a single data frame. Keep only the chemicals that have data in both ToxCast equivalent doses and exposure data frames. -```{r} +```{r 06-Chapter6-180} hazard_exposure <- merge(toxcast_equiv_dose, exposure, by = "DTXSID", @@ -3781,7 +3781,7 @@ head(hazard_exposure) #view first few rows of result We can visually compare the equivalent doses and the inferred exposure estimates by plotting them together. -```{r fig.align = "center"} +```{r 06-Chapter6-181, fig.align = "center"} ggplot(hazard_exposure) + geom_crossbar(aes(x = Compound.x, #Boxes for equivalent doses y = `50%`, @@ -3813,7 +3813,7 @@ ggplot(hazard_exposure) + The bioactivity-exposure ratio (BER) is simply the ratio of the lower-end equivalent dose (for the most-sensitive 5\% of the population) divided by the upper-end estimated exposure (here, the upper bound on the inferred population median exposure). In the data frame `hazard_exposure` containing the hazard and exposure data, the lower-end equivalent dose is in column `95%` (corresponding to the 95th-percentile $C_{ss}$-dose slope) and the upper-end exposure is in column `up95`. Calculate the BER, and assign the result to a new column in the `hazard_exposure` data frame called `BER`. -```{r} +```{r 06-Chapter6-182} hazard_exposure[["BER"]] <- hazard_exposure[["95%"]]/hazard_exposure[["up95"]] ``` @@ -3823,7 +3823,7 @@ To prioritize chemicals according to potential risk, they can be sorted from low Sort the rows of the data.frame from lowest to highest BER. -```{r} +```{r 06-Chapter6-183} hazard_exposure <- hazard_exposure[order(hazard_exposure$BER), ] head(hazard_exposure) ``` @@ -3832,7 +3832,7 @@ The hazard-exposure plot above showed chemicals in alphabetical order. It can be First, create a categorical (factor) variable for the compound names, whose levels are in order of increasing BER. (Since we already sorted the data.frame in order of increasing BER, we can just take the compound names in the order that they appear.) -```{r} +```{r 06-Chapter6-184} hazard_exposure$Compound_factor <- factor(hazard_exposure$Compound.x, levels = hazard_exposure$Compound.x) @@ -3840,7 +3840,7 @@ hazard_exposure$Compound_factor <- factor(hazard_exposure$Compound.x, Now, make the same plot as before, but use `Compound_factor` as the x-axis variable instead of `Compound`. -```{r fig.align = "center"} +```{r 06-Chapter6-185, fig.align = "center"} ggplot(hazard_exposure) + geom_crossbar(aes(x = Compound_factor, #Boxes for equivalent dose y = `50%`, @@ -3907,7 +3907,7 @@ Now, the chemicals are displayed in order of increasing BER. From left to right, To calculate a BER for a chemical, both bioactivity and exposure data are required, as well as sufficient TK data to perform reverse TK. In this training module, bioactivity data came from ToxCast AC50s; exposure data consisted of exposure inferences made from NHANES urinary biomonitoring data; and TK data consisted of parameter values measured *in vitro* and built into the *httk* R package. The intersections are illustrated in an Euler diagram below. BERs can only be calculated for chemicals in the triple intersection. -```{r fig.align = "center"} +```{r 06-Chapter6-186, fig.align = "center"} fit <- eulerr::euler(list('ToxCast AC50s' = unique(toxcast$DTXSID), 'HTTK' = unique(chems_3compss$DTXSID), 'NHANES inferred exposure' = unique(exposure$DTXSID) @@ -3943,7 +3943,7 @@ This training module provides an overview of toxicokinetic modeling using the *h We would like to acknowledge the developers of the *httk* package, as detailed below via the CRAN website: -```{r, echo=FALSE, fig.align='center'} +```{r 06-Chapter6-187, echo=FALSE, fig.align='center' } knitr::include_graphics("Chapter_6/Module6_6_Input/Module6_6_Image2.png") ``` @@ -3992,7 +3992,7 @@ All input files (script, data, and figures) can be downloaded from the [UNC-SRP *Disclaimer: The views expressed in this document are those of the author and do not necessarily reflect the views or policies of the U.S. EPA.* -```{r, include=FALSE} +```{r 06-Chapter6-188, include=FALSE} #set default values for R Markdown "knitting" to HTML, Word, or PDF knitr::opts_chunk$set(echo = TRUE) #print code chunks ``` @@ -4000,7 +4000,7 @@ knitr::opts_chunk$set(echo = TRUE) #print code chunks ## Introduction to Training Module The method of **read-across** represents one type of computational approach that is commonly used to predict a chemical's toxicological effects using its properties. Other types of approaches that you will hear commonly used in this field include **SAR** and **QSAR** analyses. A high-level overview of each of these definitions and simple illustrative examples of these three computational modeling approaches is provided in the following schematic: -```{r, echo=FALSE} +```{r 06-Chapter6-189, echo=FALSE } knitr::include_graphics("Chapter_6/Module6_7_Input/Module6_7_Image1.png") ``` @@ -4036,13 +4036,13 @@ This training module was specifically developed to answer the following environm ### Script Preparations #### Cleaning the global environment -```{r} +```{r 06-Chapter6-190} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you: -```{r, results=FALSE, message=FALSE} +```{r 06-Chapter6-191, results=FALSE, message=FALSE} if (!requireNamespace("tidyverse")) install.packages("tidyverse"); if (!requireNamespace("fingerprint")) @@ -4052,14 +4052,14 @@ if (!requireNamespace("rcdk")) ``` #### Loading R packages required for this session -```{r, results=FALSE, message=FALSE} +```{r 06-Chapter6-192, results=FALSE, message=FALSE} library(tidyverse) #all tidyverse packages, including dplyr and ggplot2 library(fingerprint) # a package that supports operations on molecular fingerprint data library(rcdk) # a package that interfaces with the 'CDK', a Java framework for chemoinformatics libraries packaged for R ``` #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 06-Chapter6-193, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` @@ -4073,21 +4073,21 @@ Let's start by loading the datasets needed for this training module. We are goin The first file to upload is named `Module6_6_InputData1.csv` and contains the list of substances and their structural information, in the form of SMILES nomenclature. SMILES stands for Simplified molecular-input line-entry system, a form of line notation to describe the structure of a chemical. The second file to upload is named `Module6_6_InputData2.csv` and contains the substances and their acute toxicity information. -```{r} +```{r 06-Chapter6-194} substances <- read.csv("Chapter_6/Module6_7_Input/Module6_7_InputData1.csv") acute_data <- read.csv("Chapter_6/Module6_7_Input/Module6_7_InputData2.csv") ``` Let's first view the substances dataset: -```{r} +```{r 06-Chapter6-195} dim(substances) ``` -```{r} +```{r 06-Chapter6-196} colnames(substances) ``` -```{r} +```{r 06-Chapter6-197} head(substances) ``` @@ -4098,20 +4098,20 @@ We can see that this dataset contains information on 6955 chemicals (rows). The + `QSAR_READY_SMILES`: `SMILES` that have been standardized related to salts, tautomers, inorganics, aromaticity, and stereochemistry (among other factors) prior to any QSAR modeling or prediction. Let's make sure that these values are recognized as character format and placed in its own vector, to ensure proper execution of functions throughout this script: -```{r} +```{r 06-Chapter6-198} all_smiles <- as.character(substances$QSAR_READY_SMILES) ``` Now let's view the acute toxicity dataset: -```{r} +```{r 06-Chapter6-199} dim(acute_data) ``` -```{r} +```{r 06-Chapter6-200} colnames(acute_data) ``` -```{r} +```{r 06-Chapter6-201} head(acute_data) ``` @@ -4125,7 +4125,7 @@ We can see that this dataset contains information on 6955 chemicals (rows). Some In modeling studies, the convention is to convert toxicity values expressed as mg per unit into their molar or millimolar values and then to convert these to the base 10 logarithm. To increase clarity when plotting, such that higher toxicities would be expressed by higher values, the negative logarithm is then taken. For example, substance DTXSID00142939 has a molecular weight of 99.089 (grams per mole) and a LD50 of 32 mg/kg. This would be converted to a toxicity value of ($\frac{32}{99.089} = 0.322942~mmol/kg$). The logarithm of that would be -0.4908755. By convention, the negative logarithm of the millimolar concentration would then be used i.e. -log[mmol/kg]. This conversion has been used to create the `LD50_LM` values in the acute toxicity dataset. Let's check to see whether the same chemicals are present in both datasets: -```{r} +```{r 06-Chapter6-202} # First need to make sure that both dataframes are sorted by the identifier, DTXSID substances <- substances[order(substances$DTXSID),] acute_data <- acute_data[order(acute_data$DTXSID),] @@ -4138,7 +4138,7 @@ All accounts are true, meaning they are all equal (the same chemical). ### Data Visualizations of Acute Toxicity Values Let's create a plot to show the distribution of the LD50 values in the dataset. -```{r fig.align = "center"} +```{r 06-Chapter6-203, fig.align = "center"} ggplot(data = acute_data, aes(LD50_mgkg)) + stat_ecdf(geom = "point") @@ -4155,12 +4155,12 @@ For this exercise, we will select a 'target' substance of interest from our data Our target substance for this exercise is going to be DTXSID5020281, which is 1-chloro-4-nitrobenzene. This chemical is an organic compound with the formula ClC~6~H~4~NO~2~, and is a common intermediate in the production of a number of industrially useful compounds, including common antioxidants found in rubber. Here is an image of the chemical structure (https://comptox.epa.gov/dashboard/dsstoxdb/results?search=DTXSID5020281): -```{r, echo=FALSE, fig.align='center'} +```{r 06-Chapter6-204, echo=FALSE, fig.align='center' } knitr::include_graphics("Chapter_6/Module6_7_Input/Module6_7_Image2.png") ``` Filtering the dataframes for only data on this target substance: -```{r} +```{r 06-Chapter6-205} target_substance <-filter(substances, DTXSID == 'DTXSID5020281') target_acute_data <- filter(acute_data, DTXSID == 'DTXSID5020281') ``` @@ -4177,7 +4177,7 @@ To eventually identify chemical analogues with information that can be 'read-acr To derive structure fingerprints across all evaluated substances, we need to first convert the chemical identifiers originally provided as `QSAR_READY_SMILES` into molecular objects. The standard exchange format for molecular information is a MOL file. This is a chemical file format that contains plain text information and stores information about atoms, bonds and their connections. We can carry out these identifier conversions using the 'parse.smiles' function within the rcdk package. Here we do this for the target chemical of interest, as well as all substances in the dataset. -```{r} +```{r 06-Chapter6-206} target_mol <- parse.smiles(as.character(target_substance$QSAR_READY_SMILES)) all_mols <-parse.smiles(all_smiles) ``` @@ -4185,13 +4185,13 @@ all_mols <-parse.smiles(all_smiles) #### Computing chemical fingerprints With these mol data, we can now compute the fingerprints for our target substance, as well as all the substances in the dataset. We can compute fingerprints leveraging the `get.fingerprint()` function. Let's first run it on the target chemical: -```{r} +```{r 06-Chapter6-207} target.fp <- get.fingerprint(target_mol[[1]], type = 'standard') target.fp # View fingerprint ``` We can run the same function over the entire `all_mols` dataset, leveraging the `lapply()` function: -```{r} +```{r 06-Chapter6-208} all.fp <- lapply(all_mols, get.fingerprint, type = 'standard') ``` @@ -4204,7 +4204,7 @@ Using these molecular fingerprint data, we can now calculate the degree to which Once these Tanimoto similarity indices are calculated between every possible chemical pair, the similarity results can be viewed in the form of a similarity matrix. In this matrix, all substances are listed across the rows and columns, and the degree to which every possible chemical pair is similar is summarized through values contained within the matrix. Further information about chemical similarity can be found here: https://en.wikipedia.org/wiki/Chemical_similarity Steps to generate this similarity matrix are detailed here: -```{r} +```{r 06-Chapter6-209} all.fp.sim <- fingerprint::fp.sim.matrix(all.fp, method = 'tanimoto') all.fp.sim <- as.data.frame(all.fp.sim) # Convert the outputted matrix to a dataframe colnames(all.fp.sim) = substances$DTXSID # Placing chemical identifiers back as column headers @@ -4212,23 +4212,23 @@ row.names(all.fp.sim) = substances$DTXSID # Placing chemical identifiers back as ``` Since we are querying a large number of chemicals, it is difficult to view the entire resulting similarity matrix. Let's, instead view portions of these results: -```{r} +```{r 06-Chapter6-210} all.fp.sim[1:5,1:5] # Viewing the first five rows and columns of data ``` -```{r} +```{r 06-Chapter6-211} all.fp.sim[6:10,6:10] # Viewing the next five rows and columns of data ``` You can see that there is an identity line within this similarity matrix, where instances when a chemical's structure is being compared to itself, the similarity values are 1.00000. All other possible chemical pairings show variable similarity scores, ranging from: -```{r} +```{r 06-Chapter6-212} min(all.fp.sim) ``` a minimum of zero, indicating no similarities between chemical structures. -```{r} +```{r 06-Chapter6-213} max(all.fp.sim) ``` @@ -4238,13 +4238,13 @@ a maximum of 1, indicating the identical chemical structure (which occurs when c This step will find substances that are structurally similar to the target chemical, 1-chloro-4-nitrobenzene (with DTXSID5020281). Structurally similar chemicals are referred to as 'source analogues', with information that will be carried forward in this read-across analysis. The first step to identifying chemical analogues is to subset the full similarity matrix to focus just on our target chemical. -```{r} +```{r 06-Chapter6-214} target.sim <- all.fp.sim %>% filter(row.names(all.fp.sim) == 'DTXSID5020281') ``` Then we'll extract the substances that exceed a similarity threshold of 0.75 by selecting to keep columns which are > 0.75. -```{r} +```{r 06-Chapter6-215} target.sim <- target.sim %>% select_if(function(x) any(x > 0.75)) @@ -4252,7 +4252,7 @@ dim(target.sim) # Show dimensions of subsetted matrix ``` This gives us our analogues list! Specifically, we selected 12 columns of data, representing our target chemical plus 11 structurally similar chemicals. Let's create a dataframe of these substance identifiers to carry forward in the read-across analysis: -```{r} +```{r 06-Chapter6-216} source_analogues <- t(target.sim) # Transposing the filtered similarity matrix results DTXSID <-rownames(source_analogues) # Temporarily grabbing the dtxsid identifiers from this matrix source_analogues <- cbind(DTXSID, source_analogues) # Adding these identifiers as a column @@ -4276,12 +4276,12 @@ source_analogues[1:12,1:2] # Viewing the cleaned dataframe of analogues Acute toxicity data from these chemical analogues can now be extracted and read across to the target chemical (1-chloro-4-nitrobenzene) to make predictions about its toxicity. Let's first merge the acute data for these analogues into our working dataframe: -```{r} +```{r 06-Chapter6-217} source_analogues <- merge(source_analogues, acute_data, by.x = 'DTXSID', by.y = 'DTXSID') ``` Then, let's remove the target chemical of interest and create a new dataframe of just the source analogues: -```{r} +```{r 06-Chapter6-218} source_analogues_only <- source_analogues %>% filter(Target_TanimotoSim!=1) # Removing the row of data with the target chemical, identified as the chemical with a similarity of 1 to itself @@ -4294,7 +4294,7 @@ The final generalized read-across (GenRA) prediction is based on a similarity-we (pairwise similarity between the target and source analogue) * (the toxicity of the source analogue), summed across each individual analogue; and then this value is divided by the sum of all pairwise similarities. For further details surrounding this algorithm and its spelled out formulation, see [Shah et al.](https://pubmed.ncbi.nlm.nih.gov/27174420/). Here are the underlying calculations needed to derive the similarity weighted activity score for this current exercise: -```{r} +```{r 06-Chapter6-219} source_analogues_only$wt_tox_calc <- as.numeric(source_analogues_only$Target_TanimotoSim) * source_analogues_only$LD50_LM # Calculating (pairwise similarity between the target and source analogue) * (the toxicity of the source analogue) @@ -4311,7 +4311,7 @@ ReadAcross_Pred <- sum_tox/sum_sims # Final calculation for the weighted activi ### Converting LD50 Units Right now, these results are in units of -log~10~ millimolar. So we still need to convert them into mg/kg equivalent, by converting out of -log~10~ and multiplying by the molecular weight of 1-chloro-4-nitrobenzene (g/mol): -```{r} +```{r 06-Chapter6-220} ReadAcross_Pred <- (10^(-ReadAcross_Pred))*157.55 ReadAcross_Pred ``` @@ -4328,7 +4328,7 @@ ReadAcross_Pred ### Visual Representation of this Read-Across Approach Here is a schematic summarizing the steps we employed in this analysis: -```{r, echo=FALSE} +```{r 06-Chapter6-221, echo=FALSE } knitr::include_graphics("Chapter_6/Module6_7_Input/Module6_7_Image3.png") ```
@@ -4336,7 +4336,7 @@ knitr::include_graphics("Chapter_6/Module6_7_Input/Module6_7_Image3.png") ### Comparing Read-Across Predictions to Experimental Observations Let's now compare how close this computationally-based prediction is to the experimentally observed LD50 value -```{r} +```{r 06-Chapter6-222} target_acute_data$LD50_mgkg ``` We can see that the experimentally observed LD50 values for this chemical is 460 mg/kg. @@ -4382,4 +4382,4 @@ Use the same input data we used in this module to answer the following questions -::: \ No newline at end of file +::: diff --git a/Chapter_7/.DS_Store b/Chapter_7/.DS_Store deleted file mode 100644 index fd69aca..0000000 Binary files a/Chapter_7/.DS_Store and /dev/null differ diff --git a/Chapter_7/07-Chapter7.Rmd b/Chapter_7/07-Chapter7.Rmd index 2898f7b..6c3d55b 100644 --- a/Chapter_7/07-Chapter7.Rmd +++ b/Chapter_7/07-Chapter7.Rmd @@ -12,7 +12,7 @@ All input files (script, data, and figures) can be downloaded from the [UNC-SRP The Comparative Toxicogenomics Database (CTD) is a publicly available, online database that provides manually curated information about chemical-gene/protein interactions, chemical-disease and gene-disease relationships. CTD also recently incorporated curation of exposure data and chemical-phenotype relationships. CTD is located at: http://ctdbase.org/. Here is a screenshot of the CTD homepage (as of August 5, 2021): -```{r, echo=FALSE, fig.align='center'} +```{r 07-Chapter7-1, echo=FALSE, fig.align='center' } #knitr::include_graphics("_book/TAME_Toolkit_files/figure-html/Module3_1_CTD_homepage.jpg") knitr::include_graphics("Chapter_7/Module7_1_Input/Module7_1_Image1.jpg") ``` @@ -32,14 +32,14 @@ This training module was specifically developed to answer the following environm ### Script Preparations #### Cleaning the global environment -```{r} +```{r 07-Chapter7-2} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you. -```{r, results=FALSE, message=FALSE} +```{r 07-Chapter7-3, results=FALSE, message=FALSE} if (!requireNamespace("tidyverse")) install.packages("tidyverse") if (!requireNamespace("VennDiagram")) @@ -50,7 +50,7 @@ install.packages("grid") #### Loading R packages required for this session -```{r, results=FALSE, message=FALSE} +```{r 07-Chapter7-4, results=FALSE, message=FALSE} library(tidyverse) library(VennDiagram) library(grid) @@ -58,7 +58,7 @@ library(grid) #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 07-Chapter7-5, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` @@ -74,7 +74,7 @@ Navigate to the main CTD website: http://ctdbase.org/. Select at the top, 'Search' -> 'Chemical-Gene Interactions'.
-```{r, echo=FALSE, fig.align='center'} +```{r 07-Chapter7-6, echo=FALSE, fig.align='center' } knitr::include_graphics("Chapter_7/Module7_1_Input/Module7_1_Image2.jpg") ``` @@ -82,7 +82,7 @@ knitr::include_graphics("Chapter_7/Module7_1_Input/Module7_1_Image2.jpg") Select to query all chemical-gene interaction data for arsenic.
-```{r, echo=FALSE, fig.align='center'} +```{r 07-Chapter7-7, echo=FALSE, fig.align='center' } knitr::include_graphics("Chapter_7/Module7_1_Input/Module7_1_Image3.jpg") ```
@@ -90,7 +90,7 @@ knitr::include_graphics("Chapter_7/Module7_1_Input/Module7_1_Image3.jpg") Note that there are lots of results, represented by many many rows of data! Scroll to the bottom of the webpage and select to download as 'CSV'.
-```{r, echo=FALSE, fig.align='center'} +```{r 07-Chapter7-8, echo=FALSE, fig.align='center' } knitr::include_graphics("Chapter_7/Module7_1_Input/Module7_1_Image4.jpg") ``` @@ -106,14 +106,14 @@ Note that the data pulled here represent data available on August 1, 2021 Read in the csv file of the results from CTD query: -```{r, results=FALSE, message=FALSE} +```{r 07-Chapter7-9, results=FALSE, message=FALSE} ctd = read_csv("Chapter_7/Module7_1_Input/Module7_1_InputData1.csv") ``` Let's first see how many rows and columns of data this file contains: -```{r} +```{r 07-Chapter7-10} dim(ctd) ``` This dataset includes 6280 observations (represented by rows) linking arsenic exposure to gene-level alterations @@ -122,12 +122,12 @@ With information spanning across 9 columns Let's also see what kind of data are organized within the columns: -```{r} +```{r 07-Chapter7-11} colnames(ctd) ``` -```{r} +```{r 07-Chapter7-12} # Viewing the first five rows of data, across all 9 columns ctd[1:9,1:5] ``` @@ -140,19 +140,19 @@ ctd[1:9,1:5] To identify genes with altered expression in association with arsenic, we can leverage the results of our CTD query and filter this dataset to include only the rows that contain the term "expression" in the "Interaction Actions" column. -```{r} +```{r 07-Chapter7-13} exp_filt = ctd %>% filter(grepl("expression", `Interaction Actions`)) ``` We now have 2586 observations, representing instances of arsenic exposure causing a changes in a target gene's expression levels. -```{r} +```{r 07-Chapter7-14} dim(exp_filt) ``` Let's see how many unique genes this represents: -```{r} +```{r 07-Chapter7-15} length(unique(exp_filt$`Gene Symbol`)) ``` This reflects 1878 unique genes that show altered expression in association with arsenic. @@ -160,7 +160,7 @@ This reflects 1878 unique genes that show altered expression in association with Let's make a separate dataframe that includes only the unique genes, based on the "Gene Symbol" column. -```{r} +```{r 07-Chapter7-16} exp_genes = exp_filt %>% distinct(`Gene Symbol`, .keep_all=TRUE) # Removing columns besides gene identifier @@ -193,18 +193,18 @@ Which genes show altered expression in response to arsenic exposure? For this dataset, let's focus on gene-level methylation as a marker of epigenetic regulation. Let's return to our main dataframe, representing the results of the CTD query, and filter these results for only the rows that contain the term "methylation" in the "Interaction Actions" column. -```{r} +```{r 07-Chapter7-17} met_filt = ctd %>% filter(grepl("methylation",`Interaction Actions`)) ``` We now have 3211 observations, representing instances of arsenic exposure causing a changes in a target gene's methylation levels. -```{r} +```{r 07-Chapter7-18} dim(met_filt) ``` Let's see how many unique genes this represents. -```{r} +```{r 07-Chapter7-19} length(unique(met_filt$`Gene Symbol`)) ``` This reflects 3142 unique genes that show altered methylation in association with arsenic @@ -212,7 +212,7 @@ This reflects 3142 unique genes that show altered methylation in association wit Let's make a separate dataframe that includes only the unique genes, based on the "Gene Symbol" column. -```{r} +```{r 07-Chapter7-20} met_genes = met_filt %>% distinct(`Gene Symbol`, .keep_all=TRUE) # Removing columns besides gene identifier @@ -231,13 +231,13 @@ With this list of genes with altered methylation, we can now compare it to previ Merge the expression results with the methylation resuts on the Gene Symbol column found in both datasets. -```{r} +```{r 07-Chapter7-21} merge_df = merge(exp_genes, met_genes, by = "Gene Symbol") ``` We end up with 315 rows reflecting the 315 genes that show altered expression and altered methylation Let's view these genes: -```{r} +```{r 07-Chapter7-22} merge_df[1:315,] ``` @@ -261,7 +261,7 @@ For further training, shown here is another method for pulling this list of inte Obtain a list of the overlapping genes in the overall expression results and the methylation results. -```{r} +```{r 07-Chapter7-23} inxn = intersect(exp_filt$`Gene Symbol`,met_filt$`Gene Symbol`) ``` Again, we end up with a list of 315 unique genes that show altered expression and altered methylation. @@ -269,21 +269,21 @@ Again, we end up with a list of 315 unique genes that show altered expression an This list can be viewed on its own or converted to a dataframe (df). -```{r} +```{r 07-Chapter7-24} inxn_df = data.frame(genes=inxn) ``` This list can also be conveniently used to filter the original query results. -```{r} +```{r 07-Chapter7-25} inxn_df_all_data = ctd %>% filter(`Gene Symbol` %in% inxn) ``` Note that in this last case, the same 315 genes are present, but this time the results contain all records from the original query results, hence the 875 rows (875 records observations reflecting the 315 genes). -```{r} +```{r 07-Chapter7-26} summary(unique(sort(inxn_df_all_data$`Gene Symbol`))==sort(merge_df$`Gene Symbol`)) dim(inxn_df_all_data) ``` @@ -405,14 +405,14 @@ This training module was specifically developed to answer the following environm ### Script Preparations #### Cleaning the global environment -```{r} +```{r 07-Chapter7-27} rm(list=ls()) ``` #### Installing required R packages If you already have these packages installed, you can skip this step, or you can run the below code which checks installation status for you -```{r, results=FALSE, message=FALSE} +```{r 07-Chapter7-28, results=FALSE, message=FALSE} if (!requireNamespace("tidyverse")) install.packages("tidyverse") if (!requireNamespace("reshape2")) @@ -426,7 +426,7 @@ BiocManager::install("GEOquery") #### Loading R packages required for this session -```{r, results=FALSE, message=FALSE, warning=FALSE} +```{r 07-Chapter7-29, results=FALSE, message=FALSE, warning=FALSE} library(tidyverse) library(reshape2) library(GEOquery) @@ -440,11 +440,11 @@ For more information on the **GEOquery package**, see its associated [Bioconduct #### Set your working directory -```{r, eval=FALSE, echo=TRUE} +```{r 07-Chapter7-30, eval=FALSE, echo=TRUE} setwd("/filepath to where your input files are") ``` -```{r, echo=FALSE} +```{r 07-Chapter7-31, echo=FALSE} #setwd("/Users/juliarager/IEHS Dropbox/Julia Rager/Research Projects/1_SRP/4_DMAC/DMAC Training Modules/Training_Modules/3_Chapter 3/3_2_Database_GEO/Clean_Files/") ``` @@ -458,7 +458,7 @@ In this first method, we will navigate to the dataset within the GEO website, ma For the purposes of this training exercise, we manually downloaded the GEO series matrix file from the GEO series webpage, located at: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE42394. The specific file that was downloaded was noted as "GSE42394_series_matrix.txt", pulled by clicking on the link indicated by the red arrow from the GEO series webpage: -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align = "center"} +```{r 07-Chapter7-32, echo=FALSE, fig.width=4, fig.height=5, fig.align = "center"} knitr::include_graphics("Chapter_7/Module7_2_Input/Module7_2_Image1.png") ``` @@ -467,7 +467,7 @@ For simplicity, we also have already pre-filtered this file for the samples we a At this point, we can simply read in this pre-filtered text file for the purposes of this training module -```{r} +```{r 07-Chapter7-33} geodata_manual = read.table(file="Chapter_7/Module7_2_Input/Module7_2_InputData1.txt", header=T) ``` @@ -476,7 +476,7 @@ geodata_manual = read.table(file="Chapter_7/Module7_2_Input/Module7_2_InputData1 Because this is a manual approach, we have to also manually define the treated and untreated samples (based on manually opening the surrounding metadata from the GEO webpage) Manually defining treated and untreated for these samples of interest: -```{r} +```{r 07-Chapter7-34} exposed_manual = c("GSM1150940", "GSM1150941", "GSM1150942") unexposed_manual = c("GSM1150937", "GSM1150938", "GSM1150939") ``` @@ -488,25 +488,25 @@ In this second method, we will leverage the GEOquery package, which allows for e Let's first use the getGEO function (from the GEOquery package) to load data from our series matrix ("GSE42394_series_matrix.txt", renamed "Module7_2_InputData2.txt" for use in this module). *Note that this line of code may take a couple of minutes to run.* -```{r, message=FALSE} +```{r 07-Chapter7-35, message=FALSE} geo.getGEO.data = getGEO(filename='Chapter_7/Module7_2_Input/Module7_2_InputData2.txt') ``` One of the reasons the getGEO package is so helpful is that we can automatically link a dataset with nicely organized sample information using the `pData()` function. -```{r} +```{r 07-Chapter7-36} sampleInfo = pData(geo.getGEO.data) ``` Let's view this sample information / metadata file, first by viewing what the column headers are. -```{r} +```{r 07-Chapter7-37} colnames(sampleInfo) ``` Then viewing the first five columns. -```{r} +```{r 07-Chapter7-38} sampleInfo[1:10,1:5] ``` @@ -514,7 +514,7 @@ This shows that each sample is provided with a unique number starting with "GSM" Let's view the next five columns. -```{r} +```{r 07-Chapter7-39} sampleInfo[1:10,6:10] ``` @@ -531,7 +531,7 @@ Now, we can use this information to define the samples we want to analyze. Note In this training exercise, we are focusing on responses in the nose, so we can easily filter for cell type = Nasal epithelial cells (specifically in the `cell type:ch1` variable). We are also focusing on responses collected after 7 days of exposure, which we can filter for using time = 7 day (specifically in the `time:ch1` variable). We will also define exposed and unexposed samples using the variable `treatment:ch1`. First, let's subset the sampleInfo dataframe to just keep the samples we're interested in -```{r} +```{r 07-Chapter7-40} # Define a vector variable (here we call it 'keep') that will store rows we want to keep keep = rownames(sampleInfo[which(sampleInfo$`cell type:ch1`=="Nasal epithelial cells" & sampleInfo$`time:ch1`=="7 day"),]) @@ -542,13 +542,13 @@ sampleInfo = sampleInfo[keep,] Next, we can pull the exposed and unexposed animal IDs. Let's first see how these are labeled within the `treatment:ch1` variable. -```{r} +```{r 07-Chapter7-41} unique(sampleInfo$`treatment:ch1`) ``` And then search for the rows of data, pulling the sample animal IDs (which are in the variable `geo_accession`). -```{r} +```{r 07-Chapter7-42} exposedIDs = sampleInfo[which(sampleInfo$`treatment:ch1`=="2 ppm formaldehyde"), "geo_accession"] unexposedIDs = sampleInfo[which(sampleInfo$`treatment:ch1`=="unexposed"), @@ -557,20 +557,20 @@ unexposedIDs = sampleInfo[which(sampleInfo$`treatment:ch1`=="unexposed"), The next step is to pull the expression data we want to use in our analyses. The GEOquery function, `exprs()`, allows us to easily pull these data. Here, we can pull the data we're interested in using the `exprs()` function, while defining the data we want to pull based off our previously generated 'keep' vector. -```{r} +```{r 07-Chapter7-43} # As a reminder, this is what the 'keep' vector includes # (i.e., animal IDs that we're interested in) keep ``` -```{r} +```{r 07-Chapter7-44} # Using the exprs() function geodata = exprs(geo.getGEO.data[,keep]) ``` Let's view the full dataset as is now: -```{r} +```{r 07-Chapter7-45} head(geodata) ``` This now represents a matrix of data, with animal IDs as column headers and expression levels within the matrix. @@ -580,18 +580,18 @@ This now represents a matrix of data, with animal IDs as column headers and expr These column names are not the easiest to interpret, so let's rename these columns to indicate which animals were from the exposed vs. unexposed groups. We need to first convert our expression dataset to a dataframe so we can edit columns names, and continue with downstream data manipulations that require dataframe formats. -```{r} +```{r 07-Chapter7-46} geodata = data.frame(geodata) ``` Let's remind ourselves what the column names are: -```{r} +```{r 07-Chapter7-47} colnames(geodata) ``` Which ones of these are exposed vs unexposed animals can be determined by viewing our previously defined vectors. -```{r} +```{r 07-Chapter7-48} exposedIDs unexposedIDs ``` @@ -599,14 +599,14 @@ unexposedIDs With this we can tell that the first three listed IDs are from unexposed animals, and the last three IDs are from exposed animals. Let's simplify the names of these columns to indicate exposure status and replicate number. -```{r} +```{r 07-Chapter7-49} colnames(geodata) = c("Control_1", "Control_2", "Control_3", "Exposed_1", "Exposed_2", "Exposed_3") ``` And we'll now need to re-define our 'exposed' vs 'unexposed' vectors for downstream script. -```{r} +```{r 07-Chapter7-50} exposedIDs = c("Exposed_1", "Exposed_2", "Exposed_3") unexposedIDs = c("Control_1", "Control_2", "Control_3") ``` @@ -614,7 +614,7 @@ unexposedIDs = c("Control_1", "Control_2", "Control_3") Viewing the data again: -```{r} +```{r 07-Chapter7-51} head(geodata) ``` @@ -630,7 +630,7 @@ In instances where we want more information surrounding the molecular identifier For example, let's pull the platform-specific annotation file for this experiment. Let's revisit the [website](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE42394) that contained the original dataset on GEO. Scroll down to where it lists "Platforms", and there is a hyperlinked platform number "GPL6247" (see arrow below). -```{r, echo=FALSE, fig.width=4, fig.height=5, fig.align = "center"} +```{r 07-Chapter7-52, echo=FALSE, fig.width=4, fig.height=5, fig.align = "center"} knitr::include_graphics("Chapter_7/Module7_2_Input/Module7_2_Image2.png") ``` @@ -641,12 +641,12 @@ Here, we're interested in pulling the corresponding gene symbol information for In this exercise, we've already done these steps and unzipped the file in our working directory. So at this point, we can simply read in this annotation dataset, renamed "Module7_2_InputData2.annot", still using the `GEOquery()` function to help automate. -```{r, warning=FALSE} +```{r 07-Chapter7-53, warning=FALSE} geo.annot = GEOquery::getGEO(filename="Chapter_7/Module7_2_Input/Module7_2_InputData3.annot") ``` Now we can use the `Table()` function from GEOquery to pull data from the annotation dataset. -```{r} +```{r 07-Chapter7-54} id.gene.table = GEOquery::Table(geo.annot)[,c("ID", "Gene symbol")] id.gene.table[1:10,1:2] ``` @@ -654,12 +654,12 @@ id.gene.table[1:10,1:2] With these two columns of data, we now have the needed IDs and gene symbols to match with our dataset. Within the full dataset, we need to add a new column for the probeset ID, taken from the rownames, in preparation for the merging step. -```{r} +```{r 07-Chapter7-55} geodata$ID = rownames(geodata) ``` We can now merge the gene symbol information by ID with our expression data. -```{r} +```{r 07-Chapter7-56} geodata_genes = merge(geodata, id.gene.table, by="ID") head(geodata_genes) ``` @@ -667,7 +667,7 @@ head(geodata_genes) Note that many of the probeset IDs do not map to full gene symbols, which is shown here by viewing the top few rows - this is expected in genome-wide analyses based on microarray platforms. Let's look at the first 25 unique genes in these data: -```{r} +```{r 07-Chapter7-57} UniqueGenes = unique(geodata_genes$`Gene symbol`) UniqueGenes[1:25] ``` @@ -677,22 +677,22 @@ Again, you can see that the first value listed is blank, representing probesetID You can also see that some gene symbols have multiple entries, separated by "///" To simplify identifiers, we can pull just the first gene symbol, and remove the rest by using gsub(). -```{r} +```{r 07-Chapter7-58} geodata_genes$`Gene symbol` = gsub("///.*", "", geodata_genes$`Gene symbol`) ``` Let's alphabetize by main expression dataframe by gene symbol. -```{r} +```{r 07-Chapter7-59} geodata_genes = geodata_genes[order(geodata_genes$`Gene symbol`),] ``` And then re-view these data: -```{r} +```{r 07-Chapter7-60} geodata_genes[1:5,] ``` In preparation for the visualization steps below, let's reset the probeset IDs to rownames. -```{r} +```{r 07-Chapter7-61} rownames(geodata_genes) = geodata_genes$ID # Can then remove this column within the dataframe @@ -700,7 +700,7 @@ geodata_genes$ID = NULL ``` Finally let's rearrange this dataset to include gene symbols as the first column, right after rownames (probeset IDs). -```{r} +```{r 07-Chapter7-62} geodata_genes = geodata_genes[,c(ncol(geodata_genes),1:(ncol(geodata_genes)-1))] geodata_genes[1:5,] dim(geodata_genes) @@ -709,7 +709,7 @@ dim(geodata_genes) Note that this dataset includes expression measures across **29,214 probes, representing 14,019 unique genes**. For simplicity in the final exercises, let's just filter for rows representing mapped genes. -```{r} +```{r 07-Chapter7-63} geodata_genes = geodata_genes[!(geodata_genes$`Gene symbol` == ""), ] dim(geodata_genes) ``` @@ -748,14 +748,14 @@ To visualize the -omics data, we can generate boxplots, heat maps, any many othe For this example, let's simply use R's built in boxplot() function. We only want to use columns with our expression data (2 to 7), so let's pull those columns when running the boxplot function. -```{r, fig.width=5, fig.height=4, fig.align = "center"} +```{r 07-Chapter7-64, fig.width=5, fig.height=4, fig.align = "center"} boxplot(geodata_genes[,2:7]) ``` There seem to be a lot of variability within each sample's range of expression levels, with many outliers. This makes sense given that we are analyzing the expression levels across the rat's entire genome, where some genes won't be expressed at all while others will be highly expressed due to biological and/or potential technical variability. To show plots without outliers, we can simply use outline=F. -```{r, fig.width=5, fig.height=4, fig.align = "center"} +```{r 07-Chapter7-65, fig.width=5, fig.height=4, fig.align = "center"} boxplot(geodata_genes[,2:7], outline=F) ``` @@ -767,7 +767,7 @@ There are many different packages you can use to generate heat maps. Here, we us It also takes awhile to plot all genes across the genome, so to save time for this training module, let's randomly select 100 rows to plot. -```{r, fig.width=9, fig.height=7, fig.align = "center"} +```{r 07-Chapter7-66, fig.width=9, fig.height=7, fig.align = "center"} # To ensure that the same subset of genes are selected each time set.seed = 101 @@ -794,18 +794,18 @@ Z-score is a very common method of scaling that transforms data points to reflec Let's see what happens when we scale this gene expression dataset by z-score across each probe. This can be easily done using the `scale()` function. This specific `scale()` function works by centering and scaling across columns, but since we want to use it across probesets (organized as rows), we need to first transpose our dataset, then run the scale function. -```{r} +```{r 07-Chapter7-67} geodata_genes_scaled = scale(t(geodata_genes[,2:7]), center=T, scale=T) ``` Now we can transpose it back to the original format (i.e., before it was transposed). -```{r} +```{r 07-Chapter7-68} geodata_genes_scaled = t(geodata_genes_scaled) ``` And then view what the normalized and now scaled expression data look like for now a random subset of 100 probesets (representing genes). -```{r, echo=FALSE, fig.width=9, fig.height=7, fig.align = "center"} +```{r 07-Chapter7-69, echo=FALSE, fig.width=9, fig.height=7, fig.align = "center"} superheat::superheat(geodata_genes_scaled[row.sample,], pretty.order.rows = TRUE, pretty.order.cols = TRUE, @@ -844,13 +844,13 @@ We need to run a t-test for each row of our dataset. This exercise demonstrates #### Method 1 (m1): 'For Loop' Let's first re-save the molecular probe IDs to a column within the dataframe, since we need those values in the loop function. -```{r} +```{r 07-Chapter7-70} geodata_genes$ID = rownames(geodata_genes) ``` We also need to initially create an empty dataframe to eventually store p-values. -```{r} +```{r 07-Chapter7-71} pValue_m1 = matrix(0, nrow=nrow(geodata_genes), ncol=3) colnames(pValue_m1) = c("ID", "pval", "padj") head(pValue_m1) @@ -859,7 +859,7 @@ head(pValue_m1) You can see the empty dataframe that was generated through this code. Then we can loop through the entire dataset to acquire p-values from t-test statistics, comparing n=3 exposed vs n=3 unexposed samples. -```{r} +```{r 07-Chapter7-72} for (i in 1:nrow(geodata_genes)) { #Get the ID @@ -876,7 +876,7 @@ for (i in 1:nrow(geodata_genes)) { ``` View the results: -```{r} +```{r 07-Chapter7-73} # Note that we're not pulling the last column (padj) since we haven't calculated these yet pValue_m1[1:5,1:2] ``` @@ -886,14 +886,14 @@ pValue_m1[1:5,1:2] #### Method 2 (m2): Apply Function For the second method, we can use the *apply()* function to calculate resulting t-test p-values more efficiently labeled. -```{r} +```{r 07-Chapter7-74} pValue_m2 = apply(geodata_genes[,2:7], 1, function(x) t.test(x[unexposedIDs], x[exposedIDs])$p.value) names(pValue_m2) = geodata_genes[,"ID"] ``` We can convert the results into a dataframe to make it similar to m1 matrix we created above. -```{r} +```{r 07-Chapter7-75} pValue_m2 = data.frame(pValue_m2) # Now create an ID column @@ -901,7 +901,7 @@ pValue_m2$ID = rownames(pValue_m2) ``` Then we can view at the two datasets to see they result in the same pvalues. -```{r} +```{r 07-Chapter7-76} head(pValue_m1) head(pValue_m2) ``` @@ -912,25 +912,25 @@ We can see from these results that both methods (m1 and m2) generate the same st Let's again merge these data with the gene symbols to tell which genes are significant. First, let's convert to a dataframe and then merge as before, for one of the above methods as an example (m1). -```{r} +```{r 07-Chapter7-77} pValue_m1 = data.frame(pValue_m1) pValue_m1 = merge(pValue_m1, id.gene.table, by="ID") ``` We can also add a multiple test correction by applying a false discovery rate-adjusted p-value; here, using the Benjamini Hochberg (BH) method. -```{r} +```{r 07-Chapter7-78} # Here fdr is an alias for B-H method pValue_m1[,"padj"] = p.adjust(pValue_m1[,"pval"], method=c("fdr")) ``` Now, we can sort these statistical results by adjusted p-values. -```{r} +```{r 07-Chapter7-79} pValue_m1.sorted = pValue_m1[order(pValue_m1[,'padj']),] head(pValue_m1.sorted) ``` Pulling just the significant genes using an adjusted p-value threshold of 0.05. -```{r} +```{r 07-Chapter7-80} adj.pval.sig = pValue_m1[which(pValue_m1[,'padj'] < .05),] # Viewing these genes @@ -950,13 +950,13 @@ What genes are altered in expression by formaldehyde inhalation exposure? Finally, let's plot these using a mini heat map. Note that we can use probesetIDs, then gene symbols, in rownames to have them show in heat map labels. -```{r, echo=FALSE, fig.width=8, fig.height=4, fig.align = "center"} +```{r 07-Chapter7-81, echo=FALSE, fig.width=8, fig.height=4, fig.align = "center"} rownames(geodata_genes) = paste(geodata_genes$ID, ": ",geodata_genes$`Gene symbol`) superheat::superheat(geodata_genes[which(geodata_genes$ID %in% adj.pval.sig[,"ID"]),2:7]) ``` Note that this statistical filter is pretty strict when comparing only n=3 vs n=3 biological replicates. If we loosen the statistical criteria to p-value < 0.05, this is what we can find: -```{r} +```{r 07-Chapter7-82} pval.sig = pValue_m1[which(pValue_m1[,'pval'] < .05),] nrow(pval.sig) ``` @@ -1008,7 +1008,7 @@ Using the same dataset that was used in this module, available from the [UNC-SRP # 7.3 CompTox Dashboard Data through APIs -```{r, include = FALSE} +```{r 07-Chapter7-83, include = FALSE} tpl <- knitr::opts_template$get("TAME_options") merged <- c( list(collapse = TRUE, comment = "#>"), @@ -1085,14 +1085,14 @@ This training module was specifically developed to answer the following question ### Cleaning the Global Environment -```{r, eval=FALSE} +```{r 07-Chapter7-84, eval=FALSE} rm(list=ls()) ``` ### Installing Required R Packages -```{r, eval=FALSE} +```{r 07-Chapter7-85, eval=FALSE} if (!requireNamespace('ctxR')) install.packages('ctxR') @@ -1102,7 +1102,7 @@ if (!requireNamespace('ggplot2')) ### Loading R Packages -```{r} +```{r 07-Chapter7-86} # Used to interface with CompTox Chemicals Dashboard library(ctxR) @@ -1123,7 +1123,7 @@ The CCD can be searched either one chemical at a time, or using a batch search. In single-substance search, the user types a full or partial chemical identifier (name, CASRN, InChiKey, or DSSTox ID) into a search box on the CCD homepage. Autocomplete provides a list of possible matches; the user selects one by clicking on it, and is then taken to the CCD page for that substance. Here is an example of the CCD page for the chemical Bisphenol A: -```{r, echo = FALSE, out.width= "90%", fig.align= 'center'} +```{r 07-Chapter7-87, echo = FALSE, out.width= "90%", fig.align= 'center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image1.png') ``` @@ -1135,13 +1135,13 @@ The different domains of data available for this chemical are shown by the tabs In batch search, the user enters a list of search inputs, separated by newlines, into a batch-search box on https://comptox.epa.gov/dashboard/batch-search . The user selects the type(s) of inputs by selecting one or more checkboxes – these may include chemical identifiers, monoisotopic masses, or molecular formulas. Then, the user selects “Display All Chemicals” to display the list of substances matching the batch-search inputs, or “Choose Export Options” to choose options for exporting the batch-search results as a spreadsheet. The exported spreadsheet may include data from most of the domains available on an individual substance’s CCD page. -```{r, echo = FALSE, out.width = "90%", fig.align = 'center'} +```{r 07-Chapter7-88, echo = FALSE, out.width = "90%", fig.align = 'center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image2.png') ``` The user can download the selected information in various formats, such as Excel (.xlsx), comma-separated values (.csv), or different types of chemical table files (.e.g, MOL). -```{r, echo=FALSE, out.width="90%", fig.align='center'} +```{r 07-Chapter7-89, echo=FALSE, out.width="90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image3.png') ``` @@ -1192,7 +1192,7 @@ For more information on the data accessible through the CTX APIs and related too The APIs are organized into four sets of "endpoints" (chemical data domains): `Chemical`, `Hazard`, `Bioactivity`, and `Exposure`. Pictured below is what the `Chemical` section looks like and can be found at [CTX API Chemical Endpoints](https://api-ccte.epa.gov/docs/chemical.html). -```{r, echo = FALSE, out.width = "90%", fig.align='center'} +```{r 07-Chapter7-90, echo = FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image4.png') ``` @@ -1202,7 +1202,7 @@ The APIs can be explored through the pictured web interface at https://api-ccte. `Authentication` is the first tab on the left. Authentication is required to use the APIs. To authenticate yourself in the API web interface, input your unique API key. -```{r, echo = FALSE, out.width = "90%", fig.align='center'} +```{r 07-Chapter7-91, echo = FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image5.png') ``` @@ -1221,7 +1221,7 @@ In the CTX API web interface, the colored boxes next to each endpoint indicate t Click on the second item under `Chemical Details Resource`, the tab labeled `Get data by dtxsid`. The following page will appear. -```{r, echo = FALSE, out.width = "90%", fig.align='center'} +```{r 07-Chapter7-92, echo = FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image6.png') ``` @@ -1232,14 +1232,14 @@ This page has two subheadings: "Path Parameters" and "Query-String Parameters". The default return format is displayed below and includes a variety of fields with data types represented. -```{r, echo = FALSE, out.width = "90%", fig.align='center'} +```{r 07-Chapter7-93, echo = FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image7.png') ``` We show what reRturned data from searching Bisphenol A looks like using this endpoint with the `chemicaldetailstandard` value for `projection` selected. -```{r, echo = FALSE, out.width = "90%", fig.align='center'} +```{r 07-Chapter7-94, echo = FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image8.png') ``` @@ -1254,7 +1254,7 @@ Formatting an http request is not necessarily intuitive nor worth the time for s We store the API key required to access the APIs. To do this for the current session, run the first command. If you want to store your key across multiple sessions, run the second command. -```{r, eval=FALSE} +```{r 07-Chapter7-95, eval=FALSE} # This stores the key in the current session register_ctx_api_key(key = '') @@ -1263,7 +1263,7 @@ register_ctx_api_key(key = '') register_ctx_api_key(key = '', write = TRUE) ``` -```{r, echo=FALSE, warning = FALSE} +```{r 07-Chapter7-96, echo=FALSE, warning = FALSE} # This stores the key in the current session register_ctx_api_key(key = '706401cd-8bda-469d-9cdb-ac27f489c93a') @@ -1274,7 +1274,7 @@ register_ctx_api_key(key = '706401cd-8bda-469d-9cdb-ac27f489c93a', write = TRUE) To check that your key has successfully been stored for the session, run the following command. -```{r, eval=FALSE} +```{r 07-Chapter7-97, eval=FALSE} ctx_key() ``` @@ -1282,7 +1282,7 @@ ctx_key() Now, we demonstrate how to retrieve the information for BPA given by the `Chemical Detail Resource` endpoint under the `chemicaldetailstandard` value for `projection`. Note, this `projection` value is the default value for the function `get_chemical_details()`. -```{r} +```{r 07-Chapter7-98} BPA_chemical_detail <- get_chemical_details(DTXSID = 'DTXSID7020182') dim(BPA_chemical_detail) class(BPA_chemical_detail) @@ -1301,7 +1301,7 @@ These lists can be found in the CCD at [CCL4](https://comptox.epa.gov/dashboard/ We explore details about these two lists of chemicals before diving into analyzing the data contained in each list. -```{r} +```{r 07-Chapter7-99} options(width = 100) ccl4_information <- get_public_chemical_list_by_name('CCL4') print(ccl4_information, trunc.cols = TRUE) @@ -1312,7 +1312,7 @@ print(natadb_information, trunc.cols = TRUE) Now we pull the actual chemicals contained in the lists using the APIs. -```{r} +```{r 07-Chapter7-100} ccl4 <- get_chemicals_in_list('ccl4') ccl4 <- data.table::as.data.table(ccl4) @@ -1322,7 +1322,7 @@ natadb <- data.table::as.data.table(natadb) We examine the dimensions of the data, the column names, and display a single row for illustrative purposes. -```{r} +```{r 07-Chapter7-101} dim(ccl4) dim(natadb) @@ -1335,7 +1335,7 @@ head(ccl4, 1) Once we have the chemicals in each list, we access their physico-chemical properties. We will use the batch search forms of the function `get_chem_info()`, to which we supply a list of DTXSIDs. -```{r} +```{r 07-Chapter7-102} ccl4$dtxsid natadb$dtxsid @@ -1347,7 +1347,7 @@ Observe that this returns a single data.table for each query, and the data.table Before any deeper analysis, let's take a look at the dimensions of the data and the column names. -```{r} +```{r 07-Chapter7-103} dim(ccl4_phys_chem) colnames(ccl4_phys_chem) ``` @@ -1357,14 +1357,14 @@ Next, we display the unique values for the columns `propertyID` and `propType`. -```{r} +```{r 07-Chapter7-104} ccl4_phys_chem[, unique(propName)] ccl4_phys_chem[, unique(propType)] ``` Let's explore this further by examining the mean of the "boiling-point" and "melting-point" data. -```{r} +```{r 07-Chapter7-105} ccl4_phys_chem[propName == 'Boiling Point', .(Mean = mean(propValue, na.rm = TRUE))] ccl4_phys_chem[propName == 'Boiling Point', .(Mean = mean(propValue, na.rm = TRUE)), by = .(propType)] @@ -1388,7 +1388,7 @@ These results tell us about some of the reported physico-chemical properties of To explore **all** the values of the physico-chemical properties and calculate their means, we can do the following procedure. First we look at all the physico-chemical properties individually, then group them by each property ("Boiling Point", "Melting Point", etc...), and then additionally group those by property type ("experimental" vs "predicted"). In the grouping, we look at the columns `propValue`, `unit`, `propName` and `propType`. We also demonstrate how take the mean of the values for each grouping. We examine the chemical with `DTXSID` "DTXSID0020153" from CCL4. -```{r} +```{r 07-Chapter7-106} head(ccl4_phys_chem[dtxsid == 'DTXSID0020153', ]) ccl4_phys_chem[dtxsid == 'DTXSID0020153', .(propType, propValue, propUnit), by = .(propName)] @@ -1413,7 +1413,7 @@ We first examine the vapor pressures for all the chemicals in each list. We then Group first by DTXSID. -```{r} +```{r 07-Chapter7-107} ccl4_vapor_all <- ccl4_phys_chem[propName %in% 'Vapor Pressure', .(mean_vapor_pressure = sapply(.SD, function(t) {mean(t, na.rm = TRUE)})), .SDcols = c('propValue'), by = .(dtxsid)] @@ -1424,7 +1424,7 @@ natadb_vapor_all <- natadb_phys_chem[propName %in% 'Vapor Pressure', Then group by DTXSID and then by property type. -```{r} +```{r 07-Chapter7-108} ccl4_vapor_grouped <- ccl4_phys_chem[propName %in% 'Vapor Pressure', .(mean_vapor_pressure = sapply(.SD, function(t) {mean(t, na.rm = TRUE)})), .SDcols = c('propValue'), @@ -1438,7 +1438,7 @@ natadb_vapor_grouped <- natadb_phys_chem[propName %in% 'Vapor Pressure', Then examine the summary statistics of the data. -```{r} +```{r 07-Chapter7-109} summary(ccl4_vapor_all) summary(ccl4_vapor_grouped) summary(natadb_vapor_all) @@ -1447,7 +1447,7 @@ summary(natadb_vapor_grouped) With such a large range of values covering several orders of magnitude, we log transform the data. Since some of these value are non-positive, some transformations may result in non-numeric values. These will be removed when plotting. We expect these values to be positive in general so we go ahead with these transformations. -```{r} +```{r 07-Chapter7-110} ccl4_vapor_all[, log_transform_mean_vapor_pressure := log(mean_vapor_pressure)] ccl4_vapor_grouped[, log_transform_mean_vapor_pressure := log(mean_vapor_pressure)] @@ -1461,7 +1461,7 @@ natadb_vapor_grouped[, log_transform_mean_vapor_pressure := Now we plot the log transformed data. First plot the CCL4 data. -```{r, fig.align='center'} +```{r 07-Chapter7-111, fig.align='center'} ggplot(ccl4_vapor_all, aes(log_transform_mean_vapor_pressure)) + geom_boxplot() + coord_flip() @@ -1471,7 +1471,7 @@ ggplot(ccl4_vapor_grouped, aes(propType, log_transform_mean_vapor_pressure)) + Then plot the NATA data. -```{r, fig.align='center'} +```{r 07-Chapter7-112, fig.align='center'} ggplot(natadb_vapor_all, aes(log_transform_mean_vapor_pressure)) + geom_boxplot() + coord_flip() ggplot(natadb_vapor_grouped, aes(propType, log_transform_mean_vapor_pressure)) + @@ -1480,7 +1480,7 @@ ggplot(natadb_vapor_grouped, aes(propType, log_transform_mean_vapor_pressure)) + Finally, we compare both sets simultaneously. We add in a column to each data.table denoting to which data set the rows correspond and then combine the rows from both data sets together using the function `rbind()`. -```{r} +```{r 07-Chapter7-113} ccl4_vapor_grouped[, set := 'CCL4'] natadb_vapor_grouped[, set := 'NATADB'] @@ -1489,7 +1489,7 @@ all_vapor_grouped <- rbind(ccl4_vapor_grouped, natadb_vapor_grouped) Now we plot the combined data. First we color the boxplots based on the property type, with mean log transformed vapor pressure plotted for each data set and property type. -```{r, fig.align='center'} +```{r 07-Chapter7-114, fig.align='center'} vapor_box <- ggplot(all_vapor_grouped, aes(set, log_transform_mean_vapor_pressure)) + geom_boxplot(aes(color = propType)) @@ -1498,7 +1498,7 @@ vapor_box Next we color the boxplots based on the data set. -```{r, , fig.align='center'} +```{r 07-Chapter7-115,, fig.align='center'} vapor <- ggplot(all_vapor_grouped, aes(log_transform_mean_vapor_pressure)) + geom_boxplot((aes(color = set))) + coord_flip() @@ -1511,7 +1511,7 @@ We also explore Henry's Law constant and boiling point in a similar fashion. Group by DTXSID. -```{r} +```{r 07-Chapter7-116} ccl4_hlc_all <- ccl4_phys_chem[propName %in% "Henry's Law Constant", .(mean_hlc = sapply(.SD, function(t) {mean(t, na.rm = TRUE)})), .SDcols = c('propValue'), by = .(dtxsid)] @@ -1522,7 +1522,7 @@ natadb_hlc_all <- natadb_phys_chem[propName %in% "Henry's Law Constant", Group by DTXSID and property type. -```{r} +```{r 07-Chapter7-117} ccl4_hlc_grouped <- ccl4_phys_chem[propName %in% "Henry's Law Constant", .(mean_hlc = sapply(.SD, function(t) {mean(t, na.rm = TRUE)})), .SDcols = c('propValue'), @@ -1535,7 +1535,7 @@ natadb_hlc_grouped <- natadb_phys_chem[propName %in% "Henry's Law Constant", Examine summary statistics. -```{r} +```{r 07-Chapter7-118} summary(ccl4_hlc_all) summary(ccl4_hlc_grouped) summary(natadb_hlc_all) @@ -1544,7 +1544,7 @@ summary(natadb_hlc_grouped) Again, we log transform the data as it covers several orders of magnitude. We expect these values to be positive in general so we go ahead with these transformations. -```{r} +```{r 07-Chapter7-119} ccl4_hlc_all[, log_transform_mean_hlc := log(mean_hlc)] ccl4_hlc_grouped[, log_transform_mean_hlc := log(mean_hlc)] @@ -1557,7 +1557,7 @@ We compare both sets simultaneously. We add in a column to each data.table denot Label and combine data. -```{r} +```{r 07-Chapter7-120} ccl4_hlc_grouped[, set := 'CCL4'] natadb_hlc_grouped[, set := 'NATADB'] @@ -1566,7 +1566,7 @@ all_hlc_grouped <- rbind(ccl4_hlc_grouped, natadb_hlc_grouped) Plot data. Some rows are removed due to transformations above that result in non-valid values. -```{r, , fig.align='center'} +```{r 07-Chapter7-121,, fig.align='center'} hlc_box <- ggplot(all_hlc_grouped, aes(set, log_transform_mean_hlc)) + geom_boxplot(aes(color = propType)) hlc_box @@ -1583,7 +1583,7 @@ Finally, we consider boiling point. Group by DTXSID. -```{r} +```{r 07-Chapter7-122} ccl4_boiling_all <- ccl4_phys_chem[propName %in% 'Boiling Point', .(mean_boiling_point = sapply(.SD, function(t) {mean(t, na.rm = TRUE)})), .SDcols = c('propValue'), by = .(dtxsid)] @@ -1595,7 +1595,7 @@ natadb_boiling_all <- natadb_phys_chem[propName %in% 'Boiling Point', Group by DTXSID and property type. -```{r} +```{r 07-Chapter7-123} ccl4_boiling_grouped <- ccl4_phys_chem[propName %in% 'Boiling Point', .(mean_boiling_point = sapply(.SD, function(t) {mean(t, na.rm = TRUE)})), @@ -1610,7 +1610,7 @@ natadb_boiling_grouped <- natadb_phys_chem[propName %in% 'Boiling Point', Calculate summary statistics. -```{r} +```{r 07-Chapter7-124} summary(ccl4_boiling_all) summary(ccl4_boiling_grouped) summary(natadb_boiling_all) @@ -1619,7 +1619,7 @@ summary(natadb_boiling_grouped) Since some of the boiling point values have negative values, we cannot log transform these values. If we try, as you will see below, there will be warnings of NaNs produced. -```{r, eval} +```{r 07-Chapter7-125, eval} ccl4_boiling_all[, log_transform := log(mean_boiling_point)] ccl4_boiling_grouped[, log_transform := log(mean_boiling_point)] @@ -1631,7 +1631,7 @@ We compare both sets simultaneously. We add in a column to each data.table denot Label and combine data. -```{r} +```{r 07-Chapter7-126} ccl4_boiling_grouped[, set := 'CCL4'] natadb_boiling_grouped[, set := 'NATADB'] @@ -1640,7 +1640,7 @@ all_boiling_grouped <- rbind(ccl4_boiling_grouped, natadb_boiling_grouped) Plot the data. -```{r, , fig.align='center'} +```{r 07-Chapter7-127,, fig.align='center'} boiling_box <- ggplot(all_boiling_grouped, aes(set, mean_boiling_point)) + geom_boxplot(aes(color = propType)) boiling_box @@ -1672,13 +1672,13 @@ Now, having examined some of the distributions of the physico-chemical propertie Using the standard CompTox Chemicals Dashboard approach to access genotoxicity, one would again navigate to the individual chemical page -```{r, echo = FALSE, out.width = "90%", fig.align='center'} +```{r 07-Chapter7-128, echo = FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image9.png') ``` Once one navigates to the genotoxicity tab highlighted in the previous page, the following is displayed as seen here: -```{r, echo = FALSE, out.width = "90%", fig.align='center'} +```{r 07-Chapter7-129, echo = FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image10.png') ``` @@ -1686,7 +1686,7 @@ This page includes two sets of information, the first of which provides a summar We again use the CTX APIs to streamline the process of retrieving this information in a programmatic fashion. To this end, we will use the genotoxicity endpoints found within the `Hazard` endpoints of the CTX APIs. Pictured below is the particular set of genotoxicity resources available in the `Hazard` endpoints of the CTX APIs. -```{r, echo = FALSE, out.width = "90%", fig.align='center'} +```{r 07-Chapter7-130, echo = FALSE, out.width = "90%", fig.align='center'} knitr::include_graphics('Chapter_7/Module7_3_Input/Module7_3_Image11.png') ``` @@ -1695,21 +1695,21 @@ There are both summary and detail resources, reflecting the information one can To access the genetox endpoint, we will use the function `get_genetox_summary()`. Since we have a list of chemicals, rather than searching individually for each chemical, we use the batch search version of the function, named `get_genetox_summary_batch()`. We will examine this and then access the details. Grab the data using the APIs. -```{r} +```{r 07-Chapter7-131} ccl4_genotox <- get_genetox_summary_batch(DTXSID = ccl4$dtxsid) natadb_genetox <- get_genetox_summary_batch(DTXSID = natadb$dtxsid) ``` Examine the dimensions. -```{r} +```{r 07-Chapter7-132} dim(ccl4_genotox) dim(natadb_genetox) ``` Examine the column names and data from the first six chemicals with genetox data from CCL4. -```{r} +```{r 07-Chapter7-133} colnames(ccl4_genotox) head(ccl4_genotox) ``` @@ -1718,7 +1718,7 @@ The information returned is of the first variety highlighted in the image above, Observe that we have information on 71 chemicals from the CCL4 data and 153 from the NATA data. We note the chemicals not included in the results and then dig into the returned results. -```{r} +```{r 07-Chapter7-134} ccl4[!(dtxsid %in% ccl4_genotox$dtxsid), .(dtxsid, casrn, preferredName, molFormula)] natadb[!(dtxsid %in% natadb_genetox$dtxsid), @@ -1729,21 +1729,21 @@ Now, we access the genotoxicity details of the chemicals in each data set using Grab the data from the CTX APIs. -```{r} +```{r 07-Chapter7-135} ccl4_genetox_details <- get_genetox_details_batch(DTXSID = ccl4$dtxsid) natadb_genetox_details <- get_genetox_details_batch(DTXSID = natadb$dtxsid) ``` Examine the dimensions. -```{r} +```{r 07-Chapter7-136} dim(ccl4_genetox_details) dim(natadb_genetox_details) ``` Look at the column names and the first six rows of the data from the CCL4 chemicals. -```{r} +```{r 07-Chapter7-137} colnames(ccl4_genetox_details) head(ccl4_genetox_details) ``` @@ -1752,14 +1752,14 @@ We examine the information returned for the first chemical in each set of result Look at the dimensions first. -```{r} +```{r 07-Chapter7-138} dim(ccl4_genetox_details[dtxsid %in% 'DTXSID0020153', ]) dim(natadb_genetox_details[dtxsid %in% 'DTXSID0020153', ]) ``` Now examine the first few rows. -```{r} +```{r 07-Chapter7-139} head(ccl4_genetox_details[dtxsid %in% 'DTXSID0020153', ]) ``` @@ -1770,13 +1770,13 @@ We now explore the assays present for chemicals in each data set. We first deter Determine the unique assay categories. -```{r} +```{r 07-Chapter7-140} ccl4_genetox_details[, unique(assayCategory)] natadb_genetox_details[, unique(assayCategory)] ``` Determine the unique assays for each data set and list them. -```{r} +```{r 07-Chapter7-141} ccl4_genetox_details[, unique(assayType)] natadb_genetox_details[, unique(assayType)] @@ -1788,7 +1788,7 @@ natadb_genetox_details[, unique(assayType)] Determine the number of assays per unique `assayCategory` value. -```{r} +```{r 07-Chapter7-142} ccl4_genetox_details[, .(Assays = length(unique(assayType))), by = .(assayCategory)] @@ -1799,14 +1799,14 @@ natadb_genetox_details[, .(Assays = length(unique(assayType))), We can analyze these results more closely, counting the number of assay results and grouping by `assayCategory`, and `assayType`. We also examine the different numbers of `assayCategory` and `assayTypes` values used. -```{r} +```{r 07-Chapter7-143} ccl4_genetox_details[, .N, by = .(assayCategory, assayType, assayResult)] ccl4_genetox_details[, .N, by = .(assayCategory)] ``` We look at the `assayType` values and numbers of each for the three different `assayCategory` values. -```{r} +```{r 07-Chapter7-144} ccl4_genetox_details[assayCategory == 'in vitro', .N, by = .(assayType)] ccl4_genetox_details[assayCategory == 'ND', .N, by = .(assayType)] ccl4_genetox_details[assayCategory == 'in vivo', .N, by = .(assayType)] @@ -1814,14 +1814,14 @@ ccl4_genetox_details[assayCategory == 'in vivo', .N, by = .(assayType)] Now we repeat this for NATADB. -```{r} +```{r 07-Chapter7-145} natadb_genetox_details[, .N, by = .(assayCategory, assayType, assayResult)] natadb_genetox_details[, .N, by = .(assayCategory)] ``` Examine the number of rows for each `assayType` value by each `assaycategory` value. -```{r, R.options=list(width=150) } +```{r 07-Chapter7-146, R.options=list(width=150) } natadb_genetox_details[assayCategory == 'in vitro', .N, by = .(assayType)] natadb_genetox_details[assayCategory == 'ND', .N, by = .(assayType)] natadb_genetox_details[assayCategory == 'in vivo', .N, by = .(assayType)] @@ -1839,7 +1839,7 @@ natadb_genetox_details[assayCategory == 'in vivo', .N, by = .(assayType)] Next, we dig into the results of the assays. One may be interested in looking at the number of chemicals for which an assay resulted in a positive or negative result for instance. We group by `assayResult` and determine the number of unique `dtxsid` values associated with each `assayResult` value. -```{r} +```{r 07-Chapter7-147} ccl4_genetox_details[, .(DTXSIDs = length(unique(dtxsid))), by = .(assayResult)] natadb_genetox_details[, .(DTXSIDs = length(unique(dtxsid))), by = .(assayResult)] @@ -1857,7 +1857,7 @@ natadb_genetox_details[, .(DTXSIDs = length(unique(dtxsid))), We now determine the chemicals from each data set that are known to have genotoxic effects. For this, we look to see which chemicals produce at least one positive response in the `assayResult` column. -```{r} +```{r 07-Chapter7-148} ccl4_genetox_details[, .(is_positive = any(assayResult == 'positive')), by = .(dtxsid)][is_positive == TRUE, dtxsid] natadb_genetox_details[, .(is_positive = any(assayResult == 'positive')), @@ -1866,7 +1866,7 @@ natadb_genetox_details[, .(is_positive = any(assayResult == 'positive')), With so much genotoxicity data, let us explore this data for one chemical more deeply to get a sense of the assays and results present for it. We will explore the chemical with DTXSID0020153. We will look at the assays, the number of each type of result, and which correspond to "positive" results. To determine this, we group by `assayResult` and calculate `.N` for each group. We also isolate which were positive and output a data.table with the number of each type. -```{r} +```{r 07-Chapter7-149} ccl4_genetox_details[dtxsid == 'DTXSID0020153', .(Number = .N), by = .(assayResult)] ccl4_genetox_details[dtxsid == 'DTXSID0020153' & assayResult == 'positive', @@ -1889,20 +1889,20 @@ ccl4_genetox_details[dtxsid == 'DTXSID0020153' & assayResult == 'positive', Finally, we examine the hazard data associated with the chemicals in each data set. For each chemical, there will be potentially hundreds of rows of hazard data, so the returned results will be much larger than in most other API endpoints. -```{r} +```{r 07-Chapter7-150} ccl4_hazard <- get_hazard_by_dtxsid_batch(DTXSID = ccl4$dtxsid) natadb_hazard <- get_hazard_by_dtxsid_batch(DTXSID = natadb$dtxsid) ``` We do some preliminary exploration of the data. First we determine the dimensions of the data sets. -```{r} +```{r 07-Chapter7-151} dim(ccl4_hazard) dim(natadb_hazard) ``` Next we record the column names and display the first six results in the CCL4 hazard data. -```{r} +```{r 07-Chapter7-152} colnames(ccl4_hazard) head(ccl4_hazard) ``` @@ -1911,26 +1911,26 @@ We determine the number of unique values in the `criticalEffect`, `toxvalTypeSup The number of unique values for `criticalEffect`. -```{r} +```{r 07-Chapter7-153} length(ccl4_hazard[, unique(criticalEffect)]) length(natadb_hazard[, unique(criticalEffect)]) ``` The number of unique values of `toxvalTypeSuperCategory`. -```{r} +```{r 07-Chapter7-154} length(ccl4_hazard[, unique(toxvalTypeSuperCategory)]) length(natadb_hazard[, unique(toxvalTypeSuperCategory)]) ``` The number of unique values for `toxvalType`. -```{r} +```{r 07-Chapter7-155} length(ccl4_hazard[, unique(toxvalType)]) length(natadb_hazard[, unique(toxvalType)]) ``` Now we look at the number of entries per `toxvalTypeSuperCategory`. -```{r} +```{r 07-Chapter7-156} ccl4_hazard[, .N, by = .(toxvalTypeSuperCategory)] natadb_hazard[, .N, by = .(toxvalTypeSuperCategory)] @@ -1938,7 +1938,7 @@ natadb_hazard[, .N, by = .(toxvalTypeSuperCategory)] With over 7,000 results for the `toxvalTypeSuperCategory` value "Dose Response Summary Value" for each data set, we dig into this further. We determine the number of rows grouped by `toxvalType` that have the "Dose Response Summary Value" `toxvalTypeSuperCategory` value, and display this descending. -```{r} +```{r 07-Chapter7-157} ccl4_hazard[toxvalTypeSuperCategory %in% 'Dose Response Summary Value', .N, by = .(toxvalType)][order(-N),] natadb_hazard[toxvalTypeSuperCategory %in% 'Dose Response Summary Value', .N, @@ -1949,7 +1949,7 @@ We explore "NOAEL", "LOAEL", and "NOEL" further. Let us look at the the case whe First, we look at "food". We order by `toxvalType` and by the minimum `toxvalNumeric` value in each group, descending. -```{r} +```{r 07-Chapter7-158} ccl4_hazard[media %in% 'food' & toxvalType %in% c('LOAEL', 'NOAEL', 'NOEL'), .(toxvalNumeric = min(toxvalNumeric)), by = .(toxvalType, toxvalUnits, dtxsid)][order(toxvalType, @@ -1962,7 +1962,7 @@ natadb_hazard[media %in% 'food' & toxvalType %in% c('LOAEL', 'NOAEL', 'NOEL'), Next we look at "culture", repeating the same grouping and ordering as in the previous case. -```{r} +```{r 07-Chapter7-159} ccl4_hazard[media %in% 'culture' & toxvalType %in% c('LOAEL', 'NOAEL', 'NOEL'), .(toxvalNumeric = min(toxvalNumeric)), by = .(toxvalType, toxvalUnits, dtxsid)][order(toxvalType, @@ -1977,7 +1977,7 @@ Now, let us restrict our attention to human hazard and focus on the exposure rou First, let us determine the exposure routes in general. -```{r} +```{r 07-Chapter7-160} ccl4_hazard[humanEco %in% 'human health', unique(exposureRoute)] natadb_hazard[humanEco %in% 'human health', unique(exposureRoute)] ``` @@ -1988,7 +1988,7 @@ Then, let's focus on the inhalation and oral exposure routes for human hazard. To answer this, filter the data into the corresponding exposure routes, then group by `exposureRoute` and `riskAssessmentClass`, and finally count the number of instances for each grouping. To determine the most represented class, one can order the results descending. -```{r} +```{r 07-Chapter7-161} ccl4_hazard[humanEco %in% 'human health' & exposureRoute %in% c('inhalation', 'oral'), .(Hits = .N), by = .(exposureRoute, riskAssessmentClass)][order(exposureRoute, @@ -2018,7 +2018,7 @@ To answer this, we filter the rows to the "human health" `humanEco` value and "i First we look at CCL4. -```{r} +```{r 07-Chapter7-162} ccl4_hazard[humanEco %in% 'human health' & exposureRoute %in% c('inhalation'), unique(toxvalType)] ccl4_hazard[humanEco %in% 'human health' & @@ -2028,7 +2028,7 @@ intersect(ccl4_hazard[humanEco %in% 'human health' & exposureRoute %in% 'inhalat Then we look at NATADB. -```{r} +```{r 07-Chapter7-163} natadb_hazard[humanEco %in% 'human health' & exposureRoute %in% c('inhalation'), unique(toxvalType)] natadb_hazard[humanEco %in% 'human health' & @@ -2049,7 +2049,7 @@ intersect(natadb_hazard[humanEco %in% 'human health' & exposureRoute %in% 'inhal For the next data exploration, we will examine the "NOAEL" and "LOAEL" values for chemicals with oral exposure and human hazard. We also examine the units to determine whether any unit conversions are necessary to compare numeric values. -```{r} +```{r 07-Chapter7-164} ccl4_hazard[humanEco %in% 'human health' & exposureRoute %in% 'oral' & toxvalType %in% c('NOAEL', 'LOAEL'), ] ccl4_hazard[humanEco %in% 'human health' & exposureRoute %in% 'oral' & @@ -2062,7 +2062,7 @@ natadb_hazard[humanEco %in% 'human health' & exposureRoute %in% 'oral' & Observe that for both CCL4 and NATADB, the units are given by "mg/kg-day", "ppm", "mg/L" and additionally "-" for NATADB. In this case, we treat "mg/kg-day" and "ppm" the same and exclude "-" and "mg/L". We group by DTXSID to find the lowest or highest value. -```{r} +```{r 07-Chapter7-165} ccl4_hazard[humanEco %in% 'human health' & exposureRoute %in% 'oral' & toxvalType %in% c('NOAEL', 'LOAEL') & !(toxvalUnits %in% c('-', 'mg/L')), .(numeric_value = min(toxvalNumeric), @@ -2077,7 +2077,7 @@ natadb_hazard[humanEco %in% 'human health' & exposureRoute %in% 'oral' & Now, we also explore the values of "RfD", "RfC", and "cancer slope factor" of the `toxvalType` rows. We first determine the set of units for each, make appropriate conversions if necessary, and then make comparisons. -```{r} +```{r 07-Chapter7-166} ccl4_hazard[humanEco %in% 'human health' & toxvalType %in% c('cancer slope factor', 'RfD', 'RfC'), .N, by = .(toxvalType, toxvalUnits)][order(toxvalType, -N)] @@ -2089,7 +2089,7 @@ For CCL4 and NATADB, there is a single unit type for each `toxvalType` value, so First, we filter and separate out the relevant data subsets. -```{r} +```{r 07-Chapter7-167} # Separate out into relevant data subsets ccl4_csf <- ccl4_hazard[humanEco %in% 'human health' & toxvalType %in% c('cancer slope factor') & (toxvalUnits != 'mg/kg-day'), ] @@ -2101,7 +2101,7 @@ ccl4_rfd <- ccl4_hazard[humanEco %in% 'human health' & While there are no unit conversions needed, we demonstrate how we would convert units if they were required. -```{r} +```{r 07-Chapter7-168} # Set mass by volume units to mg/m3, so scale g/m3 by 1E3 and ug/m3 by 1E-3 ccl4_rfc[toxvalUnits == 'mg/m3', conversion := 1] ccl4_rfc[toxvalUnits == 'g/m3', conversion := 1E3] @@ -2114,7 +2114,7 @@ ccl4_rfd[toxvalUnits %in% c('mg/kg-day', 'mg/kg'), units := 'mg/kg'] Then aggregate the data. -```{r} +```{r 07-Chapter7-169} # Run data aggregations grouping by dtxsid and taking either the max or the min # depending on the toxvalType we are considering. ccl4_csf[,.(numeric_value = max(toxvalNumeric), @@ -2130,7 +2130,7 @@ ccl4_rfd[,.(numeric_value = min(toxvalNumeric*conversion), Repeat the process for NATADB, first separating out the relevant subsets of the data. -```{r} +```{r 07-Chapter7-170} # Separate out into relevant data subsets natadb_csf <- natadb_hazard[humanEco %in% 'human health' & toxvalType %in% c('cancer slope factor') & (toxvalUnits != 'mg/kg-day'), ] @@ -2142,7 +2142,7 @@ natadb_rfd <- natadb_hazard[humanEco %in% 'human health' & Now handle the unit conversions. -```{r} +```{r 07-Chapter7-171} # Set mass by mass units to mg/kg. Note that ppm is already in mg/kg natadb_rfc <- natadb_rfc[toxvalUnits != 'ppm',] natadb_rfd[, units := 'mg/kg-day'] @@ -2150,7 +2150,7 @@ natadb_rfd[, units := 'mg/kg-day'] Finally, aggregate the data. -```{r} +```{r 07-Chapter7-172} # Run data aggregations grouping by dtxsid and taking either the max or the min # depending on the toxvalType we are considering. natadb_csf[, .(numeric_value = max(toxvalNumeric), diff --git a/DESCRIPTION b/DESCRIPTION index 3ff119c..f9a9683 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: tame2book Title: TAME 2.0 Bookdown Project Version: 0.0.1 Authors@R: - person("Rager", "Lab", role = c("aut", "cre"), email = "you@example.com") + person("Rager", "Lab", role = c("aut", "cre"), email = "jrager@unc.edu") Description: Bookdown project for the TAME 2.0 toolkit. License: MIT Encoding: UTF-8 @@ -45,10 +45,10 @@ Imports: themis, e1071, ggsci, + ggrepel, factoextra, dendextend, FactoMineR, - ggrepel, vegan, patchwork, purrr, diff --git a/index.Rmd b/index.Rmd index 5c7ddc3..ecc96f9 100644 --- a/index.Rmd +++ b/index.Rmd @@ -14,7 +14,7 @@ github-repo: rstudio/bookdown-demo favicon: images/icons/Favicon.png --- -```{r , include=FALSE} +```{r index-1, include=FALSE} source("R/_common.R") ``` @@ -83,7 +83,7 @@ This study was supported by the National Institutes of Health (NIH) from the Nat **P42ES031007**: The [University of North Carolina (UNC)-Superfund Research Program](https://sph.unc.edu/superfund-pages/srp/) (SRP) seeks to develop new solutions for reducing exposure to inorganic arsenic and prevent arsenic-induced diabetes through mechanistic and translational research. The [UNC-SRP Data Analysis and Management Core (UNC-SRP-DMAC)](https://sph.unc.edu/superfund-pages/dmac/) provides the UNC-SRP with critical expertise in bioinformatics, statistics, data management, and data integration. -```{r, echo=FALSE, out.width="40%", fig.align='center'} +```{r index-2, echo=FALSE, out.width="40%", fig.align='center'} knitr::include_graphics("images/index_images/Module0_Image2.png") ```
@@ -91,7 +91,7 @@ knitr::include_graphics("images/index_images/Module0_Image2.png")
**T32ES007126**: The [UNC Curriculum in Toxicology and Environmental Medicine (CiTEM)](https://www.med.unc.edu/toxicology/) seeks to provide a cutting edge research and mentoring environment to train students and postdoctoral fellows in environmental health and toxicology. Towards this goal, the CiTEM has a T32 Training Program for Pre- and Postdoctoral Training in Toxicology to support the development of future investigators in environmental health and toxicology. -```{r, echo=FALSE, out.width="15%",fig.align='center'} +```{r index-3, echo=FALSE, out.width="15%",fig.align='center'} knitr::include_graphics("images/index_images/Module0_Image3.png") ```
@@ -99,7 +99,7 @@ knitr::include_graphics("images/index_images/Module0_Image3.png")
Support was additionally provided through the [Institute for Environmental Health Solutions (IEHS)](https://sph.unc.edu/iehs/institute-for-environmental-health-solutions/) at the University of North Carolina (UNC) Gillings School of Global Public Health. The IEHS is aimed at protecting those who are particularly vulnerable to diseases caused by environmental factors, putting solutions directly into the hands of individuals and communities of North Carolina and beyond. -```{r, echo=FALSE, out.width="60%", fig.align='center'} +```{r index-4, echo=FALSE, out.width="60%", fig.align='center'} knitr::include_graphics("images/index_images/Module0_Image4.png") ``` -
\ No newline at end of file +