From 40cf2d2b2e9f8212449b060b0e9783768e96306d Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Sun, 16 Mar 2025 16:48:39 +0200 Subject: [PATCH 1/8] up --- inst/assets/bibliography.bib | 17 ++++++- inst/pages/beta_diversity.qmd | 1 - inst/pages/time_series.qmd | 85 +++++++++++++++++++++++++++++++++++ inst/pages/transformation.qmd | 34 +++++++++++++- 4 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 inst/pages/time_series.qmd diff --git a/inst/assets/bibliography.bib b/inst/assets/bibliography.bib index e99b8600a..8db470dfa 100644 --- a/inst/assets/bibliography.bib +++ b/inst/assets/bibliography.bib @@ -2447,7 +2447,7 @@ @article{Marchesi2015 publisher = {Springer Science and Business Media LLC}, author = {Marchesi, Julian R. and Ravel, Jacques}, year = {2015}, - month = jul + month = jul } @article{Nickols2024, @@ -2459,3 +2459,18 @@ @article{Nickols2024 year = {2024}, month = dec } + +@article{Schmidt2018, + title = {The Human Gut Microbiome: From Association to Modulation}, + volume = {172}, + ISSN = {0092-8674}, + url = {http://dx.doi.org/10.1016/j.cell.2018.02.044}, + DOI = {10.1016/j.cell.2018.02.044}, + number = {6}, + journal = {Cell}, + publisher = {Elsevier BV}, + author = {Schmidt, Thomas S.B. and Raes, Jeroen and Bork, Peer}, + year = {2018}, + month = mar, + pages = {1198–1215} +} diff --git a/inst/pages/beta_diversity.qmd b/inst/pages/beta_diversity.qmd index 5332066a7..435dc8b00 100644 --- a/inst/pages/beta_diversity.qmd +++ b/inst/pages/beta_diversity.qmd @@ -479,7 +479,6 @@ tse_sub <- runMDS( Then let's do the same with rarefaction: - ```{r} #| label = "MDS plot based on the Bray-Curtis distances using rarefaction." diff --git a/inst/pages/time_series.qmd b/inst/pages/time_series.qmd new file mode 100644 index 000000000..af8e8010a --- /dev/null +++ b/inst/pages/time_series.qmd @@ -0,0 +1,85 @@ +# Time series {#sec-time-series} + +```{r setup, echo = FALSE, results = "asis"} +library(rebook) +chapterPreamble() +``` + +```{r include = FALSE} +# global knitr options +knitr::opts_chunk$set( + message = FALSE, + fig.width = 10, + dpi = 300, + dev = "png", + dev.args = list(type = "cairo-png") +) +``` + +Microbial communities are dynamicm, constantly evolving over time. Despite +temporal fluctuations driven by environmental and other factors, the +microbiome retains its core characteristics [@Schmidt2018]. + +Compared to cross-sectional studies, a longitudinal study design allows +researchers to control time-related factors, enabling a more focused +investigation of specific research questions. However, longitudinal studies are +usually more complex than cross-sectional. Analyzing such data includes specific +aspects that need to be taken into account such as the dependence between +samples collected at multiple time points. Since these samples come from the +same set of subjects, they are paired, allowing each patient to serve as their +own control. + +Most of the methods introduced earlier in this book can be applied to temporal +studies with appropriate modifications and considerations. + +In this chapter, we use an example dataset from probiotic/placebo intervention +study. + +```{r} +#| label: load_data + +library(microbiomeDataSets) + +mae <- microbiomeDataSets::peerj32() +tse <- mae[[1]] +``` + +## Alpha diversity + +[@sec-alpha-diversity] introduced methods and main idea of alpha diversity. + +MIten alfa-diversiteetti eroaa? Viiva potilainen näytteidenn välillä. +Väritä, onko punaine tai sininen, muutoksen suunta. + +Tee nopea testi, jolla testataan, onko paired samples wilcoxon test, merkitsevä ero + + +## Time divergence + +Analysoi näytteiden välinen ero, miten visualisoida? + +Box plot, divergence y-askelilla, x -askelilla ryhmä + +## Ordination + +Viiva pootlaiden välillä, nuoli + +In datasets including time points, researcher might be interested on how similar +samples are within a subject or patient and how the microbial communities +evolve. This can be done for instance by connecting points of each subject. + +```{r} +library(miaTime) +library(scater) +data("temporalMicrobiome20") +tse <- temporalMicrobiome20 + +tse <- transformAssay(tse, method = "clr", pseudocount = 1) + +tse <- runPCA(tse, assay.type = "clr") + +p <- plotReducedDim(tse, "PCA") +``` + +Lisää tehtäviä. Lisää esimerkkivastauksiin chunk + diff --git a/inst/pages/transformation.qmd b/inst/pages/transformation.qmd index ff4144bb7..1643425e2 100644 --- a/inst/pages/transformation.qmd +++ b/inst/pages/transformation.qmd @@ -120,10 +120,12 @@ multiple pseudocount values. See [@sec-differential-abundance]. ## Transformations in practice ```{r} +#| label: transf1 + # Load example data library(mia) -data("GlobalPatterns", package = "mia") -tse <- GlobalPatterns +data("Tengeler2020") +tse <- Tengeler2020 # Transform "counts" assay to relative abundances ("relabundance") tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") @@ -140,6 +142,8 @@ Get the values in the resulting assay, and view some of the first entries of it with the `head` command. ```{r} +#| label: transf2 + assay(tse, "clr") |> head() ``` @@ -151,6 +155,8 @@ performance in microbiome-based classification performance [@Giliberti2022, Karwowska2024]. ```{r} +#| label: transf2 + # Here, `assay.type` is not explicitly specified. # Then The function uses the "counts" assay for the transformation. tse <- transformAssay(tse, method = "pa") @@ -160,9 +166,33 @@ assay(tse, "pa") |> head() You can now view the entire list of abundance assays in your data object with: ```{r} +#| label: transf3 + assays(tse) ``` +To incorporate phylogenetic information, one can apply the phylogenetic +isometric log-ratio (PhILR) transformation [@Silverman2017]. Unlike standard +transformations, PhILR accounts for the genetic relationships between taxa. +This is important because closely related species often share similar +properties, which traditional transformations fail to capture. + +```{r} +#| label: transf4 + +tse <- transformAssay(tse, method = "philr", MARGIN = 1L, pseudocount = TRUE) +``` + +Unlike other transformations, PhILR outputs a table where rows represent nodes +of phylogeny. These new features do not match with features of `TreeSE` which +is why this new dataset is stored into `altExp`. + +```{r} +#| label: transf5 + +altExp(tse, "philr") +``` + ::: {.callout-tip} ## Summary From 936fb0e10b3bef6f23d963d3f719636703cb4318 Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Mon, 17 Mar 2025 21:03:06 +0200 Subject: [PATCH 2/8] up --- inst/pages/time_series.qmd | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/inst/pages/time_series.qmd b/inst/pages/time_series.qmd index af8e8010a..9ec5dcdc9 100644 --- a/inst/pages/time_series.qmd +++ b/inst/pages/time_series.qmd @@ -41,18 +41,41 @@ study. library(microbiomeDataSets) mae <- microbiomeDataSets::peerj32() -tse <- mae[[1]] +tse <- getWithColData(mae, 1L) +tse[["time_point"]] <- tse[["time"]] |> as.factor() ``` ## Alpha diversity [@sec-alpha-diversity] introduced methods and main idea of alpha diversity. +Here we calculate MIten alfa-diversiteetti eroaa? Viiva potilainen näytteidenn välillä. Väritä, onko punaine tai sininen, muutoksen suunta. Tee nopea testi, jolla testataan, onko paired samples wilcoxon test, merkitsevä ero +```{r} +#| label: alpha1 + +library(mia) + +tse <- addAlpha(tse, index = "shannon") +``` + +```{r} +library(dplyr) +df <- colData(tse) +df <- df |> as.data.frame() |> arrange(time) |> group_by(subject) |> mutate(diff = diff(shannon)) |> ungroup() |> DataFrame() + + +library(ggplot2) +p <- ggplot(df, aes(x = time_point, y = shannon, group = interaction(time_point, group))) + + geom_boxplot(outlier.shape = NA) + + geom_point() +p +``` + ## Time divergence From 511f49c03a2900924b8f667bd36a6294c6ed5a0b Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Fri, 21 Mar 2025 13:15:20 +0200 Subject: [PATCH 3/8] up --- inst/pages/time_series.qmd | 109 +++++++++++++++++++++++++++++++++---- 1 file changed, 97 insertions(+), 12 deletions(-) diff --git a/inst/pages/time_series.qmd b/inst/pages/time_series.qmd index 9ec5dcdc9..ed297c22a 100644 --- a/inst/pages/time_series.qmd +++ b/inst/pages/time_series.qmd @@ -16,7 +16,7 @@ knitr::opts_chunk$set( ) ``` -Microbial communities are dynamicm, constantly evolving over time. Despite +Microbial communities are dynamic, constantly evolving over time. Despite temporal fluctuations driven by environmental and other factors, the microbiome retains its core characteristics [@Schmidt2018]. @@ -48,12 +48,7 @@ tse[["time_point"]] <- tse[["time"]] |> as.factor() ## Alpha diversity [@sec-alpha-diversity] introduced methods and main idea of alpha diversity. -Here we calculate - -MIten alfa-diversiteetti eroaa? Viiva potilainen näytteidenn välillä. -Väritä, onko punaine tai sininen, muutoksen suunta. - -Tee nopea testi, jolla testataan, onko paired samples wilcoxon test, merkitsevä ero +Here we calculate Shannon diversity index. ```{r} #| label: alpha1 @@ -63,19 +58,109 @@ library(mia) tse <- addAlpha(tse, index = "shannon") ``` +In addition to visualizing the diversity in respect to sample groups, we want +to visualize how much the diversity changes between time points. That is why +we calculate the difference for each subject. + ```{r} +#| label: alpha2 + library(dplyr) -df <- colData(tse) -df <- df |> as.data.frame() |> arrange(time) |> group_by(subject) |> mutate(diff = diff(shannon)) |> ungroup() |> DataFrame() +df <- colData(tse) |> + as.data.frame() |> + # Sort based on time + arrange(time) |> + # Group based on subject so that we calculate the difference between each + # subject's samples + group_by(subject) |> + # Calculate difference + mutate(diff = diff(shannon)) |> + ungroup() |> + DataFrame() + +# Sort to original order and add difference info to colData +df <- df[match(tse[["sample"]], df[["sample"]]), , drop = FALSE] +rownames(df) <- colnames(tse) +colData(tse) <- df + +colData(tse) |> head() +``` + +Now we can continue to visualize the alpha diversities. First we create a simple +boxplot showing Shannon index in respect to groups and time. + +```{r} +#| label: alpha3 library(ggplot2) -p <- ggplot(df, aes(x = time_point, y = shannon, group = interaction(time_point, group))) + - geom_boxplot(outlier.shape = NA) + - geom_point() + +# Get colData +df <- colData(tse) |> + as.data.frame() + +# Create a second df which is sorted +df2 <- df |> + arrange(subject, time_point, group) + +# Create a jitter +jitter_position <- position_jitterdodge( + dodge.width = 0.75, + jitter.width = 0, + jitter.height = 0 + ) + +# Create a plot +p <- ggplot(mapping = aes(x = time_point, y = shannon, fill = group)) + + # We disable outlier plotting as we plot points with jitter. Otherwise + # outliers would be duplicated + geom_boxplot(data = df, outlier.shape = NA) + + # Add jitter plot + geom_point(data = df2, aes( + group = interaction(subject, group)), + position = jitter_position) + p ``` +Additionally, we might be interested on linking the sample-pairs to visualize +their difference. + +```{r} +#| label: alpha4 + +# Add line linking subjects +p_path <- p + + geom_line( + data = df2, + aes(group = interaction(subject, group)), + position = jitter_position + ) +p_path +``` + +```{r} +# Add line linking subjects +p_path <- p + + geom_line( + data = df2, + arrow = arrow(length = unit(2, "mm")), + aes(group = interaction(subject, group), color = diff), + position = jitter_position + ) + + scale_color_gradient2( + low="blue", mid="white", high="red", + limits = c(-max(abs(df[["diff"]])), max(abs(df[["diff"]])))) + +p_path +``` + + +MIten alfa-diversiteetti eroaa? Viiva potilainen näytteidenn välillä. +Väritä, onko punaine tai sininen, muutoksen suunta. + +Tee nopea testi, jolla testataan, onko paired samples wilcoxon test, merkitsevä ero + ## Time divergence From 9c13b5cc1e89908f50c2ff6ca05ab7370f9def24 Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Sun, 23 Mar 2025 23:08:48 +0200 Subject: [PATCH 4/8] up --- inst/pages/beta_diversity.qmd | 2 +- inst/pages/time_series.qmd | 205 +++++++++++++++++++++++++++++----- 2 files changed, 178 insertions(+), 29 deletions(-) diff --git a/inst/pages/beta_diversity.qmd b/inst/pages/beta_diversity.qmd index 435dc8b00..e57a1c2aa 100644 --- a/inst/pages/beta_diversity.qmd +++ b/inst/pages/beta_diversity.qmd @@ -177,7 +177,7 @@ between two points is always the shortest possible distance. ::: -## Divergence +## Divergence {#sec-divergence} Divergence measure refers to a difference in community composition between the given sample(s) and a reference sample. This can be diff --git a/inst/pages/time_series.qmd b/inst/pages/time_series.qmd index ed297c22a..084904c70 100644 --- a/inst/pages/time_series.qmd +++ b/inst/pages/time_series.qmd @@ -16,8 +16,8 @@ knitr::opts_chunk$set( ) ``` -Microbial communities are dynamic, constantly evolving over time. Despite -temporal fluctuations driven by environmental and other factors, the +Microbial communities are dynamic, constantly evolving over time. However, +despite temporal fluctuations driven by environmental and other factors, the microbiome retains its core characteristics [@Schmidt2018]. Compared to cross-sectional studies, a longitudinal study design allows @@ -33,14 +33,15 @@ Most of the methods introduced earlier in this book can be applied to temporal studies with appropriate modifications and considerations. In this chapter, we use an example dataset from probiotic/placebo intervention -study. +study. The dataset includes gut microbiota samples from 22 subjects and 2 time +points. The first time point was sampled before introducing a treatment. ```{r} #| label: load_data library(microbiomeDataSets) -mae <- microbiomeDataSets::peerj32() +mae <- peerj32() tse <- getWithColData(mae, 1L) tse[["time_point"]] <- tse[["time"]] |> as.factor() ``` @@ -59,8 +60,8 @@ tse <- addAlpha(tse, index = "shannon") ``` In addition to visualizing the diversity in respect to sample groups, we want -to visualize how much the diversity changes between time points. That is why -we calculate the difference for each subject. +to analyse how much the diversity changes between time points. That is why +we calculate the difference between time points for each subject. ```{r} #| label: alpha2 @@ -123,28 +124,35 @@ p <- ggplot(mapping = aes(x = time_point, y = shannon, fill = group)) + p ``` -Additionally, we might be interested on linking the sample-pairs to visualize -their difference. +The boxplot above shows alpha diversity in each group and time point. In the +dataset, each subject was sampled in each time point, however, from the plot +we cannot see which samples correspond to which subject. That is why we +might be interested on linking the sample-pairs to highlight +the subject-specific deviation between time points. ```{r} #| label: alpha4 -# Add line linking subjects +# Add line to link subjects p_path <- p + geom_line( data = df2, aes(group = interaction(subject, group)), - position = jitter_position + position = jitter_position, + alpha = 0.3 ) p_path ``` +We can further refine the plot by adding the magnitude of change with a color. + ```{r} +#| label: alpha5 + # Add line linking subjects p_path <- p + geom_line( data = df2, - arrow = arrow(length = unit(2, "mm")), aes(group = interaction(subject, group), color = diff), position = jitter_position ) + @@ -155,39 +163,180 @@ p_path <- p + p_path ``` +The visualization does not show clear effect of treatment. We can still test +if we can get support on these null findings with statistical tests. The +simplest test might be the Wilcoxon test that we have to apply for each +treatment group separately. Moreover, we have to apply the paired version of +test as the samples are from same subjects. -MIten alfa-diversiteetti eroaa? Viiva potilainen näytteidenn välillä. -Väritä, onko punaine tai sininen, muutoksen suunta. +We first test if placebo treatment had effect on alpha diversity. -Tee nopea testi, jolla testataan, onko paired samples wilcoxon test, merkitsevä ero +```{r} +#| label: alpha6 +# Get data on placebo samples +df <- tse[, tse[["group"]] == "Placebo"] |> colData() +# Get first and second time points separately +before <- df[df[["time_point"]] == 1, "shannon"] +after <- df[df[["time_point"]] == 2, "shannon"] -## Time divergence +# Apply paired Wilcoxon test +wilcox.test(before, after, paired = TRUE) +``` -Analysoi näytteiden välinen ero, miten visualisoida? +Based on results, alpha diversity in second time point equals to first time +point for the placebo group, which is expected result. We can then do the same +test for group that got probiotics. -Box plot, divergence y-askelilla, x -askelilla ryhmä +```{r} +#| label: alpha7 -## Ordination +# Get data on placebo samples +df <- tse[, tse[["group"]] == "LGG"] |> colData() +# Get first and second time points separately +before <- df[df[["time_point"]] == 1, "shannon"] +after <- df[df[["time_point"]] == 2, "shannon"] + +# Apply paired Wilcoxon test +wilcox.test(before, after, paired = TRUE) +``` + +Wilcoxon test confirms the results also for probiotics group: probiotic +treatment do not increase or decrease diversity of microbioal community. + +More elegant test would be model-based approach. Below, we fit a simple linear +mixed effects model to examine the impact of time and treatment group on +Shannon diversity. In this model, both time and group are treated as fixed +effects, meaning that we assume their influence on Shannon diversity is +consistent across all subjects. Additionally, we include subject as a random +effect to account for the repeated measurements from the same individuals. + +If you are not familiar with R formulas, here’s an explanation of how to +interpret the components: + +- The left-hand side of the formula represents the predicted variable +(in this case, shannon diversity). + +- The `+` sign in the formula means that we are adding the terms on the +right-hand side of the formula together (i.e., combining them as separate +effects). + +- The `*` sign indicates an interaction between the terms on either side. This +means that we are not only testing the individual effects of time and group, but +also how these two factors interact with each other to influence Shannon +diversity. Specifically, it allows us to assess how the effect of treatment +group differs between time point 1 and time point 2. + +```{r} +#| label: alpha8 + +library(lme4) +library(lmerTest) + +# Get sample metadata +df <- colData(tse) |> + as.data.frame() + +# Fit the model +model <- lmer(shannon ~ time * group + (1 | subject), data = df) + +# Display summary with p-values +summary(model) +``` -Viiva pootlaiden välillä, nuoli +The results indicate that none of the variables in the model show statistically +significant effects. Specifically, "time" represents the overall effect of time +on Shannon diversity, while "groupPlacebo" represents the overall difference in +Shannon diversity between the Placebo group and the LGG group, averaged across +both time points. -In datasets including time points, researcher might be interested on how similar -samples are within a subject or patient and how the microbial communities -evolve. This can be done for instance by connecting points of each subject. +The most interesting variable in the results is the "time:groupPlacebo" +interaction term. This term investigates whether the effect of time on Shannon +diversity differs between the Placebo group and the LGG group when comparing the +second time point to the first one. + +A significant interaction would indicate that the change in diversity over time +is different between the two groups. However, in this case, the interaction is +not statistically significant, suggesting that the treatment effect did not +differ significantly between the Placebo group and the LGG group between the two +time points. + +## Time divergence + +[@sec-divergence] introduced divergence which means that we calculate +dissimilarity between samples. We can do the same and calculate dissimilarity +between time points for each subject. Below we calculate Bray-Curtis +dissimilarity between consecutive time points. ```{r} +#| label: divergence1 + library(miaTime) + +tse <- transformAssay(tse, method = "relabundance") +tse <- addStepwiseDivergence( + tse, + time.col = "time", + group = "subject", + assay.type = "relabundance", + method = "bray" +) +``` + +Then we can visualize the results with a box plot. + +```{r} +#| label: divergence2 + library(scater) -data("temporalMicrobiome20") -tse <- temporalMicrobiome20 -tse <- transformAssay(tse, method = "clr", pseudocount = 1) +plotColData(tse, x = "group", y = "divergence", show_boxplot = TRUE) +``` + +Figure above summarizes the divergence. Each point represent a subject: +dissimilarity between samples from the two time points. Divergence between +consecutive time points seems to be equal between treatment groups. + +## Ordination + +Ordination plots, e.g., Principal Coordinate Analysis (PCoA) plot, is a common +visualization technique to find patterns from the data. Below, we perform PCoA +and visualize the results as showed in [@#sec-community-similarity]. -tse <- runPCA(tse, assay.type = "clr") +Note that it is important to sort the data based on time so that the arrows that +we add in the next step have correct direction. -p <- plotReducedDim(tse, "PCA") +```{r} +#| label: pcoa + +# Sort data based on time +tse <- tse[, order(tse[["time"]])] + +# Perform PCoA +tse <- addMDS(tse, assay.type = "relabundance", method = "bray") +# Create ordination plot +p <- plotReducedDim( + tse, dimred = "MDS", colour_by = "group", + # Add other fields that are used later to add paths + other_fields = c("subject", "time") +) +p ``` -Lisää tehtäviä. Lisää esimerkkivastauksiin chunk +After initializing a scatter plot, we can add arrows that show how the microbial +communities evolved over time. + +```{r} +#| label: pcoa2 + +# Add arrows to connect samples from subjects +p <- p + geom_path( + mapping = aes(group = subject, ), + arrow = arrow(length = unit(0.2, "cm")), + alpha = 0.5 +) +p +``` +From the PCoA plot, we cannot observe clear patterns. As a final conclusion, it +seems that probiotics did not have effect on microbiome compared to placebo. From f16eb41cbf3cb13d5a33b88429569f2bc9cdeac2 Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Sun, 23 Mar 2025 23:11:43 +0200 Subject: [PATCH 5/8] up --- DESCRIPTION | 4 +++- inst/assets/_book.yml | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c1b636f03..d9d270061 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: OMA Title: Orchestrating Microbiome Analysis with Bioconductor -Version: 0.98.37 +Version: 0.98.38 Date: 2025-03-13 Authors@R: c( @@ -63,6 +63,8 @@ Suggests: igraph, IntegratedLearner, knitr, + lme4, + lmerTest, Maaslin2, maaslin3, mediation, diff --git a/inst/assets/_book.yml b/inst/assets/_book.yml index 5237f97ae..cb61a1980 100644 --- a/inst/assets/_book.yml +++ b/inst/assets/_book.yml @@ -60,9 +60,12 @@ book: - pages/cross_correlation.qmd - pages/multiassay_ordination.qmd - pages/integrated_learner.qmd - - part: "Machine learning & statistical modeling" + - part: "Machine learning" chapters: - pages/machine_learning.qmd + - part: "Time series" + chapters: + - pages/time_series.qmd - part: "Workflows" chapters: - pages/introductory_workflow.qmd From 2c3f0b581512dbecb009775301b1f721d20f541b Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Sun, 23 Mar 2025 23:23:22 +0200 Subject: [PATCH 6/8] up --- inst/pages/transformation.qmd | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/inst/pages/transformation.qmd b/inst/pages/transformation.qmd index 1643425e2..4bca481af 100644 --- a/inst/pages/transformation.qmd +++ b/inst/pages/transformation.qmd @@ -171,22 +171,6 @@ You can now view the entire list of abundance assays in your data object with: assays(tse) ``` -To incorporate phylogenetic information, one can apply the phylogenetic -isometric log-ratio (PhILR) transformation [@Silverman2017]. Unlike standard -transformations, PhILR accounts for the genetic relationships between taxa. -This is important because closely related species often share similar -properties, which traditional transformations fail to capture. - -```{r} -#| label: transf4 - -tse <- transformAssay(tse, method = "philr", MARGIN = 1L, pseudocount = TRUE) -``` - -Unlike other transformations, PhILR outputs a table where rows represent nodes -of phylogeny. These new features do not match with features of `TreeSE` which -is why this new dataset is stored into `altExp`. - ```{r} #| label: transf5 From 9b13d9debd0df56449090214fc1dc3c4b4085fca Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Sun, 23 Mar 2025 23:26:28 +0200 Subject: [PATCH 7/8] up --- inst/pages/transformation.qmd | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/inst/pages/transformation.qmd b/inst/pages/transformation.qmd index 4bca481af..ff4144bb7 100644 --- a/inst/pages/transformation.qmd +++ b/inst/pages/transformation.qmd @@ -120,12 +120,10 @@ multiple pseudocount values. See [@sec-differential-abundance]. ## Transformations in practice ```{r} -#| label: transf1 - # Load example data library(mia) -data("Tengeler2020") -tse <- Tengeler2020 +data("GlobalPatterns", package = "mia") +tse <- GlobalPatterns # Transform "counts" assay to relative abundances ("relabundance") tse <- transformAssay(tse, assay.type = "counts", method = "relabundance") @@ -142,8 +140,6 @@ Get the values in the resulting assay, and view some of the first entries of it with the `head` command. ```{r} -#| label: transf2 - assay(tse, "clr") |> head() ``` @@ -155,8 +151,6 @@ performance in microbiome-based classification performance [@Giliberti2022, Karwowska2024]. ```{r} -#| label: transf2 - # Here, `assay.type` is not explicitly specified. # Then The function uses the "counts" assay for the transformation. tse <- transformAssay(tse, method = "pa") @@ -166,17 +160,9 @@ assay(tse, "pa") |> head() You can now view the entire list of abundance assays in your data object with: ```{r} -#| label: transf3 - assays(tse) ``` -```{r} -#| label: transf5 - -altExp(tse, "philr") -``` - ::: {.callout-tip} ## Summary From f94e693967e38b954c487ebb44b2b8c9df7e5b76 Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Mon, 31 Mar 2025 10:47:22 +0300 Subject: [PATCH 8/8] up --- inst/assets/_book.yml | 1 + inst/pages/time_diversity.qmd | 327 ++++++++++++++++++++++++++++++++++ inst/pages/time_series.qmd | 327 +--------------------------------- 3 files changed, 331 insertions(+), 324 deletions(-) create mode 100644 inst/pages/time_diversity.qmd diff --git a/inst/assets/_book.yml b/inst/assets/_book.yml index cb61a1980..11c4862f8 100644 --- a/inst/assets/_book.yml +++ b/inst/assets/_book.yml @@ -66,6 +66,7 @@ book: - part: "Time series" chapters: - pages/time_series.qmd + - pages/time_diversity.qmd - part: "Workflows" chapters: - pages/introductory_workflow.qmd diff --git a/inst/pages/time_diversity.qmd b/inst/pages/time_diversity.qmd new file mode 100644 index 000000000..70d556a63 --- /dev/null +++ b/inst/pages/time_diversity.qmd @@ -0,0 +1,327 @@ +# Diversity and time divergence {#sec-time-diversity} + +```{r setup, echo = FALSE, results = "asis"} +library(rebook) +chapterPreamble() +``` + +```{r include = FALSE} +# global knitr options +knitr::opts_chunk$set( + message = FALSE, + fig.width = 10, + dpi = 300, + dev = "png", + dev.args = list(type = "cairo-png") +) +``` + +In this chapter, we use an example dataset from probiotic/placebo intervention +study. The dataset includes gut microbiota samples from 22 subjects and 2 time +points. The first time point was sampled before introducing a treatment. + +```{r} +#| label: load_data + +library(microbiomeDataSets) + +mae <- peerj32() +tse <- getWithColData(mae, 1L) +tse[["time_point"]] <- tse[["time"]] |> as.factor() +``` + +## Alpha diversity + +[@sec-alpha-diversity] introduced methods and main idea of alpha diversity. +Here we calculate Shannon diversity index. + +```{r} +#| label: alpha1 + +library(mia) + +tse <- addAlpha(tse, index = "shannon") +``` + +In addition to visualizing the diversity in respect to sample groups, we want +to analyse how much the diversity changes between time points. That is why +we calculate the difference between time points for each subject. + +```{r} +#| label: alpha2 + +library(dplyr) + +df <- colData(tse) |> + as.data.frame() |> + # Sort based on time + arrange(time) |> + # Group based on subject so that we calculate the difference between each + # subject's samples + group_by(subject) |> + # Calculate difference + mutate(diff = diff(shannon)) |> + ungroup() |> + DataFrame() + +# Sort to original order and add difference info to colData +df <- df[match(tse[["sample"]], df[["sample"]]), , drop = FALSE] +rownames(df) <- colnames(tse) +colData(tse) <- df + +colData(tse) |> head() +``` + +Now we can continue to visualize the alpha diversities. First we create a simple +boxplot showing Shannon index in respect to groups and time. + +```{r} +#| label: alpha3 + +library(ggplot2) + +# Get colData +df <- colData(tse) |> + as.data.frame() + +# Create a second df which is sorted +df2 <- df |> + arrange(subject, time_point, group) + +# Create a jitter +jitter_position <- position_jitterdodge( + dodge.width = 0.75, + jitter.width = 0, + jitter.height = 0 + ) + +# Create a plot +p <- ggplot(mapping = aes(x = time_point, y = shannon, fill = group)) + + # We disable outlier plotting as we plot points with jitter. Otherwise + # outliers would be duplicated + geom_boxplot(data = df, outlier.shape = NA) + + # Add jitter plot + geom_point(data = df2, aes( + group = interaction(subject, group)), + position = jitter_position) + +p +``` + +The boxplot above shows alpha diversity in each group and time point. In the +dataset, each subject was sampled in each time point, however, from the plot +we cannot see which samples correspond to which subject. That is why we +might be interested on linking the sample-pairs to highlight +the subject-specific deviation between time points. + +```{r} +#| label: alpha4 + +# Add line to link subjects +p_path <- p + + geom_line( + data = df2, + aes(group = interaction(subject, group)), + position = jitter_position, + alpha = 0.3 + ) +p_path +``` + +We can further refine the plot by adding the magnitude of change with a color. + +```{r} +#| label: alpha5 + +# Add line linking subjects +p_path <- p + + geom_line( + data = df2, + aes(group = interaction(subject, group), color = diff), + position = jitter_position + ) + + scale_color_gradient2( + low="blue", mid="white", high="red", + limits = c(-max(abs(df[["diff"]])), max(abs(df[["diff"]])))) + +p_path +``` + +The visualization does not show clear effect of treatment. We can still test +if we can get support on these null findings with statistical tests. The +simplest test might be the Wilcoxon test that we have to apply for each +treatment group separately. Moreover, we have to apply the paired version of +test as the samples are from same subjects, thus allowing each patient to serve +as their own control. + +We first test if placebo treatment had effect on alpha diversity. + +```{r} +#| label: alpha6 + +# Get data on placebo samples +df <- tse[, tse[["group"]] == "Placebo"] |> colData() +# Get first and second time points separately +before <- df[df[["time_point"]] == 1, "shannon"] +after <- df[df[["time_point"]] == 2, "shannon"] + +# Apply paired Wilcoxon test +wilcox.test(before, after, paired = TRUE) +``` + +Based on results, alpha diversity in second time point equals to first time +point for the placebo group, which is expected result. We can then do the same +test for group that got probiotics. + +```{r} +#| label: alpha7 + +# Get data on placebo samples +df <- tse[, tse[["group"]] == "LGG"] |> colData() +# Get first and second time points separately +before <- df[df[["time_point"]] == 1, "shannon"] +after <- df[df[["time_point"]] == 2, "shannon"] + +# Apply paired Wilcoxon test +wilcox.test(before, after, paired = TRUE) +``` + +Wilcoxon test confirms the results also for probiotics group: probiotic +treatment do not increase or decrease diversity of microbioal community. + +More elegant test would be model-based approach. Below, we fit a simple linear +mixed effects model to examine the impact of time and treatment group on +Shannon diversity. In this model, both time and group are treated as fixed +effects, meaning that we assume their influence on Shannon diversity is +consistent across all subjects. Additionally, we include subject as a random +effect to account for the repeated measurements from the same individuals. + +If you are not familiar with R formulas, here’s an explanation of how to +interpret the components: + +- The left-hand side of the formula represents the predicted variable +(in this case, shannon diversity). + +- The `+` sign in the formula means that we are adding the terms on the +right-hand side of the formula together (i.e., combining them as separate +effects). + +- The `*` sign indicates an interaction between the terms on either side. This +means that we are not only testing the individual effects of time and group, but +also how these two factors interact with each other to influence Shannon +diversity. Specifically, it allows us to assess how the effect of treatment +group differs between time point 1 and time point 2. + +```{r} +#| label: alpha8 + +library(lme4) +library(lmerTest) + +# Get sample metadata +df <- colData(tse) |> + as.data.frame() + +# Fit the model +model <- lmer(shannon ~ time * group + (1 | subject), data = df) + +# Display summary with p-values +summary(model) +``` + +The results indicate that none of the variables in the model show statistically +significant effects. Specifically, "time" represents the overall effect of time +on Shannon diversity, while "groupPlacebo" represents the overall difference in +Shannon diversity between the Placebo group and the LGG group, averaged across +both time points. + +The most interesting variable in the results is the "time:groupPlacebo" +interaction term. This term investigates whether the effect of time on Shannon +diversity differs between the Placebo group and the LGG group when comparing the +second time point to the first one. + +A significant interaction would indicate that the change in diversity over time +is different between the two groups. However, in this case, the interaction is +not statistically significant, suggesting that the treatment effect did not +differ significantly between the Placebo group and the LGG group between the two +time points. + +## Time divergence + +[@sec-divergence] introduced divergence which means that we calculate +dissimilarity between samples. We can do the same and calculate dissimilarity +between time points for each subject. Below we calculate Bray-Curtis +dissimilarity between consecutive time points. + +```{r} +#| label: divergence1 + +library(miaTime) + +tse <- transformAssay(tse, method = "relabundance") +tse <- addStepwiseDivergence( + tse, + time.col = "time", + group = "subject", + assay.type = "relabundance", + method = "bray" +) +``` + +Then we can visualize the results with a box plot. + +```{r} +#| label: divergence2 + +library(scater) + +plotColData(tse, x = "group", y = "divergence", show_boxplot = TRUE) +``` + +Figure above summarizes the divergence. Each point represent a subject: +dissimilarity between samples from the two time points. Divergence between +consecutive time points seems to be equal between treatment groups. + +## Ordination + +Ordination plots, e.g., Principal Coordinate Analysis (PCoA) plot, is a common +visualization technique to find patterns from the data. Below, we perform PCoA +and visualize the results as showed in [@#sec-community-similarity]. + +Note that it is important to sort the data based on time so that the arrows that +we add in the next step have correct direction. + +```{r} +#| label: pcoa + +# Sort data based on time +tse <- tse[, order(tse[["time"]])] + +# Perform PCoA +tse <- addMDS(tse, assay.type = "relabundance", method = "bray") +# Create ordination plot +p <- plotReducedDim( + tse, dimred = "MDS", colour_by = "group", + # Add other fields that are used later to add paths + other_fields = c("subject", "time") +) +p +``` + +After initializing a scatter plot, we can add arrows that show how the microbial +communities evolved over time. + +```{r} +#| label: pcoa2 + +# Add arrows to connect samples from subjects +p <- p + geom_path( + mapping = aes(group = subject, ), + arrow = arrow(length = unit(0.2, "cm")), + alpha = 0.5 +) +p +``` + +From the PCoA plot, we cannot observe clear patterns. As a final conclusion, it +seems that probiotics did not have effect on microbiome compared to placebo. diff --git a/inst/pages/time_series.qmd b/inst/pages/time_series.qmd index 084904c70..8d32ff6ce 100644 --- a/inst/pages/time_series.qmd +++ b/inst/pages/time_series.qmd @@ -5,17 +5,6 @@ library(rebook) chapterPreamble() ``` -```{r include = FALSE} -# global knitr options -knitr::opts_chunk$set( - message = FALSE, - fig.width = 10, - dpi = 300, - dev = "png", - dev.args = list(type = "cairo-png") -) -``` - Microbial communities are dynamic, constantly evolving over time. However, despite temporal fluctuations driven by environmental and other factors, the microbiome retains its core characteristics [@Schmidt2018]. @@ -25,318 +14,8 @@ researchers to control time-related factors, enabling a more focused investigation of specific research questions. However, longitudinal studies are usually more complex than cross-sectional. Analyzing such data includes specific aspects that need to be taken into account such as the dependence between -samples collected at multiple time points. Since these samples come from the -same set of subjects, they are paired, allowing each patient to serve as their -own control. +samples collected at multiple time points. Most of the methods introduced earlier in this book can be applied to temporal -studies with appropriate modifications and considerations. - -In this chapter, we use an example dataset from probiotic/placebo intervention -study. The dataset includes gut microbiota samples from 22 subjects and 2 time -points. The first time point was sampled before introducing a treatment. - -```{r} -#| label: load_data - -library(microbiomeDataSets) - -mae <- peerj32() -tse <- getWithColData(mae, 1L) -tse[["time_point"]] <- tse[["time"]] |> as.factor() -``` - -## Alpha diversity - -[@sec-alpha-diversity] introduced methods and main idea of alpha diversity. -Here we calculate Shannon diversity index. - -```{r} -#| label: alpha1 - -library(mia) - -tse <- addAlpha(tse, index = "shannon") -``` - -In addition to visualizing the diversity in respect to sample groups, we want -to analyse how much the diversity changes between time points. That is why -we calculate the difference between time points for each subject. - -```{r} -#| label: alpha2 - -library(dplyr) - -df <- colData(tse) |> - as.data.frame() |> - # Sort based on time - arrange(time) |> - # Group based on subject so that we calculate the difference between each - # subject's samples - group_by(subject) |> - # Calculate difference - mutate(diff = diff(shannon)) |> - ungroup() |> - DataFrame() - -# Sort to original order and add difference info to colData -df <- df[match(tse[["sample"]], df[["sample"]]), , drop = FALSE] -rownames(df) <- colnames(tse) -colData(tse) <- df - -colData(tse) |> head() -``` - -Now we can continue to visualize the alpha diversities. First we create a simple -boxplot showing Shannon index in respect to groups and time. - -```{r} -#| label: alpha3 - -library(ggplot2) - -# Get colData -df <- colData(tse) |> - as.data.frame() - -# Create a second df which is sorted -df2 <- df |> - arrange(subject, time_point, group) - -# Create a jitter -jitter_position <- position_jitterdodge( - dodge.width = 0.75, - jitter.width = 0, - jitter.height = 0 - ) - -# Create a plot -p <- ggplot(mapping = aes(x = time_point, y = shannon, fill = group)) + - # We disable outlier plotting as we plot points with jitter. Otherwise - # outliers would be duplicated - geom_boxplot(data = df, outlier.shape = NA) + - # Add jitter plot - geom_point(data = df2, aes( - group = interaction(subject, group)), - position = jitter_position) - -p -``` - -The boxplot above shows alpha diversity in each group and time point. In the -dataset, each subject was sampled in each time point, however, from the plot -we cannot see which samples correspond to which subject. That is why we -might be interested on linking the sample-pairs to highlight -the subject-specific deviation between time points. - -```{r} -#| label: alpha4 - -# Add line to link subjects -p_path <- p + - geom_line( - data = df2, - aes(group = interaction(subject, group)), - position = jitter_position, - alpha = 0.3 - ) -p_path -``` - -We can further refine the plot by adding the magnitude of change with a color. - -```{r} -#| label: alpha5 - -# Add line linking subjects -p_path <- p + - geom_line( - data = df2, - aes(group = interaction(subject, group), color = diff), - position = jitter_position - ) + - scale_color_gradient2( - low="blue", mid="white", high="red", - limits = c(-max(abs(df[["diff"]])), max(abs(df[["diff"]])))) - -p_path -``` - -The visualization does not show clear effect of treatment. We can still test -if we can get support on these null findings with statistical tests. The -simplest test might be the Wilcoxon test that we have to apply for each -treatment group separately. Moreover, we have to apply the paired version of -test as the samples are from same subjects. - -We first test if placebo treatment had effect on alpha diversity. - -```{r} -#| label: alpha6 - -# Get data on placebo samples -df <- tse[, tse[["group"]] == "Placebo"] |> colData() -# Get first and second time points separately -before <- df[df[["time_point"]] == 1, "shannon"] -after <- df[df[["time_point"]] == 2, "shannon"] - -# Apply paired Wilcoxon test -wilcox.test(before, after, paired = TRUE) -``` - -Based on results, alpha diversity in second time point equals to first time -point for the placebo group, which is expected result. We can then do the same -test for group that got probiotics. - -```{r} -#| label: alpha7 - -# Get data on placebo samples -df <- tse[, tse[["group"]] == "LGG"] |> colData() -# Get first and second time points separately -before <- df[df[["time_point"]] == 1, "shannon"] -after <- df[df[["time_point"]] == 2, "shannon"] - -# Apply paired Wilcoxon test -wilcox.test(before, after, paired = TRUE) -``` - -Wilcoxon test confirms the results also for probiotics group: probiotic -treatment do not increase or decrease diversity of microbioal community. - -More elegant test would be model-based approach. Below, we fit a simple linear -mixed effects model to examine the impact of time and treatment group on -Shannon diversity. In this model, both time and group are treated as fixed -effects, meaning that we assume their influence on Shannon diversity is -consistent across all subjects. Additionally, we include subject as a random -effect to account for the repeated measurements from the same individuals. - -If you are not familiar with R formulas, here’s an explanation of how to -interpret the components: - -- The left-hand side of the formula represents the predicted variable -(in this case, shannon diversity). - -- The `+` sign in the formula means that we are adding the terms on the -right-hand side of the formula together (i.e., combining them as separate -effects). - -- The `*` sign indicates an interaction between the terms on either side. This -means that we are not only testing the individual effects of time and group, but -also how these two factors interact with each other to influence Shannon -diversity. Specifically, it allows us to assess how the effect of treatment -group differs between time point 1 and time point 2. - -```{r} -#| label: alpha8 - -library(lme4) -library(lmerTest) - -# Get sample metadata -df <- colData(tse) |> - as.data.frame() - -# Fit the model -model <- lmer(shannon ~ time * group + (1 | subject), data = df) - -# Display summary with p-values -summary(model) -``` - -The results indicate that none of the variables in the model show statistically -significant effects. Specifically, "time" represents the overall effect of time -on Shannon diversity, while "groupPlacebo" represents the overall difference in -Shannon diversity between the Placebo group and the LGG group, averaged across -both time points. - -The most interesting variable in the results is the "time:groupPlacebo" -interaction term. This term investigates whether the effect of time on Shannon -diversity differs between the Placebo group and the LGG group when comparing the -second time point to the first one. - -A significant interaction would indicate that the change in diversity over time -is different between the two groups. However, in this case, the interaction is -not statistically significant, suggesting that the treatment effect did not -differ significantly between the Placebo group and the LGG group between the two -time points. - -## Time divergence - -[@sec-divergence] introduced divergence which means that we calculate -dissimilarity between samples. We can do the same and calculate dissimilarity -between time points for each subject. Below we calculate Bray-Curtis -dissimilarity between consecutive time points. - -```{r} -#| label: divergence1 - -library(miaTime) - -tse <- transformAssay(tse, method = "relabundance") -tse <- addStepwiseDivergence( - tse, - time.col = "time", - group = "subject", - assay.type = "relabundance", - method = "bray" -) -``` - -Then we can visualize the results with a box plot. - -```{r} -#| label: divergence2 - -library(scater) - -plotColData(tse, x = "group", y = "divergence", show_boxplot = TRUE) -``` - -Figure above summarizes the divergence. Each point represent a subject: -dissimilarity between samples from the two time points. Divergence between -consecutive time points seems to be equal between treatment groups. - -## Ordination - -Ordination plots, e.g., Principal Coordinate Analysis (PCoA) plot, is a common -visualization technique to find patterns from the data. Below, we perform PCoA -and visualize the results as showed in [@#sec-community-similarity]. - -Note that it is important to sort the data based on time so that the arrows that -we add in the next step have correct direction. - -```{r} -#| label: pcoa - -# Sort data based on time -tse <- tse[, order(tse[["time"]])] - -# Perform PCoA -tse <- addMDS(tse, assay.type = "relabundance", method = "bray") -# Create ordination plot -p <- plotReducedDim( - tse, dimred = "MDS", colour_by = "group", - # Add other fields that are used later to add paths - other_fields = c("subject", "time") -) -p -``` - -After initializing a scatter plot, we can add arrows that show how the microbial -communities evolved over time. - -```{r} -#| label: pcoa2 - -# Add arrows to connect samples from subjects -p <- p + geom_path( - mapping = aes(group = subject, ), - arrow = arrow(length = unit(0.2, "cm")), - alpha = 0.5 -) -p -``` - -From the PCoA plot, we cannot observe clear patterns. As a final conclusion, it -seems that probiotics did not have effect on microbiome compared to placebo. +studies with appropriate modifications and considerations. These are discussed +in [@sec-time-diversity]. \ No newline at end of file