scater_tutorials_open_data/zeisel_mouse_cortex.Rmd at master · davismcc/scater_tutorials_open_data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
---
title: 'Single-cell dataset: Zeisel mouse cortex'
author: "Davis McCarthy"
date: "`r Sys.Date()`"
output:
    html_document:
        toc: true
        toc_float: true
        highlight: tango
        number_sections: true
        code_folding: hide
---

```{r load-libararies, message=FALSE, warning=FALSE}
library(scater)
library(data.table)
library(cowplot)
library(DT)
library(knitr)
library(scran)
opts_chunk$set(cache = FALSE, warning = FALSE)
```

# Load the Zeisel et al mouse cortex data

The mouse cortex data from [Zeisel et al, 2015](http://science.sciencemag.org/content/347/6226/1138)
have been made available as UMI (unique molecular identifier) counts at the
[Linnarsson Lab website](http://linnarssonlab.org/blobs/cortex/).

Here we download the data directly from the Linnarrsson Lab page and produce an
`SCESet` object, which is the data structure used in the `scater` package.

First load the data for metadata for mRNA genes:

```{r load-data-mrna, cache=FALSE}
## download mRNA data
mrna_meta <- fread("https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt",
              nrows = 10, header = FALSE)
mrna_meta <- as.data.frame(t(mrna_meta), stringsAsFactors = FALSE)
colnames(mrna_meta) <- mrna_meta[2,]
mrna_meta <- mrna_meta[-c(1,2),]
rownames(mrna_meta) <- mrna_meta$cell_id
datatable(mrna_meta[1:6,])
```

Now load the count data for the mRNA genes:

```{r load-data-mrna2, cache=FALSE}
mrna <- fread("https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt",
              skip = 10)
mrna <- as.data.frame(mrna)
rownames(mrna) <- mrna$V1
mrna_gene_cluster <- mrna$V2
mrna <- mrna[, -c(1,2)]
colnames(mrna) <- mrna_meta$cell_id
datatable(mrna[1:20, 1:6])
```

Next load the metadata for mitochondrial genes:

```{r load-data-mtgenes, cache=FALSE}
## download data for mitochondrial genes
mtgenes_meta <- fread("https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mito_17-Aug-2014.txt",
                      nrows = 10, header = FALSE)
mtgenes_meta <- as.data.frame(t(mtgenes_meta), stringsAsFactors = FALSE)
colnames(mtgenes_meta) <- mtgenes_meta[2,]
mtgenes_meta <- mtgenes_meta[-c(1,2),]
rownames(mtgenes_meta) <- mtgenes_meta$cell_id
datatable(mtgenes_meta[1:6,])
```

Followed by the count data for mitochondrial genes:

```{r load-data-mtgenes2, cache=FALSE}
mtgenes <- fread("https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mito_17-Aug-2014.txt",
                 skip = 10)
mtgenes <- as.data.frame(mtgenes)
rownames(mtgenes) <- mtgenes$V1
mtgenes <- mtgenes[, -c(1,2)]
colnames(mtgenes) <- mtgenes_meta$cell_id
datatable(mtgenes[1:10, 1:6])
```

Finally load the metadata for ERCC spike-ins:

```{r load-data-ercc, cache=FALSE}
## download data for ERCC spike-ins
ercc_meta <- fread("https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_spikes_17-Aug-2014.txt",
                   nrows = 10, header = FALSE)
ercc_meta <- as.data.frame(t(ercc_meta), stringsAsFactors = FALSE)
colnames(ercc_meta) <- ercc_meta[2,]
ercc_meta <- ercc_meta[-c(1,2),]
rownames(ercc_meta) <- ercc_meta$cell_id
datatable(ercc_meta[1:6,])
```

and then the count data for the ERCC spike-ins:

```{r load-data-ercc2, cache=FALSE}
ercc <- fread("https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_spikes_17-Aug-2014.txt",
                   skip = 10)
ercc <- as.data.frame(ercc)
rownames(ercc) <- ercc$V1
ercc <- ercc[, -c(1,2)]
colnames(ercc) <- ercc_meta$cell_id
datatable(ercc[1:10, 1:6])
```

Now, after checking that the cell IDs match up in the different data frames we
have loaded, combine data and form into an SCESet object:

```{r check-ids, results='hide'}
identical(mtgenes_meta[rownames(mrna_meta),], mrna_meta)
identical(ercc_meta[rownames(mrna_meta),], mrna_meta)
## cell metadata matches as long as cells are matched
mtgenes <- mtgenes[, rownames(mrna_meta)]
identical(colnames(mtgenes), colnames(mrna))
ercc <- ercc[, rownames(mrna_meta)]
identical(colnames(ercc), colnames(mrna))
```

```{r make-sceset}
## combine expression values
counts <- rbind(mrna, mtgenes, ercc)
dim(counts)
pd <- new("AnnotatedDataFrame", mrna_meta)
fd <- data.frame(gene_cluster = c(mrna_gene_cluster,
                                rep(NA, (nrow(ercc) + nrow(mtgenes)))))
rownames(fd) <- rownames(counts)
fd <- new("AnnotatedDataFrame", fd)
sce_zeisel_raw <- newSCESet(countData = counts, phenoData = pd, featureData = fd)
sce_zeisel_raw
```

This data were produced using UMIs, so `cpm` (counts-per-million) should be the
correct units for analysis (transcript length becomes irrelevant as we are
counting molecules directly).

# Calculate QC metrics

Calculate QC metrics with `scater`'s `calculateQCMetrics` function. It is
helpful to define "feature controls", that is features (here genes) that can be
used for QC purposes. Typically, it is helpful to define ERCC spike-ins and
mitochondrial genes as feature controls, which is exactly what we do here:

```{r calc-qc-metrics}
ercc_genes <- grep("ERCC", featureNames(sce_zeisel_raw))
mt_genes <- grep("mt-", featureNames(sce_zeisel_raw))
sce_zeisel_raw <- calculateQCMetrics(
    sce_zeisel_raw, feature_controls = list(ERCC = ercc_genes, mt = mt_genes))
```

For this data set the original authors classified the cells into the broad cell
classes shown in the output below.

```{r, echo=TRUE, eval=TRUE}
table(sce_zeisel_raw$level1class)
```

We will explore the cell types throughout the QC procedures described subsequently.

---

# QC the genes


Scater allows you to set minimum QC thresholds for a gene to be considered sufficiently expressed in your downstream analysis. Here,  using the inbuilt  ```is_exprs()``` function,  we enforce that a gene must have least one count in at least 98 cells (as this is the smallest group of cells as identified by Zeisel et al). The rationale behind this is that we would want to keep a gene if it was expressed in just one group of cells, but we don't want genes with very sparse expression overall.


```{r, echo=TRUE, eval=TRUE, }
keep_gene <- rowSums(is_exprs(sce_zeisel_raw)) >= 98
fData(sce_zeisel_raw)$use <- keep_gene
```

We retain `r sum(keep_gene)` genes for the analysis and drop `r sum(!keep_gene)`
lowly-expressed genes from the analysis.

It can be useful to plot gene expression frequency versus mean expression level
to assess the effects of technical dropout in the dataset. We fit a non-linear
least squares curve for the relationship between expression frequency and mean
expression and use this to define the number of genes above high technical
dropout and the numbers of genes that are expressed (here defined as at least 1
count) in at least 50% and at least 25% of cells. A subset of genes to be
treated as feature controls can be specified, otherwise any feature controls
previously defined are used.


```{r, echo=TRUE, eval=TRUE, fig.width=7, fig.height=5, fig.align="center"}
plotQC(sce_zeisel_raw, type = "exprs")
```

The ```plotQC()``` function provides several useful  QC plots,  such as the
example below that considers the the number of reads consumed by the top 50
expressed genes. Aside from spike-ins, these are typically mitochondrial and
housekeeping genes. Here, as with most single-cell experiments,  a large
proportion of reads are being are taken up by uninteresting biology.

```{r, echo=T, eval=T, message=F, warning=F, fig.width=6, fig.height=6, fig.align="center"}
plotQC( sce_zeisel_raw[fData(sce_zeisel_raw)$use, ],
        type = "highest-expression", col_by_variable = "level1class" )
```

As well as the expected ERCC and mitochondrial genes among the most expressed,
we see *Actb*, involved in cell motility, structure and integrity.


---

# QC the cells

A useful first step is flagging/failing poorly performing cells. This can be
done from the sample meta-data using the automated QC metrics generated above,
any additional sequencing metrics from sequencing aligner/mapping software,
and additional cell phenotypes such as from imaging. For the sake of
demonstration, here we focus on four metrics. Others you may want to consider
are % reads mapped to mitochondrial genes, library PCR duplication rate,
and mean sequencing bias per cell.

```plotPhenoData()``` can be used to explore specific sample meta-data values.
For example, in the plots below we can see how different QC metrics distinguish
the cells.

```{r, echo=T, eval=T, message=F, warning=F, fig.align="center", fig.height=11, fig.width=12}
p1 <- plotPhenoData(sce_zeisel_raw, aes_string(x = "log10_total_counts",
                                  y = "pct_counts_feature_controls_mt",
                                  colour = "level1class"))
p2 <- plotPhenoData(sce_zeisel_raw, aes_string(x = "log10_total_counts",
                                  y = "pct_counts_feature_controls_ERCC",
                                  colour = "level1class"))
p3 <- plotPhenoData(sce_zeisel_raw, aes_string(x = "total_features",
                                  y = "pct_counts_feature_controls_mt",
                                  colour = "level1class"))
p4 <- plotPhenoData(sce_zeisel_raw, aes_string(x = "total_features",
                                  y = "pct_counts_feature_controls_ERCC",
                                  colour = "level1class"))
plot_grid(p1, p2, p3, p4, labels = letters[1:4], nrow = 2)
```

```{r, echo=T, eval=T, message=F, warning=F, fig.align="center", fig.height=7, fig.width=8}
plotPhenoData(sce_zeisel_raw, aes_string(x = "total_features",
                                  y = "pct_counts_feature_controls",
                                  colour = "filter_on_pct_counts_feature_controls",
                                  shape_by = "filter_on_total_counts")) +
    geom_vline(xintercept = 1000, linetype = 2)
```

Use QC metrics to select a subset of cells for use.

```{r, echo=T, eval=T, message=F, warning=F}
sce_zeisel_raw$use <- (sce_zeisel_raw$total_features > 1000 & #sufficient features
                           sce_zeisel_raw$total_counts > 5000 & # sufficient molucules counted
                           !sce_zeisel_raw$filter_on_pct_counts_feature_controls & # sufficient endogenous RNA

                           !sce_zeisel_raw$filter_on_total_features & # remove cells with unusual numbers of genes
                           !sce_zeisel_raw$is_cell_control # controls shouldn't be used in downstream analysis
)
table(sce_zeisel_raw$use)
```
\

This would lead us to drop a further `r sum(!sce_zeisel_raw$use)` cells from
this dataset.

Box plots aren't particularly useful for visualising sparse data, so
```plot()``` applied to an SCESet object helps visualise all cells as a
cumulative proportion of reads per cell. You can see from the plot below that
the two failed cells have curves that look more like the blank controls.
\

Cumulative expression plots:

```{r cum-exprs-plots, fig.height=9, fig.width=10, fig.align="center"}
plot(sce_zeisel_raw, block1 = "level1class", colour_by = "level2class",
     exprs_values = "counts")
```

We observe that certain cell types (e.g. oligodendrocytes and microglia) have a
larger proportion of cells with their libraries accounted for by a small number
of features than to pyramidal cells.

```{r cum-exprs-plots-use, fig.height=9, fig.width=10, fig.align="center"}
plot(sce_zeisel_raw, block1 = "level1class", colour_by = "use",
     exprs_values = "counts")
```

The plots above show that some (but certainly not all) of the cells we have
opted not to use have a large proportion of their library accounted for by a
handful of very highly-expressed genes.

Scater allows users total flexibility to run their favourite dimension reduction
methods,  as described [here]() and in the supporting help files. Here we use
```plotPCA()``` to further explore the cells. The t-SNE plot works particularly
nicely for this dataset to separate the different cell types as identified by
Zeisel et al.

Let's take a look at a PCA plot:

```{r pca-plot}
sce_zeisel_raw <- plotPCA(sce_zeisel_raw, colour_by = "tissue",
                          return_SCESet = TRUE)
plotReducedDim(sce_zeisel_raw, colour_by = "level1class")
```

Take a look using a diffusion map (better suited to cells distributed along a
process):

```{r diff-map-plot}
plotDiffusionMap(sce_zeisel_raw, colour_by = "level1class")
```

And finally look at a t-SNE plot:

```{r tsne-plot}
plotTSNE(sce_zeisel_raw, colour_by = "level1class", rand_seed = 20160128)
```

The t-SNE gives nice, tight cell-type clusters.

```{r, echo=T, eval=T, message=F, warning=F, fig.width=7, fig.height=6, fig.align="center"}
plotPCA(sce_zeisel_raw, colour_by = "use", shape_by = "level1class" )
```
\
Another option available in `scater` is to conduct PCA on a set of QC metrics.
The advantage of doing this is that the QC metrics focus on technical aspects of
the libraries that are likely to distinguish problematics cells. Automatic
outlier detection on PCA plots using QC metrics is available to help identify
potentially problematic cells.

By default, the following metrics are used for PCA-based outlier detection:

* `pct_counts_top_100_features`
* `total_features`
* `pct_counts_feature_controls`
* `n_detected_feature_controls`
* `log10_counts_endogenous_features`
* `log10_counts_feature_controls`

A particular set of variables to be used can be specified with the
`selected_variables` argument as shown in the example below.

```{r pca-qc-metrics, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE, fig.width=7, fig.height=5, fig.align="center"}
## PCA on the phenoData cannot handle missing values
## for the exercise here we thus set cdna_recovered_in_ng_per_ul to 100 where
## there are NA values actually (creating a new dummy variable)
sce_zeisel_raw <- plotPCA(sce_zeisel_raw, size_by = "total_features",
                          shape_by = "filter_on_total_features",
                          pca_data_input = "pdata", detect_outliers = TRUE,
                          return_SCESet = TRUE)
```

This dataset has already been carefully QC'd by the original authors and this
PCA plot confirms that, with the PCA on QC metrics detecting no further outliers.


With `scater`, any specific set of features based on prior knowledge can be used
for PCA, t-SNE or diffusion maps. A feature set to use can be defined by
supplying the `feature_set` argument to `plotPCA`, `plotTSNE` or
`plotDiffusionMap`. This allows, for example, using only housekeeping features
or control features or cell cycle genes to produce reduced-dimension plots. The
plots below use only the spike-in genes defined as such earlier.

```{r pca-difmap-ercc-genes, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE, fig.width=10, fig.height=5, fig.align="center"}
p1 <- plotPCA(sce_zeisel_raw, feature_set = fData(sce_zeisel_raw)$is_feature_control,
               colour_by = "use", shape_by = "tissue",
               size_by = "outlier" ) + ggtitle("PCA")
p2 <- plotDiffusionMap(sce_zeisel_raw,
                       feature_set = fData(sce_zeisel_raw)$is_feature_control,
                       colour_by = "use", shape_by = "tissue",
                       size_by = "outlier" ) + ggtitle("Diffusion map")
multiplot(p1, p2, cols = 2)
```

In addition to outliers, we prefer to use cells that have a good coverage of
detected genes. After gene filtering, we demand that cells have detectable
expression for at least 10% of retained genes.

```{r further-cell-filtering}
keep_cell <- (colSums(is_exprs(sce_zeisel_raw[fData(sce_zeisel_raw)$use,])) >
                  0.1 * sum(fData(sce_zeisel_raw)$use))
sce_zeisel_raw$use <- (sce_zeisel_raw$use & keep_cell)
```

After this procedure we retain `r sum(sce_zeisel_raw$use)` cells and drop
`r sum(!sce_zeisel_raw$use)` cells.


---

# Investigate explanatory variables

Once you have filtered cells and genes,  a next step is to explore technical
drivers of variability in the data to inform data normalisation before
downstream analysis.

Experimental design is a critical, but neglected, aspect of sc-RNA-seq
studies. To the best of our knowledge, methods like those described in this
section for exploring experimental and QC variables and the experimental
design, do not feature in any scRNA-seq software packages apart from `scater`.
There are a very large number of potential confounders, artifacts and biases in
sc-RNA-seq studies. Exploring the effects of such explanatory variables (both
those recorded during the experiment and computed QC metrics) is crucial for
appropriate modeling of the data. The `scater` package provides a set of methods
specifically for quality control of experimental and explanatory variables,
which will be demonstrated briefly here.

Using the ```plotPCA()``` function we can see that principal component one
appears to be driven by differences in number of genes detected ("total
features"). Differences in number of detected genes is a
common driver of cell clustering  and can be result of biology (e.g. different
cell types, cell cycle). Indeed, in the PCA here we see that the
endothelial-mural and astrocyte-ependymal cells have typically many fewer
features than the other cell types.

However, number of genes detected often has a strong technical component due
to variably recovered RNA, reverse transcription, or library amplification. Its
effect can also be notably non-linear,  affecting low expressed and high
expressed genes differently. The ```plotQC()``` function can be used to explore
the marginal % variance explained (per gene) of the various technical
factors. In the second plot we can see that it's not unusual for gene coverage
to explain more than 10% of the expression variance of a gene.

```{r pca-diffmap-normalisation, echo=T, eval=T, message=F, warning=F, fig.width=7, fig.height=5, fig.align="center"}
# we can easily subset to only plot genes and cells of interest
plotPCA( sce_zeisel_raw[fData(sce_zeisel_raw)$use, sce_zeisel_raw$use],
         colour_by = "level1class", shape_by = "tissue", size_by = "total_features" )
```

One can also easily produce plots to identify principal components that
correlate with experimental and QC variables of interest. The function `plotQC`
with the option `type = "find-pcs"` ranks the principal components in decreasing
order of R-squared from a linear model regressing PC value against the variable of
interest. The default behaviour is to show the relationships between the
variable of interest and the six principal components with the strongest
relationship to the variable (as measured by R-squared). This works both for
continous and categorical variables. This type of plot  can indicate which
explanatory variables may be driving differences between cells as detected by
PCA and highlight which PCs are associated with the variable. The "total
features" variable shows very strong correlation with both principal components
1 and 2.

```{r, echo=T, eval=T, message=F, warning=F, fig.width=10, fig.height=6, fig.align="center"}
p1 <- plotQC(sce_zeisel_raw[fData(sce_zeisel_raw)$use, sce_zeisel_raw$use],
        type = "find-pcs", variable = "total_features" )
p2 <- plotQC(sce_zeisel_raw[fData(sce_zeisel_raw)$use, sce_zeisel_raw$use],
        type = "find-pcs", variable = "level1class" ) +
    theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1))
plot_grid(p1, p2, ncol = 2, labels = letters[1:2])
```

Very clearly, the first principal component is affected strongly by the total number of features detected in a cell.

The relative importance of different explanatory variables can be explored with
some of the `plotQC` function options. Supplying the `type = "expl"` argument to
`plotQC` computes the marginal R-squared for each variable in the
\textsf{SCESet} when fitting a linear model regressing expression values for
each gene against just that variable, and displays a density plot of the
gene-wise marginal R-squared values for the variables. The default approach
looks at all variables in the \textsf{phenoData} slot of the object and plots
the top `nvars_to_plot` variables (default is 10).

Alternatively, one can choose a subset of variables to plot in this manner,
which we do here. The density curves for marginal R-squared show the relative
importance of different variables for explaining variance in expression between
cells. In the plot we can see that it's not unusual for gene coverage
to explain more than 10% of the expression variance of a gene.

```{r plotqc-expl-vars, echo=T, eval=T, message=F, warning=F, fig.width=8, fig.height=4.5, fig.align="center"}
plotQC(sce_zeisel_raw[fData(sce_zeisel_raw)$use, sce_zeisel_raw$use],
        type = "explanatory-variables",
        variables = c("pct_counts_top_100_features", "total_features",
                      "pct_counts_feature_controls", "level1class",
                      "n_detected_feature_controls",
                      "log10_counts_endogenous_features",
                      "log10_counts_feature_controls", "tissue") )
```


This analysis indicates that total number of features detected and the
sequencing depth (number of counts) for endogenous genes, in particular, have
substantial explanatory power for many genes, so these variables are good
candidates for conditioning out in a normalisation step, or including in
downstream statistical models. The number of detected feature controls
(spike-in genes) does not appear to be an important explanatory variable.


The `plotQC()` function can also be used to produce a pairs plot of explanatory
variables (with the same calls as above, but with `method = "pairs"`). The plot
below shows this use case for looking at the % counts from the top 100
most-expressed genes, the total number of expressed genes, the % of counts from
feature controls, the number of detected feature controls, the  number of counts
(on the log-10 scale) from endogenous features, the number of counts
(log-10 scale) from feature controls and sample type. The explanatory variables
are ordered by the median R-squared of the variable across all genes, and this
value is reported on the plot. This type of plot is useful for finding correlations
between experimental and QC variables with substantial explanatory power.

```{r, echo=T, eval=T, message=F, warning=F, fig.width=8, fig.height=8, fig.align="center"}
plotQC(sce_zeisel_raw[fData(sce_zeisel_raw)$use, sce_zeisel_raw$use],
        type = "expl", method = "pairs",
        variables = c("pct_counts_top_100_features", "total_features",
                      "pct_counts_feature_controls",
                      "n_detected_feature_controls",
                      "log10_counts_endogenous_features",
                      "log10_counts_feature_controls", "level1class"),
        theme_size = 6)

```
\

The plot above shows that the variables `level1class` (cell type),
`total_features`, `log10_counts_endogenous_features`,
`pct_counts_top_100_features` and `pct_counts_feature_controls` are all
potentially important explanatory variables, while `tissue` is less important
and `n_detected_feature_controls` and `log10_counts_feature_controls` do not
look important at all.

---

# Normalise the data

After important explanatory variables have been identified with the tools shown
above, their effects can be accounted for in subsequent statistical models, or
they can be conditioned out using `normaliseExprs()`, if so desired. If a design
matrix incorporating a selection of explanatory variables is supplied as an
argument to `normaliseExprs`, then normalised expression values returned for
each feature will be the residuals from a linear model fitted with the design
matrix, after any size-factor normalisation has been applied to the expression
data.

Normalising single-cell RNA-seq data is a topic in its infancy, but many of the
basic principles still apply. How much you choose to initially correct for
technical factors depends on your question of interest and the ease with which
they can be accounted for in downstream models.

## Size-factor normalisation

In `scater` it is easy to perform size-factor normalisation using the "TMM"
approach (the default in the
[edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html) package),
the "RLE" approach (default in
the [DESeq](https://bioconductor.org/packages/release/bioc/html/DESeq.html) package),
an "upper-quartile approach" (as proposed by
[Bullard et al (2010)](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-94))
or simple scaling by total counts. These approaches were designed for bulk
RNA-seq and thus are not optimal for single-cell data, but are provided for
convenience and interest. Careful thought should be given to applying these
scale-factor normalisation methods as underlying assumptions may not be
appropriate for all single-cell datasets.

For instance, the RLE size-factor method does not work for this data set, as
the distribution of expression values for each cell has too many zeroes for this
method to handle. Similarly, the Bullard et al method only works here for
quantile values greater than 90%, because the quantile values less than 90% give
a value of zero for numerous cells - when this is the case, the normalisation
method fails. If inappropriate normalisation factors are computed, a warning is
provided and normalisation factors are set to 1 (the same as applying
`method="none"`).

However, [Lun et al (2016)](http://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0947-7) recently published a size-factor normalisation method specifically designed
for scRNA-seq data. This method performs much better on single-cell data and its
implementation in the Bioconductor package `scran` allows seamless integration
into the `scater` workflow. (The `scran` package itself depends on `scater`).

We recommend using the `scran` size-factor normalisation approach and
demonstrate its use here.

The code below demonstrates how to carry out size-factor normalisation on a
subsetted SCESet object, normalising either to ERCC spike-in genes (not shown)
or to endogenous genes. Under certain circumstances either may be appropriate.


```{r sizefactor-normalisation, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE}
## subset to form a QC'd version of the data
sce_zeisel_qc <- sce_zeisel_raw[fData(sce_zeisel_raw)$use |
                            fData(sce_zeisel_raw)$is_feature_control,
                        sce_zeisel_raw$use]
endog_genes <- !fData(sce_zeisel_qc)$is_feature_control

## size factor normalisation with scran
qclust <- quickCluster(sce_zeisel_qc)
sce_zeisel_qc <- computeSumFactors(sce_zeisel_qc, clusters = qclust)
summary(sce_zeisel_qc$size_factor)
sce_zeisel_qc <- normalise(sce_zeisel_qc)

# sce_zeisel_qc <- normaliseExprs(sce_zeisel_qc, method = "upperquartile",
#                                 feature_set = endog_genes, p = 0.99)
# set_exprs(sce_zeisel_qc, "norm_exprs_sf") <- norm_exprs(sce_zeisel_qc)
# summary(sce_zeisel_qc$norm_factors)
```

Here normalisation using size factors using the `computeSumFactors` function from
the `scran` package is undertaken. The `quickCluster` method from `scran` was
used to obtain a rough clustering of cells to guide the size-factor computation
step.

We can investigate the effects of normalisation with PCA plots, here
using only endogenous gene expression values to produce the plots.

```{r pca-norm-plots, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.width=10, fig.height=5.5}
## subset again so that only endogenous genes are used
plt_pca_prenorm <- plotPCA(
    sce_zeisel_raw[fData(sce_zeisel_raw)$use, sce_zeisel_raw$use],
    exprs_values = "exprs",
    size_by = "total_features",  colour_by = "level1class", shape_by = "tissue") +
    ggtitle("PCA - no normalisation") +
    theme(legend.position = "bottom")
plt_pca_endog_norm <- plotPCA(
    sce_zeisel_qc[fData(sce_zeisel_qc)$use,], exprs_values = "exprs",
    size_by = "total_features",  colour_by = "level1class", shape_by = "tissue") +
    ggtitle("PCA - endogenous size-factor normalisation") +
    theme(legend.position = "bottom")
multiplot(plt_pca_prenorm, plt_pca_endog_norm, cols = 2)
```

Here, where there are a large number of distinct cell types, we see that
size-factor normalisation has little effect on the large-scale structure in the
dataset as determined by PCA.

However, in the t-SNE plots below the overall structure is very similar in both
cases, but we see that the normalised data combines the two clusters of
oligodendrocytes from the un-normalised plot, and that the microglia and
endothelial-mural cells are better differentiated with the normalised data.

```{r tsne-norm-plots, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.width=10, fig.height=5.5}
## subset again so that only endogenous genes are used
plt_tsne_prenorm <- plotTSNE(
    sce_zeisel_raw[fData(sce_zeisel_raw)$use, sce_zeisel_raw$use],
    exprs_values = "exprs",
    size_by = "total_features",  colour_by = "level1class", shape_by = "tissue",
    random_seed = 20160203) +
    ggtitle("t-SNE - no normalisation") +
    theme(legend.position = "bottom")
plt_tsne_endog_norm <- plotTSNE(
    sce_zeisel_qc[fData(sce_zeisel_qc)$use,], exprs_values = "exprs",
    size_by = "total_features",  colour_by = "level1class", shape_by = "tissue",
    random_seed = 20160203) +
    ggtitle("t-SNE - endogenous size-factor normalisation") +
    theme(legend.position = "bottom")
multiplot(plt_tsne_prenorm, plt_tsne_endog_norm, cols = 2)
```


## Regressing out explanatory variables

In the section above we used the `plotQC()` function to identify the following
technical variables that (marginally) explained a substantial proportion of the
variance for many genes:
* `log10_counts_endogenous_features`
* `pct_counts_top_100_features`
* `pct_counts_feature_controls`.
The `normaliseExprs()` function enables such explanatory variables to be
regressed out of the expression values for downstream analyses. (NB:
in some settings `total_features` may also be considered a techinical variable,
but here it could reflect real biology, so we do not regress it out.)

The residuals of a linear regression are returned to `norm_exprs(object)`. Here,
the `exprs` values in the `SCESet` are assumed to be on a log scale - if they
are not, then this regression approach may not work reliably. For similar
reasons, `norm_counts`, `norm_tpm`, `norm_cpm` and `norm_fpkm` values are not
affected by regressing out a set of explanatory variables. As using regression
residuals is unreliable for these strictly non-negative quantities, they are
only affected by size-factor normalisation.

To regress out a set of explanatory variables, we simply need to supply a
"design matrix" to `normaliseExprs` that contains the matrix of values for the
variables we wish to regress out. This is easy to construct in R with the
built-in `model.matrix` function. Here we regress out the four explanatory
variables listed above, combined with the same upper-quartile size-factor
normalisation as above. To retain all normalised expression values, here we save
the normalised expression residuals to a new object (`norm_exprs_resid`) and
then add these values to a slot with the same name in the `SCESet` object:

```{r regress-expl-vars, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE}
design <- model.matrix(~sce_zeisel_qc$log10_counts_endogenous_features +
                           sce_zeisel_qc$pct_counts_top_100_features +
                           sce_zeisel_qc$pct_counts_feature_controls)
# sce_zeisel_qc <- normaliseExprs(sce_zeisel_qc, design = design,
#                                 method = "upperquartile",
#                                 feature_set = endog_genes, p = 0.99)
sce_zeisel_qc <- normaliseExprs(sce_zeisel_qc, design = design,
                                method = "none", exprs_values = "exprs",
                                feature_set = endog_genes,
                                return_norm_as_exprs = FALSE)
set_exprs(sce_zeisel_qc, "norm_exprs_resid") <- norm_exprs(sce_zeisel_qc)
smoothScatter(exprs(sce_zeisel_qc), get_exprs(sce_zeisel_qc,
                                                   "norm_exprs_resid"),
              xlab = "Size-factor normalised expression values",
              ylab = "Size-factor normalised expression residuals")
abline(0, 1, col = "firebrick", lty = 2)
```

As the smoothed scatter plot above shows, regressing out the explanatory
variables does alter expression levels quite substantially, because there are
several technical factors that have large effects on expression.

As previously, we can investigate the affects of normalisation on large-scale
cell population structure.

```{r pca-norm-plots-regress, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.width=10, fig.height=5.5}
## subset again so that only endogenous genes are used
plt_pca_endog_norm_resid <- plotPCA(
    sce_zeisel_qc, exprs_values = "norm_exprs_resid",
    size_by = "total_features",  colour_by = "level1class", shape_by = "tissue") +
    ggtitle("PCA - endogenous size-factor normalisation residuals") +
    theme(legend.position = "bottom")
#multiplot(plt_pca_endog_norm, plt_pca_endog_norm_resid, cols = 2)
plt_pca_endog_norm_resid
```

Earlier PCA plots suggested that technical factors were strongly affecting the
PCA results. The plot above confirms this - when important technical variables
are regressed out, the cells of different level 1 clases group together more
closely when looking at the first two principal components.

```{r, echo=T, eval=T, message=F, warning=F, fig.width=10, fig.height=6, fig.align="center"}
p1 <- plotQC(sce_zeisel_qc, type = "find-pcs",
             variable = "pct_counts_feature_controls",
             exprs_values = "norm_exprs_resid")
p2 <- plotQC(sce_zeisel_qc, type = "find-pcs", variable = "level1class",
             exprs_values = "norm_exprs_resid") +
    theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1))
plot_grid(p1, p2, ncol = 2, labels = letters[1:2])
```

The QC plot above shows that PCs no longer tag `total_features`, and that later
PCs (2, 4, 5, 7) capture more signal for differences in cell type.

```{r pca-norm-plots-regress-n7, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.width=12, fig.height=12}
## subset again so that only endogenous genes are used
plt_pca_endog_norm_resid_n7 <- plotPCA(
    sce_zeisel_qc, exprs_values = "norm_exprs_resid", ncomponents = 7,
    colour_by = "level1class", shape_by = "tissue") +
    ggtitle("PCA - endogenous size-factor normalisation residuals") +
    theme(legend.position = "bottom")
#multiplot(plt_pca_endog_norm, plt_pca_endog_norm_resid, cols = 2)
plt_pca_endog_norm_resid_n7
```

It is still eminently possible to distinguish between cell types, but with
strong technical effects (which here appear confounded with cell type to a
substantial extent) the later PCs are required to separate cell types with PCA.


```{r tsne-norm-plots-regress, echo=TRUE, eval=TRUE, message=FALSE, warning=FALSE, fig.align="center", fig.width=8, fig.height=10}
## subset again so that only endogenous genes are used
plt_tsne_endog_norm_resid <- plotTSNE(
    sce_zeisel_qc, exprs_values = "norm_exprs_resid",
    size_by = "total_features",  colour_by = "level1class", shape_by = "tissue",
    random_seed = 20160203) +
    ggtitle("t-SNE - endogenous size-factor normalisation residuals") +
    theme(legend.position = "bottom")
#multiplot(plt_tsne_endog_norm, plt_tsne_endog_norm_resid, cols = 2)
plt_tsne_endog_norm_resid
plt_tsne_endog_norm_resid2 <- plotTSNE(
    sce_zeisel_qc, exprs_values = "norm_exprs_resid",
    size_by = "total_features",  colour_by = "level2class", shape_by = "tissue",
    random_seed = 20160203) +
    ggtitle("t-SNE - endogenous size-factor normalisation residuals") +
    theme(legend.position = "bottom")
plt_tsne_endog_norm_resid2
```

The user will need to make a decision for each dataset on the value of
regressing out explanatory variables for their analysis. The example here
highlights a tricky situation in which technical sources of variation seem to
be confounded with biology of interest to a substantial effect.

NB: Users who prefer North American to British/Australian spelling can
*normalize* their data with `normalizeExprs()`, which is exactly the same as
`normaliseExprs()`.

## Customised normalisation approaches

"Customised" normalisation approaches can be easily incorporated into the
`scater` workflow, because it is very easy to add data to an `SCESet` obejct
with the `set_exprs` function. In a subsequent `scater` tutorial we provide an
example in which we normalise and standardise counts conditioned on expression
level. One advantage of this approach is that a biologically 'noisy' gene is
naturally defined as one with greater dispersion than other genes at a similar
expression level. In the normalised data these are genes with a mean absolute
deviation greater than 1.

Other sophisticated normalisation approaches can be attempted, such as those
implemented in the [BASiCS](https://github.com/catavallejos/BASiCS) package.
Alternative normalisation approaches can easily be incorporated into a `scater`
workflow.

Thus, after convenient pre-processing, QC and normalisation with \scater, the
data are well organised (with feature and cell metadata and many data
transformations), clean and tidy, and are ready for further statistical modeling
and analysis.

---

# Downstream analysis

Expression data at a single-cell resolution not only allows testing for
differential expression,  but exploring how this is dependent on sub-types of
cells and/or how genes coexpress with each other across cells. As with
normalisation,  single-cell analysis methodology is an area in its infancy and
deserving of discussion that is beyond this case study. Here we simply
demonstrate how QC'ed and normalised data contained within an SCESet object
allows for easy downstream interrogation. Let's assume we are interested in
defining differential expression as change in expression frequency. This can be
done with a standard generalised linear model and the ```qvalue``` package to
control false discovery rates.
\
Here, we simply test whether the expression frequency of each gene for
endothelial-mural cells is different from that of astrocytes-ependymal cells
(which is the default baseline in this model).

```{r de-testing, echo=T, eval=T, message=F, warning=F, fig.align="center", cache=FALSE}
library("qvalue")
celltype <- sce_zeisel_qc$level1class
de <- data.frame(t(apply(1 * is_exprs(sce_zeisel_qc), 1, # test change in expression frequency
                         function(y) coef(
                             summary(glm(y ~ celltype, family = "binomial")))[2, c(1, 4)])),
                 check.names = FALSE )
de$qvalue <- qvalue(de[, "Pr(>|z|)"], fdr.level = 0.05)$qvalues # control for false disovery rate
de <- de[order(de$qvalue, decreasing = FALSE ), ] # order by global statistical evidence
datatable(de[1:1000,])
```

In `scater` the `plotExpression` function enables the convenient visualisation
of expression values for a set of features. Here, the expression values for the
six most DE genes for expression frequency between cell types are shown. The
units for expression in the plot can be defined with the `exprs_values` argument
(the expression values must exist with the provided name in the `assayData`
slot of the SCESet object; if not the default `exprs` values will be used,
with a warning). As with other plots in `scater` we can use phenotype data
variables to define the colour and shape of the points.

```{r plot-exprs-de, cache=FALSE, echo=T, eval=T, message=F, warning=F, fig.align="center", fig.width=8, fig.height=5}
plotExpression(sce_zeisel_qc, rownames(de)[1:6], x = "level1class", ncol = 3,
               colour_by = "level1class") +
    theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1))
plotExpression(sce_zeisel_qc, tail(rownames(de), n = 6), exprs_values = "norm_exprs",
               x = "level1class", colour_by = "level1class",
               shape_by = "tissue", ncol = 3) +
    theme(axis.text.x = element_text(angle = 60, hjust = 1, vjust = 1))
```
\

As one would hope, the genes with the least evidence for differential expression
frequency are spike-in genes.

The `scater` package also makes it easy to investigate correlations between
genes and cells by supporting cell-cell and gene-gene distances in an `SCESet`
object. Below we show how cell-cell distances can be defined and then how these
distances can easily be used to produce multidimensional-scaling (MDS) plots to
investigate the structure between cells.
\

```{r, echo=T, eval=T, message=F, warning=F, fig.width=10, fig.height=8, fig.align="center"}

cellDist(sce_zeisel_qc) <- (1 - cor(norm_exprs(sce_zeisel_qc),
                                    method = "spearman")) / 2

plotMDS(sce_zeisel_qc, size_by = "total_features",  colour_by = "level1class",
        shape_by = "tissue") +
    ggtitle("MDS plot based on spearman correlation")

```

With the `plotMDS` function the user can apply their favourite way of defining
the distance between cells to then plot the cells in a low-dimensional space.
The plot above suggests that Spearman correlation is not the best distance
metric for single-cell RNA-seq data unless you are particularly interested in
separating cells by total number of detected genes. Other metrics may work
substantially better.

Finally, let us save the QC'd version of the data that can be used for any
subsequent analyses.

```{r save-data}
save(sce_zeisel_qc, file = "zeisel_sce_qc.RData", compress = "xz",
     compression_level = 9)
```

---

# Technical stuff


Scater has been tested on Mac OS X and Linux environments and requires the R packages:

* Biobase
* BiocGenerics
* ggplot2
* methods

and imports the packages:

* biomaRt
* data.table
* dplyr
* edgeR
* grid
* limma
* matrixStats
* plyr
* reshape2
* rhdf5
* rjson
* viridis

This case study was run using the following platform and R package versions:
\

```{r, echo=FALSE, eval=TRUE}

sessionInfo()

```


```{r figures-for-paper, echo=FALSE, eval = FALSE, results='hide'}
### Figures for the paper
if ( requireNamespace("cowplot") ) {
    ## QC figure
    p1 <- plot( scData, colour_by = "sample_type", exprs_values = "counts")
    p2 <- plotPCA( scData, shape_by = "sample_type",
                   pca_data_input = "pdata", detect_outliers = TRUE,
                   selected_variables = c("cdna_recovered_in_ng_per_ul_no_miss",
                                          "pct_counts_top_100_features",
                                          "total_features",
                                          "pct_counts_feature_controls",
                                          "n_detected_feature_controls",
                                          "log10_counts_endogenous_features",
                                          "log10_counts_feature_controls"))
    p3 <- plotQC( scData[fData( scData )$use, pData( scData )$use],
        type = "highest-expression",
        col_by_variable = "sample_type" )
    p4 <- plotQC(scData, type = "exprs")
    p5 <- plotQC( scData[fData( scData )$use, scData$use],
        type = "explanatory-variables",
        variables = c("pct_counts_top_100_features", "total_features",
                      "pct_counts_feature_controls", "c1_machine",
                      "n_detected_feature_controls",
                      "log10_counts_endogenous_features",
                      "log10_counts_feature_controls", "sample_type") )
    p6 <- plotQC( scData[fData( scData )$use, scData$use],
        type = "find-pcs", variable = "total_features")
    figplot1 <- cowplot::plot_grid(p1, p2, p3, p4, p5, p6,
                                  labels = letters[1:6], ncol = 2)
    cowplot::save_plot("../figures/figure2.pdf", figplot1, ncol = 2, nrow = 3,
                       base_height = 5, base_aspect_ratio = 1.4)
    cowplot::save_plot("../figures/figure2.png", figplot1, ncol = 2, nrow = 3,
                       base_height = 5, base_aspect_ratio = 1.4)

    ## dimension reduction figure
    p1 <- plotPCA(scData[, scData$use], size_by = "total_features",
                  colour_by = "sample_type",
                  shape_by = "c1_machine") + ggtitle("PCA - all genes") +
        theme(legend.position = "bottom")
    p2 <- plotTSNE(scData[, scData$use], size_by = "total_features",
                   colour_by = "sample_type",
                   shape_by = "c1_machine", rand_seed = 20151225) +
        ggtitle("t-SNE - all genes") +
        theme(legend.position = "bottom")
    p3 <- plotDiffusionMap( scData[, scData$use], size_by = "total_features",
                            colour_by = "sample_type",
                            shape_by = "c1_machine") +
        ggtitle("Diffusion map - all genes") +
        theme(legend.position = "bottom")
    p4 <- plotPCA(scData[, scData$use], feature_set = cell_cycle_genes,
                  colour_by = featureNames(scData)[most_exprs_ccycle[8]],
                  shape_by = "sample_type") + ggtitle("PCA - cell cycle genes") +
        theme(legend.position = "bottom")
    p5 <- plotTSNE(scData[, scData$use], feature_set = cell_cycle_genes,
                   colour_by = featureNames(scData)[most_exprs_ccycle[8]],
                   shape_by = "sample_type", rand_seed = 20151225) +
        ggtitle("t-SNE - cell cycle genes") +
        theme(legend.position = "bottom")
    p6 <- plotDiffusionMap( scData[, scData$use], feature_set = cell_cycle_genes,
                            colour_by = featureNames(scData)[most_exprs_ccycle[8]],
                            shape_by = "sample_type") +
        ggtitle("Diffusion map - cell cycle genes") +
        theme(legend.position = "bottom")

    figplot2 <- cowplot::plot_grid(p1, p2, p3, p4, p5, p6, plt_pca_ercc_norm,
                                   plt_pca_endog_norm, plt_cond_norm,
                                   labels = letters[1:9], ncol = 3,
                                   rel_heights = c(1.1, 1))
    cowplot::save_plot("../figures/figure4.pdf", figplot2, ncol = 3, nrow = 3,
                       base_height = 5, base_aspect_ratio = 0.8)
    cowplot::save_plot("../figures/figure4.png", figplot2, ncol = 3, nrow = 3,
                       base_height = 5, base_aspect_ratio = 0.8)
}

```