8 RNA-Seq (an example)

8.1 Introduction

This workflow below is based on Mary Piper.¹ For analysis, we used the DESeq2 package² and also considered Michael I. Love, Simon Anders, and Wolfgang Huber³ description.

The research questions and databases used in the workflow are based on the publication of Casimiro Gerarduzzi et al.⁴ RNA sequencing data can be downloaded from the Gene Expression Omnibus (GEO) database (GEO accession: [GSE85209] (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85209)).

8.2 Loading packages

# Load library for DESeq2
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# .libPaths()
# BiocManager::install("DESeq2")
# devtools::install_github("stephenturner/annotables")

library(DESeq2)

# Load library for RColorBrewer
library(RColorBrewer)

# Load library for pheatmap
# install.packages("pheatmap")
library(pheatmap)

# Load library for tidyverse
library(tidyverse)

8.3 Read counts

smoc2_rawcounts <- read.csv("data/fibrosis_smoc2_rawcounts_unordered.csv") 
rownames(smoc2_rawcounts) <- smoc2_rawcounts$X
smoc2_rawcounts$X <- NULL

# Explore the first six observations of smoc2_rawcounts
head(smoc2_rawcounts)
#>                    smoc2_fibrosis1 smoc2_fibrosis4
#> ENSMUSG00000102693               0               0
#> ENSMUSG00000064842               0               0
#> ENSMUSG00000051951              72              30
#> ENSMUSG00000102851               0               0
#> ENSMUSG00000103377               0               0
#> ENSMUSG00000104017               0               0
#>                    smoc2_normal1 smoc2_normal3
#> ENSMUSG00000102693             0             0
#> ENSMUSG00000064842             0             0
#> ENSMUSG00000051951             0             3
#> ENSMUSG00000102851             0             0
#> ENSMUSG00000103377             1             0
#> ENSMUSG00000104017             0             0
#>                    smoc2_fibrosis3 smoc2_normal4
#> ENSMUSG00000102693               0             0
#> ENSMUSG00000064842               0             0
#> ENSMUSG00000051951              36             1
#> ENSMUSG00000102851               0             0
#> ENSMUSG00000103377               0             0
#> ENSMUSG00000104017               0             0
#>                    smoc2_fibrosis2
#> ENSMUSG00000102693               0
#> ENSMUSG00000064842               0
#> ENSMUSG00000051951              51
#> ENSMUSG00000102851               0
#> ENSMUSG00000103377               0
#> ENSMUSG00000104017               0

# Explore the structure of smoc2_rawcounts
str(smoc2_rawcounts)
#> 'data.frame':    47729 obs. of  7 variables:
#>  $ smoc2_fibrosis1: int  0 0 72 0 0 0 0 0 0 1 ...
#>  $ smoc2_fibrosis4: int  0 0 30 0 0 0 0 0 0 1 ...
#>  $ smoc2_normal1  : int  0 0 0 0 1 0 0 0 0 1 ...
#>  $ smoc2_normal3  : int  0 0 3 0 0 0 0 0 0 0 ...
#>  $ smoc2_fibrosis3: int  0 0 36 0 0 0 0 0 0 1 ...
#>  $ smoc2_normal4  : int  0 0 1 0 0 0 0 0 0 0 ...
#>  $ smoc2_fibrosis2: int  0 0 51 0 0 0 0 0 0 1 ...

8.4 Distribution of counts

library(ggplot2)
ggplot(smoc2_rawcounts) + 
  geom_histogram(aes(x=smoc2_fibrosis1), bins = 300)

summary(smoc2_rawcounts$smoc2_fibrosis1)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>      0.0      0.0      1.0    579.7    184.0 420026.0

8.5 Setting of metadata

# Create genotype vector
genotype <- c("smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe")

# Create condition vector
condition <- c("fibrosis", "fibrosis", "fibrosis", "fibrosis", "normal", "normal", "normal")

# Create data frame
smoc2_metadata <- data.frame(genotype, condition)

# Assign the row names of the data frame
rownames(smoc2_metadata) <- c("smoc2_fibrosis1", "smoc2_fibrosis2", "smoc2_fibrosis3", "smoc2_fibrosis4", "smoc2_normal1", "smoc2_normal3", "smoc2_normal4")

smoc2_metadata
#>                 genotype condition
#> smoc2_fibrosis1 smoc2_oe  fibrosis
#> smoc2_fibrosis2 smoc2_oe  fibrosis
#> smoc2_fibrosis3 smoc2_oe  fibrosis
#> smoc2_fibrosis4 smoc2_oe  fibrosis
#> smoc2_normal1   smoc2_oe    normal
#> smoc2_normal3   smoc2_oe    normal
#> smoc2_normal4   smoc2_oe    normal

8.6 Matching metadata and counts data

# Use the match() function to reorder the columns of the raw counts
reorder_idx <- match(rownames(smoc2_metadata), colnames(smoc2_rawcounts))

# Reorder the columns of the count data
reordered_smoc2_rawcounts <- smoc2_rawcounts[ , reorder_idx]

all(rownames(smoc2_metadata) == colnames(reordered_smoc2_rawcounts))
#> [1] TRUE

# Create a DESeq2 object
dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts,
                              colData = smoc2_metadata,
                              design = ~ condition)

8.7 Normalizing counts

# Determine the size factors to use for normalization
dds_smoc2 <- estimateSizeFactors(dds_smoc2)

sizeFactors(dds_smoc2)
#> smoc2_fibrosis1 smoc2_fibrosis2 smoc2_fibrosis3 
#>       1.4319832       1.1189642       1.2480024 
#> smoc2_fibrosis4   smoc2_normal1   smoc2_normal3 
#>       1.0826799       0.7106482       0.7989734 
#>   smoc2_normal4 
#>       0.8482426

# Extract the normalized counts
smoc2_normalized_counts <- counts(dds_smoc2, normalized=TRUE)
# smoc2_normalized_counts

8.8 Quality control - Hierarchical heatmap

# Transform the normalized counts 
vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)

# Extract the matrix of transformed counts
vsd_mat_smoc2 <- assay(vsd_smoc2)

# Compute the correlation values between samples
vsd_cor_smoc2 <- cor(vsd_mat_smoc2) 

# Plot the heatmap
pheatmap(vsd_cor_smoc2, annotation = select(smoc2_metadata, condition))

8.9 Quality control - Principal component analysis

# Transform the normalized counts 
vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)

# Plot the PCA of PC1 and PC2
plotPCA(vsd_smoc2, intgroup="condition")

8.10 Creating the DeSeq2 object

# Create DESeq2 object
dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts,
                              colData = smoc2_metadata,
                              design = ~ condition)

# Run the DESeq2 analysis
dds_smoc2 <- DESeq(dds_smoc2)

8.11 DESeq2 model - dispersion

mean_counts <- apply(smoc2_rawcounts, 1, mean)
variance_counts <- apply(smoc2_rawcounts, 1, var)
df <- data.frame(mean_counts, variance_counts)
ggplot(df) + geom_point(aes(x=mean_counts, y=variance_counts)) + 
  scale_y_log10() + 
  scale_x_log10() +
  labs(x="Mean counts per gene", y="Vaiance per gene")

Dispersion formula: \(Var=\mu+\alpha\times\mu^2\)

\(Var\): variance
\(\mu\): mean
\(\alpha\): dispersion

# Plot dispersions
plotDispEsts(dds_smoc2)

8.12 DESeq2 model - contrasts

DESeq2 Negative Binomial Model:

\[K_{ij} \sim NB(\mu_{ij},\alpha_i)\] \[\mu_{ij}=s_{j}q_{ij}\]

\[log_2(q_{ij})=x_j\beta_j\]

\(K_{ij}\) - raw count for gene i, samle j
\(s_{ij}\) - size factor
\(g_{ij}\) - normalized count

# Extract the results of the differential expression analysis
smoc2_res <- results(dds_smoc2, 
                contrast = c("condition", "fibrosis", "normal"), 
                alpha = 0.05)
plotMA(smoc2_res)

8.12.1 DESeq2 rsults - LFC shrinkage

# Shrink the log2 fold change estimates to be more accurate
smoc2_res <- lfcShrink(dds_smoc2,
                    contrast = c("condition", "fibrosis", "normal"),
                    res = smoc2_res, type = "ashr")
plotMA(smoc2_res)

8.13 DESeq2 results

mcols(smoc2_res)
#> DataFrame with 5 rows and 2 columns
#>                        type            description
#>                 <character>            <character>
#> baseMean       intermediate mean of normalized c..
#> log2FoldChange      results log2 fold change (MM..
#> lfcSE               results posterior SD: condit..
#> pvalue              results Wald test p-value: c..
#> padj                results   BH adjusted p-values
head(smoc2_res)
#> log2 fold change (MMSE): condition fibrosis vs normal 
#> Wald test p-value: condition fibrosis vs normal 
#> DataFrame with 6 rows and 5 columns
#>                     baseMean log2FoldChange     lfcSE
#>                    <numeric>      <numeric> <numeric>
#> ENSMUSG00000102693  0.000000       0.000000  1.317258
#> ENSMUSG00000064842  0.000000       0.000000  1.317258
#> ENSMUSG00000051951 22.478090       3.830238  0.789374
#> ENSMUSG00000102851  0.000000       0.000000  1.317258
#> ENSMUSG00000103377  0.201024      -0.144713  1.160598
#> ENSMUSG00000104017  0.000000       0.000000  1.317258
#>                         pvalue        padj
#>                      <numeric>   <numeric>
#> ENSMUSG00000102693          NA          NA
#> ENSMUSG00000064842          NA          NA
#> ENSMUSG00000051951 5.82520e-08 2.54201e-07
#> ENSMUSG00000102851          NA          NA
#> ENSMUSG00000103377 6.76672e-01          NA
#> ENSMUSG00000104017          NA          NA
summary(smoc2_res)
#> 
#> out of 29556 with nonzero total read count
#> adjusted p-value < 0.05
#> LFC > 0 (up)       : 5776, 20%
#> LFC < 0 (down)     : 5332, 18%
#> outliers [1]       : 15, 0.051%
#> low counts [2]     : 7207, 24%
#> (mean count < 1)
#> [1] see 'cooksCutoff' argument of ?results
#> [2] see 'independentFiltering' argument of ?results

# Extract results
smoc2_res <- results(dds_smoc2, 
                contrast = c("condition", "fibrosis", "normal"), 
                alpha = 0.05, 
                lfcThreshold = 0.32)

# Shrink the log2 fold changes
smoc2_res <- lfcShrink(dds_smoc2, 
                    contrast = c("condition", "fibrosis", "normal"), 
                    res = smoc2_res, type = "ashr")

# Get an overview of the results    
summary(smoc2_res)
#> 
#> out of 29556 with nonzero total read count
#> adjusted p-value < 0.05
#> LFC > 0 (up)       : 3716, 13%
#> LFC < 0 (down)     : 3322, 11%
#> outliers [1]       : 15, 0.051%
#> low counts [2]     : 7207, 24%
#> (mean count < 1)
#> [1] see 'cooksCutoff' argument of ?results
#> [2] see 'independentFiltering' argument of ?results

8.14 DESeq2 significant results

# Save results as a data frame
smoc2_res_all <- data.frame(smoc2_res)

# Subset the results to only return the significant genes with p-adjusted values less than 0.05
smoc2_res_sig <- subset(smoc2_res_all, padj < 0.05)
head(smoc2_res_sig)
#>                      baseMean log2FoldChange      lfcSE
#> ENSMUSG00000051951   22.47809      3.8302376 0.78937355
#> ENSMUSG00000025900   12.06950     -2.0769846 0.62565698
#> ENSMUSG00000033845 1380.35712     -0.8819001 0.09788832
#> ENSMUSG00000025903 2522.97515     -1.8975565 0.15007654
#> ENSMUSG00000002459   11.55182      1.9315914 0.74833900
#> ENSMUSG00000033793 1921.19192     -0.8971164 0.08467708
#>                          pvalue         padj
#> ENSMUSG00000051951 4.698974e-07 2.829520e-06
#> ENSMUSG00000025900 5.479409e-04 2.278479e-03
#> ENSMUSG00000033845 3.838750e-09 2.815588e-08
#> ENSMUSG00000025903 1.242343e-26 2.749899e-25
#> ENSMUSG00000002459 3.154811e-03 1.160401e-02
#> ENSMUSG00000033793 3.841942e-12 3.594718e-11

8.15 Visualization of results - MA és Vulcano plots

# Create MA plot
plotMA(smoc2_res)


# Generate logical column 
smoc2_res_all <- data.frame(smoc2_res) %>% mutate(threshold = padj < 0.05)
              
# Create the volcano plot
ggplot(smoc2_res_all) + 
        geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) + 
        xlab("log2 fold change") + 
        ylab("-log10 adjusted p-value") + 
        theme(legend.position = "none", 
              plot.title = element_text(size = rel(1.5), hjust = 0.5), 
              axis.title = element_text(size = rel(1.25)))

8.16 Visualization of results - heatmap

# Subset normalized counts to significant genes
sig_norm_counts_smoc2 <- smoc2_normalized_counts[rownames(smoc2_res_sig), ]

# Choose heatmap color palette
heat_colors <- brewer.pal(n = 6, name = "YlOrRd")

# Plot heatmap
pheatmap(sig_norm_counts_smoc2, 
         color = heat_colors, 
         cluster_rows = TRUE, 
         show_rownames = FALSE,
         annotation = select(smoc2_metadata, condition), 
         scale = "row")

8.17 Significant genes

library(annotables)
head(grcm38)
#> # A tibble: 6 x 9
#>   ensgene  entrez symbol chr     start    end strand biotype
#>   <chr>     <int> <chr>  <chr>   <int>  <int>  <int> <chr>  
#> 1 ENSMUSG~  14679 Gnai3  3      1.08e8 1.08e8     -1 protei~
#> 2 ENSMUSG~  54192 Pbsn   X      7.78e7 7.79e7     -1 protei~
#> 3 ENSMUSG~  12544 Cdc45  16     1.88e7 1.88e7     -1 protei~
#> 4 ENSMUSG~     NA H19    7      1.43e8 1.43e8     -1 lincRNA
#> 5 ENSMUSG~ 107815 Scml2  X      1.61e8 1.61e8      1 protei~
#> 6 ENSMUSG~  11818 Apoh   11     1.08e8 1.08e8      1 protei~
#> # ... with 1 more variable: description <chr>

smoc2_res_all <- data.frame(smoc2_res) %>% 
  rownames_to_column(var="ensgene") %>% 
  left_join(, y=grcm38[, c("ensgene", "symbol", "description")],
            by="ensgene")

head(smoc2_res_all)
#>              ensgene   baseMean log2FoldChange     lfcSE
#> 1 ENSMUSG00000102693  0.0000000      0.0000000 1.3172581
#> 2 ENSMUSG00000064842  0.0000000      0.0000000 1.3172581
#> 3 ENSMUSG00000051951 22.4780900      3.8302376 0.7893735
#> 4 ENSMUSG00000102851  0.0000000      0.0000000 1.3172581
#> 5 ENSMUSG00000103377  0.2010237     -0.1447134 1.1605983
#> 6 ENSMUSG00000104017  0.0000000      0.0000000 1.3172581
#>         pvalue        padj        symbol
#> 1           NA          NA 4933401J01Rik
#> 2           NA          NA       Gm26206
#> 3 4.698974e-07 2.82952e-06          Xkr4
#> 4           NA          NA       Gm18956
#> 5 7.390045e-01          NA       Gm37180
#> 6           NA          NA       Gm37363
#>                                                             description
#> 1        RIKEN cDNA 4933401J01 gene [Source:MGI Symbol;Acc:MGI:1918292]
#> 2             predicted gene, 26206 [Source:MGI Symbol;Acc:MGI:5455983]
#> 3 X-linked Kx blood group related 4 [Source:MGI Symbol;Acc:MGI:3528744]
#> 4             predicted gene, 18956 [Source:MGI Symbol;Acc:MGI:5011141]
#> 5             predicted gene, 37180 [Source:MGI Symbol;Acc:MGI:5610408]
#> 6             predicted gene, 37363 [Source:MGI Symbol;Acc:MGI:5610591]

smoc2_res_sig <- subset(smoc2_res_all, padj < 0.05)
smoc2_res_sig <- smoc2_res_sig %>% arrange(padj)
head(smoc2_res_sig)
#>              ensgene  baseMean log2FoldChange      lfcSE
#> 1 ENSMUSG00000029811  8710.652       5.593885 0.11145593
#> 2 ENSMUSG00000027896  3819.931      -3.523220 0.09295523
#> 3 ENSMUSG00000062908 24293.902      -3.507301 0.11564406
#> 4 ENSMUSG00000028222  5848.681      -4.496932 0.16082443
#> 5 ENSMUSG00000032332  5162.864       4.557115 0.16437178
#> 6 ENSMUSG00000036887  3638.590       3.859276 0.13865758
#>          pvalue          padj  symbol
#> 1  0.000000e+00  0.000000e+00    Aoc1
#> 2 4.544391e-261 5.074721e-257 Slc16a4
#> 3 4.634505e-168 3.450235e-164   Acadm
#> 4 4.327404e-150 2.416206e-146   Calb1
#> 5 6.044426e-148 2.699924e-144 Col12a1
#> 6 1.030528e-144 3.835968e-141    C1qa
#>                                                                                                 description
#> 1                                    amine oxidase, copper-containing 1 [Source:MGI Symbol;Acc:MGI:1923757]
#> 2 solute carrier family 16 (monocarboxylic acid transporters), member 4 [Source:MGI Symbol;Acc:MGI:2385183]
#> 3                             acyl-Coenzyme A dehydrogenase, medium chain [Source:MGI Symbol;Acc:MGI:87867]
#> 4                                                             calbindin 1 [Source:MGI Symbol;Acc:MGI:88248]
#> 5                                             collagen, type XII, alpha 1 [Source:MGI Symbol;Acc:MGI:88448]
#> 6               complement component 1, q subcomponent, alpha polypeptide [Source:MGI Symbol;Acc:MGI:88223]

8.18 Top 20 genes

top_20 <- data.frame(sig_norm_counts_smoc2)[1:20, ] %>% 
  rownames_to_column(var="ensgene")

top_20 <- top_20 %>% pivot_longer(cols = 2:8, values_to = "normalized_counts", names_to="samplename")

head(top_20)
#> # A tibble: 6 x 3
#>   ensgene            samplename      normalized_counts
#>   <chr>              <chr>                       <dbl>
#> 1 ENSMUSG00000051951 smoc2_fibrosis1             50.3 
#> 2 ENSMUSG00000051951 smoc2_fibrosis2             45.6 
#> 3 ENSMUSG00000051951 smoc2_fibrosis3             28.8 
#> 4 ENSMUSG00000051951 smoc2_fibrosis4             27.7 
#> 5 ENSMUSG00000051951 smoc2_normal1                0   
#> 6 ENSMUSG00000051951 smoc2_normal3                3.75

top_20 <- inner_join(top_20, 
                     rownames_to_column(smoc2_metadata, var="samplename"),
                     by="samplename")
# write.table(x = top_20, file = "tmp.csv", sep=";", dec=",", row.names = F, quote = T)
ggplot(top_20) + 
  geom_point(aes(x=ensgene, y=normalized_counts, color=condition)) +
  scale_y_log10() +
  labs(title = "Top 20 genes", x="Genes", y="Normalized counts") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45, hjust=1))

7 Bioconductor

9 Summary