8 RNA-Seq (an example)

8.1 Introduction

This workflow below is based on Mary Piper.1 For analysis, we used the DESeq2 package2 and also considered Michael I. Love, Simon Anders, and Wolfgang Huber3 description.

The research questions and databases used in the workflow are based on the publication of Casimiro Gerarduzzi et al.4 RNA sequencing data can be downloaded from the Gene Expression Omnibus (GEO) database (GEO accession: [GSE85209] (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE85209)).

8.2 Loading packages

# Load library for DESeq2
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# 
# .libPaths()
# BiocManager::install("DESeq2")
# devtools::install_github("stephenturner/annotables")

library(DESeq2)

# Load library for RColorBrewer
library(RColorBrewer)

# Load library for pheatmap
# install.packages("pheatmap")
library(pheatmap)

# Load library for tidyverse
library(tidyverse)

8.3 Read counts

smoc2_rawcounts <- read.csv("data/fibrosis_smoc2_rawcounts_unordered.csv") 
rownames(smoc2_rawcounts) <- smoc2_rawcounts$X
smoc2_rawcounts$X <- NULL

# Explore the first six observations of smoc2_rawcounts
head(smoc2_rawcounts)
#>                    smoc2_fibrosis1 smoc2_fibrosis4
#> ENSMUSG00000102693               0               0
#> ENSMUSG00000064842               0               0
#> ENSMUSG00000051951              72              30
#> ENSMUSG00000102851               0               0
#> ENSMUSG00000103377               0               0
#> ENSMUSG00000104017               0               0
#>                    smoc2_normal1 smoc2_normal3
#> ENSMUSG00000102693             0             0
#> ENSMUSG00000064842             0             0
#> ENSMUSG00000051951             0             3
#> ENSMUSG00000102851             0             0
#> ENSMUSG00000103377             1             0
#> ENSMUSG00000104017             0             0
#>                    smoc2_fibrosis3 smoc2_normal4
#> ENSMUSG00000102693               0             0
#> ENSMUSG00000064842               0             0
#> ENSMUSG00000051951              36             1
#> ENSMUSG00000102851               0             0
#> ENSMUSG00000103377               0             0
#> ENSMUSG00000104017               0             0
#>                    smoc2_fibrosis2
#> ENSMUSG00000102693               0
#> ENSMUSG00000064842               0
#> ENSMUSG00000051951              51
#> ENSMUSG00000102851               0
#> ENSMUSG00000103377               0
#> ENSMUSG00000104017               0

# Explore the structure of smoc2_rawcounts
str(smoc2_rawcounts)
#> 'data.frame':    47729 obs. of  7 variables:
#>  $ smoc2_fibrosis1: int  0 0 72 0 0 0 0 0 0 1 ...
#>  $ smoc2_fibrosis4: int  0 0 30 0 0 0 0 0 0 1 ...
#>  $ smoc2_normal1  : int  0 0 0 0 1 0 0 0 0 1 ...
#>  $ smoc2_normal3  : int  0 0 3 0 0 0 0 0 0 0 ...
#>  $ smoc2_fibrosis3: int  0 0 36 0 0 0 0 0 0 1 ...
#>  $ smoc2_normal4  : int  0 0 1 0 0 0 0 0 0 0 ...
#>  $ smoc2_fibrosis2: int  0 0 51 0 0 0 0 0 0 1 ...

8.4 Distribution of counts

library(ggplot2)
ggplot(smoc2_rawcounts) + 
  geom_histogram(aes(x=smoc2_fibrosis1), bins = 300)
summary(smoc2_rawcounts$smoc2_fibrosis1)
#>     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
#>      0.0      0.0      1.0    579.7    184.0 420026.0

8.5 Setting of metadata

# Create genotype vector
genotype <- c("smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe", "smoc2_oe")

# Create condition vector
condition <- c("fibrosis", "fibrosis", "fibrosis", "fibrosis", "normal", "normal", "normal")

# Create data frame
smoc2_metadata <- data.frame(genotype, condition)

# Assign the row names of the data frame
rownames(smoc2_metadata) <- c("smoc2_fibrosis1", "smoc2_fibrosis2", "smoc2_fibrosis3", "smoc2_fibrosis4", "smoc2_normal1", "smoc2_normal3", "smoc2_normal4")

smoc2_metadata
#>                 genotype condition
#> smoc2_fibrosis1 smoc2_oe  fibrosis
#> smoc2_fibrosis2 smoc2_oe  fibrosis
#> smoc2_fibrosis3 smoc2_oe  fibrosis
#> smoc2_fibrosis4 smoc2_oe  fibrosis
#> smoc2_normal1   smoc2_oe    normal
#> smoc2_normal3   smoc2_oe    normal
#> smoc2_normal4   smoc2_oe    normal

8.6 Matching metadata and counts data

# Use the match() function to reorder the columns of the raw counts
reorder_idx <- match(rownames(smoc2_metadata), colnames(smoc2_rawcounts))

# Reorder the columns of the count data
reordered_smoc2_rawcounts <- smoc2_rawcounts[ , reorder_idx]

all(rownames(smoc2_metadata) == colnames(reordered_smoc2_rawcounts))
#> [1] TRUE

# Create a DESeq2 object
dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts,
                              colData = smoc2_metadata,
                              design = ~ condition)

8.7 Normalizing counts

# Determine the size factors to use for normalization
dds_smoc2 <- estimateSizeFactors(dds_smoc2)

sizeFactors(dds_smoc2)
#> smoc2_fibrosis1 smoc2_fibrosis2 smoc2_fibrosis3 
#>       1.4319832       1.1189642       1.2480024 
#> smoc2_fibrosis4   smoc2_normal1   smoc2_normal3 
#>       1.0826799       0.7106482       0.7989734 
#>   smoc2_normal4 
#>       0.8482426

# Extract the normalized counts
smoc2_normalized_counts <- counts(dds_smoc2, normalized=TRUE)
# smoc2_normalized_counts

8.8 Quality control - Hierarchical heatmap

# Transform the normalized counts 
vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)

# Extract the matrix of transformed counts
vsd_mat_smoc2 <- assay(vsd_smoc2)

# Compute the correlation values between samples
vsd_cor_smoc2 <- cor(vsd_mat_smoc2) 

# Plot the heatmap
pheatmap(vsd_cor_smoc2, annotation = select(smoc2_metadata, condition))

8.9 Quality control - Principal component analysis

# Transform the normalized counts 
vsd_smoc2 <- vst(dds_smoc2, blind = TRUE)

# Plot the PCA of PC1 and PC2
plotPCA(vsd_smoc2, intgroup="condition")

8.10 Creating the DeSeq2 object

# Create DESeq2 object
dds_smoc2 <- DESeqDataSetFromMatrix(countData = reordered_smoc2_rawcounts,
                              colData = smoc2_metadata,
                              design = ~ condition)

# Run the DESeq2 analysis
dds_smoc2 <- DESeq(dds_smoc2)

8.11 DESeq2 model - dispersion

mean_counts <- apply(smoc2_rawcounts, 1, mean)
variance_counts <- apply(smoc2_rawcounts, 1, var)
df <- data.frame(mean_counts, variance_counts)
ggplot(df) + geom_point(aes(x=mean_counts, y=variance_counts)) + 
  scale_y_log10() + 
  scale_x_log10() +
  labs(x="Mean counts per gene", y="Vaiance per gene") 

Dispersion formula: \(Var=\mu+\alpha\times\mu^2\)

  • \(Var\): variance
  • \(\mu\): mean
  • \(\alpha\): dispersion
# Plot dispersions
plotDispEsts(dds_smoc2)

8.12 DESeq2 model - contrasts

DESeq2 Negative Binomial Model:

\[K_{ij} \sim NB(\mu_{ij},\alpha_i)\] \[\mu_{ij}=s_{j}q_{ij}\]

\[log_2(q_{ij})=x_j\beta_j\]

  • \(K_{ij}\) - raw count for gene i, samle j
  • \(s_{ij}\) - size factor
  • \(g_{ij}\) - normalized count
# Extract the results of the differential expression analysis
smoc2_res <- results(dds_smoc2, 
                contrast = c("condition", "fibrosis", "normal"), 
                alpha = 0.05)
plotMA(smoc2_res)

8.12.1 DESeq2 rsults - LFC shrinkage

# Shrink the log2 fold change estimates to be more accurate
smoc2_res <- lfcShrink(dds_smoc2,
                    contrast = c("condition", "fibrosis", "normal"),
                    res = smoc2_res, type = "ashr")
plotMA(smoc2_res)

8.13 DESeq2 results

mcols(smoc2_res)
#> DataFrame with 5 rows and 2 columns
#>                        type            description
#>                 <character>            <character>
#> baseMean       intermediate mean of normalized c..
#> log2FoldChange      results log2 fold change (MM..
#> lfcSE               results posterior SD: condit..
#> pvalue              results Wald test p-value: c..
#> padj                results   BH adjusted p-values
head(smoc2_res)
#> log2 fold change (MMSE): condition fibrosis vs normal 
#> Wald test p-value: condition fibrosis vs normal 
#> DataFrame with 6 rows and 5 columns
#>                     baseMean log2FoldChange     lfcSE
#>                    <numeric>      <numeric> <numeric>
#> ENSMUSG00000102693  0.000000       0.000000  1.317258
#> ENSMUSG00000064842  0.000000       0.000000  1.317258
#> ENSMUSG00000051951 22.478090       3.830238  0.789374
#> ENSMUSG00000102851  0.000000       0.000000  1.317258
#> ENSMUSG00000103377  0.201024      -0.144713  1.160598
#> ENSMUSG00000104017  0.000000       0.000000  1.317258
#>                         pvalue        padj
#>                      <numeric>   <numeric>
#> ENSMUSG00000102693          NA          NA
#> ENSMUSG00000064842          NA          NA
#> ENSMUSG00000051951 5.82520e-08 2.54201e-07
#> ENSMUSG00000102851          NA          NA
#> ENSMUSG00000103377 6.76672e-01          NA
#> ENSMUSG00000104017          NA          NA
summary(smoc2_res)
#> 
#> out of 29556 with nonzero total read count
#> adjusted p-value < 0.05
#> LFC > 0 (up)       : 5776, 20%
#> LFC < 0 (down)     : 5332, 18%
#> outliers [1]       : 15, 0.051%
#> low counts [2]     : 7207, 24%
#> (mean count < 1)
#> [1] see 'cooksCutoff' argument of ?results
#> [2] see 'independentFiltering' argument of ?results
# Extract results
smoc2_res <- results(dds_smoc2, 
                contrast = c("condition", "fibrosis", "normal"), 
                alpha = 0.05, 
                lfcThreshold = 0.32)

# Shrink the log2 fold changes
smoc2_res <- lfcShrink(dds_smoc2, 
                    contrast = c("condition", "fibrosis", "normal"), 
                    res = smoc2_res, type = "ashr")
# Get an overview of the results    
summary(smoc2_res)
#> 
#> out of 29556 with nonzero total read count
#> adjusted p-value < 0.05
#> LFC > 0 (up)       : 3716, 13%
#> LFC < 0 (down)     : 3322, 11%
#> outliers [1]       : 15, 0.051%
#> low counts [2]     : 7207, 24%
#> (mean count < 1)
#> [1] see 'cooksCutoff' argument of ?results
#> [2] see 'independentFiltering' argument of ?results

8.14 DESeq2 significant results

# Save results as a data frame
smoc2_res_all <- data.frame(smoc2_res)

# Subset the results to only return the significant genes with p-adjusted values less than 0.05
smoc2_res_sig <- subset(smoc2_res_all, padj < 0.05)
head(smoc2_res_sig)
#>                      baseMean log2FoldChange      lfcSE
#> ENSMUSG00000051951   22.47809      3.8302376 0.78937355
#> ENSMUSG00000025900   12.06950     -2.0769846 0.62565698
#> ENSMUSG00000033845 1380.35712     -0.8819001 0.09788832
#> ENSMUSG00000025903 2522.97515     -1.8975565 0.15007654
#> ENSMUSG00000002459   11.55182      1.9315914 0.74833900
#> ENSMUSG00000033793 1921.19192     -0.8971164 0.08467708
#>                          pvalue         padj
#> ENSMUSG00000051951 4.698974e-07 2.829520e-06
#> ENSMUSG00000025900 5.479409e-04 2.278479e-03
#> ENSMUSG00000033845 3.838750e-09 2.815588e-08
#> ENSMUSG00000025903 1.242343e-26 2.749899e-25
#> ENSMUSG00000002459 3.154811e-03 1.160401e-02
#> ENSMUSG00000033793 3.841942e-12 3.594718e-11

8.15 Visualization of results - MA és Vulcano plots

# Create MA plot
plotMA(smoc2_res)

# Generate logical column 
smoc2_res_all <- data.frame(smoc2_res) %>% mutate(threshold = padj < 0.05)
              
# Create the volcano plot
ggplot(smoc2_res_all) + 
        geom_point(aes(x = log2FoldChange, y = -log10(padj), color = threshold)) + 
        xlab("log2 fold change") + 
        ylab("-log10 adjusted p-value") + 
        theme(legend.position = "none", 
              plot.title = element_text(size = rel(1.5), hjust = 0.5), 
              axis.title = element_text(size = rel(1.25)))

8.16 Visualization of results - heatmap

# Subset normalized counts to significant genes
sig_norm_counts_smoc2 <- smoc2_normalized_counts[rownames(smoc2_res_sig), ]

# Choose heatmap color palette
heat_colors <- brewer.pal(n = 6, name = "YlOrRd")

# Plot heatmap
pheatmap(sig_norm_counts_smoc2, 
         color = heat_colors, 
         cluster_rows = TRUE, 
         show_rownames = FALSE,
         annotation = select(smoc2_metadata, condition), 
         scale = "row")

8.17 Significant genes

library(annotables)
head(grcm38)
#> # A tibble: 6 x 9
#>   ensgene  entrez symbol chr     start    end strand biotype
#>   <chr>     <int> <chr>  <chr>   <int>  <int>  <int> <chr>  
#> 1 ENSMUSG~  14679 Gnai3  3      1.08e8 1.08e8     -1 protei~
#> 2 ENSMUSG~  54192 Pbsn   X      7.78e7 7.79e7     -1 protei~
#> 3 ENSMUSG~  12544 Cdc45  16     1.88e7 1.88e7     -1 protei~
#> 4 ENSMUSG~     NA H19    7      1.43e8 1.43e8     -1 lincRNA
#> 5 ENSMUSG~ 107815 Scml2  X      1.61e8 1.61e8      1 protei~
#> 6 ENSMUSG~  11818 Apoh   11     1.08e8 1.08e8      1 protei~
#> # ... with 1 more variable: description <chr>

smoc2_res_all <- data.frame(smoc2_res) %>% 
  rownames_to_column(var="ensgene") %>% 
  left_join(, y=grcm38[, c("ensgene", "symbol", "description")],
            by="ensgene")

head(smoc2_res_all)
#>              ensgene   baseMean log2FoldChange     lfcSE
#> 1 ENSMUSG00000102693  0.0000000      0.0000000 1.3172581
#> 2 ENSMUSG00000064842  0.0000000      0.0000000 1.3172581
#> 3 ENSMUSG00000051951 22.4780900      3.8302376 0.7893735
#> 4 ENSMUSG00000102851  0.0000000      0.0000000 1.3172581
#> 5 ENSMUSG00000103377  0.2010237     -0.1447134 1.1605983
#> 6 ENSMUSG00000104017  0.0000000      0.0000000 1.3172581
#>         pvalue        padj        symbol
#> 1           NA          NA 4933401J01Rik
#> 2           NA          NA       Gm26206
#> 3 4.698974e-07 2.82952e-06          Xkr4
#> 4           NA          NA       Gm18956
#> 5 7.390045e-01          NA       Gm37180
#> 6           NA          NA       Gm37363
#>                                                             description
#> 1        RIKEN cDNA 4933401J01 gene [Source:MGI Symbol;Acc:MGI:1918292]
#> 2             predicted gene, 26206 [Source:MGI Symbol;Acc:MGI:5455983]
#> 3 X-linked Kx blood group related 4 [Source:MGI Symbol;Acc:MGI:3528744]
#> 4             predicted gene, 18956 [Source:MGI Symbol;Acc:MGI:5011141]
#> 5             predicted gene, 37180 [Source:MGI Symbol;Acc:MGI:5610408]
#> 6             predicted gene, 37363 [Source:MGI Symbol;Acc:MGI:5610591]

smoc2_res_sig <- subset(smoc2_res_all, padj < 0.05)
smoc2_res_sig <- smoc2_res_sig %>% arrange(padj)
head(smoc2_res_sig)
#>              ensgene  baseMean log2FoldChange      lfcSE
#> 1 ENSMUSG00000029811  8710.652       5.593885 0.11145593
#> 2 ENSMUSG00000027896  3819.931      -3.523220 0.09295523
#> 3 ENSMUSG00000062908 24293.902      -3.507301 0.11564406
#> 4 ENSMUSG00000028222  5848.681      -4.496932 0.16082443
#> 5 ENSMUSG00000032332  5162.864       4.557115 0.16437178
#> 6 ENSMUSG00000036887  3638.590       3.859276 0.13865758
#>          pvalue          padj  symbol
#> 1  0.000000e+00  0.000000e+00    Aoc1
#> 2 4.544391e-261 5.074721e-257 Slc16a4
#> 3 4.634505e-168 3.450235e-164   Acadm
#> 4 4.327404e-150 2.416206e-146   Calb1
#> 5 6.044426e-148 2.699924e-144 Col12a1
#> 6 1.030528e-144 3.835968e-141    C1qa
#>                                                                                                 description
#> 1                                    amine oxidase, copper-containing 1 [Source:MGI Symbol;Acc:MGI:1923757]
#> 2 solute carrier family 16 (monocarboxylic acid transporters), member 4 [Source:MGI Symbol;Acc:MGI:2385183]
#> 3                             acyl-Coenzyme A dehydrogenase, medium chain [Source:MGI Symbol;Acc:MGI:87867]
#> 4                                                             calbindin 1 [Source:MGI Symbol;Acc:MGI:88248]
#> 5                                             collagen, type XII, alpha 1 [Source:MGI Symbol;Acc:MGI:88448]
#> 6               complement component 1, q subcomponent, alpha polypeptide [Source:MGI Symbol;Acc:MGI:88223]

8.18 Top 20 genes

top_20 <- data.frame(sig_norm_counts_smoc2)[1:20, ] %>% 
  rownames_to_column(var="ensgene")

top_20 <- top_20 %>% pivot_longer(cols = 2:8, values_to = "normalized_counts", names_to="samplename")

head(top_20)
#> # A tibble: 6 x 3
#>   ensgene            samplename      normalized_counts
#>   <chr>              <chr>                       <dbl>
#> 1 ENSMUSG00000051951 smoc2_fibrosis1             50.3 
#> 2 ENSMUSG00000051951 smoc2_fibrosis2             45.6 
#> 3 ENSMUSG00000051951 smoc2_fibrosis3             28.8 
#> 4 ENSMUSG00000051951 smoc2_fibrosis4             27.7 
#> 5 ENSMUSG00000051951 smoc2_normal1                0   
#> 6 ENSMUSG00000051951 smoc2_normal3                3.75

top_20 <- inner_join(top_20, 
                     rownames_to_column(smoc2_metadata, var="samplename"),
                     by="samplename")
# write.table(x = top_20, file = "tmp.csv", sep=";", dec=",", row.names = F, quote = T)
ggplot(top_20) + 
  geom_point(aes(x=ensgene, y=normalized_counts, color=condition)) +
  scale_y_log10() +
  labs(title = "Top 20 genes", x="Genes", y="Normalized counts") +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45, hjust=1))