library(tidyverse)
library(tximport)
library(DESeq2)
library(cowplot)
library(ggpubr)
library(ggvenn)
library(ComplexHeatmap)
wd <- "~/data/bulk-rnaseq/mouse/merged_by_sample/output_bulk-rnaseq_align-sort-assemble-count"
dir.inputs <- wd
transcript.cutoff = 150
N_nonzt_samples = 2
cpm.thresh = 1.5
cpm.N_samples = 2
R version 3.6.2 (2019-12-12)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Debian GNU/Linux 10 (buster)
Matrix products: default
BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.3.5.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=C
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] circlize_0.4.8 zFPKM_1.8.0 ComplexHeatmap_2.2.0
[4] ggvenn_0.1.10 ggpubr_0.6.0 cowplot_1.1.1
[7] DESeq2_1.26.0 tximport_1.14.2 forcats_1.0.0
[10] stringr_1.5.0 dplyr_1.1.3 purrr_1.0.2
[13] readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
[16] tidyverse_1.3.0 muscat_1.0.1 scater_1.14.6
[19] ggplot2_3.4.3 SingleCellExperiment_1.8.0 SummarizedExperiment_1.16.1
[22] DelayedArray_0.12.3 BiocParallel_1.20.1 matrixStats_1.0.0
[25] Biobase_2.46.0 GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
[28] IRanges_2.20.2 S4Vectors_0.24.4 BiocGenerics_0.32.0
[31] Seurat_3.1.0
loaded via a namespace (and not attached):
[1] R.methodsS3_1.8.2 acepack_1.4.2 bit64_4.0.5 knitr_1.43
[5] irlba_2.3.5.1 multcomp_1.4-25 R.utils_2.12.2 data.table_1.14.8
[9] rpart_4.1-15 RCurl_1.98-1.12 doParallel_1.0.17 generics_0.1.3
[13] metap_1.8 TH.data_1.1-2 RSQLite_2.3.1 RANN_2.6.1
[17] future_1.33.0 bit_4.0.5 tzdb_0.4.0 mutoss_0.1-13
[21] xml2_1.3.5 lubridate_1.9.2 viridis_0.5.1 xfun_0.40
[25] jquerylib_0.1.4 hms_1.1.3 evaluate_0.21 fansi_1.0.4
[29] progress_1.2.2 readxl_1.4.3 caTools_1.18.2 dbplyr_2.3.3
[33] igraph_1.5.1 DBI_1.1.3 geneplotter_1.64.0 htmlwidgets_1.6.2
[37] backports_1.4.1 annotate_1.64.0 deldir_1.0-9 vctrs_0.6.3
[41] ROCR_1.0-11 abind_1.4-5 cachem_1.0.8 withr_2.5.0
[45] vroom_1.6.3 checkmate_2.2.0 sctransform_0.3.5 prettyunits_1.1.1
[49] mnormt_2.1.1 cluster_2.1.0 ape_5.7-1 lazyeval_0.2.2
[53] crayon_1.5.2 genefilter_1.68.0 labeling_0.4.3 edgeR_3.28.1
[57] pkgconfig_2.0.3 qqconf_1.1.1 nlme_3.1-142 vipor_0.4.5
[61] blme_1.0-5 nnet_7.3-12 rlang_1.1.1 globals_0.16.2
[65] lifecycle_1.0.3 sandwich_3.0-2 mathjaxr_1.6-0 modelr_0.1.11
[69] rsvd_1.0.3 cellranger_1.1.0 lmtest_0.9-40 Matrix_1.6-1
[73] carData_3.0-5 boot_1.3-23 zoo_1.8-12 reprex_2.0.2
[77] base64enc_0.1-3 beeswarm_0.4.0 ggridges_0.5.4 GlobalOptions_0.1.2
[81] png_0.1-8 viridisLite_0.4.2 rjson_0.2.9 bitops_1.0-7
[85] R.oo_1.25.0 KernSmooth_2.23-16 blob_1.2.4 DelayedMatrixStats_1.8.0
[89] shape_1.4.6 parallelly_1.36.0 jpeg_0.1-10 rstatix_0.7.2
[93] ggsignif_0.6.4 scales_1.2.1 memoise_2.0.1 magrittr_2.0.3
[97] plyr_1.8.8 ica_1.0-3 gplots_3.1.3 zlibbioc_1.32.0
[101] compiler_3.6.2 RColorBrewer_1.1-3 plotrix_3.8-2 clue_0.3-64
[105] lme4_1.1-34 fitdistrplus_1.1-11 cli_3.6.1 XVector_0.26.0
[109] lmerTest_3.1-3 listenv_0.9.0 pbapply_1.7-2 TMB_1.9.6
[113] htmlTable_2.4.1 Formula_1.2-5 MASS_7.3-51.4 tidyselect_1.2.0
[117] stringi_1.7.12 yaml_2.3.7 BiocSingular_1.2.2 locfit_1.5-9.4
[121] latticeExtra_0.6-30 ggrepel_0.9.3 sass_0.4.7 tools_3.6.2
[125] timechange_0.2.0 future.apply_1.11.0 rstudioapi_0.15.0 foreach_1.5.2
[129] foreign_0.8-72 gridExtra_2.3 farver_2.1.1 Rtsne_0.16
[133] digest_0.6.33 Rcpp_1.0.11 car_3.1-2 broom_1.0.5
[137] SDMTools_1.1-221.2 RcppAnnoy_0.0.21 httr_1.4.7 AnnotationDbi_1.48.0
[141] Rdpack_2.5 colorspace_2.1-0 rvest_1.0.3 fs_1.6.3
[145] XML_3.99-0.3 reticulate_1.31 splines_3.6.2 uwot_0.1.16
[149] sn_2.1.1 multtest_2.42.0 plotly_4.10.2 xtable_1.8-4
[153] jsonlite_1.8.7 nloptr_2.0.3 R6_2.5.1 TFisher_0.2.0
[157] Hmisc_4.4-0 pillar_1.9.0 htmltools_0.5.6 glue_1.6.2
[161] fastmap_1.1.1 minqa_1.2.5 BiocNeighbors_1.4.2 codetools_0.2-16
[165] tsne_0.1-3.1 mvtnorm_1.2-3 utf8_1.2.3 bslib_0.5.1
[169] lattice_0.20-38 numDeriv_2016.8-1.1 pbkrtest_0.4-8.6 ggbeeswarm_0.7.2
[173] leiden_0.4.3 colorRamps_2.3.1 gtools_3.9.4 interp_1.1-4
[177] survival_3.1-8 limma_3.42.2 glmmTMB_1.1.7 rmarkdown_2.24
[181] munsell_0.5.0 GetoptLong_1.0.5 GenomeInfoDbData_1.2.2 iterators_1.0.14
[185] variancePartition_1.16.1 haven_2.5.3 reshape2_1.4.4 gtable_0.3.4
[189] rbibutils_2.2.15
# hisat2-samtools-stringtie pipeline example, not run here
# Step 1: Alignment
hisat2 -p 36 -x "$REFERENCE_GENOME" -1 "$r1_merged" -2 "$r2_merged" -S "$OUTPUT_DIR/$sam_file"
# Step 2: Converting and sorting
samtools view -bS "$OUTPUT_DIR/$sam_file" > "$OUTPUT_DIR/bams/$bam_file"
samtools sort --threads 36 "$OUTPUT_DIR/bams/$bam_file" -o "$OUTPUT_DIR/bams/$sorted_bam_file"
# Step 3: Assemble and quantify with stringtie
stringtie -p 4 -eB "$OUTPUT_DIR/bams/$sorted_bam_file" -G "$ANNOTATION_GTF" -o "$OUTPUT_DIR/$sample/$transcripts_gtf" -A "$OUTPUT_DIR/$sample/$gene_abundances_tsv"
colData <- read.csv(file.path(dir.inputs, "pheno_data_v1.csv"),
sep = ",",
row.names = 1) %>%
mutate(condition3_sex = factor(condition3_sex, levels = c("Female", "Male"))) %>%
mutate(condition4_treatment = factor(condition4_treatment, levels = c("Saline", "LPS"))) %>%
mutate(condition5_time = factor(condition5_time, levels = c("3h", "24h", "72h"))) %>%
arrange(condition4_treatment, condition5_time, condition3_sex) %>%
mutate(condition1_sex.treatment.time = factor(
condition1_sex.treatment.time,
levels = unique(condition1_sex.treatment.time)
))
colData
dds.list <- list()
fpkm.mat.list <- list()
DESeq.result.list <- list()
timepoints <- c(
"3h" = "3h",
"24h" = "24h",
"72h" = "72h",
"All" = "3h|24h|72h",
"All+male" = "3h|24h|72h"
)
timepoint_name = "3h"
for (timepoint_name in names(timepoints)) {
# Filter the metadata
colData.subset <- colData %>%
filter(str_detect(
condition3_sex,
ifelse(timepoint_name == "All+male", "Male|Female", "Female")
) &
str_detect(condition5_time, timepoints[[timepoint_name]]))
# Load the data into a DDS object via tximport
t_data.ctabs <-
list.files(
file.path(dir.inputs, "stringtie_rerun"),
recursive = T,
pattern = "t_data.ctab",
full.names = T
)
names(t_data.ctabs) <- word(t_data.ctabs, -2, sep = "/")
t_data.ctabs <- t_data.ctabs[rownames(colData.subset)]
tmp <- read_tsv(t_data.ctabs[1])
tx2gene <- tmp[, c("t_name", "gene_name")]
txi <-
tximport(
t_data.ctabs,
type = "stringtie",
tx2gene = tx2gene,
ignoreTxVersion = F
)
dds <-
DESeqDataSetFromTximport(txi,
colData = colData.subset,
design = ~ condition4_treatment)
# Filtering based on the transcript count row sum
keep <- rowSums(counts(dds)) >= transcript.cutoff
dds <- dds[keep,]
# Filtering based on counts > 0 in more than N samples in either of the groups
mat.Saline <-
counts(dds)[, rownames(filter(colData.subset, condition4_treatment == "Saline"))]
mat.Saline.filtered <-
mat.Saline[rowSums(mat.Saline != 0) >= N_nonzt_samples,]
mat.LPS <-
counts(dds)[, rownames(filter(colData.subset, condition4_treatment == "LPS"))]
mat.LPS.filtered <-
mat.LPS[rowSums(mat.LPS != 0) >= N_nonzt_samples,]
genes.filtered <-
c(rownames(mat.Saline.filtered), rownames(mat.LPS.filtered)) %>% unique()
dds <- dds[genes.filtered, ]
# Filtering by CPM > threshold
mat.CPM <- fpm(dds)
keep <- rowSums(mat.CPM > cpm.thresh) >= cpm.N_samples
dds <- dds[keep, ]
# DESeq2 analysis
dds <- DESeq(dds)
res <- results(dds)
# FPKM matrix
fpkm.mat <- fpkm(dds)
DE.genes.ordered <- res %>%
as.data.frame() %>%
rownames()
fpkm.mat.ordered <- fpkm.mat[DE.genes.ordered,]
dds.list[[timepoint_name]] <- dds
fpkm.mat.list[[timepoint_name]] <- fpkm.mat.ordered
DESeq.result.list[[timepoint_name]] <- res %>% as.data.frame()
}
Female samples.
Filtering strategy:
# Plotting
log2.axis.breaks.fpkm = c(2 ^ -5, 2 ^ 0, 2 ^ 5, 2 ^ 10, 2 ^ 15)
log2.axis.labels.fpkm = c("2^-5", "2^0", "2^5", "2^10", "2^15")
plot.FPKM.list <- list()
for (timepoint in names(fpkm.mat.list)[1:3]) {
fpkm.mat.filtered <- fpkm.mat.list[[timepoint]]
res <- DESeq.result.list[[timepoint]]
plot.FPKM.list[[timepoint]] <- data.frame(
FPKM.mean.Saline = fpkm.mat.filtered[, str_which(colnames(fpkm.mat.filtered), "Saline")] %>%
rowMeans(),
FPKM.mean.LPS = fpkm.mat.filtered[, str_which(colnames(fpkm.mat.filtered), "LPS")] %>%
rowMeans()
) %>%
mutate(highlight = case_when(res$padj > 0.05 |
is.na(res$padj) ~ FALSE,
TRUE ~ TRUE)) %>%
filter(FPKM.mean.Saline != 0 & FPKM.mean.LPS != 0) %>%
ggplot(aes(x = FPKM.mean.Saline, y = FPKM.mean.LPS)) +
geom_point(aes(color = highlight), size = 1) +
scale_x_continuous(
trans = "log2",
limits = c(2 ^ -5, 2 ^ 15),
breaks = log2.axis.breaks.fpkm,
labels = log2.axis.labels.fpkm
) +
scale_y_continuous(
trans = "log2",
limits = c(2 ^ -5, 2 ^ 15),
breaks = log2.axis.breaks.fpkm,
labels = log2.axis.labels.fpkm
) +
scale_color_manual(values = c("black", "#FF40FC")) +
labs(color = "p.adj < 0.05") +
labs(x = "Saline (FPKM)", y = "LPS (FPKM)",
title = timepoint) +
cowplot::theme_cowplot()
}
ggpubr::ggarrange(
plotlist = plot.FPKM.list,
common.legend = T,
nrow = 1,
legend = "bottom"
)
All samples, same filtering strategy.
PCA.data <-
plotPCA(vst(dds.list[[5]]), intgroup = "condition1_sex.treatment.time", returnData = T)
PCA.data %>%
mutate(
group_labels = str_replace_all(group, "_", " "),
group_labels = str_replace(group_labels, "\\s(\\d+h)", " (\\1)"),
group_labels = factor(group_labels, levels = unique(group_labels))
) %>%
ggplot(aes(x = PC1, y = -PC2)) +
geom_point((aes(color = group_labels)), size = 2) +
labs(y = "PC2", color = NULL)
Filtered transcripts expressed in LPS-treated female mice compared to saline, additionally filtered to show genes with log2(fold change) >= 1 & adjusted p value < 0.05.
DESeq.result.list.filtered <- list()
for(timepoint in names(DESeq.result.list)[1:3]){
DESeq.result.list.filtered[[timepoint]] <- DESeq.result.list[[timepoint]] %>%
filter(log2FoldChange >= 1 & padj < 0.05)
}
ggvenn::ggvenn(
data = lapply(DESeq.result.list.filtered, rownames),
fill_color = c("red", "blue", "green"),
show_percentage = F,
text_size = 5
)
Log2(fold change) of selected genes.
top.genes.list <- list(
"3h" = c("Csf3", "Serpina3g", "Oasl1", "Gvin1", "Il6", "Socs3", "Gpr84"),
"24h" = c("Timp1", "C3", "Lrg1", "C1ra", "Serping1", "Serpina3m", "Fbln5"),
"72h" = c("Cybb", "Cxcl13", "AI607873", "Rac2", "Cd55"),
"All" = c("Cp", "H2-D1", "Gfap", "Trim30a", "Zbp1", "H2-Q6", "Gbp6")
)
top.genes.lfc <- list()
for (timepoint in names(top.genes.list)) {
gene.set <- top.genes.list[[timepoint]]
top.genes.3h <- DESeq.result.list[["3h"]][gene.set,]
top.genes.24h <- DESeq.result.list[["24h"]][gene.set,]
top.genes.72h <- DESeq.result.list[["72h"]][gene.set,]
rownames(top.genes.3h) <- gene.set
rownames(top.genes.24h) <- gene.set
rownames(top.genes.72h) <- gene.set
top.genes.lfc[[timepoint]] <- rbind(
rownames_to_column(top.genes.3h, "genename") %>%
mutate(timepoint = "Saline", log2FoldChange = 0),
rownames_to_column(top.genes.3h, "genename") %>% mutate(timepoint = "3h"),
rownames_to_column(top.genes.24h, "genename") %>% mutate(timepoint = "24h"),
rownames_to_column(top.genes.72h, "genename") %>% mutate(timepoint = "72h")
) %>%
select(genename, log2FoldChange, timepoint) %>%
mutate(log2FoldChange = replace_na(log2FoldChange, 0))
}
plot.list <- list()
for(plot_name in names(top.genes.lfc)){
plot.list[[plot_name]] <- ggline(top.genes.lfc[[plot_name]],
x = "timepoint",
y = "log2FoldChange",
color = "genename", point.size = 1) +
theme(legend.position = "right") +
labs(y = "log2 fold (compared to saline)", x = NULL, color = NULL)
}
ggarrange(plotlist = plot.list, labels = names(plot.list), label.x = 0.11)
Filtered transcripts expressed in LPS-treated female mice compared to saline, additionally filtered to show genes with log2(fold change) >= 2 & adjusted p value < 0.05.
# HT matrix
zscore<- function(x){
z<- (x - mean(x)) / sd(x)
return(z)
}
fpkm.mat.z <- zscore(fpkm.mat.list[["All+male"]])
significant.genes <- DESeq.result.list[["All+male"]] %>%
filter(log2FoldChange >= 2 & padj < 0.05) %>%
arrange(padj) %>%
rownames()
fpkm.mat.sig.z <- fpkm.mat.z[significant.genes, rownames(colData)]
# HT annotation
## Samples
sample.names <- colData[colnames(fpkm.mat.sig.z),] %>%
mutate(
sample_names = paste(condition4_treatment, condition3_sex, condition5_time, sep = "_"),
sample_names = factor(sample_names, levels = unique(sample_names))
) %>%
pull(sample_names) %>%
str_replace_all(., "_", " ") %>%
str_replace(., "\\s(\\d+h)", " (\\1)")
sample.colors <- unique(sample.names) %>% length() %>%
RColorBrewer::brewer.pal(., "Set2")
names(sample.colors) <- unique(sample.names)
## Treatment
treatment.names <-
colData[colnames(fpkm.mat.sig.z),]$condition4_treatment
treatment.colors <- c("gray20", "gray70")
names(treatment.colors) <- unique(treatment.names)
treatment.groups <- word(colnames(fpkm.mat.sig.z), 2, sep = "_")
treatment.groups <- str_replace(treatment.groups, "Saline", "gray20")
treatment.groups <- str_replace(treatment.groups, "LPS", "gray70")
names(treatment.groups) <- word(colnames(fpkm.mat.sig.z), 2, sep = "_")
treatment.anno <- HeatmapAnnotation(
"Group" = treatment.names,
"Sample" = factor(sample.names, levels = names(sample.colors)),
col = list("Group" = treatment.colors, "Sample" = sample.colors),
show_legend = T,
show_annotation_name = F
)
highlight.genes <- c("Cybb", "Cxcl13", "Timp1", "Igtp", "Ifitm3", "H2-D1", "Ccl2", "Il6")
# HT
ComplexHeatmap::Heatmap(
fpkm.mat.sig.z,
name = "zFPKM",
col = circlize::colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
show_row_names = T,
show_column_names = T,
cluster_columns = F,
cluster_rows = T,
show_row_dend = F,
bottom_annotation = treatment.anno
) +
rowAnnotation(link = anno_mark(
at = which(rownames(fpkm.mat.sig.z) %in% highlight.genes),
labels = highlight.genes,
padding = unit(1, "mm")
))
[1] 291
library(zFPKM)
library(dplyr)
library(ComplexHeatmap)
library(stringr)
library(circlize)
fpkm.mat.list.plus.one <- lapply(fpkm.mat.list, function(mat) mat + 1)
# Convert the matrix to a data.frame
fpkm.df <- as.data.frame(fpkm.mat.list.plus.one[["All+male"]])
# Compute z-scores using zFPKM
fpkm.mat.z <- zFPKM(fpkm.df)
# Convert the result back to a matrix
fpkm.mat.z <- as.matrix(fpkm.mat.z)
significant.genes <- DESeq.result.list[["All+male"]] %>%
filter(log2FoldChange >= 2 & padj < 0.05) %>%
arrange(padj) %>%
rownames()
fpkm.mat.sig.z <- fpkm.mat.z[significant.genes, rownames(colData)]
# HT annotation
## Samples
sample.names <- colData[colnames(fpkm.mat.sig.z),] %>%
mutate(
sample_names = paste(condition4_treatment, condition3_sex, condition5_time, sep = "_"),
sample_names = factor(sample_names, levels = unique(sample_names))
) %>%
pull(sample_names) %>%
str_replace_all(., "_", " ") %>%
str_replace(., "\\s(\\d+h)", " (\\1)")
sample.colors <- unique(sample.names) %>% length() %>%
RColorBrewer::brewer.pal(., "Set2")
names(sample.colors) <- unique(sample.names)
## Treatment
treatment.names <-
colData[colnames(fpkm.mat.sig.z),]$condition4_treatment
treatment.colors <- c("gray20", "gray70")
names(treatment.colors) <- unique(treatment.names)
treatment.groups <- word(colnames(fpkm.mat.sig.z), 2, sep = "_")
treatment.groups <- str_replace(treatment.groups, "Saline", "gray20")
treatment.groups <- str_replace(treatment.groups, "LPS", "gray70")
names(treatment.groups) <- word(colnames(fpkm.mat.sig.z), 2, sep = "_")
treatment.anno <- HeatmapAnnotation(
"Group" = treatment.names,
"Sample" = factor(sample.names, levels = names(sample.colors)),
col = list("Group" = treatment.colors, "Sample" = sample.colors),
show_legend = T,
show_annotation_name = F
)
highlight.genes <- c("Cybb", "Cxcl13", "Timp1", "Igtp", "Ifitm3", "H2-D1", "Ccl2", "Il6")
# HT
ComplexHeatmap::Heatmap(
fpkm.mat.sig.z,
name = "zFPKM",
col = circlize::colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
show_row_names = T,
show_column_names = T,
cluster_columns = F,
cluster_rows = T,
show_row_dend = F,
bottom_annotation = treatment.anno
) +
rowAnnotation(link = anno_mark(
at = which(rownames(fpkm.mat.sig.z) %in% highlight.genes),
labels = highlight.genes,
padding = unit(1, "mm")
))
[1] 291