suppressPackageStartupMessages({
library(VariantAnnotation)
library(tidyverse)
library(RColorBrewer)
library(ggrepel)
library(cowplot)
})
First, we need to load the VCF files using readVcf
function from VariantAnnotation
package. We load both
classic GATK and Parabricks GATK VCFs as objects into a named list.
vcf.filenames <- c("classic GATK" = "~/WES_data/vcf_analysis/WES_tumor.annotated.vcf",
"Parabricks GATK" = "~/WES_data/vcf_analysis/PB_WES_tumor.annotated.vcf")
vcf.list <- lapply(vcf.filenames, readVcf, genome = "hg38")
Here’s what looking at dimensions of each element of this list gives us:
lapply(vcf.list, dim)
$`classic GATK`
[1] 7480 2
$`Parabricks GATK`
[1] 8994 2
First value is the number of mutations and the second is the number of samples (normal and tumor). That is a lot of variants, luckly, we applied a Mutect2 filter. Let’s subset the VCF objects leaving only variants which passed the filter:
vcf.list.qc <-
lapply(vcf.list, function(x)
x[rowRanges(x)$FILTER == "PASS"])
# Also, filter only variants mapped correctly to a chromosome:
vcf.list.qc <-
lapply(vcf.list.qc, function(x)
x[!(grepl(names(x), pattern = "chrUn_")),])
lapply(vcf.list.qc, dim)
$`classic GATK`
[1] 2603 2
$`Parabricks GATK`
[1] 3146 2
Next block of code will extract only the relevant fields and columns from the VCF objects, creating a data frame where each row is a detected variant. It will include positional information and annotations of interest (gene name, type of SNV, allelic frequence) along with additional filter information which we will use to further prune the variants.
extractFUNCOTATION <-
function(vcf,
fields = c(
"Gencode_34_hugoSymbol",
"Gencode_34_variantClassification",
"Gencode_34_secondaryVariantClassification",
"Gencode_34_variantType"
)) {
annotation_colnames <-
info(vcf@metadata$header)["FUNCOTATION", "Description"] %>%
stringr::str_remove("Functional annotation from the Funcotator tool. Funcotation fields are: ") %>%
stringr::str_split(pattern = "\\|") %>%
unlist()
as.data.frame(info(vcf)[["FUNCOTATION"]]) %>%
dplyr::select("FUNCOTATION" = value) %>%
dplyr::mutate(
FUNCOTATION = word(FUNCOTATION, 1, sep = "\\]"),
FUNCOTATION = str_remove_all(FUNCOTATION, "[\\[\\]]")
) %>%
tidyr::separate(FUNCOTATION,
into = annotation_colnames,
sep = "\\|",
fill = "right") %>%
dplyr::select(all_of(fields))
}
vcf.list.anno <- lapply(vcf.list.qc, function(x) extractFUNCOTATION(x))
snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
vcf <- vcf.list.qc[[sample.name]]
df.anno <- vcf.list.anno[[sample.name]]
snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
# since the sample is the same in both VCF, we rename the AF columns equally
setNames(., c("AF_CTRL", "AF_TUMOR")) %>%
mutate_all( ~ unlist(.)) %>%
cbind(., df.anno) %>%
mutate(
DP = info(vcf)[["DP"]],
TLOD = info(vcf)[["TLOD"]] %>% unlist()
) %>%
rownames_to_column("Variant") %>%
mutate(Position_hg38 = word(Variant, 1, sep = "_"),
.before = "Variant") %>%
mutate(Variant = word(Variant, 2, sep = "_"))
}
lapply(snv.list, head, 10) # Print only first 10 rows
$`classic GATK`
$`Parabricks GATK`
NA
Variant filters
Most of these SNVs are artefacts that we will need to filter out.
First, we can remove non-coding variants: introns, variants from
intergenic regions, etc. Next block of code creates a
second_filter
column in the data frame with SNVs which will
help us filter the mutations without losing possible false
negatives.
noncoding.vars <- c("IGR", "INTRON", "COULD_NOT_DETERMINE", "RNA", "SILENT")
snv.list <- snv.list %>%
lapply(
mutate,
second_filter = case_when(
Gencode_34_variantClassification %in% noncoding.vars ~ "non-coding",
Gencode_34_secondaryVariantClassification %in% noncoding.vars ~ "non-coding",
TRUE ~ "PASS"
)
)
lapply(snv.list, filter, second_filter == "PASS") %>%
lapply(dim)
$`classic GATK`
[1] 1195 11
$`Parabricks GATK`
[1] 1380 11
Next, we can filter the mutations by thresholding them by tumor
allele frequency > 10% and TLOD
> 15.
TLOD
stands for tumor log odds and is a of probability that
the variant is present in the tumor sample relative to the expected
noise.
AF_TUMOR_thresh <- 0.1
TLOD_thresh <- 15
snv.list <- snv.list %>%
lapply(
mutate,
second_filter = case_when(
second_filter == "PASS" & TLOD < TLOD_thresh ~ "low_TLOD",
second_filter == "PASS" &
AF_TUMOR < AF_TUMOR_thresh ~ "low_TUMOR_AF",
TRUE ~ second_filter
)
)
lapply(snv.list, filter, second_filter == "PASS") %>%
lapply(dim)
$`classic GATK`
[1] 871 11
$`Parabricks GATK`
[1] 1064 11
Scatterplot of
max.TLOD <- bind_rows(snv.list) %>%
select(TLOD) %>% max()
min.TLOD <- bind_rows(snv.list) %>%
select(TLOD) %>% min()
lapply(names(snv.list),
function(name) {
snv.list[[name]] %>%
filter(second_filter != "non-coding") %>%
ggplot(aes(y = AF_TUMOR, x = TLOD)) +
theme_bw() +
labs(title = name) +
geom_point(
aes(color = Gencode_34_variantClassification),
shape = 1,
size = 2,
stroke = 1
) +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(
trans = "log10",
limits = c(min.TLOD, max.TLOD),
breaks = c(1, 10, 20, 50, 100, 200)
) +
geom_vline(xintercept = TLOD_thresh, linetype = 2) +
geom_hline(yintercept = AF_TUMOR_thresh, linetype = 2)
})
[[1]]
[[2]]
Compare the allele frequencies for SNV detected in both samples
Let’s combine the snv.list
into one data frame and
inspect the overlap of filtered mutations:
combined.snv.df <- snv.list %>%
bind_rows(.id = "pipeline") %>%
filter(second_filter == "PASS") %>%
dplyr::select(pipeline, Position_hg38, Variant, AF_TUMOR,
Gencode_34_hugoSymbol, Gencode_34_variantClassification) %>%
pivot_wider(names_from = "pipeline", values_from = "AF_TUMOR")
combined.snv.df %>%
mutate(detection = case_when(
!is.na(`classic GATK`) & !is.na(`Parabricks GATK`) ~ "classic and Parabricks",
!is.na(`classic GATK`) & is.na(`Parabricks GATK`) ~ "classic",
is.na(`classic GATK`) & !is.na(`Parabricks GATK`) ~ "Parabricks"
)) %>%
group_by(detection, Gencode_34_variantClassification) %>%
summarise(N = n()) %>%
pivot_wider(names_from = detection, values_from = N)
`summarise()` has grouped output by 'detection'. You can override using the `.groups` argument.
One of the most commonly mutated genes in tumors is TP53. Using filter by gene name we can check the presence of TP53 mutations.
Finally, lets compare the allele frequencies for the variants detected by both pipelines:
combined.snv.df %>%
ggplot(aes(x = `classic GATK`, y = `Parabricks GATK`)) +
geom_point(aes(fill = Gencode_34_variantClassification),
shape = 21,
size = 2) +
theme_bw() +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(labels = scales::percent, limits = c(0, 1))
Session info
sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS/LAPACK: /home/bench-user/.apps/conda/lib/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Etc/UTC
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_1.1.3 ggrepel_0.9.5 RColorBrewer_1.1-3 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[7] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1 ggplot2_3.4.4
[13] tidyverse_2.0.0 VariantAnnotation_1.48.1 Rsamtools_2.18.0 Biostrings_2.70.1 XVector_0.42.0 SummarizedExperiment_1.32.0
[19] Biobase_2.62.0 GenomicRanges_1.54.1 GenomeInfoDb_1.38.1 IRanges_2.36.0 S4Vectors_0.40.2 MatrixGenerics_1.14.0
[25] matrixStats_1.2.0 BiocGenerics_0.48.1
loaded via a namespace (and not attached):
[1] DBI_1.2.1 bitops_1.0-7 biomaRt_2.58.0 rlang_1.1.3 magrittr_2.0.3 compiler_4.3.1 RSQLite_2.3.4
[8] GenomicFeatures_1.54.1 png_0.1-8 vctrs_0.6.5 pkgconfig_2.0.3 crayon_1.5.2 fastmap_1.1.1 dbplyr_2.4.0
[15] labeling_0.4.3 utf8_1.2.4 rmarkdown_2.25 tzdb_0.4.0 ggbeeswarm_0.7.2 bit_4.0.5 xfun_0.41
[22] zlibbioc_1.48.0 cachem_1.0.8 jsonlite_1.8.8 progress_1.2.3 blob_1.2.4 DelayedArray_0.28.0 BiocParallel_1.36.0
[29] parallel_4.3.1 prettyunits_1.2.0 R6_2.5.1 bslib_0.6.1 stringi_1.8.3 rtracklayer_1.62.0 jquerylib_0.1.4
[36] Rcpp_1.0.12 knitr_1.45 timechange_0.3.0 Matrix_1.6-5 tidyselect_1.2.0 rstudioapi_0.15.0 abind_1.4-5
[43] yaml_2.3.8 codetools_0.2-19 curl_5.1.0 lattice_0.22-5 withr_3.0.0 KEGGREST_1.42.0 evaluate_0.23
[50] BiocFileCache_2.10.1 xml2_1.3.6 pillar_1.9.0 filelock_1.0.3 rsconnect_1.2.0 generics_0.1.3 RCurl_1.98-1.14
[57] hms_1.1.3 munsell_0.5.0 scales_1.3.0 glue_1.7.0 tools_4.3.1 BiocIO_1.12.0 BSgenome_1.70.1
[64] GenomicAlignments_1.38.0 XML_3.99-0.16.1 grid_4.3.1 AnnotationDbi_1.64.1 colorspace_2.1-0 GenomeInfoDbData_1.2.11 beeswarm_0.4.0
[71] restfulr_0.0.15 vipor_0.4.7 cli_3.6.2 rappdirs_0.3.3 fansi_1.0.6 S4Arrays_1.2.0 gtable_0.3.4
[78] sass_0.4.8 digest_0.6.34 SparseArray_1.2.2 farver_2.1.1 rjson_0.2.21 memoise_2.0.1 htmltools_0.5.7
[85] lifecycle_1.0.4 httr_1.4.7 bit64_4.0.5