suppressPackageStartupMessages({
library(VariantAnnotation)
library(tidyverse)
library(RColorBrewer)
library(ggrepel)
library(cowplot)
})
First, we need to load the VCF files using readVcf
function from VariantAnnotation
package. We load both
classic GATK and Parabricks GATK VCFs as objects into a named list.
vcf.filenames <- c("classic GATK" = "~/WES_data/vcf_analysis/WES_tumor.annotated.vcf",
"Parabricks GATK" = "~/WES_data/vcf_analysis/PB_WES_tumor.annotated.vcf")
vcf.list <- lapply(vcf.filenames, readVcf, genome = "hg38")
Here’s what looking at dimensions of each element of this list gives us:
lapply(vcf.list, dim)
$`classic GATK`
[1] 7480 2
$`Parabricks GATK`
[1] 8994 2
First value is the number of mutations and the second is the number of samples (normal and tumor). That is a lot of variants, luckly, we applied a Mutect2 filter. Let’s subset the VCF objects leaving only variants which passed the filter:
vcf.list.qc <-
lapply(vcf.list, function(x)
x[rowRanges(x)$FILTER == "PASS"])
# Also, filter only variants mapped correctly to a chromosome:
vcf.list.qc <-
lapply(vcf.list.qc, function(x)
x[!(grepl(names(x), pattern = "chrUn_")),])
lapply(vcf.list.qc, dim)
$`classic GATK`
[1] 2603 2
$`Parabricks GATK`
[1] 3146 2
Next block of code will extract only the relevant fields and columns from the VCF objects, creating a data frame where each row is a detected variant. It will include positional information and annotations of interest (gene name, type of SNV, allelic frequence) along with additional filter information which we will use to further prune the variants.
extractFUNCOTATION <-
function(vcf,
fields = c(
"Gencode_34_hugoSymbol",
"Gencode_34_variantClassification",
"Gencode_34_secondaryVariantClassification",
"Gencode_34_variantType"
)) {
annotation_colnames <-
info(vcf@metadata$header)["FUNCOTATION", "Description"] %>%
stringr::str_remove("Functional annotation from the Funcotator tool. Funcotation fields are: ") %>%
stringr::str_split(pattern = "\\|") %>%
unlist()
as.data.frame(info(vcf)[["FUNCOTATION"]]) %>%
dplyr::select("FUNCOTATION" = value) %>%
dplyr::mutate(
FUNCOTATION = word(FUNCOTATION, 1, sep = "\\]"),
FUNCOTATION = str_remove_all(FUNCOTATION, "[\\[\\]]")
) %>%
tidyr::separate(FUNCOTATION,
into = annotation_colnames,
sep = "\\|",
fill = "right") %>%
dplyr::select(all_of(fields))
}
vcf.list.anno <- lapply(vcf.list.qc, function(x) extractFUNCOTATION(x))
snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
vcf <- vcf.list.qc[[sample.name]]
df.anno <- vcf.list.anno[[sample.name]]
snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
# since the sample is the same in both VCF, we rename the AF columns equally
setNames(., c("AF_CTRL", "AF_TUMOR")) %>%
mutate_all( ~ unlist(.)) %>%
cbind(., df.anno) %>%
mutate(
DP = info(vcf)[["DP"]],
TLOD = info(vcf)[["TLOD"]] %>% unlist()
) %>%
rownames_to_column("Variant") %>%
mutate(Position_hg38 = word(Variant, 1, sep = "_"),
.before = "Variant") %>%
mutate(Variant = word(Variant, 2, sep = "_"))
}
lapply(snv.list, head, 10) # Print only first 10 rows
$`classic GATK`
$`Parabricks GATK`
NA
Variant filters
Most of these SNVs are artefacts that we will need to filter out.
First, we can remove non-coding variants: introns, variants from
intergenic regions, etc. Next block of code creates a
second_filter
column in the data frame with SNVs which will
help us filter the mutations without losing possible false
negatives.
noncoding.vars <- c("IGR", "INTRON", "COULD_NOT_DETERMINE", "RNA", "SILENT")
snv.list <- snv.list %>%
lapply(
mutate,
second_filter = case_when(
Gencode_34_variantClassification %in% noncoding.vars ~ "non-coding",
Gencode_34_secondaryVariantClassification %in% noncoding.vars ~ "non-coding",
TRUE ~ "PASS"
)
)
lapply(snv.list, filter, second_filter == "PASS") %>%
lapply(dim)
$`classic GATK`
[1] 1195 11
$`Parabricks GATK`
[1] 1380 11
Next, we can filter the mutations by thresholding them by tumor
allele frequency > 10% and TLOD
> 15.
TLOD
stands for tumor log odds and is a of probability that
the variant is present in the tumor sample relative to the expected
noise.
AF_TUMOR_thresh <- 0.1
TLOD_thresh <- 15
snv.list <- snv.list %>%
lapply(
mutate,
second_filter = case_when(
second_filter == "PASS" & TLOD < TLOD_thresh ~ "low_TLOD",
second_filter == "PASS" &
AF_TUMOR < AF_TUMOR_thresh ~ "low_TUMOR_AF",
TRUE ~ second_filter
)
)
lapply(snv.list, filter, second_filter == "PASS") %>%
lapply(dim)
$`classic GATK`
[1] 871 11
$`Parabricks GATK`
[1] 1064 11
Scatterplot of
max.TLOD <- bind_rows(snv.list) %>%
select(TLOD) %>% max()
min.TLOD <- bind_rows(snv.list) %>%
select(TLOD) %>% min()
lapply(names(snv.list),
function(name) {
snv.list[[name]] %>%
filter(second_filter != "non-coding") %>%
ggplot(aes(y = AF_TUMOR, x = TLOD)) +
theme_bw() +
labs(title = name) +
geom_point(
aes(color = Gencode_34_variantClassification),
shape = 1,
size = 2,
stroke = 1
) +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(
trans = "log10",
limits = c(min.TLOD, max.TLOD),
breaks = c(1, 10, 20, 50, 100, 200)
) +
geom_vline(xintercept = TLOD_thresh, linetype = 2) +
geom_hline(yintercept = AF_TUMOR_thresh, linetype = 2)
})
[[1]]
[[2]]