suppressPackageStartupMessages({
  library(VariantAnnotation)
  library(tidyverse)
  library(RColorBrewer)
  library(ggrepel)
  library(cowplot)
})

Load and process VCF files

First, we need to load the VCF files using readVcf function from VariantAnnotation package. We load both classic GATK and Parabricks GATK VCFs as objects into a named list.

vcf.filenames <- c("classic GATK" = "~/WES_data/vcf_analysis/WES_tumor.annotated.vcf",
                   "Parabricks GATK" = "~/WES_data/vcf_analysis/PB_WES_tumor.annotated.vcf")

vcf.list <- lapply(vcf.filenames, readVcf, genome = "hg38")

Here’s what looking at dimensions of each element of this list gives us:

lapply(vcf.list, dim)
$`classic GATK`
[1] 7480    2

$`Parabricks GATK`
[1] 8994    2

First value is the number of mutations and the second is the number of samples (normal and tumor). That is a lot of variants, luckly, we applied a Mutect2 filter. Let’s subset the VCF objects leaving only variants which passed the filter:

vcf.list.qc <-
  lapply(vcf.list, function(x)
    x[rowRanges(x)$FILTER == "PASS"])

# Also, filter only variants mapped correctly to a chromosome:
vcf.list.qc <-
  lapply(vcf.list.qc, function(x)
    x[!(grepl(names(x), pattern = "chrUn_")),])

lapply(vcf.list.qc, dim)
$`classic GATK`
[1] 2603    2

$`Parabricks GATK`
[1] 3146    2

Next block of code will extract only the relevant fields and columns from the VCF objects, creating a data frame where each row is a detected variant. It will include positional information and annotations of interest (gene name, type of SNV, allelic frequence) along with additional filter information which we will use to further prune the variants.

extractFUNCOTATION <-
  function(vcf,
           fields = c(
             "Gencode_34_hugoSymbol",
             "Gencode_34_variantClassification",
             "Gencode_34_secondaryVariantClassification",
             "Gencode_34_variantType"
           )) {
    annotation_colnames <-
      info(vcf@metadata$header)["FUNCOTATION", "Description"] %>%
      stringr::str_remove("Functional annotation from the Funcotator tool.  Funcotation fields are: ") %>%
      stringr::str_split(pattern = "\\|") %>%
      unlist()
    
    as.data.frame(info(vcf)[["FUNCOTATION"]]) %>%
      dplyr::select("FUNCOTATION" = value) %>%
      dplyr::mutate(
        FUNCOTATION = word(FUNCOTATION, 1, sep = "\\]"),
        FUNCOTATION = str_remove_all(FUNCOTATION, "[\\[\\]]")
      ) %>%
      tidyr::separate(FUNCOTATION,
                      into = annotation_colnames,
                      sep = "\\|",
                      fill = "right") %>%
      dplyr::select(all_of(fields))
  }

vcf.list.anno <- lapply(vcf.list.qc, function(x) extractFUNCOTATION(x))

snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
  vcf <- vcf.list.qc[[sample.name]]
  df.anno <- vcf.list.anno[[sample.name]]
  snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
    
    # since the sample is the same in both VCF, we rename the AF columns equally
    setNames(., c("AF_CTRL", "AF_TUMOR")) %>%
    
    mutate_all( ~ unlist(.)) %>%
    cbind(., df.anno) %>%
    mutate(
      DP = info(vcf)[["DP"]],
      TLOD = info(vcf)[["TLOD"]] %>% unlist()
    ) %>%
    rownames_to_column("Variant") %>%
    mutate(Position_hg38 = word(Variant, 1, sep = "_"),
           .before = "Variant") %>%
    mutate(Variant = word(Variant, 2, sep = "_"))
}

lapply(snv.list, head, 10) # Print only first 10 rows
$`classic GATK`

$`Parabricks GATK`
NA

Variant filters

Most of these SNVs are artefacts that we will need to filter out.

First, we can remove non-coding variants: introns, variants from intergenic regions, etc. Next block of code creates a second_filter column in the data frame with SNVs which will help us filter the mutations without losing possible false negatives.

noncoding.vars <- c("IGR", "INTRON", "COULD_NOT_DETERMINE", "RNA", "SILENT")

snv.list <- snv.list %>%
  lapply(
  mutate,
    second_filter = case_when(
      Gencode_34_variantClassification %in% noncoding.vars ~ "non-coding",
      Gencode_34_secondaryVariantClassification %in% noncoding.vars ~ "non-coding",
      TRUE ~ "PASS"
    )
  )

lapply(snv.list, filter, second_filter == "PASS") %>% 
  lapply(dim)
$`classic GATK`
[1] 1195   11

$`Parabricks GATK`
[1] 1380   11

Next, we can filter the mutations by thresholding them by tumor allele frequency > 10% and TLOD > 15. TLOD stands for tumor log odds and is a of probability that the variant is present in the tumor sample relative to the expected noise.

AF_TUMOR_thresh <- 0.1
TLOD_thresh <- 15

snv.list <- snv.list %>%
  lapply(
    mutate,
    second_filter = case_when(
      second_filter == "PASS" & TLOD < TLOD_thresh ~ "low_TLOD",
      second_filter == "PASS" &
        AF_TUMOR < AF_TUMOR_thresh ~ "low_TUMOR_AF",
      TRUE ~ second_filter
    )
  )

lapply(snv.list, filter, second_filter == "PASS") %>% 
  lapply(dim)
$`classic GATK`
[1] 871  11

$`Parabricks GATK`
[1] 1064   11

Scatterplot of

max.TLOD <- bind_rows(snv.list) %>%
  select(TLOD) %>% max()
min.TLOD <- bind_rows(snv.list) %>%
  select(TLOD) %>% min()

lapply(names(snv.list),
       function(name) {
         snv.list[[name]] %>%
           filter(second_filter != "non-coding") %>%
           ggplot(aes(y = AF_TUMOR, x = TLOD)) +
           theme_bw() +
           labs(title = name) +
           geom_point(
             aes(color = Gencode_34_variantClassification),
             shape = 1,
             size = 2,
             stroke = 1
           ) +
           scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
           scale_x_continuous(
             trans = "log10",
             limits = c(min.TLOD, max.TLOD),
             breaks = c(1, 10, 20, 50, 100, 200)
           ) +
           geom_vline(xintercept = TLOD_thresh, linetype = 2) +
           geom_hline(yintercept = AF_TUMOR_thresh, linetype = 2)
       })
[[1]]

[[2]]