library(VariantAnnotation)
library(openxlsx)
library(tidyverse)
library(RColorBrewer)
library(ggrepel)
library(cowplot)
wd <- here::here()

extractFUNCOTATION <-
  function(vcf,
           fields = c(
             "Gencode_43_hugoSymbol",
             "Gencode_43_variantClassification",
             "Gencode_43_variantType"
           )) {
    annotation_colnames <-
      info(vcf@metadata$header)["FUNCOTATION", "Description"] %>%
      stringr::str_remove("Functional annotation from the Funcotator tool.  Funcotation fields are: ") %>%
      stringr::str_split(pattern = "\\|") %>%
      unlist()
    
    as.data.frame(info(vcf)[["FUNCOTATION"]]) %>%
      dplyr::select("FUNCOTATION" = value) %>%
      dplyr::mutate(
        FUNCOTATION = word(FUNCOTATION, 1, sep = "\\]"),
        FUNCOTATION = str_remove_all(FUNCOTATION, "[\\[\\]]")
      ) %>%
      tidyr::separate(FUNCOTATION, into = annotation_colnames,
                      sep = "\\|", fill = "right") %>%
      dplyr::select(all_of(fields))
  }

Load sample

vcf.list <- list()
vcf.filenames <- c("GATK" = "LL.annotated.vcf.gz", "PB" = "pb-LL.vcf")
for(i in seq_along(vcf.filenames)){
  vcf.dir <- file.path(wd, "VCF")
  vcf.filepath <- file.path(vcf.dir, vcf.filenames[[i]])
  
  vcf.list[[names(vcf.filenames)[[i]]]] <- readVcf(vcf.filepath, "hg38")
}
Warning: duplicate keys in header will be forced to unique rownames
  
vcf.list <- vcf.list[1]

Subsetting only mutations that passed the initial upstream quality filter and were not unmapped.

vcf.list.qc <-
  lapply(vcf.list, function(x)
    x[rowRanges(x)$FILTER == "PASS"])
vcf.list.qc <-
  lapply(vcf.list.qc, function(x)
    x[!(grepl(names(x), pattern = "chrUn_")),])

lapply(vcf.list.qc, dim)
$GATK
[1] 2605    2
vcf.list.anno <- lapply(vcf.list.qc, function(x)
  extractFUNCOTATION(
    x,
    fields = c(
      "Gencode_43_hugoSymbol",
      "Gencode_43_variantClassification",
      "Gencode_43_variantType"
    )
  ))

Construct a data frame with useful values for easier plotting:

snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
  vcf <- vcf.list.qc[[sample.name]]
  df.anno <- vcf.list.anno[[sample.name]]
  snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
    setNames(., c("AF_CTRL", "AF_PDLC")) %>%
    mutate_all( ~ unlist(.)) %>%
    cbind(., df.anno) %>%
    mutate(
      DP = info(vcf)[["DP"]],
      GERMQ = info(vcf)[["GERMQ"]],
      NALOD = info(vcf)[["NALOD"]] %>% unlist(),
      NLOD = info(vcf)[["NLOD"]] %>% unlist(),
      TLOD = info(vcf)[["TLOD"]] %>% unlist()
    ) %>%
    rownames_to_column("Variant") %>%
    mutate(Position_hg38 = word(Variant, 1, sep = "_"),
           .before = "Variant") %>%
    mutate(Variant = word(Variant, 2, sep = "_"))
}
snv.list
$GATK