library(VariantAnnotation)
library(openxlsx)
library(tidyverse)
library(RColorBrewer)
library(ggrepel)
library(cowplot)
wd <- here::here()
extractFUNCOTATION <-
function(vcf,
fields = c(
"Gencode_43_hugoSymbol",
"Gencode_43_variantClassification",
"Gencode_43_variantType"
)) {
annotation_colnames <-
info(vcf@metadata$header)["FUNCOTATION", "Description"] %>%
stringr::str_remove("Functional annotation from the Funcotator tool. Funcotation fields are: ") %>%
stringr::str_split(pattern = "\\|") %>%
unlist()
as.data.frame(info(vcf)[["FUNCOTATION"]]) %>%
dplyr::select("FUNCOTATION" = value) %>%
dplyr::mutate(
FUNCOTATION = word(FUNCOTATION, 1, sep = "\\]"),
FUNCOTATION = str_remove_all(FUNCOTATION, "[\\[\\]]")
) %>%
tidyr::separate(FUNCOTATION, into = annotation_colnames,
sep = "\\|", fill = "right") %>%
dplyr::select(all_of(fields))
}
vcf.list <- list()
vcf.filenames <- c("GATK" = "LL.annotated.vcf.gz", "PB" = "pb-LL.vcf")
for(i in seq_along(vcf.filenames)){
vcf.dir <- file.path(wd, "VCF")
vcf.filepath <- file.path(vcf.dir, vcf.filenames[[i]])
vcf.list[[names(vcf.filenames)[[i]]]] <- readVcf(vcf.filepath, "hg38")
}
Warning: duplicate keys in header will be forced to unique rownames
vcf.list <- vcf.list[1]
Subsetting only mutations that passed the initial upstream quality filter and were not unmapped.
vcf.list.qc <-
lapply(vcf.list, function(x)
x[rowRanges(x)$FILTER == "PASS"])
vcf.list.qc <-
lapply(vcf.list.qc, function(x)
x[!(grepl(names(x), pattern = "chrUn_")),])
lapply(vcf.list.qc, dim)
$GATK
[1] 2605 2
vcf.list.anno <- lapply(vcf.list.qc, function(x)
extractFUNCOTATION(
x,
fields = c(
"Gencode_43_hugoSymbol",
"Gencode_43_variantClassification",
"Gencode_43_variantType"
)
))
Construct a data frame with useful values for easier plotting:
snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
vcf <- vcf.list.qc[[sample.name]]
df.anno <- vcf.list.anno[[sample.name]]
snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
setNames(., c("AF_CTRL", "AF_PDLC")) %>%
mutate_all( ~ unlist(.)) %>%
cbind(., df.anno) %>%
mutate(
DP = info(vcf)[["DP"]],
GERMQ = info(vcf)[["GERMQ"]],
NALOD = info(vcf)[["NALOD"]] %>% unlist(),
NLOD = info(vcf)[["NLOD"]] %>% unlist(),
TLOD = info(vcf)[["TLOD"]] %>% unlist()
) %>%
rownames_to_column("Variant") %>%
mutate(Position_hg38 = word(Variant, 1, sep = "_"),
.before = "Variant") %>%
mutate(Variant = word(Variant, 2, sep = "_"))
}
snv.list
$GATK