library(VariantAnnotation)
library(openxlsx)
library(tidyverse)
library(RColorBrewer)
library(ggrepel)
library(cowplot)
wd <- here::here()
VCF files are loaded into a list. Names of the “samples”:
vcf.list <- list()
vcf.filenames <- c("GATK" = "LL.annotated.vcf.gz", "PB" = "pb-LL.annotated.vcf")
for(i in seq_along(vcf.filenames)){
vcf.dir <- file.path(wd, "VCF")
vcf.filepath <- file.path(vcf.dir, vcf.filenames[[i]])
vcf.list[[names(vcf.filenames)[[i]]]] <- readVcf(vcf.filepath, "hg38")
}
names(vcf.list)
[1] "GATK" "PB"
Sub-setting only mutations that passed the initial upstream quality filter and were not unmapped:
vcf.list.qc <-
lapply(vcf.list, function(x)
x[rowRanges(x)$FILTER == "PASS"])
vcf.list.qc <-
lapply(vcf.list.qc, function(x)
x[!(grepl(names(x), pattern = "chrUn_")),])
lapply(vcf.list.qc, dim)
$GATK
[1] 2605 2
$PB
[1] 3007 2
Get only gene symbol, variant class and type from the functional annotation of the VCF.
vcf.list.anno[[1]] %>% colnames()
[1] "Gencode_43_hugoSymbol" "Gencode_43_variantClassification" "Gencode_43_variantType"
Construct a data frame with extracted values.
snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
vcf <- vcf.list.qc[[sample.name]]
df.anno <- vcf.list.anno[[sample.name]]
snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
setNames(., c("AF_CTRL", "AF_PDLC")) %>%
mutate_all( ~ unlist(.)) %>%
cbind(., df.anno) %>%
mutate(
DP = info(vcf)[["DP"]],
GERMQ = info(vcf)[["GERMQ"]],
NALOD = info(vcf)[["NALOD"]] %>% unlist(),
NLOD = info(vcf)[["NLOD"]] %>% unlist(),
TLOD = info(vcf)[["TLOD"]] %>% unlist()
) %>%
rownames_to_column("Variant") %>%
mutate(Position_hg38 = word(Variant, 1, sep = "_"),
.before = "Variant") %>%
mutate(Variant = word(Variant, 2, sep = "_"))
}
lapply(snv.list, head)
$GATK
$PB
NA
The list of SNV data frames is bound into one data frame with a sample name column.
snv.df <- bind_rows(snv.list, .id = "sample")
head(snv.df)
Now we can compare the samples side by side.
A plot of allelic frequency for all the SNV in the data frame.
We can look for the SNV in a particular gene:
snv.df %>%
filter(Gencode_43_hugoSymbol == "TP53")
GATK filter tool provides VCF with columns of data by which the SNV could be further filtered to remove germline variants and sequencing artifacts.
DP is an approximate read depth. We want to remove variants under a certain DP threshold.
GERMQ - The phred-scaled posterior probability that the alternate allele(s) are NOT germline variants.
NALOD (Normal Allele Log Odds) is negative log 10 odds of artifact in normal with same allele fraction as tumor. 5% chance is NALOD ~ 1.3
NLOD (Normal Log Odds) is log odds that the variant is not present in the normal sample
TLOD (Tumor Log Odds) is a log odds ratio that represents the likelihood that the observed tumor allele depth (coverage) is due to a real mutation.
filters = list(
"DP" = 30,
"GERMQ" = 90,
"NALOD" = -log10(0.05),
"NLOD" = 6,
"TLOD" = 30,
"AF_CTRL" = 0.025,
"AF_TUMOR" = 0.1
)
snv.list[["GATK"]] %>%
ggplot(.,
aes(y = AF_CTRL, x = DP)) +
geom_point(aes(color = GERMQ),
shape = 1, alpha = 0.75,
size = 1.5, stroke = 1
) +
scale_colour_steps2(
high = "green",
midpoint = 50,
mid = "blue",
low = "blue",
breaks = filters[["GERMQ"]]
) +
scale_x_continuous(trans = "log10") +
scale_y_continuous(labels = scales::percent) +
geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
geom_vline(xintercept = filters[["DP"]], linetype = "dashed") +
labs(y = "CTRL AF, %")
snv.list[["GATK"]] %>%
dplyr::filter(
GERMQ >= filters[["GERMQ"]],
DP >= filters[["DP"]]
) %>%
ggplot(.,
aes(x = TLOD,
y = NLOD)) +
geom_density_2d(color = "black") +
geom_point(
aes(color = NALOD,
shape = GERMQ
), size = 2, stroke = 1) +
scale_y_continuous(trans = "log10") +
scale_x_continuous(trans = "log10") +
scale_colour_steps2(
high = "green",
midpoint = filters[["NALOD"]] / 2,
mid = "blue",
low = "blue",
breaks = filters[["NALOD"]]
) +
scale_shape_binned(breaks = c(filters[["GERMQ"]]), solid = FALSE) +
geom_hline(yintercept = filters[["NLOD"]], linetype = 2) +
geom_vline(xintercept = filters[["TLOD"]], linetype = 2)
First plot shows thresholds based on CTRL allelic frequency, sequencing depth and GERMQ score. The SNV which pass these thresholds are shown in the second plot. They are thresholded using NLOD, NALOD and TLOD scores.
filters = list(
"DP" = 30,
"GERMQ" = 90,
"NALOD" = -log10(0.05),
"NLOD" = 6,
"TLOD" = 30,
"AF_CTRL" = 0.025,
"AF_TUMOR" = 0.1
)
snv.list[["PB"]] %>%
ggplot(.,
aes(y = AF_CTRL, x = DP)) +
geom_point(aes(color = GERMQ),
shape = 1, alpha = 0.75,
size = 1.5, stroke = 1
) +
scale_colour_steps2(
high = "green",
midpoint = 50,
mid = "blue",
low = "blue",
breaks = filters[["GERMQ"]]
) +
scale_x_continuous(trans = "log10") +
scale_y_continuous(labels = scales::percent) +
geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
geom_vline(xintercept = filters[["DP"]], linetype = "dashed") +
labs(y = "CTRL AF, %")
snv.list[["PB"]] %>%
dplyr::filter(
GERMQ >= filters[["GERMQ"]],
DP >= filters[["DP"]]
) %>%
ggplot(.,
aes(x = TLOD,
y = NLOD)) +
geom_density_2d(color = "black") +
geom_point(
aes(color = NALOD,
shape = GERMQ
), size = 2, stroke = 1) +
scale_y_continuous(trans = "log10") +
scale_x_continuous(trans = "log10") +
scale_colour_steps2(
high = "green",
midpoint = filters[["NALOD"]] / 2,
mid = "blue",
low = "blue",
breaks = filters[["NALOD"]]
) +
scale_shape_binned(breaks = c(filters[["GERMQ"]]), solid = FALSE) +
geom_hline(yintercept = filters[["NLOD"]], linetype = 2) +
geom_vline(xintercept = filters[["TLOD"]], linetype = 2)
First plot shows thresholds based on CTRL allelic frequency, sequencing depth and GERMQ score. The SNV which pass these thresholds are shown in the second plot. They are thresholded using NLOD, NALOD and TLOD scores.
Constructing a filter vector for vcf_qc
object:
colnames(snv.df)
[1] "sample" "Position_hg38"
[3] "Variant" "AF_CTRL"
[5] "AF_PDLC" "Gencode_43_hugoSymbol"
[7] "Gencode_43_variantClassification" "Gencode_43_variantType"
[9] "DP" "GERMQ"
[11] "NALOD" "NLOD"
[13] "TLOD"
Rather than just filtering, we annotate the SNV based on first reason for filtering out.
snv.filter.df <- snv.df %>%
mutate(
second_filter = case_when(
DP <= filters[["DP"]] ~ "low_DP",
GERMQ <= filters[["GERMQ"]] ~ "low_GERMQ",
AF_CTRL > filters[["AF_CTRL"]] ~ "high_CTRL_AF",
AF_PDLC < filters[["AF_TUMOR"]] ~ "low_Tumor_AF",
NALOD < filters[["NALOD"]] ~ "low_NALOD",
NLOD < filters[["NLOD"]] ~ "low_NLOD",
TLOD < filters[["TLOD"]] ~ "low_TLOD",
TRUE ~ "PASS"
)
)
snv.filter.df %>%
filter(second_filter == "PASS")
Again, we can check for the presence of a specific mutation in the dataset:
snv.filter.df %>%
filter(Gencode_43_hugoSymbol == "TP53")
c(
"low_DP" = "red",
"low_GERMQ" = "orange",
"low_NALOD" = "purple",
"low_NLOD" = "violet",
"low_TLOD" = "firebrick",
"high_CTRL_AF" = "coral2",
"low_Tumor_AF" = "blueviolet",
"PASS" = "green"
) -> pal.colors
snv.filter.df %>%
pivot_longer(cols = contains("AF"), values_to = "AF") %>%
ggplot(aes(x = name, y = AF)) +
ggbeeswarm::geom_quasirandom(aes(color = second_filter)) +
geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
theme_bw() + labs(x = NULL) +
scale_color_manual(values = pal.colors) +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~ sample)
Shawarma plots of allelic frequencies for all mutations (including filtered out).
snv.filter.df %>%
filter(second_filter == "PASS") %>%
pivot_longer(cols = contains("AF"), values_to = "AF") %>%
ggplot(aes(x = name, y = AF)) +
ggbeeswarm::geom_quasirandom(aes(color = second_filter)) +
geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
theme_bw() + labs(x = NULL) +
scale_color_manual(values = pal.colors) +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~ sample)
Same plot but only SNV which passed the filters.
Data frame of all filtered SNVs:
snv.filter.df.wide <- snv.filter.df %>%
filter(second_filter == "PASS") %>%
dplyr::select(sample, Position_hg38, Variant, AF_PDLC,
Gencode_43_hugoSymbol, Gencode_43_variantClassification) %>%
pivot_wider(names_from = "sample", values_from = "AF_PDLC")
snv.filter.df.wide
Data frame of missense filtered SNVs:
snv.filter.df.wide %>%
filter(Gencode_43_variantClassification == "MISSENSE")
NB: Not all mutations detected by PB-GATK were detected by GATK!
dot_position = position_jitter(height = 0,
width = 0.25,
seed = 413)
snv.filter.df %>%
filter(second_filter == "PASS") %>%
ggplot() +
lemon::geom_pointline(
aes(
x = sample,
y = AF_PDLC,
group = Position_hg38,
fill = Gencode_43_variantClassification
),
linecolor = "gray30",
linetype = 3,
position = dot_position,
shape = 21,
size = 2
) +
geom_point(
aes(
x = sample,
y = AF_PDLC,
group = Position_hg38,
fill = Gencode_43_variantClassification
),
position = dot_position,
shape = 21,
size = 2
) +
geom_hline(yintercept = 0.05, linetype = "dashed") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
theme_bw() +
labs(x = NULL, y = "Allelic frequency",
fill = "SNV type")
Visual comparison of allelic frequencies for both runs. Only filtered SNV present in both samples are plotted.
dot_position = position_jitter(height = 0,
width = 0.25,
seed = 413)
snv.filter.df %>%
filter(second_filter == "PASS") %>%
filter(Gencode_43_variantClassification == "MISSENSE") %>%
ggplot() +
lemon::geom_pointline(
aes(x = sample,
y = AF_PDLC,
group = Position_hg38),
linecolor = "gray30",
linetype = 3,
position = dot_position,
shape = 21,
size = 2,
fill = "gray90"
) +
geom_hline(yintercept = 0.05, linetype = "dashed") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
theme_bw() +
labs(x = NULL, y = "Allelic frequency",
fill = "SNV type")
Visual comparison of allelic frequencies for missence mutations present in both samples.
output_dir <- "~/test_data/SNVcalls/VCF"
output_file_xlsx <- paste0("Filtered_SNV.xlsx")
output_path_xlsx <- file.path(output_dir, output_file_xlsx)
list(
"Total SNV" = snv.filter.df,
"Filtered SNV" = snv.filter.df %>%
filter(second_filter == "PASS") %>%
dplyr::select(-second_filter)
) %>%
write.xlsx(., output_path_xlsx, overwrite = T)
sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS/LAPACK: /home/bench-user/.apps/conda/lib/libopenblasp-r0.3.25.so; LAPACK version 3.11.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
[4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
[10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Etc/UTC
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_1.1.2 ggrepel_0.9.4 RColorBrewer_1.1-3
[4] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[7] dplyr_1.1.4 purrr_1.0.2 readr_2.1.4
[10] tidyr_1.3.0 tibble_3.2.1 ggplot2_3.4.4
[13] tidyverse_2.0.0 openxlsx_4.2.5.2 VariantAnnotation_1.48.1
[16] Rsamtools_2.18.0 Biostrings_2.70.1 XVector_0.42.0
[19] SummarizedExperiment_1.32.0 Biobase_2.62.0 GenomicRanges_1.54.1
[22] GenomeInfoDb_1.38.1 IRanges_2.36.0 S4Vectors_0.40.2
[25] MatrixGenerics_1.14.0 matrixStats_1.2.0 BiocGenerics_0.48.1
loaded via a namespace (and not attached):
[1] DBI_1.2.0 bitops_1.0-7 gridExtra_2.3 biomaRt_2.58.0
[5] rlang_1.1.2 magrittr_2.0.3 compiler_4.3.1 RSQLite_2.3.4
[9] GenomicFeatures_1.54.1 png_0.1-8 vctrs_0.6.5 pkgconfig_2.0.3
[13] crayon_1.5.2 fastmap_1.1.1 dbplyr_2.4.0 lemon_0.4.7
[17] labeling_0.4.3 utf8_1.2.4 rmarkdown_2.25 tzdb_0.4.0
[21] ggbeeswarm_0.7.2 bit_4.0.5 xfun_0.41 zlibbioc_1.48.0
[25] cachem_1.0.8 progress_1.2.3 blob_1.2.4 DelayedArray_0.28.0
[29] BiocParallel_1.36.0 parallel_4.3.1 prettyunits_1.2.0 R6_2.5.1
[33] stringi_1.8.3 rtracklayer_1.62.0 Rcpp_1.0.11 knitr_1.45
[37] timechange_0.2.0 Matrix_1.6-4 tidyselect_1.2.0 rstudioapi_0.15.0
[41] abind_1.4-5 yaml_2.3.8 codetools_0.2-19 curl_5.1.0
[45] lattice_0.22-5 plyr_1.8.9 withr_2.5.2 KEGGREST_1.42.0
[49] evaluate_0.23 isoband_0.2.7 BiocFileCache_2.10.1 zip_2.3.0
[53] xml2_1.3.6 pillar_1.9.0 filelock_1.0.3 generics_0.1.3
[57] rprojroot_2.0.4 RCurl_1.98-1.13 hms_1.1.3 munsell_0.5.0
[61] scales_1.3.0 glue_1.6.2 tools_4.3.1 BiocIO_1.12.0
[65] BSgenome_1.70.1 GenomicAlignments_1.38.0 XML_3.99-0.16 grid_4.3.1
[69] AnnotationDbi_1.64.1 colorspace_2.1-0 GenomeInfoDbData_1.2.11 beeswarm_0.4.0
[73] restfulr_0.0.15 vipor_0.4.7 cli_3.6.2 rappdirs_0.3.3
[77] fansi_1.0.6 S4Arrays_1.2.0 gtable_0.3.4 digest_0.6.33
[81] SparseArray_1.2.2 farver_2.1.1 rjson_0.2.21 memoise_2.0.1
[85] htmltools_0.5.7 lifecycle_1.0.4 httr_1.4.7 here_1.0.1
[89] MASS_7.3-60 bit64_4.0.5