library(VariantAnnotation)
library(openxlsx)
library(tidyverse)
library(RColorBrewer)
library(ggrepel)
library(cowplot)
wd <- here::here()
extractFUNCOTATION <-
function(vcf,
fields = c(
"Gencode_43_hugoSymbol",
"Gencode_43_variantClassification",
"Gencode_43_variantType"
)) {
annotation_colnames <-
info(vcf@metadata$header)["FUNCOTATION", "Description"] %>%
stringr::str_remove("Functional annotation from the Funcotator tool. Funcotation fields are: ") %>%
stringr::str_split(pattern = "\\|") %>%
unlist()
as.data.frame(info(vcf)[["FUNCOTATION"]]) %>%
dplyr::select("FUNCOTATION" = value) %>%
dplyr::mutate(
FUNCOTATION = word(FUNCOTATION, 1, sep = "\\]"),
FUNCOTATION = str_remove_all(FUNCOTATION, "[\\[\\]]")
) %>%
tidyr::separate(FUNCOTATION, into = annotation_colnames,
sep = "\\|", fill = "right") %>%
dplyr::select(all_of(fields))
}
vcf.list <- list()
vcf.filenames <- c("GATK" = "LL.annotated.vcf.gz", "PB" = "pb-LL.vcf")
for(i in seq_along(vcf.filenames)){
vcf.dir <- file.path(wd, "VCF")
vcf.filepath <- file.path(vcf.dir, vcf.filenames[[i]])
vcf.list[[names(vcf.filenames)[[i]]]] <- readVcf(vcf.filepath, "hg38")
}
Warning: duplicate keys in header will be forced to unique rownames
vcf.list <- vcf.list[1]
Subsetting only mutations that passed the initial upstream quality filter and were not unmapped.
vcf.list.qc <-
lapply(vcf.list, function(x)
x[rowRanges(x)$FILTER == "PASS"])
vcf.list.qc <-
lapply(vcf.list.qc, function(x)
x[!(grepl(names(x), pattern = "chrUn_")),])
lapply(vcf.list.qc, dim)
$GATK
[1] 2605 2
vcf.list.anno <- lapply(vcf.list.qc, function(x)
extractFUNCOTATION(
x,
fields = c(
"Gencode_43_hugoSymbol",
"Gencode_43_variantClassification",
"Gencode_43_variantType"
)
))
Construct a data frame with useful values for easier plotting:
snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
vcf <- vcf.list.qc[[sample.name]]
df.anno <- vcf.list.anno[[sample.name]]
snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
setNames(., c("AF_CTRL", "AF_PDLC")) %>%
mutate_all( ~ unlist(.)) %>%
cbind(., df.anno) %>%
mutate(
DP = info(vcf)[["DP"]],
GERMQ = info(vcf)[["GERMQ"]],
NALOD = info(vcf)[["NALOD"]] %>% unlist(),
NLOD = info(vcf)[["NLOD"]] %>% unlist(),
TLOD = info(vcf)[["TLOD"]] %>% unlist()
) %>%
rownames_to_column("Variant") %>%
mutate(Position_hg38 = word(Variant, 1, sep = "_"),
.before = "Variant") %>%
mutate(Variant = word(Variant, 2, sep = "_"))
}
snv.list
$GATK
NA
snv.df <- bind_rows(snv.list, .id = "sample")
ggplot(snv.df,
aes(y = AF_PDLC,
x = AF_CTRL)) +
geom_jitter(aes(color = Gencode_43_variantClassification), size = 2) +
ylim(0, 1) + xlim(0, 1) +
theme_bw()
snv.df %>%
filter(Gencode_43_hugoSymbol == "TP53")
DP is an approximate read depth
GERMQ The phred-scaled posterior probability that the alternate allele(s) are NOT germline variants
NALOD (Normal Allele Log Odds) is negative log 10 odds of artifact in normal with same allele fraction as tumor. 5% chance is NALOD ~ 1.3
NLOD (Normal Log Odds) is log odds that the variant is not present in the normal sample
TLOD (Tumor Log Odds) is a log odds ratio that represents the likelihood that the observed tumor allele depth (coverage) is due to a real mutation.
filters = list(
"DP" = 30,
"GERMQ" = 90,
"NALOD" = -log10(0.05),
"NLOD" = 6,
"TLOD" = 30,
"AF_CTRL" = 0.025,
"AF_TUMOR" = 0.1
)
snv.df %>%
ggplot(.,
aes(y = AF_CTRL, x = DP)) +
geom_point(aes(color = GERMQ),
shape = 1, alpha = 0.75,
size = 1.5, stroke = 1
) +
scale_colour_steps2(
high = "green",
midpoint = 50,
mid = "blue",
low = "blue",
breaks = filters[["GERMQ"]]
) +
scale_x_continuous(trans = "log10") +
scale_y_continuous(labels = scales::percent) +
geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
geom_vline(xintercept = filters[["DP"]], linetype = "dashed") +
labs(y = "CTRL AF, %")
snv.df %>%
dplyr::filter(
GERMQ >= filters[["GERMQ"]],
DP >= filters[["DP"]]
) %>%
ggplot(.,
aes(x = TLOD,
y = NLOD)) +
geom_density_2d(color = "black") +
geom_point(
aes(color = NALOD,
shape = GERMQ
), size = 2, stroke = 1) +
scale_y_continuous(trans = "log10") +
scale_x_continuous(trans = "log10") +
scale_colour_steps2(
high = "green",
midpoint = filters[["NALOD"]] / 2,
mid = "blue",
low = "blue",
breaks = filters[["NALOD"]]
) +
scale_shape_binned(breaks = c(filters[["GERMQ"]]), solid = FALSE) +
geom_hline(yintercept = filters[["NLOD"]], linetype = 2) +
geom_vline(xintercept = filters[["TLOD"]], linetype = 2)
pull(snv.df, Gencode_43_variantClassification) %>% unique() %>%
length() -> n.SNV.types.unfiltered
c(
brewer.pal(n = 12, name = "Paired"),
brewer.pal(n = n.SNV.types.unfiltered - 12, name = "Dark2")
) -> pal.colors
filter(snv.df, sample != "P069_PT_REL_BM") %>%
dplyr::filter(
GERMQ >= filters[["GERMQ"]],
DP >= filters[["DP"]]
) %>%
ggplot(.,
aes(x = TLOD,
y = NLOD)) +
geom_hex(aes(fill = Gencode_43_variantClassification), alpha = 0.75) +
scale_y_continuous(trans = "log10") +
scale_x_continuous(trans = "log10") +
geom_hline(yintercept = filters[["NLOD"]], linetype = 2) +
geom_vline(xintercept = filters[["TLOD"]], linetype = 2) +
scale_fill_manual(values = pal.colors)
Constructing a filter vector for vcf_qc
object:
colnames(snv.df)
[1] "sample" "Position_hg38" "Variant" "AF_CTRL"
[5] "AF_PDLC" "Gencode_43_hugoSymbol" "Gencode_43_variantClassification" "Gencode_43_variantType"
[9] "DP" "GERMQ" "NALOD" "NLOD"
[13] "TLOD"
snv.filter.df <- snv.df %>%
mutate(
second_filter = case_when(
DP <= filters[["DP"]] ~ "low_DP",
GERMQ <= filters[["GERMQ"]] ~ "low_GERMQ",
AF_CTRL > filters[["AF_CTRL"]] ~ "high_CTRL_AF",
AF_PDLC < filters[["AF_TUMOR"]] ~ "low_Tumor_AF",
NALOD < filters[["NALOD"]] ~ "low_NALOD",
NLOD < filters[["NLOD"]] ~ "low_NLOD",
TLOD < filters[["TLOD"]] ~ "low_TLOD",
TRUE ~ "PASS"
)
)
allowed.positions <- snv.filter.df %>%
filter(sample != "P069_PT_REL_BM" &
second_filter == "PASS") %>% pull(Position_hg38)
snv.filter.df <- snv.filter.df %>%
mutate(
second_filter = case_when(
sample == "P069_PT_REL_BM" &
Position_hg38 %in% allowed.positions ~ "PASS",
TRUE ~ second_filter
)
)
snv.filter.df %>%
filter(second_filter == "PASS")
snv.filter.df %>%
filter(Gencode_43_hugoSymbol == "TP53")
c(
"low_DP" = "red",
"low_GERMQ" = "orange",
"low_NALOD" = "purple",
"low_NLOD" = "violet",
"low_TLOD" = "firebrick",
"high_CTRL_AF" = "coral2",
"low_Tumor_AF" = "blueviolet",
"PASS" = "green"
) -> pal.colors
snv.filter.df %>%
pivot_longer(cols = contains("AF"), values_to = "AF") %>%
ggplot(aes(x = name, y = AF)) +
ggbeeswarm::geom_quasirandom(aes(color = second_filter)) +
geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
theme_bw() + labs(x = NULL) +
scale_color_manual(values = pal.colors) +
scale_y_continuous(labels = scales::percent)
snv.filter.df %>%
filter(second_filter == "PASS") %>%
pivot_longer(cols = contains("AF"), values_to = "AF") %>%
ggplot(aes(x = name, y = AF)) +
ggbeeswarm::geom_quasirandom(aes(color = second_filter)) +
geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
theme_bw() + labs(x = NULL) +
scale_color_manual(values = pal.colors) +
scale_y_continuous(labels = scales::percent)
Same plot but without the SNV which did not pass the filters.
Data frame of all filtered SNVs:
snv.filter.df.wide <- snv.filter.df %>%
filter(second_filter == "PASS") %>%
dplyr::select(sample, Position_hg38, Variant, AF_PDLC,
Gencode_43_hugoSymbol, Gencode_43_variantClassification) %>%
pivot_wider(names_from = "sample", values_from = "AF_PDLC") %>%
mutate(across(everything(), ~replace_na(., 0)))
snv.filter.df.wide
Data frame of missense filtered SNVs:
snv.filter.df %>%
filter(second_filter == "PASS") %>%
filter(Gencode_43_variantClassification == "MISSENSE") %>%
dplyr::select(sample, Position_hg38, Variant, AF_PDLC,
Gencode_43_hugoSymbol) %>%
pivot_wider(names_from = "sample", values_from = "AF_PDLC")
dot_position = position_jitter(height = 0,
width = 0.25,
seed = 413)
snv.filter.df %>%
filter(second_filter == "PASS") %>%
ggplot() +
lemon::geom_pointline(
aes(
x = sample,
y = AF_PDLC,
group = Position_hg38,
fill = Gencode_43_variantClassification
),
linecolor = "gray30",
linetype = 3,
position = dot_position,
shape = 21,
size = 2
) +
geom_point(
aes(
x = sample,
y = AF_PDLC,
group = Position_hg38,
fill = Gencode_43_variantClassification
),
position = dot_position,
shape = 21,
size = 2
) +
geom_hline(yintercept = 0.05, linetype = "dashed") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
theme_bw() +
labs(x = NULL, y = "Allelic frequency",
fill = "SNV type")
dot_position = position_jitter(height = 0,
width = 0.25,
seed = 413)
snv.filter.df %>%
filter(second_filter == "PASS") %>%
filter(Gencode_43_variantClassification == "MISSENSE") %>%
ggplot() +
lemon::geom_pointline(
aes(x = sample,
y = AF_PDLC,
group = Position_hg38),
linecolor = "gray30",
linetype = 3,
position = dot_position,
shape = 21,
size = 2,
fill = "gray90"
) +
geom_hline(yintercept = 0.05, linetype = "dashed") +
scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
theme_bw() +
labs(x = NULL, y = "Allelic frequency",
fill = "SNV type")
TODO: add the second sample after fixing the filters.
output_file_xlsx <- paste0("FilteredSNV.xlsx")
output_path_xlsx <- file.path(output_dir, output_file_xlsx)
list(
"Total SNV" = snv_filter.df,
"Filtered SNV" = snv_filter.df %>%
filter(second_filter == "PASS") %>%
dplyr::select(-second_filter)
) %>%
write.xlsx(., output_path_xlsx, overwrite = T)
sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS
Matrix products: default
BLAS/LAPACK: /home/bench-user/.apps/conda/lib/libopenblasp-r0.3.25.so; LAPACK version 3.11.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Etc/UTC
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggrepel_0.9.4 VariantAnnotation_1.48.1 Rsamtools_2.18.0 Biostrings_2.70.1 XVector_0.42.0
[6] SummarizedExperiment_1.32.0 Biobase_2.62.0 GenomicRanges_1.54.1 GenomeInfoDb_1.38.1 IRanges_2.36.0
[11] S4Vectors_0.40.2 MatrixGenerics_1.14.0 matrixStats_1.2.0 BiocGenerics_0.48.1 cowplot_1.1.2
[16] RColorBrewer_1.1-3 openxlsx_4.2.5.2 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[21] dplyr_1.1.4 purrr_1.0.2 readr_2.1.4 tidyr_1.3.0 tibble_3.2.1
[26] ggplot2_3.4.4 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] DBI_1.2.0 bitops_1.0-7 gridExtra_2.3 biomaRt_2.58.0 rlang_1.1.2 magrittr_2.0.3
[7] compiler_4.3.1 RSQLite_2.3.4 GenomicFeatures_1.54.1 png_0.1-8 vctrs_0.6.5 pkgconfig_2.0.3
[13] crayon_1.5.2 fastmap_1.1.1 dbplyr_2.4.0 lemon_0.4.7 labeling_0.4.3 utf8_1.2.4
[19] rmarkdown_2.25 tzdb_0.4.0 ggbeeswarm_0.7.2 bit_4.0.5 xfun_0.41 zlibbioc_1.48.0
[25] cachem_1.0.8 progress_1.2.3 blob_1.2.4 DelayedArray_0.28.0 BiocParallel_1.36.0 parallel_4.3.1
[31] prettyunits_1.2.0 R6_2.5.1 stringi_1.8.3 rtracklayer_1.62.0 Rcpp_1.0.11 knitr_1.45
[37] Matrix_1.6-4 timechange_0.2.0 tidyselect_1.2.0 rstudioapi_0.15.0 abind_1.4-5 yaml_2.3.8
[43] codetools_0.2-19 curl_5.1.0 lattice_0.22-5 plyr_1.8.9 withr_2.5.2 KEGGREST_1.42.0
[49] evaluate_0.23 isoband_0.2.7 BiocFileCache_2.10.1 xml2_1.3.6 zip_2.3.0 filelock_1.0.3
[55] pillar_1.9.0 BiocManager_1.30.22 generics_0.1.3 rprojroot_2.0.4 RCurl_1.98-1.13 hms_1.1.3
[61] munsell_0.5.0 scales_1.3.0 glue_1.6.2 tools_4.3.1 hexbin_1.28.3 BiocIO_1.12.0
[67] BSgenome_1.70.1 GenomicAlignments_1.38.0 XML_3.99-0.16 grid_4.3.1 AnnotationDbi_1.64.1 colorspace_2.1-0
[73] GenomeInfoDbData_1.2.11 beeswarm_0.4.0 restfulr_0.0.15 vipor_0.4.7 cli_3.6.2 rappdirs_0.3.3
[79] fansi_1.0.6 S4Arrays_1.2.0 gtable_0.3.4 digest_0.6.33 SparseArray_1.2.2 farver_2.1.1
[85] rjson_0.2.21 memoise_2.0.1 htmltools_0.5.7 lifecycle_1.0.4 httr_1.4.7 here_1.0.1
[91] MASS_7.3-60 bit64_4.0.5