suppressPackageStartupMessages({
  library(VariantAnnotation)
  library(tidyverse)
  library(RColorBrewer)
  library(ggrepel)
  library(cowplot)
})
Warning: package ‘VariantAnnotation’ was built under R version 4.3.2Warning: package ‘BiocGenerics’ was built under R version 4.3.2Warning: package ‘MatrixGenerics’ was built under R version 4.3.2Warning: package ‘matrixStats’ was built under R version 4.3.2Warning: package ‘GenomeInfoDb’ was built under R version 4.3.2Warning: package ‘S4Vectors’ was built under R version 4.3.2Warning: package ‘IRanges’ was built under R version 4.3.2Warning: package ‘GenomicRanges’ was built under R version 4.3.2Warning: package ‘SummarizedExperiment’ was built under R version 4.3.2Warning: package ‘Biobase’ was built under R version 4.3.2Warning: package ‘Rsamtools’ was built under R version 4.3.2Warning: package ‘Biostrings’ was built under R version 4.3.2Warning: package ‘XVector’ was built under R version 4.3.2Warning: package ‘tidyr’ was built under R version 4.3.2Warning: package ‘readr’ was built under R version 4.3.2Warning: package ‘dplyr’ was built under R version 4.3.2Warning: package ‘stringr’ was built under R version 4.3.2Warning: package ‘ggrepel’ was built under R version 4.3.2Warning: package ‘cowplot’ was built under R version 4.3.2

Load and process VCF files

First, we need to load the VCF files using readVcf function from VariantAnnotation package. We load both classic GATK and Parabricks GATK VCFs as objects into a named list.

vcf.filenames <- c("classic GATK" = "~/WES_data/WES_tumor.annotated.vcf",
                   "Parabricks GATK" = "~/WES_data/PB_WES_tumor.annotated.vcf")

vcf.list <- lapply(vcf.filenames, readVcf, genome = "hg38")

Here’s what looking at dimensions of each element of this list gives us:

lapply(vcf.list, dim)
$`classic GATK`
[1] 7480    2

$`Parabricks GATK`
[1] 8994    2

First value is the number of mutations and the second is the number of samples (normal and tumor). That is a lot of variants, luckly, we applied a Mutect2 filter. Let’s subset the VCF objects leaving only variants which passed the filter:

vcf.list.qc <-
  lapply(vcf.list, function(x)
    x[rowRanges(x)$FILTER == "PASS"])

# Also, filter only variants mapped correctly to a chromosome:
vcf.list.qc <-
  lapply(vcf.list.qc, function(x)
    x[!(grepl(names(x), pattern = "chrUn_")),])

lapply(vcf.list.qc, dim)
$`classic GATK`
[1] 2603    2

$`Parabricks GATK`
[1] 3146    2

That’s substantially less variants to worry about, but still a lot.

Next block of code will extract only the relevant fields and columns from the VCF objects, creating a data frame where each row is a detected variant. It will include positional information and annotations of interest (gene, type of SNV, allelic frequence) along with additional filter information which we will use to further prune the variants.

extractFUNCOTATION <-
  function(vcf,
           fields = c(
             "Gencode_34_hugoSymbol",
             "Gencode_34_variantClassification",
             "Gencode_34_variantType"
           )) {
    annotation_colnames <-
      info(vcf@metadata$header)["FUNCOTATION", "Description"] %>%
      stringr::str_remove("Functional annotation from the Funcotator tool.  Funcotation fields are: ") %>%
      stringr::str_split(pattern = "\\|") %>%
      unlist()
    
    as.data.frame(info(vcf)[["FUNCOTATION"]]) %>%
      dplyr::select("FUNCOTATION" = value) %>%
      dplyr::mutate(
        FUNCOTATION = word(FUNCOTATION, 1, sep = "\\]"),
        FUNCOTATION = str_remove_all(FUNCOTATION, "[\\[\\]]")
      ) %>%
      tidyr::separate(FUNCOTATION,
                      into = annotation_colnames,
                      sep = "\\|",
                      fill = "right") %>%
      dplyr::select(all_of(fields))
  }

vcf.list.anno <- lapply(vcf.list.qc, function(x) extractFUNCOTATION(x))

snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
  vcf <- vcf.list.qc[[sample.name]]
  df.anno <- vcf.list.anno[[sample.name]]
  snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
    
    # since the sample is the same in both VCF, we rename the AF columns equally
    setNames(., c("AF_CTRL", "AF_TUMOR")) %>%
    
    mutate_all( ~ unlist(.)) %>%
    cbind(., df.anno) %>%
    mutate(
      DP = info(vcf)[["DP"]],
      GERMQ = info(vcf)[["GERMQ"]],
      NALOD = info(vcf)[["NALOD"]] %>% unlist(),
      NLOD = info(vcf)[["NLOD"]] %>% unlist(),
      TLOD = info(vcf)[["TLOD"]] %>% unlist()
    ) %>%
    rownames_to_column("Variant") %>%
    mutate(Position_hg38 = word(Variant, 1, sep = "_"),
           .before = "Variant") %>%
    mutate(Variant = word(Variant, 2, sep = "_"))
}

lapply(snv.list, head, 10) # Print only first 10 rows
$`classic GATK`

$`Parabricks GATK`
NA

Variant filters

GATK FilterMutectCalls gives the these values for each processed variant:

  • DP is the read depth. The bigger it is - the more statistical power we have to say that the observed read is real.

  • GERMQ - The phred-scaled posterior probability that the alternate alleles are NOT germline variants. The higher the better.

  • NALOD (Normal Allele Log Odds) Negative log 10 odds of the variant being an an artifact in the normal sample with the same allele fraction as the tumor sample for each alternate allele.

  • NLOD (Normal Log Odds) is log odds that the variant is not present in the normal sample. Confidence that the variant is not a germline variant.

  • TLOD (Tumor Log Odds) Log odds that the variant is present in the tumor sample relative to the expected noise.

We can threshold them to further filter the VCF, removing germline variants and sequencing artifacts. Along with it, we threshold by allelic frequence - variant should be present in more than 10% of tumor reads and less than 2.5% of normal DNA reads.

filters = list(
  "DP" = 30,
  "GERMQ" = 90,
  "NALOD" = -log10(0.05),
  "NLOD" = 6,
  "TLOD" = 30,
  
  "AF_CTRL" = 0.025,
  "AF_TUMOR" = 0.1
)

lapply(names(snv.list),
       function(name)
         ggplot(snv.list[[name]],
                aes(y = AF_CTRL, x = DP)) +
         geom_point(
           aes(color = GERMQ),
           shape = 1,
           alpha = 0.75,
           size = 1.5,
           stroke = 1
         ) +
         scale_colour_steps2(
           high = "green",
           midpoint = 50,
           mid = "blue",
           low = "blue",
           breaks = filters[["GERMQ"]]
         ) +
         scale_x_continuous(trans = "log10") +
         scale_y_continuous(labels = scales::percent) +
         geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
         geom_vline(xintercept = filters[["DP"]], linetype = "dashed") +
         labs(y = "CTRL AF, %", title = name))
[[1]]

[[2]]

The first plots shows DP, normal allelic frequency and GERMQ thresholds. Variants in the lower right quadrant in green pass it.

lapply(names(snv.list),
       function(name)
         snv.list[[name]] %>%
         dplyr::filter(GERMQ >= filters[["GERMQ"]],
                       DP >= filters[["DP"]]) %>%
         mutate(
           NALOD_threshold = case_when(
             NALOD > filters[["NALOD"]] ~ "PASS",
             TRUE ~ "Too low"
           )
         ) %>% 
         ggplot(.,
                aes(x = TLOD,
                    y = NLOD)) +
         geom_density_2d(color = "black") +
         geom_point(
           aes(color = NALOD_threshold),
           size = 2,
           stroke = 1
         ) +
         scale_color_manual(values = c("PASS" = "#5ec962", "Too low" = "#440154")) +
         scale_y_continuous(trans = "log10") +
         scale_x_continuous(trans = "log10") +
         geom_hline(yintercept = filters[["NLOD"]], linetype = 2) +
         geom_vline(xintercept = filters[["TLOD"]], linetype = 2) +
         labs(title = name))
[[1]]

[[2]]

The SNV which pass the first plot thresholds are shown here. They are thresholded using NLOD, NALOD and TLOD scores.

Apply the variant filters

Rather than just filtering, we annotate the SNV based on first reason for filtering out.

filtered.snv.list <- lapply(snv.list, function(x)
  mutate(
    x,
    second_filter = case_when(
      DP <= filters[["DP"]] ~ "low_DP",
      GERMQ <= filters[["GERMQ"]] ~ "low_GERMQ",
      AF_CTRL > filters[["AF_CTRL"]] ~ "high_CTRL_AF",
      AF_TUMOR < filters[["AF_TUMOR"]] ~ "low_Tumor_AF",
      NALOD < filters[["NALOD"]] ~   "low_NALOD",
      NLOD < filters[["NLOD"]] ~     "low_NLOD",
      TLOD < filters[["TLOD"]] ~     "low_TLOD",
      TRUE ~ "PASS"
    )
  ))

lapply(filtered.snv.list, filter, second_filter == "PASS") %>% 
  lapply(., dim)
$`classic GATK`
[1] 536  13

$`Parabricks GATK`
[1] 587  13
c(
  "low_DP" = "red",
  "low_GERMQ" = "orange",
  "low_NALOD" = "purple",
  "low_NLOD" = "violet",
  "low_TLOD" = "firebrick",
  "high_CTRL_AF" = "coral2",
  "low_Tumor_AF" = "blueviolet",
  "PASS" = "green"
) -> pal.colors

lapply(names(filtered.snv.list),
       function(name)
         pivot_longer(filtered.snv.list[[name]], cols = contains("AF"), values_to = "AF") %>%
         ggplot(aes(x = name, y = AF)) +
         ggbeeswarm::geom_quasirandom(aes(color = second_filter)) +
         geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
         theme_bw() + labs(x = NULL) +
         scale_color_manual(values = pal.colors) +
         scale_y_continuous(labels = scales::percent) +
         labs(title = name))
[[1]]

[[2]]

Beeswarm plots of allelic frequencies for all mutations (including filtered out).

lapply(names(filtered.snv.list),
       function(name)
         filter(filtered.snv.list[[name]], second_filter == "PASS") %>%
         pivot_longer(cols = contains("AF"), values_to = "AF") %>%
         ggplot(aes(x = name, y = AF)) +
         ggbeeswarm::geom_quasirandom(aes(color = second_filter)) +
         geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
         theme_bw() + labs(x = NULL) +
         scale_color_manual(values = pal.colors) +
         scale_y_continuous(labels = scales::percent) +
         labs(title = name))
[[1]]

[[2]]

Same plot but only SNV which passed the filters.

Compare the allelic frequencies for SNV detected in both samples

Let’s combine the filtered.snv.list into one dataframe and inspect the overlap of detected mutations:

combined.snv.df <- filtered.snv.list %>% 
  bind_rows(.id = "pipeline") %>% 
  filter(second_filter == "PASS") %>% 
  dplyr::select(pipeline, Position_hg38, Variant, AF_TUMOR,
         Gencode_34_hugoSymbol, Gencode_34_variantClassification) %>% 
  pivot_wider(names_from = "pipeline", values_from = "AF_TUMOR")

combined.snv.df

Variants that were not detected by Parabricks but only by classic GATK:

combined.snv.df %>% 
  filter(!is.na(`classic GATK`) & is.na(`Parabricks GATK`))

And vice versa:

combined.snv.df %>% 
  filter(is.na(`classic GATK`) & !is.na(`Parabricks GATK`))

Parabricks GATK detected 53 more and ‘missed’ two mutations.

One of the most commonly mutated genes in tumors is TP53. Using filter by gene name we can check the presence of TP53 mutations.

Finally, lets compare the allelic frequencies for the variants detected by both pipelines:

combined.snv.df %>%
  ggplot(aes(x = `classic GATK`, y = `Parabricks GATK`)) +
  geom_point(
    aes(
      fill = Gencode_34_variantClassification
    ),
    shape = 21,
    size = 2
  ) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(labels = scales::percent, limits = c(0, 1))

We can limit the variant types to show only the biologically revevant mutations:

relevant.snv.df <- combined.snv.df %>%
  filter(Gencode_34_variantClassification %in% c("MISSENSE", "NONSENSE",
                                                 "IN_FRAME_DEL", "IN_FRAME_INS",
                                                 "FRAME_SHIFT_DEL", "FRAME_SHIFT_INS")
         )


  ggplot(relevant.snv.df,
         aes(x = `classic GATK`, y = `Parabricks GATK`)) +
  geom_point(
    aes(
      fill = Gencode_34_variantClassification
    ),
    shape = 21,
    size = 2
  ) + 
  geom_text_repel( # add text labels for some of the top AF mutations
    data = filter(relevant.snv.df, `classic GATK` > 0.9 & `Parabricks GATK` > 0.9),
    aes(label = Gencode_34_hugoSymbol), max.overlaps = 30, nudge_y = -0.2, nudge_x = -0.6
  ) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(labels = scales::percent, limits = c(0, 1))

Session info

sessionInfo()
R version 4.3.1 (2023-06-16)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 22.04.3 LTS

Matrix products: default
BLAS/LAPACK: /home/bench-user/.apps/conda/lib/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] cowplot_1.1.3               ggrepel_0.9.5               RColorBrewer_1.1-3         
 [4] lubridate_1.9.3             forcats_1.0.0               stringr_1.5.1              
 [7] dplyr_1.1.4                 purrr_1.0.2                 readr_2.1.5                
[10] tidyr_1.3.1                 tibble_3.2.1                ggplot2_3.4.4              
[13] tidyverse_2.0.0             VariantAnnotation_1.48.1    Rsamtools_2.18.0           
[16] Biostrings_2.70.1           XVector_0.42.0              SummarizedExperiment_1.32.0
[19] Biobase_2.62.0              GenomicRanges_1.54.1        GenomeInfoDb_1.38.1        
[22] IRanges_2.36.0              S4Vectors_0.40.2            MatrixGenerics_1.14.0      
[25] matrixStats_1.2.0           BiocGenerics_0.48.1        

loaded via a namespace (and not attached):
 [1] DBI_1.2.1                bitops_1.0-7             biomaRt_2.58.0           rlang_1.1.3             
 [5] magrittr_2.0.3           compiler_4.3.1           RSQLite_2.3.4            GenomicFeatures_1.54.1  
 [9] png_0.1-8                vctrs_0.6.5              pkgconfig_2.0.3          crayon_1.5.2            
[13] fastmap_1.1.1            dbplyr_2.4.0             labeling_0.4.3           utf8_1.2.4              
[17] tzdb_0.4.0               ggbeeswarm_0.7.2         bit_4.0.5                xfun_0.41               
[21] zlibbioc_1.48.0          cachem_1.0.8             progress_1.2.3           blob_1.2.4              
[25] DelayedArray_0.28.0      BiocParallel_1.36.0      parallel_4.3.1           prettyunits_1.2.0       
[29] R6_2.5.1                 stringi_1.8.3            rtracklayer_1.62.0       Rcpp_1.0.12             
[33] knitr_1.45               Matrix_1.6-5             timechange_0.3.0         tidyselect_1.2.0        
[37] rstudioapi_0.15.0        abind_1.4-5              yaml_2.3.8               codetools_0.2-19        
[41] curl_5.1.0               lattice_0.22-5           withr_3.0.0              KEGGREST_1.42.0         
[45] isoband_0.2.7            BiocFileCache_2.10.1     xml2_1.3.6               pillar_1.9.0            
[49] filelock_1.0.3           generics_0.1.3           RCurl_1.98-1.14          hms_1.1.3               
[53] munsell_0.5.0            scales_1.3.0             glue_1.7.0               tools_4.3.1             
[57] BiocIO_1.12.0            BSgenome_1.70.1          GenomicAlignments_1.38.0 XML_3.99-0.16.1         
[61] grid_4.3.1               AnnotationDbi_1.64.1     colorspace_2.1-0         GenomeInfoDbData_1.2.11 
[65] beeswarm_0.4.0           restfulr_0.0.15          vipor_0.4.7              cli_3.6.2               
[69] rappdirs_0.3.3           fansi_1.0.6              S4Arrays_1.2.0           gtable_0.3.4            
[73] digest_0.6.34            SparseArray_1.2.2        farver_2.1.1             rjson_0.2.21            
[77] memoise_2.0.1            lifecycle_1.0.4          httr_1.4.7               MASS_7.3-60             
[81] bit64_4.0.5             
---
title: "QC and filtering of annotated VCF files"
output:
  html_notebook:
    toc: true
    toc_float: true
    toc_depth: 3
    code_folding: show
    theme: united
---

```{r setup}
suppressPackageStartupMessages({
  library(VariantAnnotation)
  library(tidyverse)
  library(RColorBrewer)
  library(ggrepel)
  library(cowplot)
})
```

# Load and process VCF files

First, we need to load the VCF files using `readVcf` function from `VariantAnnotation` package. We load both classic GATK and Parabricks GATK VCFs as objects into a named list.

```{r warning=FALSE}
vcf.filenames <- c("classic GATK" = "~/WES_data/WES_tumor.annotated.vcf",
                   "Parabricks GATK" = "~/WES_data/PB_WES_tumor.annotated.vcf")

vcf.list <- lapply(vcf.filenames, readVcf, genome = "hg38")
```

Here's what looking at dimensions of each element of this list gives us:

```{r warning=FALSE}
lapply(vcf.list, dim)
```

First value is the number of mutations and the second is the number of samples (normal and tumor). That is a lot of variants, luckly, we applied a Mutect2 filter. Let's subset the VCF objects leaving only variants which passed the filter:

```{r}
vcf.list.qc <-
  lapply(vcf.list, function(x)
    x[rowRanges(x)$FILTER == "PASS"])

# Also, filter only variants mapped correctly to a chromosome:
vcf.list.qc <-
  lapply(vcf.list.qc, function(x)
    x[!(grepl(names(x), pattern = "chrUn_")),])

lapply(vcf.list.qc, dim)
```

That's substantially less variants to worry about, but still a lot.

Next block of code will extract only the relevant fields and columns from the VCF objects, creating a data frame where each row is a detected variant. It will include positional information and annotations of interest (gene, type of SNV, allelic frequence) along with additional filter information which we will use to further prune the variants. 

```{r}
extractFUNCOTATION <-
  function(vcf,
           fields = c(
             "Gencode_34_hugoSymbol",
             "Gencode_34_variantClassification",
             "Gencode_34_variantType"
           )) {
    annotation_colnames <-
      info(vcf@metadata$header)["FUNCOTATION", "Description"] %>%
      stringr::str_remove("Functional annotation from the Funcotator tool.  Funcotation fields are: ") %>%
      stringr::str_split(pattern = "\\|") %>%
      unlist()
    
    as.data.frame(info(vcf)[["FUNCOTATION"]]) %>%
      dplyr::select("FUNCOTATION" = value) %>%
      dplyr::mutate(
        FUNCOTATION = word(FUNCOTATION, 1, sep = "\\]"),
        FUNCOTATION = str_remove_all(FUNCOTATION, "[\\[\\]]")
      ) %>%
      tidyr::separate(FUNCOTATION,
                      into = annotation_colnames,
                      sep = "\\|",
                      fill = "right") %>%
      dplyr::select(all_of(fields))
  }

vcf.list.anno <- lapply(vcf.list.qc, function(x) extractFUNCOTATION(x))

snv.list <- list()
for(sample.name in names(vcf.list.qc)) {
  vcf <- vcf.list.qc[[sample.name]]
  df.anno <- vcf.list.anno[[sample.name]]
  snv.list[[sample.name]] <- as.data.frame(geno(vcf)[["AF"]]) %>%
    
    # since the sample is the same in both VCF, we rename the AF columns equally
    setNames(., c("AF_CTRL", "AF_TUMOR")) %>%
    
    mutate_all( ~ unlist(.)) %>%
    cbind(., df.anno) %>%
    mutate(
      DP = info(vcf)[["DP"]],
      GERMQ = info(vcf)[["GERMQ"]],
      NALOD = info(vcf)[["NALOD"]] %>% unlist(),
      NLOD = info(vcf)[["NLOD"]] %>% unlist(),
      TLOD = info(vcf)[["TLOD"]] %>% unlist()
    ) %>%
    rownames_to_column("Variant") %>%
    mutate(Position_hg38 = word(Variant, 1, sep = "_"),
           .before = "Variant") %>%
    mutate(Variant = word(Variant, 2, sep = "_"))
}

lapply(snv.list, head, 10) # Print only first 10 rows
```

# Variant filters

GATK `FilterMutectCalls` gives the these values for each processed variant: 

* **DP** is the read depth. The bigger it is - the more statistical power we have to say that the observed read is real.

* **GERMQ** - The phred-scaled posterior probability that the alternate alleles are NOT germline variants. The higher the better.

* **NALOD** (Normal Allele Log Odds) Negative log 10 odds of the variant being an an artifact in the normal sample with the same allele fraction as the tumor sample for each alternate allele.

* **NLOD** (Normal Log Odds) is log odds that the variant is not present in the normal sample. Confidence that the variant is not a germline variant.

* **TLOD** (Tumor Log Odds) Log odds that the variant is present in the tumor sample relative to the expected noise.

We can threshold them to further filter the VCF, removing germline variants and sequencing artifacts. Along with it, we threshold by allelic frequence - variant should be present in more than 10% of tumor reads and less than 2.5% of normal DNA reads.

```{r, fig.width=6, fig.height=5}
filters = list(
  "DP" = 30,
  "GERMQ" = 90,
  "NALOD" = -log10(0.05),
  "NLOD" = 6,
  "TLOD" = 30,
  
  "AF_CTRL" = 0.025,
  "AF_TUMOR" = 0.1
)

lapply(names(snv.list),
       function(name)
         ggplot(snv.list[[name]],
                aes(y = AF_CTRL, x = DP)) +
         geom_point(
           aes(color = GERMQ),
           shape = 1,
           alpha = 0.75,
           size = 1.5,
           stroke = 1
         ) +
         scale_colour_steps2(
           high = "green",
           midpoint = 50,
           mid = "blue",
           low = "blue",
           breaks = filters[["GERMQ"]]
         ) +
         scale_x_continuous(trans = "log10") +
         scale_y_continuous(labels = scales::percent) +
         geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
         geom_vline(xintercept = filters[["DP"]], linetype = "dashed") +
         labs(y = "CTRL AF, %", title = name))
```

The first plots shows DP, normal allelic frequency and GERMQ thresholds. Variants in the lower right quadrant in green pass it.

```{r, fig.width=6, fig.height=5}
lapply(names(snv.list),
       function(name)
         snv.list[[name]] %>%
         dplyr::filter(GERMQ >= filters[["GERMQ"]],
                       DP >= filters[["DP"]]) %>%
         mutate(
           NALOD_threshold = case_when(
             NALOD > filters[["NALOD"]] ~ "PASS",
             TRUE ~ "Too low"
           )
         ) %>% 
         ggplot(.,
                aes(x = TLOD,
                    y = NLOD)) +
         geom_density_2d(color = "black") +
         geom_point(
           aes(color = NALOD_threshold),
           size = 2,
           stroke = 1
         ) +
         scale_color_manual(values = c("PASS" = "#5ec962", "Too low" = "#440154")) +
         scale_y_continuous(trans = "log10") +
         scale_x_continuous(trans = "log10") +
         geom_hline(yintercept = filters[["NLOD"]], linetype = 2) +
         geom_vline(xintercept = filters[["TLOD"]], linetype = 2) +
         labs(title = name))
```

The SNV which pass the first plot thresholds are shown here. They are thresholded using NLOD, NALOD and TLOD scores.

## Apply the variant filters

Rather than just filtering, we annotate the SNV based on first reason for filtering out.

```{r}
filtered.snv.list <- lapply(snv.list, function(x)
  mutate(
    x,
    second_filter = case_when(
      DP <= filters[["DP"]] ~ "low_DP",
      GERMQ <= filters[["GERMQ"]] ~ "low_GERMQ",
      AF_CTRL > filters[["AF_CTRL"]] ~ "high_CTRL_AF",
      AF_TUMOR < filters[["AF_TUMOR"]] ~ "low_Tumor_AF",
      NALOD < filters[["NALOD"]] ~   "low_NALOD",
      NLOD < filters[["NLOD"]] ~     "low_NLOD",
      TLOD < filters[["TLOD"]] ~     "low_TLOD",
      TRUE ~ "PASS"
    )
  ))

lapply(filtered.snv.list, filter, second_filter == "PASS") %>% 
  lapply(., dim)
```

```{r, fig.width=9, fig.height=5}
c(
  "low_DP" = "red",
  "low_GERMQ" = "orange",
  "low_NALOD" = "purple",
  "low_NLOD" = "violet",
  "low_TLOD" = "firebrick",
  "high_CTRL_AF" = "coral2",
  "low_Tumor_AF" = "blueviolet",
  "PASS" = "green"
) -> pal.colors

lapply(names(filtered.snv.list),
       function(name)
         pivot_longer(filtered.snv.list[[name]], cols = contains("AF"), values_to = "AF") %>%
         ggplot(aes(x = name, y = AF)) +
         ggbeeswarm::geom_quasirandom(aes(color = second_filter)) +
         geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
         theme_bw() + labs(x = NULL) +
         scale_color_manual(values = pal.colors) +
         scale_y_continuous(labels = scales::percent) +
         labs(title = name))
```

Beeswarm plots of allelic frequencies for all mutations (including filtered out).

```{r, fig.width=9, fig.height=5}
lapply(names(filtered.snv.list),
       function(name)
         filter(filtered.snv.list[[name]], second_filter == "PASS") %>%
         pivot_longer(cols = contains("AF"), values_to = "AF") %>%
         ggplot(aes(x = name, y = AF)) +
         ggbeeswarm::geom_quasirandom(aes(color = second_filter)) +
         geom_hline(yintercept = filters[["AF_CTRL"]], linetype = "dashed") +
         theme_bw() + labs(x = NULL) +
         scale_color_manual(values = pal.colors) +
         scale_y_continuous(labels = scales::percent) +
         labs(title = name))
```

Same plot but only SNV which passed the filters.

## Compare the allelic frequencies for SNV detected in both samples

Let's combine the `filtered.snv.list` into one dataframe and inspect the overlap of detected mutations:

```{r}
combined.snv.df <- filtered.snv.list %>% 
  bind_rows(.id = "pipeline") %>% 
  filter(second_filter == "PASS") %>% 
  dplyr::select(pipeline, Position_hg38, Variant, AF_TUMOR,
         Gencode_34_hugoSymbol, Gencode_34_variantClassification) %>% 
  pivot_wider(names_from = "pipeline", values_from = "AF_TUMOR")

combined.snv.df
```

Variants that were not detected by Parabricks but only by classic GATK:

```{r}
combined.snv.df %>% 
  filter(!is.na(`classic GATK`) & is.na(`Parabricks GATK`))
```

And vice versa:

```{r}
combined.snv.df %>% 
  filter(is.na(`classic GATK`) & !is.na(`Parabricks GATK`))
```

Parabricks GATK detected 53 more and 'missed' two mutations.

One of the most commonly mutated genes in tumors is *TP53*. Using filter by gene name we can check the presence of *TP53* mutations.

```{r}
filter(combined.snv.df, Gencode_34_hugoSymbol == "TP53")
```

Finally, lets compare the allelic frequencies for the variants detected by both pipelines:

```{r, fig.width=8, fig.height=5}
combined.snv.df %>%
  ggplot(aes(x = `classic GATK`, y = `Parabricks GATK`)) +
  geom_point(
    aes(
      fill = Gencode_34_variantClassification
    ),
    shape = 21,
    size = 2
  ) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(labels = scales::percent, limits = c(0, 1))
```

We can limit the variant types to show only the biologically revevant mutations:

```{r, fig.width=8, fig.height=5}
relevant.snv.df <- combined.snv.df %>%
  filter(Gencode_34_variantClassification %in% c("MISSENSE", "NONSENSE",
                                                 "IN_FRAME_DEL", "IN_FRAME_INS",
                                                 "FRAME_SHIFT_DEL", "FRAME_SHIFT_INS")
         )


  ggplot(relevant.snv.df,
         aes(x = `classic GATK`, y = `Parabricks GATK`)) +
  geom_point(
    aes(
      fill = Gencode_34_variantClassification
    ),
    shape = 21,
    size = 2
  ) + 
  geom_text_repel( # add text labels for some of the top AF mutations
    data = filter(relevant.snv.df, `classic GATK` > 0.9 & `Parabricks GATK` > 0.9),
    aes(label = Gencode_34_hugoSymbol), max.overlaps = 30, nudge_y = -0.2, nudge_x = -0.6
  ) +
  theme_bw() +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(labels = scales::percent, limits = c(0, 1))
```

# Session info

```{r}
sessionInfo()
```

