library(CSAMA2014RangesAnnotationLab)
vcf.file <- NA12878_pg.chr20.vcf.bgz

header <- scanVcfHeader(vcf.file)
header

samples(header)

geno(header)

info(header)["END",]

vcf <- readVcf(vcf.file, genome="hg19")

print(object.size(vcf), unit="auto")

print(object.size(vcf) * 40L, unit="auto")

library(BSgenome.Hsapiens.UCSC.hg19)
seqinfo(Hsapiens)

ranges.chr20 <- as(seqinfo(Hsapiens)["chr20"], "GRanges")
ranges.chr20

param <- ScanVcfParam(which=ranges.chr20)
vcf.chr20 <- readVcf(vcf.file, genome="hg19", param=param)

vcf

rowData(vcf)

head(geno(vcf)$GT)

variants <- vcf[geno(vcf)$GT[,1] != "0/0",]

alt(variants)

snvs <- variants[isSNV(variants),]

snvs <- expand(snvs)
head(alt(snvs))

prefilters <- FilterRules(list(restrictToVariants=function(text) {
  !grepl("0/0", text, fixed=TRUE)
}))

filters <- FilterRules(list(restrictToSNVs=function(vcf) isSNV(vcf)))

filterVcf(vcf.file, genome="hg19", "snvs.vcf", index=TRUE,
          prefilters=prefilters, filters=filters,
          param=ScanVcfParam(info=NA))

cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442",
               "#0072B2", "#D55E00", "#CC79A7")
color.scheme <- cbPalette
my.theme <- lattice::trellis.par.get()
my.theme <- within(my.theme, {
  box.rectangle$col <- "black"
  box.rectangle$fill <- "gray90"
  box.umbrella$col <- "black"
  box.umbrella$lty <- 1
  strip.background$col <- rep("gray90", 7)
  plot.symbol$col <- "black"
  plot.symbol$pch <- 19
  box.dot$cex <- 1
  box.dot$pch <- "|"
  add.line$col <- "red"
  plot.polygon$col <- "gray90"
  superpose.symbol$fill <- color.scheme
  superpose.symbol$col <- color.scheme
  superpose.line$fill <- color.scheme
  superpose.line$col <- color.scheme
  superpose.polygon$fill <- color.scheme
  superpose.polygon$col <- color.scheme
})
lattice.options(default.theme = my.theme)

pdf(file="fig/cov.pdf",height=4,width=4)
library(lattice)
densityplot(~ geno(snvs)$DP, xlim=c(0, 2*median(geno(snvs)$DP, na.rm=TRUE)),
            plot.points=FALSE)
dev.off()

pdf(file="fig/freq-cov-bin.pdf",height=4,width=8)
rowData(snvs)$coverage.bin <- cut(geno(snvs)$DP, c(0, 20, 80, Inf))
rowData(snvs)$variant.freq <- geno(snvs)$AD[,,2]/geno(snvs)$DP[,1]
densityplot(~ variant.freq | coverage.bin,
            as.data.frame(rowData(snvs)),
            plot.points=FALSE, layout=c(3,1),
            xlab="variant frequency by coverage bin")
dev.off()

fixed(header(snvs))$FILTER

table(unlist(strsplit(filt(snvs), ";", fixed=TRUE)))

snvs <- snvs[grep("PASS", filt(snvs), fixed=TRUE),]

table(rowData(snvs)$coverage.bin)

pdf(file="fig/freq-cov-bin-2.pdf",height=4,width=4)
densityplot(~ variant.freq, as.data.frame(rowData(snvs)),
            plot.points=FALSE,
            xlab="variant frequency")
dev.off()

indels <- variants[isIndel(variants)]

indel.windows <- rowData(indels) + 10

rowData(snvs)$near.indel <- snvs %over% indel.windows

xtabs(~ near.indel, mcols(rowData(snvs)))

chr20.sequence <- getSeq(Hsapiens, "chr20")

chr20.hp <- ranges(Rle(as.raw(chr20.sequence)))

chr20.hp <- chr20.hp[width(chr20.hp) > 6L]

rowData(snvs)$over.hp <- ranges(snvs) %over% chr20.hp
xtabs(~ over.hp, mcols(rowData(snvs)))

data(selfChains)

gene.models <- TxDb.Hsapiens.UCSC.hg19.knownGene
locations <- locateVariants(snvs, gene.models, CodingVariants())

colnames(mcols(locations))

rowData(snvs)$coding.tx <- NA_integer_
rowData(snvs)$coding.tx[locations$QUERYID] <- locations$TXID

syms <- unlist(mget(locations$GENEID[!is.na(locations$GENEID)],
                    org.Hs.egSYMBOL,
                    ifnotfound=NA))
locations$SYMBOL[!is.na(locations$GENEID)] <- syms

coding <- predictCoding(snvs, gene.models, Hsapiens)

colnames(mcols(coding))

table(coding$CONSEQUENCE)

CEUvcf <- file.path(system.file("extdata", package="VariantFiltering"),
                    "CEUtrio.vcf.bgz")
CEUped <- file.path(system.file("extdata", package="VariantFiltering"),
                    "CEUtrio.ped")
param <- VariantFilteringParam(vcfFilenames=CEUvcf, pedFilename=CEUped)

reHo <- autosomalRecessiveHomozygous(param)
reHo

aim <- reportVariants(reHo)

maxMAF(reHo) <- 0.05
MAFmask <- MAFpop(reHo)
MAFmask

MAFpop(reHo) <- !MAFmask
MAFpop(reHo, "ASN_AFKG") <- TRUE
MAFpop(reHo)

minCRYP5ss(reHo) <- 0
reHo

filteredVariants(reHo)

ranges.20 <- ranges.chr20
seqlevelsStyle(ranges.20) <- "NCBI"
broad.vcf <- readVcf("~/projects/papers/ScalableGenomics/work/CEUTrio.HiSeq.WGS.b37.bestPractices.phased.b37.vcf.gz",
                     param=ScanVcfParam(which=ranges.20),
                     genome="hg19")
broad.snvs <- broad.vcf[isSNV(broad.vcf) & geno(broad.vcf)$GT[,1] != "0/0",1]
seqlevelsStyle(broad.snvs) <- "UCSC"
snvs <- dropSeqlevels(snvs, "chrM")

broad.vr <- as(broad.snvs, "VRanges")
illumina.vr <- as(snvs, "VRanges")
illumina.vr$in.broad <- illumina.vr %in% broad.vr
mean(illumina.vr$in.broad)

illumina.vr$broad.freq <- altFraction(broad.vr)[match(illumina.vr, broad.vr)]

pdf(file="fig/scatter.pdf",height=4,width=4)
xyplot(broad.freq ~ altFraction(illumina.vr),
       as.data.frame(illumina.vr),
       panel=panel.smoothScatter,
       xlab="Illumina frequency", ylab="Broad frequency")
dev.off()

runs <- vcf[!is.na(info(vcf)$END),]
end(rowData(runs)) <- info(runs)$END

sum(width(runs)) / seqlengths(runs)["chr20"]