## ----global_options, include=FALSE--------------------------------------------
knitr::opts_chunk$set(fig.pos = 'H', fig.align = "center", warning = FALSE, message = FALSE)

## ----eval=FALSE---------------------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#  
#  BiocManager::install("GBScleanR")

## ----eval=FALSE---------------------------------------------------------------
#  if (!requireNamespace("devtools", quietly = TRUE))
#      install.packages("devtools")
#  devtools::install_github("tomoyukif/GBScleanR", build_vignettes = TRUE)

## ----warning=FALSE, message=FALSE---------------------------------------------
library("GBScleanR")

## -----------------------------------------------------------------------------
vcf_fn <- system.file("extdata", "sample.vcf", package = "GBScleanR")
gds_fn <- tempfile("sample", fileext = ".gds")

## -----------------------------------------------------------------------------
# `force = TRUE` allow the function to over write the GDS file,
# even if a GDS file exists at `out_fn`.
gbsrVCF2GDS(vcf_fn = vcf_fn, out_fn = gds_fn, force = TRUE, verbose = FALSE)

## -----------------------------------------------------------------------------
gds <- loadGDS(gds_fn, verbose = FALSE)

## -----------------------------------------------------------------------------
# Number of samples
nsam(gds)

## -----------------------------------------------------------------------------
# Number of SNPs
nmar(gds) 

## -----------------------------------------------------------------------------
# Indices of chromosome ID of all markers
head(getChromosome(gds)) 

## -----------------------------------------------------------------------------
# Chromosome names of all markers
head(getChromosome(gds)) 

## -----------------------------------------------------------------------------
# Position (bp) of all markers
head(getPosition(gds)) 

## -----------------------------------------------------------------------------
# Reference allele of all markers
head(getAllele(gds)) 

## -----------------------------------------------------------------------------
# SNP IDs
head(getMarID(gds)) 

## -----------------------------------------------------------------------------
# sample IDs
head(getSamID(gds)) 

## -----------------------------------------------------------------------------
geno <- getGenotype(gds)

## -----------------------------------------------------------------------------
geno <- getRead(gds)

## -----------------------------------------------------------------------------
gds <- countGenotype(gds)
gds <- countRead(gds)

## ----fig.alt="Missing rate per marker and per sample."------------------------
# Histgrams of missing rate
histGBSR(gds, stats = "missing") 

## ----fig.alt="Heterozygosity per marker and per sample."----------------------
# Histgrams of heterozygosity
histGBSR(gds, stats = "het") 

## ----fig.alt="Reference allele frequency per marker and per sample."----------
# Histgrams of reference allele frequency
histGBSR(gds, stats = "raf") 

## ----fig.alt="Total read depth per marker and per sample."--------------------
# Histgrams of total read depth
histGBSR(gds, stats = "dp") 

## ----fig.alt="Reference read depth per marker and per sample."----------------
# Histgrams of allelic read depth
histGBSR(gds, stats = "ad_ref") 

## ----fig.alt="Alternative read depth per marker and per sample."--------------
# Histgrams of allelic read depth
histGBSR(gds, stats = "ad_ref") 

## ----fig.alt="Reference read per marker and per sample."----------------------
# Histgrams of reference allele frequency
histGBSR(gds, stats = "rrf") 

## ----fig.alt="Mean of alternative read depth per marker and per sample."------
# Histgrams of mean allelic read depth
histGBSR(gds, stats = "mean_ref") 

## ----fig.alt="SD of reference read depth per marker and per sample."----------
# Histgrams of standard deviation of read depth
histGBSR(gds, stats = "sd_ref") 

## ----fig.alt="SD of alternative read depth per marker and per sample."--------
# Histgrams of standard deviation of read depth
histGBSR(gds, stats = "sd_ref") 

## -----------------------------------------------------------------------------
plotGBSR(gds, stats = "missing")

## -----------------------------------------------------------------------------
plotGBSR(gds, stats = "geno")

## -----------------------------------------------------------------------------
pairsGBSR(gds, stats1 = "missing", stats2 = "dp")

## -----------------------------------------------------------------------------
# Reference genotype count per marker
head(getCountGenoRef(gds, target = "marker")) 
# Reference genotype count per sample
head(getCountGenoRef(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Heterozygote count per marker
head(getCountGenoHet(gds, target = "marker")) 
# Heterozygote count per sample
head(getCountGenoHet(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Alternative genotype count per marker
head(getCountGenoAlt(gds, target = "marker")) 
# Alternative genotype count per sample
head(getCountGenoAlt(gds, target = "sample"))

## -----------------------------------------------------------------------------
# Missing count per marker
head(getCountGenoMissing(gds, target = "marker")) 
# Missing count per sample
head(getCountGenoMissing(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Reference allele count per marker
head(getCountAlleleRef(gds, target = "marker")) 
# Reference allele count per sample
head(getCountAlleleRef(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Alternative allele count per marker
head(getCountAlleleAlt(gds, target = "marker")) 
# Alternative allele count per sample
head(getCountAlleleAlt(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Missing allele count per marker
head(getCountAlleleMissing(gds, target = "marker")) 
# Missing allele count per sample
head(getCountAlleleMissing(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Reference read count per marker
head(getCountReadRef(gds, target = "marker")) 
# Reference read count per sample
head(getCountReadRef(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Alternative read count per marker
head(getCountReadAlt(gds, target = "marker")) 
# Alternative read count per sample
head(getCountReadAlt(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Sum of reference and alternative read counts per marker
head(getCountRead(gds, target = "marker")) 
# Sum of reference and alternative read counts per sample
head(getCountRead(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Mean of reference allele read count per marker
head(getMeanReadRef(gds, target = "marker")) 
# Mean of reference allele read count per sample
head(getMeanReadRef(gds, target = "sample"))

## -----------------------------------------------------------------------------
# Mean of Alternative allele read count per marker
head(getMeanReadAlt(gds, target = "marker")) 
# Mean of Alternative allele read count per sample
head(getMeanReadAlt(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# SD of reference allele read count per marker
head(getSDReadRef(gds, target = "marker")) 
# SD of reference allele read count per sample
head(getSDReadRef(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# SD of Alternative allele read count per marker
head(getSDReadAlt(gds, target = "marker")) 
# SD of Alternative allele read count per sample
head(getSDReadAlt(gds, target = "sample"))

## -----------------------------------------------------------------------------
# Minor allele frequency per marker
head(getMAF(gds, target = "marker")) 
# Minor allele frequency per sample
head(getMAF(gds, target = "sample")) 

## -----------------------------------------------------------------------------
# Minor allele count per marker
head(getMAC(gds, target = "marker")) 
# Minor allele count per sample
head(getMAC(gds, target = "sample")) 

## -----------------------------------------------------------------------------
head(getCountGenoRef(gds, target = "marker", prop = TRUE))
head(getCountGenoHet(gds, target = "marker", prop = TRUE))
head(getCountGenoAlt(gds, target = "marker", prop = TRUE))
head(getCountGenoMissing(gds, target = "marker", prop = TRUE))

## -----------------------------------------------------------------------------
head(getCountAlleleRef(gds, target = "marker", prop = TRUE))
head(getCountAlleleAlt(gds, target = "marker", prop = TRUE))
head(getCountAlleleMissing(gds, target = "marker", prop = TRUE))

## -----------------------------------------------------------------------------
head(getCountReadRef(gds, target = "marker", prop = TRUE))
head(getCountReadAlt(gds, target = "marker", prop = TRUE))

## ----eval=FALSE---------------------------------------------------------------
#  gds <- setMarFilter(gds, missing = 0.2, het = c(0.1, 0.9), maf = 0.05)
#  gds <- setSamFilter(gds, missing = 0.8, het = c(0.25, 0.75))

## ----eval=FALSE---------------------------------------------------------------
#  gds <- setCallFilter(gds, dp_count = c(5, Inf))

## ----eval=FALSE---------------------------------------------------------------
#  # Filtering genotype calls based on total read counts
#  gds <- setCallFilter(gds, dp_qtile = c(0, 0.99))
#  # Filtering genotype calls based on reference read counts
#  # and alternative read counts separately.
#  gds <- setCallFilter(gds, ref_qtile = c(0, 0.99),
#                         alt_qtile = c(0, 0.99))

## -----------------------------------------------------------------------------
gds <- setCallFilter(gds, dp_count = c(5, Inf))
gds <- setMarFilter(gds, missing = 0.2)

## -----------------------------------------------------------------------------
# Here we select only one marker from each 150 bp stretch.
gds <- thinMarker(gds, range = 150) 

## -----------------------------------------------------------------------------
gds <- countGenotype(gds)
gds <- countRead(gds)

## -----------------------------------------------------------------------------
head(validMar(gds))
head(validSam(gds))

## -----------------------------------------------------------------------------
nmar(gds)
nmar(gds, valid = FALSE)

## -----------------------------------------------------------------------------
# Reset the filter on markers
gds <- resetMarFilter(gds) 
# Reset the filter on samples
gds <- resetSamFilter(gds) 
# Reset the filter on calls
gds <- resetCallFilter(gds) 
# Reset all filters
gds <- resetFilter(gds) 

## -----------------------------------------------------------------------------
p1 <- grep("Founder1", getSamID(gds), value = TRUE)
p2 <- grep("Founder2", getSamID(gds), value = TRUE)
gds <- setParents(gds, parents = c(p1, p2), mono = TRUE, bi = TRUE)

## -----------------------------------------------------------------------------
gds <- countGenotype(gds)

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "missing")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "het")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "raf")

## ----eval=FALSE---------------------------------------------------------------
#  # filter out markers with reference allele frequency
#  # less than 5% or more than 95%.
#  gds <- setMarFilter(gds, maf = 0.05)

## ----eval=FALSE---------------------------------------------------------------
#  # Filter out samples with more than 90% missing genotype calls,
#  # less than 5% heterozygosity, and less than 5% minor allele frequency.
#  gds <- setSamFilter(gds, missing = 0.9, het = 0.05, maf = 0.05)

## -----------------------------------------------------------------------------
# Filter out genotype calls supported by reads less than 2 reads.
gds <- setCallFilter(gds, dp_count = c(2, Inf))
# Filter out genotype calls supported by reads more than 100.
gds <- setCallFilter(gds, dp_count = c(0, 100))
# Filter out genotype calls based on quantile values 
# of read counts at markers in each sample.
gds <- setCallFilter(gds, ref_qtile = c(0, 0.9), alt_qtile = c(0, 0.9))

## -----------------------------------------------------------------------------
# Remove markers having more than 75% of missing genotype calls
gds <- setMarFilter(gds, missing = 0.2) 
nmar(gds)

## -----------------------------------------------------------------------------
gds <- countGenotype(gds, node = "filt")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "missing")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "het")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "raf")

## -----------------------------------------------------------------------------
plotGBSR(gds, stats = "raf")

## -----------------------------------------------------------------------------
gds <- setMarFilter(gds, maf = 0.25)
nmar(gds)

## -----------------------------------------------------------------------------
gds <- countGenotype(gds)
histGBSR(gds, stats = "missing")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "het")

## -----------------------------------------------------------------------------
histGBSR(gds, stats = "raf")

## -----------------------------------------------------------------------------
# Marker density
plotGBSR(gds, stats = "marker")

## -----------------------------------------------------------------------------
plotGBSR(gds, stats = "geno")

## -----------------------------------------------------------------------------
# As always the first step of breeding scheme would be "pairing" cross(es) of 
# founders, never be "selfing" and a "sibling" cross,
# the argument `crosstype` in initScheme() was deprecated on the update on April 6, 2023.
# gds <- initScheme(gds, crosstype = "pairing", mating = matrix(1:2, 2))
gds <- initScheme(gds, mating = matrix(1:2, 2))
gds <- addScheme(gds, crosstype = "selfing")

## -----------------------------------------------------------------------------
getParents(gds)

## ----eval=FALSE---------------------------------------------------------------
#  # As always the first step of breeding scheme would be "pairing" cross(es) of
#  # founders, never be "selfing" and a "sibling" cross,
#  # the argument `crosstype` in initScheme() was deprecated on the update on April 6, 2023.
#  # gds <- initScheme(gds, crosstype = "pair", mating = cbind(c(1:2), c(3:4), c(5:6), c(7:8)))
#  
#  gds <- initScheme(gds, mating = cbind(c(1:2), c(3:4), c(5:6), c(7:8)))

## ----eval=FALSE---------------------------------------------------------------
#  showScheme(gds)

## ----eval=FALSE---------------------------------------------------------------
#  gds <- addScheme(gds, crosstype = "pair", mating = cbind(c(9:10), c(11:12)))
#  
#  # Check IDs.
#  showScheme(gds)

## ----eval=FALSE---------------------------------------------------------------
#  gds <- addScheme(gds, crosstype = "pair", mating = cbind(c(13:14)))
#  
#  #' # Check IDs.
#  showScheme(gds)

## ----eval=FALSE---------------------------------------------------------------
#  # Inbreeding by five times selfing.
#  gds <- addScheme(gds, crosstype = "self")
#  gds <- addScheme(gds, crosstype = "self")
#  gds <- addScheme(gds, crosstype = "self")
#  gds <- addScheme(gds, crosstype = "self")
#  gds <- addScheme(gds, crosstype = "self")

## ----eval=FALSE---------------------------------------------------------------
#  gds <- setParents(object = gds,
#                    parents = c("Founder1", "Founder2", "Founder3", "Founder4"))
#  gds <- initScheme(object = gds,
#                    mating = cbind(c(1, 2), c(1, 3), c(1,4)))
#  # The initScheme() function here automatically set 5, 6, and 7 as member ID to
#  # the progenies of the above maiting (pairing) combinations, respectively.
#  
#  # Then you have to assign member IDs to your samples to indicate which sample
#  # belongs to which pedigree.
#  gds <- assignScheme(object = gds,
#                      id = c(rep(5, 10), rep(6, 15), rep(7, 20)))

## ----eval=FALSE---------------------------------------------------------------
#  # Get sample ID
#  sample_id <- getSamID(object = gds)
#  
#  # Initialize the id vector
#  id <- integer(nsam(gds))
#  
#  # Assume your samples were named with prefixes that indicate which
#  # sample was derived from which combination of founders.
#  id[grepl("P1xP2", sample_id)] <- 5
#  id[grepl("P1xP3", sample_id)] <- 6
#  id[grepl("P1xP4", sample_id)] <- 7
#  gds <- assignScheme(object = gds, id = id)

## ----message=FALSE------------------------------------------------------------
gds <- estGeno(gds, iter = 4)

## ----eval=FALSE---------------------------------------------------------------
#  gds <- estGeno(gds, het_parent = TRUE, iter = 4)

## ----eval=FALSE---------------------------------------------------------------
#  # Following codes do the same.
#  gds <- estGeno(gds, iter = 1)
#  gds <- estGeno(gds, optim = FALSE)

## -----------------------------------------------------------------------------
est_geno <- getGenotype(gds, node = "cor")

## -----------------------------------------------------------------------------
founder_geno <- getGenotype(gds, node = "parents")

## -----------------------------------------------------------------------------
est_hap <- getHaplotype(gds)

## ----eval=FALSE---------------------------------------------------------------
#  out_fn <- tempfile("sample_est", fileext = ".vcf.gz")
#  gbsrGDS2VCF(gds, out_fn)
#  gbsrGDS2VCF(gds, out_fn, node = "cor")

## -----------------------------------------------------------------------------
gds <- reopenGDS(gds)

## -----------------------------------------------------------------------------
closeGDS(gds)

## -----------------------------------------------------------------------------
sessionInfo()