## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----message = FALSE, warning = FALSE-----------------------------------------
library(ASURAT)
library(SingleCellExperiment)
library(SummarizedExperiment)

## ----eval = FALSE-------------------------------------------------------------
# mat <- as.matrix(assay(sce, "logcounts"))
# assay(sce, "centered") <- sweep(mat, 1, apply(mat, 1, mean), FUN = "-")

## ----message = FALSE----------------------------------------------------------
sce <- TENxPBMCData::TENxPBMCData(dataset = c("pbmc4k"))
pbmc_counts <- as.matrix(assay(sce, "counts"))
rownames(pbmc_counts) <- make.unique(rowData(sce)$Symbol_TENx)
colnames(pbmc_counts) <- make.unique(colData(sce)$Barcode)

## -----------------------------------------------------------------------------
pbmc_counts[1:5, 1:3]

## -----------------------------------------------------------------------------
pbmc <- SingleCellExperiment(assays = list(counts = pbmc_counts),
                             rowData = data.frame(gene = rownames(pbmc_counts)),
                             colData = data.frame(cell = colnames(pbmc_counts)))

## -----------------------------------------------------------------------------
dim(pbmc)

## -----------------------------------------------------------------------------
pbmc <- add_metadata(sce = pbmc, mitochondria_symbol = "^MT-")

## -----------------------------------------------------------------------------
pbmc <- remove_variables(sce = pbmc, min_nsamples = 10)

## -----------------------------------------------------------------------------
df <- data.frame(x = colData(pbmc)$nReads, y = colData(pbmc)$nGenes)
ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
  ggplot2::labs(x = "Number of reads", y = "Number of genes") +
  ggplot2::theme_classic(base_size = 20)

## -----------------------------------------------------------------------------
df <- data.frame(x = colData(pbmc)$nReads, y = colData(pbmc)$percMT)
ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
  ggplot2::labs(x = "Number of reads", y = "Perc of MT reads") +
  ggplot2::theme_classic(base_size = 20)

## -----------------------------------------------------------------------------
pbmc <- remove_samples(sce = pbmc, min_nReads = 5000, max_nReads = 20000,
                       min_nGenes = 100, max_nGenes = 1e+10,
                       min_percMT = 0, max_percMT = 10)

## -----------------------------------------------------------------------------
df <- data.frame(x = 1:nrow(rowData(pbmc)),
                 y = sort(rowData(pbmc)$nSamples, decreasing = TRUE))
ggplot2::ggplot() + ggplot2::scale_y_log10() +
  ggplot2::geom_point(ggplot2::aes(x = df$x, y = df$y), size = 1, alpha = 1) +
  ggplot2::labs(x = "Rank of genes", y = "Mean read counts") +
  ggplot2::theme_classic(base_size = 20)

## -----------------------------------------------------------------------------
pbmc <- remove_variables_second(sce = pbmc, min_meannReads = 0.05)

## -----------------------------------------------------------------------------
assay(pbmc, "logcounts") <- log(assay(pbmc, "counts") + 1)

## -----------------------------------------------------------------------------
mat <- assay(pbmc, "logcounts")
assay(pbmc, "centered") <- sweep(mat, 1, apply(mat, 1, mean), FUN = "-")

## -----------------------------------------------------------------------------
sname <- "logcounts"
altExp(pbmc, sname) <- SummarizedExperiment(list(counts = assay(pbmc, sname)))

## ----message = FALSE----------------------------------------------------------
dictionary <- AnnotationDbi::select(org.Hs.eg.db::org.Hs.eg.db,
                                    key = rownames(pbmc),
                                    columns = "ENTREZID", keytype = "SYMBOL")
dictionary <- dictionary[!duplicated(dictionary$SYMBOL), ]
rowData(pbmc)$geneID <- dictionary$ENTREZID

## -----------------------------------------------------------------------------
set.seed(1)
nrand_samples <- 1000
mat <- t(as.matrix(assay(pbmc, "centered")))
ids <- sample(rownames(mat), nrand_samples, prob = NULL)
cormat <- cor(mat[ids, ], method = "spearman")

## -----------------------------------------------------------------------------
urlpath <- "https://github.com/keita-iida/ASURATDB/blob/main/genes2bioterm/"
load(url(paste0(urlpath, "20201213_human_CO.rda?raw=TRUE")))         # CO
load(url(paste0(urlpath, "20220308_human_MSigDB.rda?raw=TRUE")))     # MSigDB
load(url(paste0(urlpath, "20201213_human_GO_red.rda?raw=TRUE")))     # GO
load(url(paste0(urlpath, "20201213_human_KEGG.rda?raw=TRUE")))       # KEGG

## -----------------------------------------------------------------------------
d <- list(human_CO[["cell"]], human_MSigDB[["cell"]])
res <- do.call("rbind", d)
human_CB <- list(cell = res)

## -----------------------------------------------------------------------------
pbmcs <- list(CB = pbmc, GO = pbmc, KG = pbmc)
metadata(pbmcs$CB) <- list(sign = human_CB[["cell"]])
metadata(pbmcs$GO) <- list(sign = human_GO[["BP"]])
metadata(pbmcs$KG) <- list(sign = human_KEGG[["pathway"]])

## -----------------------------------------------------------------------------
pbmcs$CB <- remove_signs(sce = pbmcs$CB, min_ngenes = 2, max_ngenes = 1000)
pbmcs$GO <- remove_signs(sce = pbmcs$GO, min_ngenes = 2, max_ngenes = 1000)
pbmcs$KG <- remove_signs(sce = pbmcs$KG, min_ngenes = 2, max_ngenes = 1000)

## -----------------------------------------------------------------------------
set.seed(1)
pbmcs$CB <- cluster_genesets(sce = pbmcs$CB, cormat = cormat,
                             th_posi = 0.30, th_nega = -0.30)
set.seed(1)
pbmcs$GO <- cluster_genesets(sce = pbmcs$GO, cormat = cormat,
                             th_posi = 0.30, th_nega = -0.30)
set.seed(1)
pbmcs$KG <- cluster_genesets(sce = pbmcs$KG, cormat = cormat,
                             th_posi = 0.20, th_nega = -0.20)

## -----------------------------------------------------------------------------
pbmcs$CB <- create_signs(sce = pbmcs$CB, min_cnt_strg = 2, min_cnt_vari = 2)
pbmcs$GO <- create_signs(sce = pbmcs$GO, min_cnt_strg = 4, min_cnt_vari = 4)
pbmcs$KG <- create_signs(sce = pbmcs$KG, min_cnt_strg = 3, min_cnt_vari = 3)

## -----------------------------------------------------------------------------
pbmcs$GO <- remove_signs_redundant(sce = pbmcs$GO,
                                   similarity_matrix = human_GO$similarity_matrix$BP,
                                   threshold = 0.70, keep_rareID = TRUE)

## -----------------------------------------------------------------------------
keywords <- "Covid|COVID"
pbmcs$KG <- remove_signs_manually(sce = pbmcs$KG, keywords = keywords)

## ----eval = FALSE-------------------------------------------------------------
# keywords <- "cell|cyte"
# test <- select_signs_manually(sce = pbmcs$CB, keywords = keywords)

## -----------------------------------------------------------------------------
pbmcs$CB <- makeSignMatrix(sce = pbmcs$CB, weight_strg = 0.5, weight_vari = 0.5)
pbmcs$GO <- makeSignMatrix(sce = pbmcs$GO, weight_strg = 0.5, weight_vari = 0.5)
pbmcs$KG <- makeSignMatrix(sce = pbmcs$KG, weight_strg = 0.5, weight_vari = 0.5)

## -----------------------------------------------------------------------------
rbind(head(assay(pbmcs$CB, "counts")[, 1:3], n = 4),
      tail(assay(pbmcs$CB, "counts")[, 1:3], n = 4))

## -----------------------------------------------------------------------------
pca_dims <- c(30, 30, 50)
tsne_dims <- c(2, 2, 3)
for(i in seq_along(pbmcs)){
  set.seed(1)
  mat <- t(as.matrix(assay(pbmcs[[i]], "counts")))
  res <- Rtsne::Rtsne(mat, dim = tsne_dims[i], pca = TRUE,
                      initial_dims = pca_dims[i])
  reducedDim(pbmcs[[i]], "TSNE") <- res[["Y"]]
}

## -----------------------------------------------------------------------------
df <- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
                                        color = "black", size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC (cell type)", x = "tSNE_1", y = "tSNE_2") +
  ggplot2::theme_classic(base_size = 15)

## -----------------------------------------------------------------------------
df <- as.data.frame(reducedDim(pbmcs$GO, "TSNE"))
ggplot2::ggplot() + ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2]),
                                        color = "black", size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC (function)", x = "tSNE_1", y = "tSNE_2") +
  ggplot2::theme_classic(base_size = 15)

## -----------------------------------------------------------------------------
df <- as.data.frame(reducedDim(pbmcs$KG, "TSNE"))
plot_dataframe3D(dataframe3D = df, theta = 45, phi = 30, title = "PBMC (pathway)",
                 xlabel = "tSNE_1", ylabel = "tSNE_2", zlabel = "tSNE_3")

## ----message = FALSE, warning = FALSE, results = "hide"-----------------------
resolutions <- c(0.20, 0.20, 0.10)
ds <- list(seq_len(20), seq_len(20), seq_len(20))
for(i in seq_along(pbmcs)){
  surt <- Seurat::as.Seurat(pbmcs[[i]], counts = "counts", data = "counts")
  mat <- as.matrix(assay(pbmcs[[i]], "counts"))
  surt[["SSM"]] <- Seurat::CreateAssayObject(counts = mat)
  Seurat::DefaultAssay(surt) <- "SSM"
  surt <- Seurat::ScaleData(surt, features = rownames(surt))
  surt <- Seurat::RunPCA(surt, features = rownames(surt))
  surt <- Seurat::FindNeighbors(surt, reduction = "pca", dims = ds[[i]])
  surt <- Seurat::FindClusters(surt, resolution = resolutions[i])
  temp <- Seurat::as.SingleCellExperiment(surt)
  colData(pbmcs[[i]])$seurat_clusters <- colData(temp)$seurat_clusters
}

## -----------------------------------------------------------------------------
labels <- colData(pbmcs$CB)$seurat_clusters
df <- as.data.frame(reducedDim(pbmcs$CB, "TSNE"))
ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC (cell type)", x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 15) + ggplot2::scale_colour_hue() +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))

## -----------------------------------------------------------------------------
labels <- colData(pbmcs$GO)$seurat_clusters
df <- as.data.frame(reducedDim(pbmcs$GO, "TSNE"))
ggplot2::ggplot() +
  ggplot2::geom_point(ggplot2::aes(x = df[, 1], y = df[, 2], color = labels),
                      size = 1, alpha = 1) +
  ggplot2::labs(title = "PBMC (function)", x = "tSNE_1", y = "tSNE_2", color = "") +
  ggplot2::theme_classic(base_size = 15) +
  ggplot2::scale_colour_brewer(palette = "Set1") +
  ggplot2::guides(colour = ggplot2::guide_legend(override.aes = list(size = 4)))

## -----------------------------------------------------------------------------
labels <- colData(pbmcs$KG)$seurat_clusters
colors <- scales::brewer_pal(palette = "Set2")(length(unique(labels)))[labels]
df <- as.data.frame(reducedDim(pbmcs$KG, "TSNE")[, seq_len(3)])
plot_dataframe3D(dataframe3D = df, labels = labels, colors = colors,
                 theta = 45, phi = 30, title = "PBMC (pathway)",
                 xlabel = "tSNE_1", ylabel = "tSNE_2", zlabel = "tSNE_3")

## ----message = FALSE, warning = FALSE-----------------------------------------
surt <- Seurat::as.Seurat(pbmcs$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(pbmcs$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::ScaleData(surt, features = rownames(surt))
surt <- Seurat::RunPCA(surt, features = rownames(surt))
surt <- Seurat::CellCycleScoring(surt, s.features = Seurat::cc.genes$s.genes,
                                 g2m.features = Seurat::cc.genes$g2m.genes)
temp <- Seurat::as.SingleCellExperiment(surt)
colData(pbmcs$CB)$Phase <- colData(temp)$Phase

## ----message = FALSE, results = "hide"----------------------------------------
for(i in seq_along(pbmcs)){
  set.seed(1)
  labels <- colData(pbmcs[[i]])$seurat_clusters
  pbmcs[[i]] <- compute_sepI_all(sce = pbmcs[[i]], labels = labels,
                                 nrand_samples = 200)
}

## ----message = FALSE----------------------------------------------------------
set.seed(1)
surt <- Seurat::as.Seurat(pbmcs$CB, counts = "counts", data = "counts")
mat <- as.matrix(assay(altExp(pbmcs$CB), "counts"))
surt[["GEM"]] <- Seurat::CreateAssayObject(counts = mat)
Seurat::DefaultAssay(surt) <- "GEM"
surt <- Seurat::SetIdent(surt, value = "seurat_clusters")
res <- Seurat::FindAllMarkers(surt, only.pos = TRUE,
                              min.pct = 0.50, logfc.threshold = 0.50)
metadata(pbmcs$CB)$marker_genes$all <- res

## -----------------------------------------------------------------------------
# Significant signs
marker_signs <- list()
keywords <- "MESENCHYMAL|LIMB|PANCREAS"
for(i in seq_along(pbmcs)){
  marker_signs[[i]] <- metadata(pbmcs[[i]])$marker_signs$all
  marker_signs[[i]] <-
    marker_signs[[i]][!grepl(keywords, marker_signs[[i]]$Description), ]
  marker_signs[[i]] <- dplyr::group_by(marker_signs[[i]], Ident_1)
  marker_signs[[i]] <- dplyr::slice_max(marker_signs[[i]], sepI, n = 2)
  marker_signs[[i]] <- dplyr::slice_min(marker_signs[[i]], Rank, n = 1)
}
# Significant genes
marker_genes_CB <- metadata(pbmcs$CB)$marker_genes$all
marker_genes_CB <- dplyr::group_by(marker_genes_CB, cluster)
marker_genes_CB <- dplyr::slice_min(marker_genes_CB, p_val_adj, n = 2)
marker_genes_CB <- dplyr::slice_max(marker_genes_CB, avg_log2FC, n = 1)

## -----------------------------------------------------------------------------
# ssm_list
sces_sub <- list() ; ssm_list <- list()
for(i in seq_along(pbmcs)){
  sces_sub[[i]] <- pbmcs[[i]][rownames(pbmcs[[i]]) %in% marker_signs[[i]]$SignID, ]
  ssm_list[[i]] <- assay(sces_sub[[i]], "counts")
}
names(ssm_list) <- c("SSM_cell", "SSM_function", "SSM_pathway")
# gem_list
expr_sub <- altExp(pbmcs$CB, "logcounts")
expr_sub <- expr_sub[rownames(expr_sub) %in% marker_genes_CB$gene]
gem_list <- list(GeneExpr = assay(expr_sub, "counts"))
# ssmlabel_list
labels <- list() ; ssmlabel_list <- list()
for(i in seq_along(pbmcs)){
  tmp <- colData(sces_sub[[i]])$seurat_clusters
  labels[[i]] <- data.frame(label = tmp)
  n_groups <- length(unique(tmp))
  if(i == 1){
    labels[[i]]$color <- scales::hue_pal()(n_groups)[tmp]
  }else if(i == 2){
    labels[[i]]$color <- scales::brewer_pal(palette = "Set1")(n_groups)[tmp]
  }else if(i == 3){
    labels[[i]]$color <- scales::brewer_pal(palette = "Set2")(n_groups)[tmp]
  }
  ssmlabel_list[[i]] <- labels[[i]]
}
names(ssmlabel_list) <- c("Label_cell", "Label_function", "Label_pathway")
# gemlabel_list
label_CC <- data.frame(label = colData(pbmcs$CB)$Phase, color = NA)
gemlabel_list <- list(CellCycle = label_CC)

## ----message = FALSE, warning = FALSE-----------------------------------------
set.seed(1)
plot_multiheatmaps(ssm_list = ssm_list, gem_list = gem_list,
                   ssmlabel_list = ssmlabel_list, gemlabel_list = gemlabel_list,
                   nrand_samples = 100, show_row_names = TRUE, title = "PBMC")

## -----------------------------------------------------------------------------
labels <- colData(pbmcs$CB)$seurat_clusters
variable <- "GO:0042100-V"
description <- "B cell proliferation"
subsce <- pbmcs$GO[which(rownames(pbmcs$GO) == variable), ]
df <- as.data.frame(t(as.matrix(assay(subsce, "counts"))))
ggplot2::ggplot() +
  ggplot2::geom_violin(ggplot2::aes(x = as.factor(labels), y = df[, 1],
                                    fill = labels), trim = FALSE, size = 0.5) +
  ggplot2::geom_boxplot(ggplot2::aes(x = as.factor(labels), y = df[, 1]),
                        width = 0.15, alpha = 0.6) +
  ggplot2::labs(title = paste0(variable, "\n", description),
                x = "Cluster (cell type)", y = "Sign score") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(legend.position = "none") + ggplot2::scale_fill_hue()

## -----------------------------------------------------------------------------
vname <- "CD79A"
sub <- altExp(pbmcs$CB, "logcounts")
sub <- sub[rownames(sub) == vname, ]
labels <- colData(pbmcs$CB)$seurat_clusters
df <- as.data.frame(t(assay(sub, "counts")))
ggplot2::ggplot() +
  ggplot2::geom_violin(ggplot2::aes(x = as.factor(labels), y = df[, 1],
                                    fill = labels), trim = FALSE, size = 0.5) +
  ggplot2::geom_boxplot(ggplot2::aes(x = as.factor(labels), y = df[, 1]),
                        width = 0.15, alpha = 0.6) +
  ggplot2::labs(title = vname, x = "Cluster (cell type)",
                y = "Normalized expression") +
  ggplot2::theme_classic(base_size = 20) +
  ggplot2::theme(legend.position = "none") + ggplot2::scale_fill_hue()

## -----------------------------------------------------------------------------
sessionInfo()