## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  echo = TRUE,
  message = FALSE,
  warning = FALSE,
  collapse = TRUE,
  comment = "#>"
)

## ----installation, eval=FALSE-------------------------------------------------
# if (!require("BiocManager", quietly = TRUE)) {
#   install.packages("BiocManager")
# }
# 
# BiocManager::install("CellMentor")

## ----load_packages------------------------------------------------------------
set.seed(100)

library(CellMentor)
library(Matrix)
library(ggplot2)
library(scRNAseq)
library(SingleCellExperiment)
library(scater) 

## ----load_data----------------------------------------------------------------
# Loading reference dataset (Baron)
baron <- h.baron_dataset()
reference_matrix <- baron$data
reference_celltypes <- baron$celltypes

# Loading query dataset (Muraro)
muraro <- muraro_dataset()
query_matrix <- muraro$data
query_celltypes <- muraro$celltypes # This would be unknown in a real application
# We keep it here for evaluation

## -----------------------------------------------------------------------------
# Function to create balanced subsets
create_subset <- function(matrix, celltypes, cells_per_type = 30) {
  # Get unique cell types
  unique_types <- unique(celltypes)

  # Select cells for each type
  selected_cells <- c()
  for (cell_type in unique_types) {
    # Get cells of this type
    type_cells <- names(celltypes)[celltypes == cell_type]

    # If fewer cells than requested, take all of them
    n_to_select <- min(cells_per_type, length(type_cells))

    # Randomly select cells
    selected <- sample(type_cells, n_to_select)
    selected_cells <- c(selected_cells, selected)
  }

  # Return subset
  list(
    matrix = matrix[, selected_cells],
    celltypes = celltypes[selected_cells]
  )
}

# Create balanced subsets with 30 cells per type
baron_subset <- create_subset(reference_matrix, reference_celltypes, 30)
muraro_subset <- create_subset(query_matrix, query_celltypes, 30)

# Update variable names for clarity
reference_matrix <- baron_subset$matrix
reference_celltypes <- baron_subset$celltypes
query_matrix <- muraro_subset$matrix
query_celltypes <- muraro_subset$celltypes # This would be unknown in a real application
# We keep it here for evaluation

## ----create_object------------------------------------------------------------
# Create the CSFNMF object
csfnmf_obj <- CreateCSFNMFobject(
  ref_matrix = reference_matrix,
  ref_celltype = reference_celltypes,
  data_matrix = query_matrix,
  norm = TRUE,
  most.variable = TRUE,
  scale = TRUE,
  scale_by = "cells",
  verbose = TRUE,
  num_cores = 1
)

## ----run_cellmentor-----------------------------------------------------------
# Run CellMentor with hyperparameter optimization
optimal_params <- CellMentor(
  csfnmf_obj,
  alpha_range = c(1, 5), # Limited alpha range
  beta_range = c(5), # use only one beta for speed
  gamma_range = c(0.1), # use only one gamma for speed
  delta_range = c(1), # use only one delta for speed
  num_cores = 1,
  verbose = TRUE
)

# Get best model
best_model <- optimal_params$best_model
K_VALUE <- cm_rank(best_model)

## ----project_data-------------------------------------------------------------
# Project query data onto the learned space
h_project <- project_data(
  W = W(best_model), # Learned gene weights
  X = data_matrix(matrices(best_model)), # Query data matrix
  num_cores = 5,
  verbose = TRUE
)

## -----------------------------------------------------------------------------
# Ensure unique rownames for genes
rownames(query_matrix) <- make.unique(rownames(query_matrix))

sce <- SingleCellExperiment(
  assays = list(counts = query_matrix)
)

# Store any per-cell annotations you have (e.g., CellMentor cell types)
colData(sce)$celltype <- query_celltypes

# Cell embeddings (cells x K)
H_cell <- t(as.matrix(h_project))      # ensure cells x K
colnames(H_cell) <- paste0("CM", seq_len(ncol(H_cell)))
reducedDim(sce, "CellMentor") <- H_cell

# Gene loadings (genes x K) — align to SCE row order
W_mat <- as.matrix(W(best_model))      # genes x K
# Make sure rownames are set and alignable
if (!is.null(rownames(W_mat))) {
  # Match to SCE rows; missing become NA
  W_mat <- W_mat[match(rownames(sce), rownames(W_mat)), , drop = FALSE]
  # Add each factor as a rowData column for convenience
  for (j in seq_len(ncol(W_mat))) {
    rowData(sce)[[paste0("CM_loading_", j)]] <- W_mat[, j]
  }
}

# UMAP using the precomputed CellMentor embedding
sce <- runUMAP(
  sce,
  dimred = "CellMentor",
  name   = "UMAP_CellMentor",
  ncomponents = 2
)

# Quick plots
plotReducedDim(sce, dimred = "UMAP_CellMentor", colour_by = "celltype")

## ----custom_data_workflow, eval=FALSE-----------------------------------------
# library(Matrix)
# library(CellMentor)
# 
# # 1) Build CSFNMF object
# csfnmf_obj <- CreateCSFNMFobject(
#   ref_matrix = ref_counts,
#   ref_celltype = ref_celltypes, # names(ref_celltypes) == colnames(ref_counts)
#   data_matrix = qry_counts,
#   norm = TRUE,
#   most.variable = TRUE,
#   scale = TRUE,
#   scale_by = "cells",
#   num_cores = 1,
#   verbose = TRUE
# )
# 
# # 2) Hyperparameter search & training
# optimal <- CellMentor(csfnmf_obj)
# 
# # 3) Get best model
# best_model <- optimal$best_model
# 
# # 4) Project data
# h_project <- project_data(
#   W = W(best_model),
#   X = data_matrix(matrices(best_model))
# )
# 
# # 5) Optional: Seurat integration & UMAP
# # (Follow the same steps as in the demo section above)

## ----session_info-------------------------------------------------------------
sessionInfo()