Overview

This notebook performs cell type marker analysis on a Seurat object, including: - Finding cell type specific markers - Calculating differential expression within cell types - Performing pseudo-bulk analysis

Load Required Libraries

library(jsonlite)
library(Matrix)
library(data.table)
library(Seurat)
library(tidyverse)

Set Parameters

seurat_obj_file <- "example_data/snRNAseq_MTG_10samples.rds" # Path to your Seurat object file
output_dir <- "example_data/snRNAseq_MTG_10samples" # Directory to save output files, usually the dataset name
cluster_col <- "MajorCellTypes" # Column name in metadata for clustering information
condition_col <- "case" # Column name in metadata for condition information
sample_col <- "sample_id" # Column name in metadata for sample information
seurat_type <- "snrnaseq" # Type of Seurat object: scrnaseq, snrnaseq, snatacseq, scatacseq, visiumst

# Create output directory
clustermarkers_folder = paste0(output_dir, "/clustermarkers")
if (!dir.exists(clustermarkers_folder)) {
  dir.create(clustermarkers_folder, recursive = TRUE)
}

Load and Validate Seurat Object

cat("Loading RDS data...\n")
## Loading RDS data...
seurat_obj <- readRDS(seurat_obj_file)

# Validation checks
if (!inherits(seurat_obj, "Seurat")) {
  stop("The provided file is not a valid Seurat object.")
}

if(seurat_type == "scrnaseq" | seurat_type == "snrnaseq"){
      # Check if the Seurat object has the necessary assay
    if (!"RNA" %in% names(seurat_obj@assays)) {
        stop("The Seurat object does not contain the 'RNA' assay.")
    }
    # Check if the Seurat object has the necessary assay data
    if (!"data" %in% slotNames(seurat_obj@assays$RNA)) {
        stop("The Seurat object does not contain the 'data' slot in the 'RNA' assay.")
    }
} else if (seurat_type == "snatacseq" | seurat_type == "scatacseq") {
    # Check if the Seurat object has the necessary assay
    if (!"ATAC" %in% names(seurat_obj@assays)) {
        stop("The Seurat object does not contain the 'ATAC' assay.")
    }
    # Check if the Seurat object has the necessary assay data
    if (!"counts" %in% slotNames(seurat_obj@assays$ATAC)) {
        stop("The Seurat object does not contain the 'counts' slot in the 'ATAC' assay.")
    }
} else if (seurat_type == "visiumst") {
    # Check if the Seurat object has the necessary assay
    if (!"Spatial" %in% names(seurat_obj@assays)) {
        stop("The Seurat object does not contain the 'Spatial' assay.")
    }
    # Check if the Seurat object has the necessary assay data
    if (!"data" %in% slotNames(seurat_obj@assays$Spatial)) {
        stop("The Seurat object does not contain the 'data' slot in the 'Spatial' assay.")
    }
}

# Check required columns exist in metadata
required_cols <- c(cluster_col, condition_col, sample_col)
missing_cols <- required_cols[!required_cols %in% colnames(seurat_obj@meta.data)]
if (length(missing_cols) > 0) {
  stop(paste("Missing required columns in metadata:", paste(missing_cols, collapse = ", ")))
}

Find Cell Type Markers

print("Extracting cell type specific markers...")
## [1] "Extracting cell type specific markers..."
# Find markers for each cell type
cell_type_markers <- FindAllMarkers(seurat_obj,group.by = cluster_col)

# Convert to data.table and save
cell_type_markers_dt <- as.data.table(cell_type_markers)
fwrite(cell_type_markers_dt, 
       paste0(clustermarkers_folder,"/cluster_FindAllMarkers.csv"), 
       row.names = FALSE)

Differential Expression Analysis

print("Calculating differential expression within each cell type...")
## [1] "Calculating differential expression within each cell type..."
# Define the cell types
cell_types <- unique(seurat_obj@meta.data[[cluster_col]])
# Initialize an empty list to store results
de_results_list <- list()
de_results_topN_list <- list()

# Loop through each cell type
for (cell_type in cell_types) {
    # Print the current cell type # nolint: whitespace_linter, indentation_linter.
    print(paste("Processing cell type:", cell_type))
    # Subset the Seurat object to the current cell type # nolint
    subset_obj <- seurat_obj[, seurat_obj@meta.data[[cluster_col]] == cell_type]

    # Calculate differential expression between conditions
    condition_ls <- unique(seurat_obj@meta.data[[condition_col]])
    # if there are more than 2 conditions, compare all possible combinations
    if (length(condition_ls) > 2) {
        combinations <- combn(condition_ls, 2)
        for (i in 1:ncol(combinations)) {
            c1 <- combinations[1, i]
            c2 <- combinations[2, i]
            de_results <- FindMarkers(subset_obj, ident.1 = c1, ident.2 = c2, group.by = condition_col)
            ## add a column for the gene names
            de_results$gene <- rownames(de_results)
            
            ## filter out genes with padj > 0.05
            # de_results <- de_results[de_results$p_val_adj < 0.05, ]
            
            ## get top 10 upregulated DE genes and downregulated, base on logFC
            de_results_topN <- rbind(de_results[order(de_results$avg_log2FC, decreasing = TRUE), ][1:10, ],de_results[order(de_results$avg_log2FC, decreasing = FALSE), ][1:10, ])
        
            # Store the results in the list
            de_results_list[[paste(cell_type, paste(c1, c2, sep = "vs"), sep = ".")]] <- de_results
            de_results_topN_list[[paste(cell_type, paste(c1, c2, sep = "vs"), sep = ".")]] <- de_results_topN
        }
    } else {
        de_results <- FindMarkers(subset_obj, ident.1 = condition_ls[1], ident.2 = condition_ls[2], group.by = condition_col)
        ## add a column for the gene names
        de_results$gene <- rownames(de_results)

        ## filter out genes with padj > 0.05
        # de_results <- de_results[de_results$p_val_adj < 0.05, ]
        
        ## get top 10 upregulated DE genes and downregulated, base on logFC
        de_results_topN <- rbind(de_results[order(de_results$avg_log2FC, decreasing = TRUE), ][1:10, ],de_results[order(de_results$avg_log2FC, decreasing = FALSE), ][1:10, ])
        
        de_results_list[[paste(cell_type, paste(condition_ls[1], condition_ls[2], sep = "vs"), sep = ".")]] <- de_results
        de_results_topN_list[[paste(cell_type, paste(condition_ls[1], condition_ls[2], sep = "vs"), sep = ".")]] <- de_results_topN

    }
}
## [1] "Processing cell type: GLU_Neurons"
## [1] "Processing cell type: GABA_Neurons"
## [1] "Processing cell type: Astrocytes"
## [1] "Processing cell type: Microglia"
## [1] "Processing cell type: Endothelial_Cells"
## [1] "Processing cell type: Pericytes"
## [1] "Processing cell type: Oligodendrocytes"
## [1] "Processing cell type: OPCs"
## [1] "Processing cell type: Fibroblast_Like_Cells"
## [1] "Processing cell type: T_Cells"
# Convert the list to a data.table
de_results_dt <- rbindlist(de_results_list, idcol = "cluster_DE")
de_results_topN_dt <- rbindlist(de_results_topN_list, idcol = "cluster_DE")
# Save to CSV
fwrite(de_results_dt, paste0(clustermarkers_folder, "/cluster_DEGs.csv"), row.names = FALSE)
fwrite(de_results_topN_dt, paste0(clustermarkers_folder, "/cluster_DEGs_topN.csv"), row.names = FALSE)

Pseudo-bulk Analysis

print("Calculating pseudo-bulk differential expression within each cell type...")
## [1] "Calculating pseudo-bulk differential expression within each cell type..."
# Set assay based on data type
assay <- switch(seurat_type,
               "scrnaseq" = "RNA",
               "snrnaseq" = "RNA",
               "snATACseq" = "ATAC",
               "visiumst" = "Spatial",
               stop("Invalid seurat_type"))

## use the counts assay
pb_obj <- PseudobulkExpression(seurat_obj,assays = assay, layer = "counts", method= "aggregate", return.seurat = T, group.by = c(sample_col, cluster_col, condition_col))

pb_obj@meta.data[[cluster_col]] <- gsub("-", "_", pb_obj@meta.data[[cluster_col]])
pb_obj@meta.data[["orig.ident"]] <- gsub("-", "_", pb_obj@meta.data[["orig.ident"]])

metadata = pb_obj@meta.data
## rename sample_id to sampleId
colnames(metadata)[colnames(metadata) == sample_col] <- "sampleId"
colnames(metadata)[colnames(metadata) == condition_col] <- "condition"

write.csv(metadata, paste0(clustermarkers_folder, "/metadata_sample_cluster_condition.csv"), row.names = FALSE)

expr_matrix <- GetAssayData(pb_obj, assay = assay, layer = "data")
colnames(expr_matrix) <- gsub("-", "_", colnames(expr_matrix))
write.csv(expr_matrix, paste0(clustermarkers_folder, "/pb_expr_matrix.csv"), row.names = TRUE)
# Define the cell types
cell_types <- unique(pb_obj@meta.data[[cluster_col]])

# Initialize an empty list to store results
pseudo_bulk_list <- list()
pseudo_bulk_topN_list <- list()
# Loop through each cell type
for (cell_type in cell_types) {
    # Print the current cell type # nolint: whitespace_linter, indentation_linter.
    print(paste("Processing cell type:", cell_type))
    # Subset the Seurat object to the current cell type # nolint
    subset_obj <- pb_obj[, pb_obj@meta.data[[cluster_col]] == cell_type]

    # Calculate differential expression between conditions
    # Here, we assume that the conditions are stored in the "Condition" metadata column 
    # You may need to adjust this based on your actual metadata structure
    condition_ls <- unique(subset_obj@meta.data[[condition_col]])
    # if there are more than 2 conditions, compare all possible combinations
    if (length(condition_ls) > 2) {
        combinations <- combn(condition_ls, 2)
        for (i in 1:ncol(combinations)) {
            c1 <- combinations[1, i]
            c2 <- combinations[2, i]
            de_results <- FindMarkers(subset_obj, ident.1 = c1, ident.2 = c2, group.by = condition_col, test.use = "wilcox")
            ## add a column for the gene names
            de_results$gene <- rownames(de_results)
            
            ## filter out genes with padj > 0.05
            # de_results <- de_results[de_results$p_val_adj < 0.1, ]

            ## get top 10 upregulated DE genes and downregulated, base on logFC
            de_results_topN <- rbind(de_results[order(de_results$avg_log2FC, decreasing = TRUE), ][1:10, ],de_results[order(de_results$avg_log2FC, decreasing = FALSE), ][1:10, ])
            
            # Store the results in the list
            pseudo_bulk_list[[paste(cell_type, paste(c1, c2, sep = "vs"), sep = ".")]] <- de_results
            pseudo_bulk_topN_list[[paste(cell_type, paste(c1, c2, sep = "vs"), sep = ".")]] <- de_results_topN
        }
    } else {
        de_results <- FindMarkers(subset_obj, ident.1 = condition_ls[1], ident.2 = condition_ls[2], group.by = condition_col,test.use = "wilcox")
        ## add a column for the gene names
        de_results$gene <- rownames(de_results)
        
        ## filter out genes with padj > 0.05
        # de_results <- de_results[de_results$p_val_adj < 0.05, ]
        
        ## get top 10 upregulated DE genes and downregulated, base on logFC
        de_results_topN <- rbind(de_results[order(de_results$avg_log2FC, decreasing = TRUE), ][1:10, ],de_results[order(de_results$avg_log2FC, decreasing = FALSE), ][1:10, ])
        
        # Store the results in the list
        pseudo_bulk_list[[paste(cell_type, paste(condition_ls[1], condition_ls[2], sep = "vs"), sep = ".")]] <- de_results
        pseudo_bulk_topN_list[[paste(cell_type, paste(condition_ls[1], condition_ls[2], sep = "vs"), sep = ".")]] <- de_results_topN
    }
}
## [1] "Processing cell type: Astrocytes"
## [1] "Processing cell type: Endothelial_Cells"
## [1] "Processing cell type: Fibroblast_Like_Cells"
## [1] "Processing cell type: GABA_Neurons"
## [1] "Processing cell type: GLU_Neurons"
## [1] "Processing cell type: Microglia"
## [1] "Processing cell type: Oligodendrocytes"
## [1] "Processing cell type: OPCs"
## [1] "Processing cell type: Pericytes"
## [1] "Processing cell type: T_Cells"
# Convert the list to a data.table
pseudo_bulk_dt <- rbindlist(pseudo_bulk_list, idcol = "cluster_DE")
pseudo_bulk_topN_dt <- rbindlist(pseudo_bulk_topN_list, idcol = "cluster_DE")
# Save to CSV
fwrite(pseudo_bulk_dt, paste0(clustermarkers_folder, "/cluster_pb_DEGs.csv"), row.names = FALSE)
fwrite(pseudo_bulk_topN_dt, paste0(clustermarkers_folder, "/cluster_pb_DEGs_topN.csv"), row.names = FALSE)
pooled_topN_DEGs = pseudo_bulk_topN_dt$gene
## remove duplicates
pooled_topN_DEGs <- unique(pooled_topN_DEGs)
## subset expr_matrix to only include pooled_topN_DEGs
expr_matrix_pooled_topN_DGEs <- expr_matrix[pooled_topN_DEGs, ]
## save the pooled_topN_DEGs expression matrix
write.csv(expr_matrix_pooled_topN_DGEs, paste0(clustermarkers_folder, "/pb_expr_matrix_DEGs_topN.csv"), row.names = TRUE)

cat("Done! cluster markers is done! ^_^ ...\n")
## Done! cluster markers is done! ^_^ ...