Overview

This notebook extracts data from a Seurat object and saves it in formats suitable for downstream analysis.

Usage

Run as an R script:

Rscript extract_SC_v5.R seurat_obj.rds output_directory cluster_column

Load Libraries

library(jsonlite)
library(Matrix)
library(data.table)
library(Seurat)
library(tidyverse)
library(jsonlite)
library(presto)

Set Arguments

You can set these manually for interactive use:

# Set these manually for notebook use
seurat_obj_file <- "example_data/snRNAseq_MTG_10samples.rds"  # Path to your Seurat object fil
output_dir <- "example_data/snRNAseq_MTG_10samples"  # Directory to save output files, usually the dataset name
cluster_col <- "MajorCellTypes"  # Column name in metadata for clustering information

Create Output Directory

if (!dir.exists(output_dir)) {
  dir.create(output_dir, recursive = TRUE)
}

Load Seurat Object

cat("Loading RDS data...\n")
## Loading RDS data...
seurat_obj <- readRDS(seurat_obj_file)
if (!inherits(seurat_obj, "Seurat")) {
  stop("The provided file is not a valid Seurat object.")
}
capture.output(str(seurat_obj), file = paste0(output_dir, "/seurat_obj_structure.txt"))

Check Seurat Object Structure

v5 Seurat objects have a different structure compared to v4, so we need to ensure the object contains the expected slots and layers.

if (!"RNA" %in% names(seurat_obj@assays)) {
  stop("The Seurat object does not contain the 'RNA' assay.")
}
if (!"data" %in% slotNames(seurat_obj@assays$RNA)) {
  stop("The Seurat object does not contain the 'data' slot in the 'RNA' assay.")
}
if (!"meta.data" %in% slotNames(seurat_obj)) {
  stop("The Seurat object does not contain the 'meta.data' slot.")
}
if (!"umap" %in% names(seurat_obj@reductions)) {
  stop("The Seurat object does not contain the 'umap' reduction.")
}
if (!cluster_col %in% colnames(seurat_obj@meta.data)) {
  stop(paste("The cluster column", cluster_col, "does not exist in the metadata."))
}

Extract and Save Metadata

cat("Saving metadata...\n")
## Saving metadata...
metadata <- seurat_obj@meta.data
write.csv(metadata, file = paste0(output_dir, "/raw_metadata.csv"), row.names = TRUE)

cat("Saving metadata feature names...\n")
## Saving metadata feature names...
column_names <- colnames(metadata)
write_json(column_names, path = file.path(output_dir, "raw_metadata_columns.json"), pretty = TRUE)

Save UMAP Embedding

cat("Saving UMAP embedding...\n")
## Saving UMAP embedding...
umap_embeddings <- seurat_obj@reductions$umap@cell.embeddings
colnames(umap_embeddings) <- toupper(colnames(umap_embeddings))
write.csv(umap_embeddings, file = paste0(output_dir, "/raw_umap_embeddings.csv"), row.names = TRUE)

Extract and Save Normalized Counts

cat("Saving normalized counts...\n")
## Saving normalized counts...
normalized_counts <- seurat_obj@assays$RNA@data # Sparse matrix

# Convert sparse matrix to triplet format (long format)
long_data <- summary(normalized_counts)

# Get row (gene) and column (cell) names
long_data$Gene <- rownames(normalized_counts)[long_data$i]
long_data$Cell <- colnames(normalized_counts)[long_data$j]
long_data$Expression <- long_data$x

# Keep only necessary columns
long_data <- long_data[, c("Gene", "Cell", "Expression")]

# Filter out zero values
nonzero_data <- long_data[long_data$Expression > 0, ]

# Convert to data.table for efficiency
nonzero_data <- as.data.table(nonzero_data)
# Save to CSV with index as the first column in a fast way
fwrite(nonzero_data, file = paste0(output_dir, "/raw_normalized_counts.csv"), row.names = FALSE)

cat("Done! Data extraction is complete!
You can find the output files in the directory:", output_dir, "\n")
## Done! Data extraction is complete!
## You can find the output files in the directory: example_data/snRNAseq_MTG_10samples