Single-cell metadata and data processing¶
This notebook processes single cell metadata for visualization, including:
- Metadata processing and renaming
- UMAP embedding processing
- Expression data processing
- Pseudo-bulk calculation
Self-defined functions¶
In [1]:
import json
import re
def is_categorical(arr, unique_threshold = 20):
"""
Determine if a list of values behaves like a categorical variable.
Parameters:
arr (list): Input list.
unique_threshold (int or float): Max number or ratio of unique values to consider the list categorical.
Returns:
bool: True if the list is considered categorical, False otherwise.
"""
if len(arr) == 0:
return True
# Unique values
unique_values = len(set(arr))
if unique_threshold < 1:
is_few_uniques = unique_values / len(arr) <= unique_threshold
else:
is_few_uniques = unique_values <= unique_threshold
return is_few_uniques
def dumps_compact_lists(obj, indent=4):
pretty = json.dumps(obj, indent=indent)
# Match any list that spans multiple lines
def compact_list(match):
# Extract the list content and remove newlines/extra spaces
items = match.group(1).strip().splitlines()
compacted_items = [item.strip().rstrip(',') for item in items if item.strip()]
return '[' + ','.join(compacted_items) + ']'
# Replace multi-line lists with compact single-line ones
compacted = re.sub(r'\[\s*((?:.|\n)*?)\s*\]', compact_list, pretty)
return compacted
Imports¶
In [2]:
import pandas as pd
import json
import os
import sys
import functools
print = functools.partial(print, flush=True)
Set Parameters¶
Set up the paths and parameters for the analysis.
In [3]:
dataset_path = "example_data/snRNAseq_MTG_10samples"
kept_features =[ "nCount_RNA", "nFeature_RNA", "sex", "MajorCellTypes", "updrs", "Complex_Assignment", "mmse", "sample_id", "case",]
sample_col = "sample_id"
cluster_col = "MajorCellTypes"
condition_col = "case"
print("Dataset path: ", dataset_path)
print("Kept features: ", kept_features)
print("Sample column: ", sample_col)
print("Cluster column: ", cluster_col)
print("Condition column: ", condition_col)
Dataset path: example_data/PMDBS_snRNAseq Kept features: ['nCount_RNA', 'nFeature_RNA', 'sex', 'cell_type', 'phase', 'G2M_score', 'S_score', 'leiden_res_0.10', 'leiden_res_0.20', 'case', 'sample_id'] Sample column: sample_id Cluster column: cell_type Condition column: case
Load and process metadata¶
In [4]:
print("Checking inputs...")
if sample_col not in kept_features:
kept_features.append(sample_col)
if cluster_col not in kept_features:
kept_features.append(cluster_col)
if condition_col not in kept_features:
kept_features.append(condition_col)
print("Loading metadata...")
metadata = pd.read_csv(dataset_path + "/raw_metadata.csv", index_col=0, header=0)
metadata = metadata.loc[:, kept_features]
# Process condition column
metadata = metadata.rename(columns={condition_col: "Condition"})
kept_features.remove(condition_col)
kept_features.append("Condition")
## if the column data is float, keep 2 digits after the decimal point
# Round only float columns to 2 decimal places
metadata[metadata.select_dtypes(include=['float']).columns] = metadata.select_dtypes(include=['float']).round(2)
Checking inputs... Loading metadata...
/var/folders/fb/jjff_3bd7tvblm0pz1wc96tssf65ss/T/ipykernel_44163/1362571716.py:10: DtypeWarning: Columns (40) have mixed types. Specify dtype option on import or set low_memory=False. metadata = pd.read_csv(dataset_path + "/raw_metadata.csv", index_col=0, header=0)
Process cell IDs¶
In [5]:
# Handle sample_id column
if "sample_id" != sample_col:
print("Renaming sample id...")
metadata.drop("sample_id", axis=1, inplace=True, errors="ignore")
metadata = metadata.rename(columns={sample_col: "sample_id"})
kept_features.remove(sample_col)
kept_features.append("sample_id")
# Rename cell IDs
print("Renaming cell id...")
new_ids = []
sample_cell_n = {}
for index, row in metadata.iterrows():
sample_id = row["sample_id"]
if sample_id not in sample_cell_n:
sample_cell_n[sample_id] = 0
sample_cell_n[sample_id] += 1
c_id = sample_id + "_c" + str(sample_cell_n[sample_id])
new_ids.append(c_id)
metadata["cs_id"] = new_ids
barcode_to_cid = metadata["cs_id"].to_dict()
metadata["barcode"] = metadata.index.tolist()
metadata = metadata.set_index("cs_id")
Renaming cell id...
Process sample id¶
In [6]:
# Save sample list
all_samples = metadata["sample_id"].unique().tolist()
with open(dataset_path + "/sample_list.json", "w") as f:
json.dump(sorted(all_samples), f)
# Save cell-to-sample mapping
cell_to_sample = metadata["sample_id"].to_dict()
with open(f"{dataset_path}/cellspot_to_sample.json", "w") as f:
json.dump(cell_to_sample, f, indent=2)
Process cell metadata¶
In [7]:
print("Processing cell metadata...")
metadata.loc[:,kept_features].to_csv(dataset_path + "/cellspot_metadata_original.csv")
# Separate sample-level and cell-level features
sample_level_features = []
cell_level_features = []
sample_groups = metadata.groupby("sample_id")
for feature in kept_features:
is_sample_level = all(group[feature].nunique() <= 2 for _, group in sample_groups)
if is_sample_level:
sample_level_features.append(feature)
else:
cell_level_features.append(feature)
cell_meta_list = cell_level_features
metadata_lite = metadata.loc[:, cell_meta_list]
cell_meta_mapping = {}
for cell_meta in cell_meta_list:
# Check if the column is categorical
if is_categorical(metadata_lite[cell_meta], unique_threshold=0.2):
# Convert to categorical
cat_series = metadata_lite[cell_meta].astype("category")
cat_counts = cat_series.value_counts().to_dict()
# Replace original column with codes
metadata_lite[cell_meta] = cat_series.cat.codes
# Store mapping, and calculate the number of cells in each category
mapping = {i: [cat, cat_counts[cat]] for i, cat in enumerate(cat_series.cat.categories)} ## with counts
cell_meta_mapping[cell_meta] = mapping
# Save mapping to JSON
with open(dataset_path + "/cellspot_meta_mapping.json", "w") as f:
f.write(dumps_compact_lists(cell_meta_mapping, indent=4))
metadata_lite.to_csv(dataset_path + "/cellspot_metadata.csv")
Processing cell metadata...
Process sample metadata¶
In [8]:
print("Processing sample metadata...")
sample_meta_list = sample_level_features
sample_meta = metadata.loc[:, sample_meta_list]
sample_meta = sample_meta.drop_duplicates()
sample_meta = sample_meta.set_index("sample_id")
sample_meta.fillna("", inplace=True)
sample_meta.to_csv(dataset_path + "/sample_metadata.csv")
with open(dataset_path + "/meta_list.json", "w") as f:
json.dump(sorted(cell_meta_list + sample_meta_list), f)
Processing sample metadata...
Process UMAP Embeddings¶
In [9]:
print("Loading embedding....")
embeddings_data = pd.read_csv(dataset_path + "/raw_umap_embeddings.csv", index_col=0, header=0)
embeddings_data["UMAP_1"] = embeddings_data["UMAP_1"].round(2)
embeddings_data["UMAP_2"] = embeddings_data["UMAP_2"].round(2)
# Rename embeddings using barcode to cell ID mapping
print("Renaming embeddings....")
embeddings_data = embeddings_data.reset_index()
embeddings_data["index"] = embeddings_data["index"].map(barcode_to_cid)
embeddings_data = embeddings_data.set_index("index")
embeddings_data.to_csv(dataset_path + "/umap_embeddings.csv", index_label="cs_id")
## sampling umap, get 100k cells
print("Sampling umap...")
n_rows = embeddings_data.shape[0]
sample_rows = 100000 if n_rows > 100000 else n_rows
embeddings_data_nk = embeddings_data.sample(n=sample_rows, random_state=42)
embeddings_data_nk.to_csv(dataset_path + "/umap_embeddings_100k.csv", index_label="cs_id")
sample_rows = 50000 if n_rows > 50000 else n_rows
embeddings_data_nk = embeddings_data.sample(n=sample_rows, random_state=42)
embeddings_data_nk.to_csv(dataset_path + "/umap_embeddings_50k.csv", index_label="cs_id")
Loading embedding.... Renaming embeddings.... Sampling umap...
Process expression data¶
In [10]:
print("Loading expression data...")
expression_data = pd.read_csv(dataset_path + "/raw_normalized_counts.csv", index_col=None, header=0)
print("Renaming expression....")
expression_data["cs_id"] = expression_data["Cell"].map(barcode_to_cid)
expression_data.drop("Cell", axis=1, inplace=True)
expression_data["Expression"] = expression_data["Expression"].round(2)
Loading expression data... Renaming expression....
In [11]:
# Group data by gene
print("Grouping by gene... be patient...")
grouped_by_gene = expression_data.groupby("Gene")
## Save gene jsons
print("Saving gene jsons...")
all_genes = grouped_by_gene.groups.keys()
all_genes = [gene_i.replace("/", "_") for gene_i in list(set(all_genes))]
with open(dataset_path + "/gene_list.json", "w") as f:
json.dump(sorted(all_genes), f)
# Create directory for genes
os.makedirs(dataset_path + "/gene_jsons", exist_ok=True)
# Save each gene as a separate JSON file
print("Saving gene... be patient...")
i = 0
total_n = len(all_genes)
for gene, df in grouped_by_gene:
try:
i += 1
if i % 1000 == 0:
print(f"{i}/{total_n}")
gene_dict = dict(zip(df["cs_id"], df["Expression"]))
safe_gene_name = gene.replace("/", "_")
# Create JSON file
file_name = f"{dataset_path}/gene_jsons/{safe_gene_name}.json"
with open(file_name, "w") as f:
json.dump(gene_dict, f, indent=4)
except Exception as e:
print(f"Error in processing {gene} !!! Check the error_gene.txt")
with open(dataset_path + "/error_gene_json.txt", "a") as f_err:
f_err.write(gene + "\n")
Grouping by gene... be patient... Saving gene jsons... Saving gene... be patient... 1000/34018 2000/34018 3000/34018 4000/34018 5000/34018 6000/34018 7000/34018 8000/34018 9000/34018 10000/34018 11000/34018 12000/34018 13000/34018 14000/34018 15000/34018 16000/34018 17000/34018 18000/34018 19000/34018 20000/34018 21000/34018 22000/34018 23000/34018 24000/34018 25000/34018 26000/34018 27000/34018 28000/34018 29000/34018 30000/34018 31000/34018 32000/34018 33000/34018 34000/34018
Calculate pseudo-bulk expression¶
In [12]:
print("Calculating pseudo count...")
os.makedirs(dataset_path + "/gene_pseudobulk", exist_ok=True)
expression_data["sample_id"] = expression_data["cs_id"].map(cell_to_sample)
print("Grouping by gene and sample... be patient...")
# Compute pseudo-bulk counts by summing expression values per (sample, gene)
pseudo_bulk = expression_data.groupby(["sample_id", "Gene"])["Expression"].sum().reset_index()
# Rename the expression value column
pseudo_bulk.rename(columns={"Expression": "pseudobulk_expr"}, inplace=True)
# Save each gene's data to a separate JSON file
i = 0
for gene, df_gene in pseudo_bulk.groupby("Gene"):
i += 1
if i % 1000 == 0:
print(f"{i}/{total_n}")
gene_dict = df_gene.set_index("sample_id")["pseudobulk_expr"].to_dict()
safe_gene_name = gene.replace("/", "_")
with open(f"{dataset_path}/gene_pseudobulk/{safe_gene_name}.json", "w") as f:
json.dump(gene_dict, f, indent=4)
print("Done! Feature/Gene data processed and saved.")
Calculating pseudo count... Grouping by gene and sample... be patient... 1000/34018 2000/34018 3000/34018 4000/34018 5000/34018 6000/34018 7000/34018 8000/34018 9000/34018 10000/34018 11000/34018 12000/34018 13000/34018 14000/34018 15000/34018 16000/34018 17000/34018 18000/34018 19000/34018 20000/34018 21000/34018 22000/34018 23000/34018 24000/34018 25000/34018 26000/34018 27000/34018 28000/34018 29000/34018 30000/34018 31000/34018 32000/34018 33000/34018 34000/34018 Done! Feature/Gene data processed and saved.