import json
import re

def is_categorical(arr, unique_threshold = 20):
    """
    Determine if a list of values behaves like a categorical variable.

    Parameters:
    arr (list): Input list.
    unique_threshold (int or float): Max number or ratio of unique values to consider the list categorical.

    Returns:
    bool: True if the list is considered categorical, False otherwise.
    """

    if len(arr) == 0:
        return True

    # Unique values
    unique_values = len(set(arr))
    if unique_threshold < 1:
        is_few_uniques = unique_values / len(arr) <= unique_threshold
    else:
        is_few_uniques = unique_values <= unique_threshold

    return is_few_uniques

def dumps_compact_lists(obj, indent=4):
    pretty = json.dumps(obj, indent=indent)

    # Match any list that spans multiple lines
    def compact_list(match):
        # Extract the list content and remove newlines/extra spaces
        items = match.group(1).strip().splitlines()
        compacted_items = [item.strip().rstrip(',') for item in items if item.strip()]
        return '[' + ','.join(compacted_items) + ']'

    # Replace multi-line lists with compact single-line ones
    compacted = re.sub(r'\[\s*((?:.|\n)*?)\s*\]', compact_list, pretty)

    return compacted

import pandas as pd
import json
import os
import sys

import functools
print = functools.partial(print, flush=True)

dataset_path = "example_data/snRNAseq_MTG_10samples"
kept_features =[ "nCount_RNA", "nFeature_RNA", "sex", "MajorCellTypes", "updrs", "Complex_Assignment", "mmse", "sample_id", "case",]
sample_col = "sample_id"
cluster_col = "MajorCellTypes"
condition_col = "case"

print("Dataset path: ", dataset_path)
print("Kept features: ", kept_features)
print("Sample column: ", sample_col)
print("Cluster column: ", cluster_col)
print("Condition column: ", condition_col)

Dataset path:  example_data/PMDBS_snRNAseq
Kept features:  ['nCount_RNA', 'nFeature_RNA', 'sex', 'cell_type', 'phase', 'G2M_score', 'S_score', 'leiden_res_0.10', 'leiden_res_0.20', 'case', 'sample_id']
Sample column:  sample_id
Cluster column:  cell_type
Condition column:  case

print("Checking inputs...")
if sample_col not in kept_features:
    kept_features.append(sample_col)
if cluster_col not in kept_features:
    kept_features.append(cluster_col)
if condition_col not in kept_features:
    kept_features.append(condition_col)

print("Loading metadata...")
metadata = pd.read_csv(dataset_path + "/raw_metadata.csv", index_col=0, header=0)
metadata = metadata.loc[:, kept_features]

# Process condition column
metadata = metadata.rename(columns={condition_col: "Condition"})
kept_features.remove(condition_col)
kept_features.append("Condition")

## if the column data is float, keep 2 digits after the decimal point
# Round only float columns to 2 decimal places
metadata[metadata.select_dtypes(include=['float']).columns] = metadata.select_dtypes(include=['float']).round(2)

Checking inputs...
Loading metadata...

/var/folders/fb/jjff_3bd7tvblm0pz1wc96tssf65ss/T/ipykernel_44163/1362571716.py:10: DtypeWarning: Columns (40) have mixed types. Specify dtype option on import or set low_memory=False.
  metadata = pd.read_csv(dataset_path + "/raw_metadata.csv", index_col=0, header=0)

# Handle sample_id column
if "sample_id" != sample_col:
    print("Renaming sample id...")
    metadata.drop("sample_id", axis=1, inplace=True, errors="ignore")
    metadata = metadata.rename(columns={sample_col: "sample_id"})
    kept_features.remove(sample_col)
    kept_features.append("sample_id")

# Rename cell IDs
print("Renaming cell id...")
new_ids = []
sample_cell_n = {}
for index, row in metadata.iterrows():
    sample_id = row["sample_id"]
    if sample_id not in sample_cell_n:
        sample_cell_n[sample_id] = 0
    sample_cell_n[sample_id] += 1
    c_id = sample_id + "_c" + str(sample_cell_n[sample_id])
    new_ids.append(c_id)
metadata["cs_id"] = new_ids

barcode_to_cid = metadata["cs_id"].to_dict()

metadata["barcode"] = metadata.index.tolist()
metadata = metadata.set_index("cs_id")

Renaming cell id...

# Save sample list
all_samples = metadata["sample_id"].unique().tolist()
with open(dataset_path + "/sample_list.json", "w") as f:
    json.dump(sorted(all_samples), f)

# Save cell-to-sample mapping
cell_to_sample = metadata["sample_id"].to_dict()
with open(f"{dataset_path}/cellspot_to_sample.json", "w") as f:
    json.dump(cell_to_sample, f, indent=2)

print("Processing cell metadata...")
metadata.loc[:,kept_features].to_csv(dataset_path + "/cellspot_metadata_original.csv")

# Separate sample-level and cell-level features
sample_level_features = []
cell_level_features = []
sample_groups = metadata.groupby("sample_id")
for feature in kept_features:
    is_sample_level = all(group[feature].nunique() <= 2 for _, group in sample_groups)
    if is_sample_level:
        sample_level_features.append(feature)
    else:
        cell_level_features.append(feature)

cell_meta_list = cell_level_features
metadata_lite = metadata.loc[:, cell_meta_list]

cell_meta_mapping = {}
for cell_meta in cell_meta_list:
    # Check if the column is categorical
    if is_categorical(metadata_lite[cell_meta], unique_threshold=0.2):
        # Convert to categorical
        cat_series = metadata_lite[cell_meta].astype("category")

        cat_counts = cat_series.value_counts().to_dict()

        # Replace original column with codes
        metadata_lite[cell_meta] = cat_series.cat.codes

        # Store mapping, and calculate the number of cells in each category
        mapping = {i: [cat, cat_counts[cat]] for i, cat in enumerate(cat_series.cat.categories)} ## with counts
        cell_meta_mapping[cell_meta] = mapping

# Save mapping to JSON
with open(dataset_path + "/cellspot_meta_mapping.json", "w") as f:
    f.write(dumps_compact_lists(cell_meta_mapping, indent=4))

metadata_lite.to_csv(dataset_path + "/cellspot_metadata.csv")

Processing cell metadata...

print("Processing sample metadata...")
sample_meta_list = sample_level_features
sample_meta = metadata.loc[:, sample_meta_list]
sample_meta = sample_meta.drop_duplicates()
sample_meta = sample_meta.set_index("sample_id")
sample_meta.fillna("", inplace=True)
sample_meta.to_csv(dataset_path + "/sample_metadata.csv")

with open(dataset_path + "/meta_list.json", "w") as f:
    json.dump(sorted(cell_meta_list + sample_meta_list), f)

Processing sample metadata...

print("Loading embedding....")
embeddings_data = pd.read_csv(dataset_path + "/raw_umap_embeddings.csv", index_col=0, header=0)

embeddings_data["UMAP_1"] = embeddings_data["UMAP_1"].round(2)
embeddings_data["UMAP_2"] = embeddings_data["UMAP_2"].round(2)

# Rename embeddings using barcode to cell ID mapping
print("Renaming embeddings....")
embeddings_data = embeddings_data.reset_index()
embeddings_data["index"] = embeddings_data["index"].map(barcode_to_cid)
embeddings_data = embeddings_data.set_index("index")
embeddings_data.to_csv(dataset_path + "/umap_embeddings.csv", index_label="cs_id")

## sampling umap, get 100k cells
print("Sampling umap...")
n_rows = embeddings_data.shape[0]

sample_rows = 100000 if n_rows > 100000 else n_rows
embeddings_data_nk = embeddings_data.sample(n=sample_rows, random_state=42)
embeddings_data_nk.to_csv(dataset_path + "/umap_embeddings_100k.csv", index_label="cs_id")

sample_rows = 50000 if n_rows > 50000 else n_rows
embeddings_data_nk = embeddings_data.sample(n=sample_rows, random_state=42)
embeddings_data_nk.to_csv(dataset_path + "/umap_embeddings_50k.csv", index_label="cs_id")

Loading embedding....
Renaming embeddings....
Sampling umap...

print("Loading expression data...")
expression_data = pd.read_csv(dataset_path + "/raw_normalized_counts.csv", index_col=None, header=0)

print("Renaming expression....")
expression_data["cs_id"] = expression_data["Cell"].map(barcode_to_cid)
expression_data.drop("Cell", axis=1, inplace=True)
expression_data["Expression"] = expression_data["Expression"].round(2)

Loading expression data...
Renaming expression....

# Group data by gene
print("Grouping by gene... be patient...")
grouped_by_gene = expression_data.groupby("Gene")

## Save gene jsons
print("Saving gene jsons...")

all_genes = grouped_by_gene.groups.keys()
all_genes = [gene_i.replace("/", "_") for gene_i in list(set(all_genes))]
with open(dataset_path + "/gene_list.json", "w") as f:
    json.dump(sorted(all_genes), f)

# Create directory for genes
os.makedirs(dataset_path + "/gene_jsons", exist_ok=True)

# Save each gene as a separate JSON file
print("Saving gene... be patient...")
i = 0
total_n = len(all_genes)
for gene, df in grouped_by_gene:
    try:
        i += 1
        if i % 1000 == 0:
            print(f"{i}/{total_n}")

        gene_dict = dict(zip(df["cs_id"], df["Expression"]))

        safe_gene_name = gene.replace("/", "_")

        # Create JSON file
        file_name = f"{dataset_path}/gene_jsons/{safe_gene_name}.json"
        with open(file_name, "w") as f:
            json.dump(gene_dict, f, indent=4)

    except Exception as e:
        print(f"Error in processing {gene} !!! Check the error_gene.txt")
        with open(dataset_path + "/error_gene_json.txt", "a") as f_err:
            f_err.write(gene + "\n")

Grouping by gene... be patient...
Saving gene jsons...
Saving gene... be patient...
1000/34018
2000/34018
3000/34018
4000/34018
5000/34018
6000/34018
7000/34018
8000/34018
9000/34018
10000/34018
11000/34018
12000/34018
13000/34018
14000/34018
15000/34018
16000/34018
17000/34018
18000/34018
19000/34018
20000/34018
21000/34018
22000/34018
23000/34018
24000/34018
25000/34018
26000/34018
27000/34018
28000/34018
29000/34018
30000/34018
31000/34018
32000/34018
33000/34018
34000/34018

print("Calculating pseudo count...")
os.makedirs(dataset_path + "/gene_pseudobulk", exist_ok=True)

expression_data["sample_id"] = expression_data["cs_id"].map(cell_to_sample)

print("Grouping by gene and sample... be patient...")
# Compute pseudo-bulk counts by summing expression values per (sample, gene)
pseudo_bulk = expression_data.groupby(["sample_id", "Gene"])["Expression"].sum().reset_index()
# Rename the expression value column
pseudo_bulk.rename(columns={"Expression": "pseudobulk_expr"}, inplace=True)

# Save each gene's data to a separate JSON file
i = 0
for gene, df_gene in pseudo_bulk.groupby("Gene"):
    i += 1
    if i % 1000 == 0:
        print(f"{i}/{total_n}")
    gene_dict = df_gene.set_index("sample_id")["pseudobulk_expr"].to_dict()
    safe_gene_name = gene.replace("/", "_")
    with open(f"{dataset_path}/gene_pseudobulk/{safe_gene_name}.json", "w") as f:
        json.dump(gene_dict, f, indent=4)

print("Done! Feature/Gene data processed and saved.")

Calculating pseudo count...
Grouping by gene and sample... be patient...
1000/34018
2000/34018
3000/34018
4000/34018
5000/34018
6000/34018
7000/34018
8000/34018
9000/34018
10000/34018
11000/34018
12000/34018
13000/34018
14000/34018
15000/34018
16000/34018
17000/34018
18000/34018
19000/34018
20000/34018
21000/34018
22000/34018
23000/34018
24000/34018
25000/34018
26000/34018
27000/34018
28000/34018
29000/34018
30000/34018
31000/34018
32000/34018
33000/34018
34000/34018
Done! Feature/Gene data processed and saved.

Single-cell metadata and data processing¶

Self-defined functions¶

Imports¶

Set Parameters¶

Load and process metadata¶

Process cell IDs¶

Process sample id¶

Process cell metadata¶

Process sample metadata¶

Process UMAP Embeddings¶

Process expression data¶

Calculate pseudo-bulk expression¶