Data Processing: Convert to CDE Format

Convert all datasets from original format to CDE format.
import numpy as np
import pandas as pd
import os
import json
import requests
import scanpy as sc
import shutil

pd.set_option('display.max_columns', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")
basepath = "/u/yashjain/hra-cell-distance-analysis/data"
orig_filedir = "data-original"
dest_filedir = "data-processed-nodes"
# Function to load your data
def load_data(path):
    data = pd.read_csv(path)
    return data
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")
def get_hubmap_uuid(hubmap_id):
    # Construct the API URL
    base_url = "https://entity.api.hubmapconsortium.org/entities/"
    url = base_url + hubmap_id

    try:
        # Send GET request to the API
        response = requests.get(url)
        
        # Check if the request was successful
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()
        
        # Extract the UUID
        uuid = data.get("uuid")
        
        if uuid:
            return uuid
        else:
            return "UUID not found in the response"
    
    except requests.RequestException as e:
        return f"An error occurred: {str(e)}"
print(len(os.listdir(os.path.join(basepath, orig_filedir))), os.listdir(os.path.join(basepath, orig_filedir)))
14 ['bonemarrow-codex-chop', 'colon-cycif-sorgerlab', 'colon-xenium-stanford', 'esophagus-codex-stanford', 'intestine-codex-stanford', 'lung-codex-urmc', 'lymphnode-codex-yale', 'maternalfetalinterface-mibitof-stanford', 'oralcavity-codex-czi', 'pancreas-geomx-ufl', 'skin-celldive-ge', 'skin-confocal-sorgerlab', 'spleen-codex-ufl', 'tonsil-codex-stanford']
# Create destination directory. Overwrite if it exists.
if os.path.exists(os.path.join(basepath, dest_filedir)):
    shutil.rmtree(os.path.join(basepath, dest_filedir))
    print(f"Directory '{dest_filedir}' already exists and has been removed. New directory will be created.")
else:
    print(f"Directory '{dest_filedir}' does not exist and will be created.")
os.makedirs(os.path.join(basepath, dest_filedir), exist_ok=False)
Directory 'data-processed-nodes' already exists and has been removed. New directory will be created.

Processsing Individual Datasets

intestine-codex-stanford

dataset_name = "intestine-codex-stanford"

raw_filepath = os.path.join(basepath, orig_filedir, dataset_name, "23_09_CODEX_HuBMAP_alldata_Dryad_merged.csv")

data_new = load_data(raw_filepath)

# Filename to HuBMAP ID mapping.
mapping_file = os.path.join(basepath, orig_filedir, dataset_name, "filename-to-hubmap-mapping.csv")
mapping_df = load_data(mapping_file)

# Convert the mapping dataframe to dictionary. Key is Filename and value is HuBMAP ID.
filename_to_id = dict(zip(mapping_df["Filename"], mapping_df["HuBMAP ID"]))

# Store types of unique regions before splitting
# column is "unique_region"
unique_regions = data_new["unique_region"].unique()

data_new.rename(columns={"cell_type": "Cell Type"}, inplace=True)

# Store types of cell before splitting
cell_types = data_new["Cell Type"].unique()
print(cell_types)

# Take the column 'unique_region' to split from the actual column names of data frame
column_to_split = "unique_region"

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for label in unique_regions:
    # If label contains "extra-nodes" then skip.
    if "extra" in label:
        continue

    # Create another sub data frame using the value for the value of the column each time
    df_label = data_new[data_new[column_to_split] == label]

    df_Region_1 = df_label[["x", "y", "Cell Type"]]

    # Calculate μm per px
    micro_per_pixel = 0.37742
    scale = micro_per_pixel  # to convert given pixel in micro meter unit
    df_Region_1["x"] = scale * df_Region_1["x"]
    df_Region_1["y"] = scale * df_Region_1["y"]

    label_clean = label.replace("-", "_")
    label_clean = label_clean.replace(" ", "")
    # Write to the file using pandas to_csv
    df_Region_1.to_csv(f"{target_dir}/{label_clean}-nodes.csv", index=False, header=True, mode="w")

    # Generate dataset.json file for HRAPoP. Get the UUID for the HuBMAP ID from Entity API.
    id = "https://entity.api.hubmapconsortium.org/entities/" + get_hubmap_uuid(filename_to_id[f"{label_clean}-nodes"])
    dataset_json = {
        "@id": id
    }

    with open(f"{target_dir}/{label_clean}-dataset.json", "w") as f:
        json.dump(dataset_json, f, indent=4)
['NK' 'Enterocyte' 'MUC1+ Enterocyte' 'TA' 'CD66+ Enterocyte' 'Paneth'
 'Smooth muscle' 'M1 Macrophage' 'Goblet' 'Neuroendocrine'
 'CD57+ Enterocyte' 'Lymphatic' 'CD8+ T' 'DC' 'M2 Macrophage' 'B'
 'Neutrophil' 'Endothelial' 'Cycling TA' 'Plasma' 'CD4+ T cell' 'Stroma'
 'Nerve' 'ICC' 'CD7+ Immune']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/intestine-codex-stanford' created successfully.

tonsil-codex-stanford

dataset_name = "tonsil-codex-stanford"

raw_filepath = os.path.join(basepath, orig_filedir, dataset_name, "BE_Tonsil_l3_dryad.csv")

data_new = load_data(raw_filepath)

# Store types of unique regions before splitting
# column is "unique_region"
unique_regions = data_new["sample_name"].unique()

data_new.rename(columns={"cell_type": "Cell Type"}, inplace=True)

# Store types of cell before splitting
cell_types = data_new["Cell Type"].unique()
print(cell_types)

# Take the column 'unique_region' to split from the actual column names of data frame
column_to_split = "sample_name"

for label in unique_regions:
    # Create another sub data frame using the value for the value of the column each time
    if label == "tonsil":
        df_label = data_new[data_new[column_to_split] == label]

        df_Region_1 = df_label[["x", "y", "Cell Type"]]

        # Calculate μm per px
        micro_per_pixel = 0.377  
        scale = micro_per_pixel  # to convert given pixel in micro meter unit
        df_Region_1["x"] = scale * df_Region_1["x"]
        df_Region_1["y"] = scale * df_Region_1["y"]

        target_dir = os.path.join(basepath, dest_filedir, dataset_name)
        create_directory(target_dir)
        label_clean = label.replace("-", "_")
        label_clean = label_clean.replace(" ", "")
        # Write to the file using pandas to_csv
        df_Region_1.to_csv(f"{target_dir}/{label_clean}-nodes.csv", index=False, header=True, mode="w")
['Innate' 'PDPN' 'Endothelial' 'B' 'T' 'Squamous_epithelial' 'Stroma'
 'SmoothMuscle' 'Plasma' 'Nerve' 'Glandular_epi' 'Secretory_epithelial'
 'Paneth']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/tonsil-codex-stanford' created successfully.

esophagus-codex-stanford

dataset_name = "esophagus-codex-stanford"

raw_filepath = os.path.join(basepath, orig_filedir, dataset_name, "BE_Tonsil_l3_dryad.csv")

data_new = load_data(raw_filepath)

# Store types of unique regions before splitting
# column is "unique_region"
unique_regions = data_new["sample_name"].unique()

data_new.rename(columns={"cell_type": "Cell Type"}, inplace=True)

# Store types of cell before splitting
cell_types = data_new["Cell Type"].unique()
print(cell_types)

# Take the column 'unique_region' to split from the actual column names of data frame
column_to_split = "sample_name"

for label in unique_regions:
    # Create another sub data frame using the value for the value of the column each time
    if label == "Barretts Esophagus":
        df_label = data_new[data_new[column_to_split] == label]

        df_Region_1 = df_label[["x", "y", "Cell Type"]]

        # Calculate μm per px
        micro_per_pixel = 0.377  
        scale = micro_per_pixel  # to convert given pixel in micro meter unit
        df_Region_1["x"] = scale * df_Region_1["x"]
        df_Region_1["y"] = scale * df_Region_1["y"]

        target_dir = os.path.join(basepath, dest_filedir, dataset_name)
        create_directory(target_dir)
        
        # Write to the file using pandas to_csv
        df_Region_1.to_csv(f"{target_dir}/esophagus-nodes.csv", index=False, header=True, mode="w")
['Innate' 'PDPN' 'Endothelial' 'B' 'T' 'Squamous_epithelial' 'Stroma'
 'SmoothMuscle' 'Plasma' 'Nerve' 'Glandular_epi' 'Secretory_epithelial'
 'Paneth']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/esophagus-codex-stanford' created successfully.

colon-cycif-sorgerlab

dataset_name = "colon-cycif-sorgerlab"

raw_filedir = os.path.join(basepath, orig_filedir, dataset_name)

# Preprocess to merge CT labels.
# List of filenames
filenames = [
    "Reg_Celltype_CRC01002.csv", "Reg_Celltype_CRC01007.csv", "Reg_Celltype_CRC01014.csv", 
    "Reg_Celltype_CRC01020.csv", "Reg_Celltype_CRC01025.csv", "Reg_Celltype_CRC01029.csv", 
    "Reg_Celltype_CRC01034.csv", "Reg_Celltype_CRC01039.csv", "Reg_Celltype_CRC01044.csv", 
    "Reg_Celltype_CRC01049.csv", "Reg_Celltype_CRC01050.csv", "Reg_Celltype_CRC01051.csv", 
    "Reg_Celltype_CRC01052.csv", "Reg_Celltype_CRC01054.csv", "Reg_Celltype_CRC01059.csv", 
    "Reg_Celltype_CRC01064.csv", "Reg_Celltype_CRC01069.csv", "Reg_Celltype_CRC01074.csv", 
    "Reg_Celltype_CRC01078.csv", "Reg_Celltype_CRC01084.csv", "Reg_Celltype_CRC01086.csv", 
    "Reg_Celltype_CRC01091.csv", "Reg_Celltype_CRC01097.csv", "Reg_Celltype_CRC01102.csv", 
    "Reg_Celltype_CRC01106.csv"
]

# Read CSV files, add filename column, and combine
df_list = []
for filename in filenames:
    df = pd.read_csv(os.path.join(raw_filedir, filename))
    # Extract CRCXXXXX part from the filename
    crc_code = filename.split('.')[0].split('_')[2]
    df['Layer'] = crc_code
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)

# Read Celltype_reference_table.csv
ref_df = pd.read_csv(os.path.join(raw_filedir, "Celltype_reference_table.csv"))

# Create a dictionary for mapping NewType to Name and Category
ref_dict = ref_df.set_index('NewType')[['Name', 'Category']].to_dict('index')

# Add new columns "Cell Type" and "Category" to the combined csv file
combined_df['Cell Type'] = combined_df['NewType'].map(lambda x: ref_dict.get(x, {}).get('Name', ''))
combined_df['Category'] = combined_df['NewType'].map(lambda x: ref_dict.get(x, {}).get('Category', ''))

combined_df.rename(columns={"Xr": "x", "Yr": "y"}, inplace=True)

data_new = combined_df 

unique_regions = data_new["Layer"].unique()

cell_types = data_new["Cell Type"].unique()
print(cell_types)

column_to_split = "Layer"

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for label in unique_regions:
    # Create another sub data frame using the value for the value of the column each time
    df_label = data_new[data_new[column_to_split] == label]

    df_Region_1 = df_label[["x", "y", "Cell Type"]]

    # Write to the file using pandas to_csv
    df_Region_1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")

    # Generate dataset.json file for HRAPoP. Paper DOI: https://doi.org/10.1016/j.cell.2022.12.028
    id = f"https://doi.org/10.1016/j.cell.2022.12.028#{label}"
    dataset_json = {
        "@id": id
    }

    with open(f"{target_dir}/{label}-dataset.json", "w") as f:
        json.dump(dataset_json, f, indent=4)
['Other' 'PDL1+ Macrophage' 'Tumor/Epithelial' 'Lymphocyte(III)' 'Treg'
 'Endothelial' 'Muscle/Fibroblast' 'Tc cell' 'PDL1+ Tumor/Epithelial'
 'Macrophage(IV)' 'B cells' 'PD1+ Tc' 'Macrophage(III)' 'DN Lymphocyte'
 'DP Lymphocyte' 'T helper' 'Macrophage(II)' 'Macrophage(I)'
 'Ki67+ Tumor/Epithelial' 'PD1+ T helper' 'PDL1+ lymphocyte']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/colon-cycif-sorgerlab' created successfully.

colon-xenium-stanford

dataset_name = "colon-xenium-stanford"

raw_filepath = os.path.join(basepath, orig_filedir, dataset_name, "xenium_polyp_29sections.txt")

data_new = pd.read_csv(raw_filepath, sep='\t')

# Store types of unique regions before splitting
# column is "unique_region"
unique_regions = data_new["layer"].unique()

data_new.rename(columns={"cell_type": "Cell Type", "x_align": "x", "y_align": "y"}, inplace=True)

# Store types of cell before splitting
cell_types = data_new["Cell Type"].unique()
print(cell_types)

# Take the column 'unique_region' to split from the actual column names of data frame
column_to_split = "layer"

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for label in unique_regions:
    # Create another sub data frame using the value for the value of the column each time
    df_label = data_new[data_new[column_to_split] == label]

    df_Region_1 = df_label[["x", "y", "Cell Type"]]

    # Write to the file using pandas to_csv
    df_Region_1.to_csv(f"{target_dir}/layer_{label}-nodes.csv", index=False, header=True, mode="w")
['Immature Goblet' 'Tuft' 'TA1' 'CD4+' 'Pericytes' 'Macrophages'
 'CyclingTA' 'Cancer Associated Fibroblasts' 'Best4+ Enterocytes' 'Stem'
 'CD8+' 'Endothelial' 'TA2' 'Myofibroblasts/Smooth Muscle 3'
 'Unknown_lowcount' 'Myofibroblasts/Smooth Muscle 1' 'Crypt Fibroblasts 3'
 'Crypt Fibroblasts 1' 'Glia' 'Crypt Fibroblasts 4' 'Adipocytes' 'Plasma'
 'GC' 'Mast' 'Enteroendocrine' 'Tregs' 'Lymphatic endothelial cells'
 'Goblet' 'Immature Enterocytes' 'Enterocyte Progenitors' 'Enterocytes'
 'Myofibroblasts/Smooth Muscle 2' 'Naive T' 'Villus Fibroblasts WNT5B+'
 'Naive B' 'Neurons' 'Memory B' 'Crypt Fibroblasts 2' 'ILCs' 'Unknown'
 'DC']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/colon-xenium-stanford' created successfully.

lymphnode-codex-yale

dataset_name = "lymphnode-codex-yale"

raw_filedir = os.path.join(basepath, orig_filedir, dataset_name)

ct = set()

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for raw_filename in os.listdir(raw_filedir):
    data_new = load_data(os.path.join(raw_filedir, raw_filename))

    data_new.rename(columns={"celltype": "Cell Type"}, inplace=True)

    data_new["Cell Type"] = data_new["Cell Type"].apply(lambda x: x.replace('Endo', 'Endothelial'))

    cell_types = data_new["Cell Type"].unique()
    
    for i in cell_types:
        ct.add(i)

    df_Region_1 = data_new[["x", "y", "Cell Type"]]
    
    # Write to the file using pandas to_csv
    label = raw_filename.split(".")[0].split("_")[0]
    df_Region_1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")

print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/lymphnode-codex-yale' created successfully.
{'DC_CCR7+', 'B_plasma', 'Mast', 'T_CD8+_cytotoxic', 'Macrophages_M2', 'T_CD8+_naive', 'DC_pDC', 'VSMC', 'T_CD4+_TfH_GC', 'B_GC_DZ', 'B_naive', 'B_GC_LZ', 'T_CD4+', 'DC_cDC1', 'NK', 'B_mem', 'T_CD4+_naive', 'B_preGC', 'B_IFN', 'B_activated', 'T_CD4+_TfH', 'B_GC_prePB', 'Monocytes', 'NKT', 'Endothelial', 'T_CD8+_CD161+', 'T_TfR', 'T_TIM3+', 'DC_cDC2', 'FDC', 'T_Treg', 'Macrophages_M1', 'B_Cycling', 'ILC'}

maternalfetalinterface-mibitof-stanford

dataset_name = "maternalfetalinterface-mibitof-stanford"

raw_filepath = os.path.join(basepath, orig_filedir, dataset_name, "Supplementary_table_3_single_cells_updated.csv")

data = load_data(raw_filepath)

data_new = data[data['overlap_decidua'] == 1.0]

# Store types of unique regions before splitting
# column is "unique_region"
unique_regions = data_new["Point"].unique()

data_new.rename(columns={'centroid0': 'x', 'centroid1': 'y', 'lineage': 'Cell Type' }, inplace=True)

# Store types of cell before splitting
cell_types = data_new["Cell Type"].unique()
print(cell_types)

# Take the column 'unique_region' to split from the actual column names of data frame
column_to_split = "Point"

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for label in unique_regions:
    # Create another sub data frame using the value for the value of the column each time
    df_label = data_new[data_new[column_to_split] == label]

    df_Region_1 = df_label[["x", "y", "Cell Type"]]

    # Calculate μm per px
    micro_per_pixel = 0.391
    scale = micro_per_pixel  # to convert given pixel in micro meter unit
    df_Region_1["x"] = scale * df_Region_1["x"]
    df_Region_1["y"] = scale * df_Region_1["y"]

    # Write to the file using pandas to_csv
    df_Region_1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
['Mac2a' 'other' 'NK1' 'Fibroblasts' 'NKT' 'Endothelial' 'Myofibroblasts'
 'Mac1a' 'EVT1a' 'Mac1b' 'CD8T' 'EVT1b' 'Mac2c' 'NK2' 'muscle' 'NK3'
 'EVT2' 'Mac2b' 'DC' 'Glandular' 'CD4T' 'EVT1c' 'NK4' 'Mast' 'Treg'
 'Placental_Mac']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/maternalfetalinterface-mibitof-stanford' created successfully.

oralcavity-codex-czi

dataset_name = "oralcavity-codex-czi"

raw_filedir = os.path.join(basepath, orig_filedir, dataset_name)

ct = set()

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for raw_filename in os.listdir(raw_filedir):
    data_new = load_data(os.path.join(raw_filedir, raw_filename))

    data_new.rename(columns={"TACIT": "Cell Type"}, inplace=True)
    data_new.rename(columns={"X": "x"}, inplace=True)
    data_new.rename(columns={"Y": "y"}, inplace=True)

    # If "Cell Type" contains "VECs", replace with "Endothelial (Vascular)". Else if, "Cell Type" contains "VEC", replace with "Endothelial (Vascular)"
    # Else if "Cell Type" contains "Endothelial Cells", replace with "Endothelial (Vascular)". Else, keep the same.
    data_new["Cell Type"] = data_new["Cell Type"].replace('VECs', 'Vascular Endothelial Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('VEC', 'Vascular Endothelial Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('Endothelial Cells', 'Vascular Endothelial Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('LECs', 'Lymphatic Endothelial Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('Melanocyte', 'Melanocytes')
    data_new["Cell Type"] = data_new["Cell Type"].replace('B cells', 'B Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('B Cell', 'B Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('NK cells', 'NK Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('Merkel cells', 'Merkel Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('Neutrophil', 'Neutrophils')
    data_new["Cell Type"] = data_new["Cell Type"].replace('Fibroblast', 'Fibroblasts')
    data_new["Cell Type"] = data_new["Cell Type"].replace('DC cells', 'Dendritic Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('Myoepithelial', 'Myoepithelial Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('Ductal Epithelial  Cells', 'Ductal Epithelial Cells')
    data_new["Cell Type"] = data_new["Cell Type"].replace('myfibroblast', 'myofibroblast')
    data_new["Cell Type"] = data_new["Cell Type"].replace('Myfibroblast', 'myofibroblast')

    cell_types = data_new["Cell Type"].unique()

    for i in cell_types:
        ct.add(i)

    df_Region_1 = data_new[["x", "y", "Cell Type"]]

    # Calculate μm per px
    micro_per_pixel = 0.5   # This is the pixel size but no need to scale since data is already given in micrometers.
    scale = micro_per_pixel  # to convert given pixel in micro meter unit
    df_Region_1["x"] = scale * df_Region_1["x"]
    df_Region_1["y"] = scale * df_Region_1["y"]

    # Write to the file using pandas to_csv
    label = raw_filename.split(".")[0]
    df_Region_1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/oralcavity-codex-czi' already exists.
{'Others', 'Melanocytes', 'myofibroblast', 'Plasma Cells', 'Skeletal Myocytes', 'Suprabasal Keratinocytes', 'Dendritic Cells', 'Macrophage', 'Vascular Endothelial Cells', 'CD8 T Cells', 'Monocyte-Macrophage', 'Glial/Neuron', 'Basal Keratincytes', 'Ducts', 'B Cells', 'Epithelial', 'Tc', 'Lymphatic Endothelial Cells', 'Adipocytes', 'Myoepithelial Cells', 'Neutrophils', 'Acinar Cells', 'Lymphatic Vascular Cells', 'DP', 'Mural Cells', 'CD4 T Cells', 'gd T Cells', 'VEC Progen', 'Ionocytes', 'Langerhans Cells', 'Fibroblasts', 'Th', 'Acini', 'Mast Cells', 'Merkel Cells', 'NK Cells', 'Ductal Epithelial Cells', 'Keratinocyte', 'Treg'}

pancreas-geomx-ufl

dataset_name = "pancreas-geomx-ufl"

# Loop through all csv files in the directory "vccf-data-original/unpublished/pancreas-geomx-ufl/original-unprocessed/".
# For each file, read it into a pandas dataframe and print the first few rows.
for file in os.listdir(os.path.join(basepath, orig_filedir, dataset_name, "original-unprocessed")):
    if file.endswith(".csv"):
        # read csv. Coordinates are in micrometers.
        data = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "original-unprocessed", file))

        # Rename column CD3134 with CD31/34
        data = data.rename(columns={"CD3134": "CD31/34"})
        # Replace "Negative" with "-" and "Positive" with "+" in column "Insulin"
        data["Insulin"] = data["Insulin"].replace("Negative", "Insulin-")
        data["Insulin"] = data["Insulin"].replace("Positive", "Insulin+")

        # Replace "Negative" with "-" and "Positive" with "+" in column "PanCK"
        data["PanCK"] = data["PanCK"].replace("Negative", "PanCK-")
        data["PanCK"] = data["PanCK"].replace("Positive", "PanCK+")

        # Replace "Negative" with "-" and "Positive" with "+" in column "CD31/34"
        data["CD31/34"] = data["CD31/34"].replace("Negative", "CD31/34-")
        data["CD31/34"] = data["CD31/34"].replace("Positive", "CD31/34+")

        # Merge columns "Insulin", "PanCK" and "CD31/34" into a new column "CellType"
        data["CellType"] = data["Insulin"] + " " + data["PanCK"] + " " + data["CD31/34"]

        # drop columns "Insulin", "PanCK" and "CD31/34"
        data = data.drop(columns=["Insulin", "PanCK", "CD31/34"])

        # Replace markers with cell types.
        data["CellType"] = data["CellType"].replace("Insulin- PanCK- CD31/34+", "Endothelial")
        data["CellType"] = data["CellType"].replace("Insulin- PanCK+ CD31/34-", "Ductal cell")
        data["CellType"] = data["CellType"].replace("Insulin+ PanCK- CD31/34-", "Beta cell")

        # Replace all other marker combinations with "unknown"
        data["CellType"] = data["CellType"].replace("Insulin+ PanCK+ CD31/34-", "unknown")
        data["CellType"] = data["CellType"].replace("Insulin- PanCK+ CD31/34+", "unknown")
        data["CellType"] = data["CellType"].replace("Insulin+ PanCK- CD31/34+", "unknown")
        data["CellType"] = data["CellType"].replace("Insulin- PanCK- CD31/34-", "unknown")
        data["CellType"] = data["CellType"].replace("Insulin+ PanCK+ CD31/34+", "unknown")

        # Rename Cell X Coordinate to x, Cell Y Coordinate to y, and CellType to Cell Type.
        data = data.rename(columns={"xcoord": "x", "ycoord": "y", "CellType": "Cell Type"})

        # Save the new dataframe to a new csv file.
        data.to_csv(os.path.join(basepath, orig_filedir, dataset_name, "cell-type-annotated", file), index=False)

raw_filedir = os.path.join(basepath, orig_filedir, dataset_name, "cell-type-annotated")

ct = set()

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for raw_filename in os.listdir(raw_filedir):
    data_new = load_data(os.path.join(raw_filedir, raw_filename))
    cell_types = data_new["Cell Type"].unique()
    for i in cell_types:
        ct.add(i)
    df_Region_1 = data_new[["x", "y", "Cell Type"]]
    # Write to the file using pandas to_csv
    label = raw_filename.split(".")[0].split("_")[0]
    df_Region_1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/pancreas-geomx-ufl' created successfully.
{'Beta cell', 'Ductal cell', 'unknown', 'Endothelial'}

skin-celldive-ge

dataset_name = "skin-celldive-ge"

raw_filedir = os.path.join(basepath, orig_filedir, dataset_name, "regions")

# Filename to HuBMAP ID dictionary.
# This is used to map the filename to the HuBMAP ID. Source: https://www.biorxiv.org/content/10.1101/2023.10.05.560733v2 
filename_to_id = {
    "region_1": "HBM732.FZVZ.656",
    "region_2": "HBM747.SPWK.779",
    "region_3": "HBM398.NCVN.256",
    "region_4": "HBM746.VTDZ.959",
    "region_5": "HBM875.SBHJ.939",
    "region_6": "HBM867.NMXL.794", # Excluded in original analysis
    "region_7": "HBM666.JCGS.862",
    "region_8": "HBM592.JGSQ.253",
    "region_9": "HBM494.XDQW.356",
    "region_10": "HBM238.ZKPC.934",
    "region_11": "HBM975.FVCG.922",
    "region_12": "HBM674.XQFQ.364", # Excluded in original analysis
}

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

ct = set()
for region in os.listdir(raw_filedir):
    # Exclude regions 6 and 12 as they were excluded in the original analysis.
    if region == "region_6" or region == "region_12":
        continue

    raw_filename = os.path.join(raw_filedir, region, "centroids.csv")

    data_new = pd.read_csv(raw_filename)

    data_new.rename(columns={"cell_type": "Cell Type"}, inplace=True)
    data_new.rename(columns={"X": "x"}, inplace=True)
    data_new.rename(columns={"Y": "y"}, inplace=True)
    data_new.rename(columns={"Z": "z"}, inplace=True)

    # For consistency. Note that all endothelial cells in this dataset are blood endothelial
    data_new["Cell Type"] = data_new["Cell Type"].apply(lambda x: x.replace('CD31', 'Endothelial'))

    # Drop "skin" coordinates.
    data_new = data_new[data_new['Cell Type'] != 'Skin']

    cell_types = data_new["Cell Type"].unique()
    for i in cell_types:
        ct.add(i)
    df_Region_1 = data_new[["x", "y", "z", "Cell Type"]]

    # Calculate μm per px
    micro_per_pixel = 1  
    scale = micro_per_pixel  # to convert given pixel in micro meter unit
    df_Region_1["x"] = scale * df_Region_1["x"]
    df_Region_1["y"] = scale * df_Region_1["y"]
    df_Region_1["z"] = scale * df_Region_1["z"]

    # Write to the file using pandas to_csv
    df_Region_1.to_csv(f"{target_dir}/{region}-nodes.csv", index=False, header=True, mode="w")

    # Generate dataset.json file for HRAPoP. Get the UUID for the HuBMAP ID from Entity API.
    id = "https://entity.api.hubmapconsortium.org/entities/" + get_hubmap_uuid(filename_to_id[region])
    dataset_json = {
        "@id": id
    }

    with open(f"{target_dir}/{region}-dataset.json", "w") as f:
        json.dump(dataset_json, f, indent=4)
print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/skin-celldive-ge' created successfully.
{'KI67', 'Endothelial', 'P53', 'DDB2', 'CD68', 'T-Reg', 'T-Killer', 'T-Helper'}

skin-confocal-sorgerlab

dataset_name = "skin-confocal-sorgerlab"

raw_filedir = os.path.join(basepath, orig_filedir, dataset_name)

ct = set()

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for raw_filename in os.listdir(raw_filedir):

    data_new = load_data(os.path.join(raw_filedir, raw_filename))

    data_new.rename(columns={"phenotype": "Cell Type", "X_centroid": "x", "Y_centroid": "y", "Z_centroid": "z"}, inplace=True)

    data_new["Cell Type"] = data_new["Cell Type"].apply(lambda x: x.replace('endothelial', 'Endothelial'))

    cell_types = data_new["Cell Type"].unique()

    for i in cell_types:
        ct.add(i)

    df_Region_1 = data_new[["x", "y", "z", "Cell Type"]]
    
    # Write to the file using pandas to_csv
    label = raw_filename.split(".")[0]

    df_Region_1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")

    # Generate dataset.json file for HRAPoP. Paper DOI: https://doi.org/10.1101/2023.11.10.566670
    id = f"https://doi.org/10.1101/2023.11.10.566670#{label}"
    dataset_json = {
        "@id": id
    }

    with open(f"{target_dir}/{label}-dataset.json", "w") as f:
        json.dump(dataset_json, f, indent=4)

print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/skin-confocal-sorgerlab' created successfully.
{'Unknown', 'Dendritic cells', 'Tumor', 'Endothelial', 'Macrophage', 'Other Immune', 'B cells', 'Myeloid', 'Tissue T', 'Langerhan cells', 'CD11B+ CD11C- cells', 'CD8 T', 'CD4 T', 'keratinocytes', 'T reg'}

spleen-codex-ufl

dataset_name = "spleen-codex-ufl"
ct = set()
raw_filedir = os.path.join(basepath, orig_filedir, dataset_name)

# Filename to HuBMAP ID dictionary.
# This is used to map the filename to the HuBMAP ID. Source: https://www.biorxiv.org/content/10.1101/2023.10.05.560733v2 
filename_to_id = {
    "FSLD": "HBM342.FSLD.938",
    "KSFB": "HBM556.KSFB.592",
    "NGPL": "HBM568.NGPL.345",
    "PBVN": "HBM825.PBVN.284",
    "PKHL": "HBM389.PKHL.936",
    "XXCD": "HBM772.XXCD.697",
}

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for raw_filename in os.listdir(raw_filedir):

    data_new = pd.read_csv(os.path.join(raw_filedir, raw_filename), sep=';')

    data_new.rename(columns={"celltypes_folBcombined": "Cell Type"}, inplace=True)

    # For consistency. Note that all endothelial cells in this dataset are blood endothelial
    data_new["Cell Type"] = data_new["Cell Type"].apply(lambda x: x.replace('Blood endothelial', 'blood endothelial'))

    cell_types = data_new["Cell Type"].unique()
    
    # Add all elements in cell_types list to teh ct set
    for i in cell_types:
        ct.add(i)

    df_Region_1 = data_new[["x", "y", "Cell Type"]]

    # Calculate μm per px
    micro_per_pixel = 0.377  
    scale = micro_per_pixel  # to convert given pixel in micro meter unit
    df_Region_1["x"] = scale * df_Region_1["x"]
    df_Region_1["y"] = scale * df_Region_1["y"]

    # Find the most negative value in each column
    shift_x = abs(df_Region_1['x'].min())
    shift_y = abs(df_Region_1['y'].min())

    # Shift all values in the columns to be positive
    df_Region_1['x'] = df_Region_1['x'] + shift_x
    df_Region_1['y'] = df_Region_1['y'] + shift_y
    
    # Write to the file using pandas to_csv
    label = raw_filename.split('.')[0]
    df_Region_1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")

    # Generate dataset.json file for HRAPoP. Get the UUID for the HuBMAP ID from Entity API.
    id = "https://entity.api.hubmapconsortium.org/entities/" + get_hubmap_uuid(filename_to_id[label])
    dataset_json = {
        "@id": id
    }

    with open(f"{target_dir}/{label}-dataset.json", "w") as f:
        json.dump(dataset_json, f, indent=4)
    
print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/spleen-codex-ufl' created successfully.
{'Myeloid cells', 'Sinusoidal cells', 'CD8 Memory T cells', 'Macrophages', 'indistinct', 'B cells, red pulp', 'Ki67 proliferating', 'Fol B cells', 'blood endothelial', 'Neutrophils/Monocytes', 'Podoplanin', 'CD4 Memory T cells'}

lung-codex-urmc

dataset_name = "lung-codex-urmc"

# Process D265 Dataset.
# Read the h5ad file
# Specify your input and output file paths
h5ad_path = os.path.join(basepath, orig_filedir, dataset_name, "D265_final_annotated.h5ad")
output_csv_path = os.path.join(basepath, orig_filedir, dataset_name, "D265_final_annotated_cells.csv")

print(f"Reading file: {h5ad_path}")
adata = sc.read_h5ad(h5ad_path)

# Initialize dictionary to store the data
data_dict = {}

# Extract spatial coordinates and split into x, y
if 'spatial' in adata.obsm_keys():
    spatial_coords = adata.obsm['spatial']
    data_dict['x'] = spatial_coords[:, 0]
    data_dict['y'] = spatial_coords[:, 1]
else:
    raise KeyError("'spatial' coordinates not found in obsm")

# Extract cell type calls from obs
if 'cell_type_calls' in adata.obs:
    data_dict['Cell Type'] = adata.obs['cell_type_calls'].tolist()
else:
    raise KeyError("'cell_type_calls' column not found in obs")

# Extract ontology mappings from uns
if 'ontology_mappings' in adata.uns:
    # Create a mapping dictionary from uns data
    data_dict['Cell Ontology ID'] = adata.uns['ontology_mappings']["D265_ont_1"].tolist()
    data_dict['Cell Ontology ID_2'] = adata.uns['ontology_mappings']["D265_ont_2"].tolist()
 
else:
    raise KeyError("'ontology_mappings' not found in uns")

# Create DataFrame
df = pd.DataFrame(data_dict)
# In D265 dataset, remove space in Cell Type column where value is ENDO_1.
df['Cell Type'] = df['Cell Type'].replace(' ENDO_1', 'ENDO_1')
df['Cell Type'] = df['Cell Type'].replace(' macrophage', 'macrophage')

# Save to CSV
df.to_csv(output_csv_path, index=False)
print(f"\nData successfully saved to: {output_csv_path}")

del adata
del df
del data_dict

# Process D115 Dataset.
# Read the h5ad file
# Specify your input and output file paths
h5ad_path = os.path.join(basepath, orig_filedir, dataset_name, "D115_final_annotated.h5ad")
output_csv_path = os.path.join(basepath, orig_filedir, dataset_name, "D115_final_annotated_cells.csv")

print(f"Reading file: {h5ad_path}")
adata = sc.read_h5ad(h5ad_path)

# Initialize dictionary to store the data
data_dict = {}

# Extract spatial coordinates and split into x, y
if 'spatial' in adata.obsm_keys():
    spatial_coords = adata.obsm['spatial']
    data_dict['x'] = spatial_coords[:, 0]
    data_dict['y'] = spatial_coords[:, 1]
else:
    raise KeyError("'spatial' coordinates not found in obsm")

# Extract cell type calls from obs
if 'cell_type_calls' in adata.obs:
    data_dict['Cell Type'] = adata.obs['cell_type_calls'].tolist()
else:
    raise KeyError("'cell_type_calls' column not found in obs")

# Extract ontology mappings from uns
if 'ontology_mappings' in adata.uns:
    # Create a mapping dictionary from uns data
    # Note: Adjust this based on the actual structure of your ontology_mappings
    data_dict['Cell Ontology ID'] = adata.uns['ontology_mappings']["D115_ont_1"].tolist()
    data_dict['Cell Ontology ID_2'] = adata.uns['ontology_mappings']["D115_ont_2"].tolist()

else:
    raise KeyError("'ontology_mappings' not found in uns")

# Create DataFrame
df = pd.DataFrame(data_dict)

# Save to CSV
df.to_csv(output_csv_path, index=False)
print(f"\nData successfully saved to: {output_csv_path}")

del adata
del df
del data_dict

# Replace cell type names with cell labels from crosswalk file and generate cell-nodes files for both datasets. 
# Read D265 csv file.
df1 = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "D265_final_annotated_cells.csv"))

# Drop duplicate rows in D265. Check if any duplicate rows.
df1 = df1.drop_duplicates(subset=['x', 'y', 'Cell Type'])

# # Read crosswalk file.
# crosswalk = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "extras", "combined_unique_ct_list_with_cl_labels.csv"))
# # Keep unique rows in crosswalk file. Check uniqueness based on Cell Type.
# crosswalk = crosswalk.drop_duplicates(subset=['Cell Type'])

# # For each row in D265, replace Cell Type with the corresponding CL Label from crosswalk file. In case of no match, keep the original Cell Type. 
# # In case the Cell Type matches but the CL Label field in the crosswalk file is empty, keep the original Cell Type.
# df1 = pd.merge(df1, crosswalk, on='Cell Type', how='left')
# df1['Cell Type'] = np.where(df1['CL Label'].isnull(), df1['Cell Type'], df1['CL Label'])
# df1 = df1.drop(columns=['CL Label'])

# Drop Cell Ontology ID and Cell Ontology ID_2 columns.
df1 = df1.drop(columns=['Cell Ontology ID', 'Cell Ontology ID_2'])

target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

label = "D265-LLL-7A7-12"
# Save the updated D265 csv file.
df1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")

id = f"https://entity.api.hubmapconsortium.org/entities/0f1ddcb41a484adbda759c0c79097a02#{label}"

dataset_json = {
    "@id": id
}

with open(f"{target_dir}/{label}-dataset.json", "w") as f:
    json.dump(dataset_json, f, indent=4)

del df1
# Replace cell type names with cell labels from crosswalk file and generate cell-nodes files for both datasets. 

# Read D115 csv file.
df1 = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "D115_final_annotated_cells.csv"))

# Drop duplicate rows in D115. Check if any duplicate rows. If true, print them.
df1 = df1.drop_duplicates(subset=['x', 'y', 'Cell Type'])

# # Read crosswalk file.
# crosswalk = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "extras", "combined_unique_ct_list_with_cl_labels.csv"))
# # Keep unique rows in crosswalk file. Check uniqueness based on Cell Type.
# crosswalk = crosswalk.drop_duplicates(subset=['Cell Type'])

# # For each row in D115, replace Cell Type with the corresponding CL Label from crosswalk file. In case of no match, keep the original Cell Type. 
# # In case the Cell Type matches but the CL Label field in the crosswalk file is empty, keep the original Cell Type.
# df1 = pd.merge(df1, crosswalk, on='Cell Type', how='left')
# df1['Cell Type'] = np.where(df1['CL Label'].isnull(), df1['Cell Type'], df1['CL Label'])
# df1 = df1.drop(columns=['CL Label'])

# Drop Cell Ontology ID and Cell Ontology ID_2 columns.
df1 = df1.drop(columns=['Cell Ontology ID', 'Cell Ontology ID_2'])

label = "D115-RLL-10A3-40"
# Save the updated D115 csv file.
df1.to_csv(f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")

id = f"https://entity.api.hubmapconsortium.org/entities/0f1ddcb41a484adbda759c0c79097a02#{label}"

dataset_json = {
    "@id": id
}

with open(f"{target_dir}/{label}-dataset.json", "w") as f:
    json.dump(dataset_json, f, indent=4)

# Print the unique cell types in both datasets combined.
unique_cell_types = pd.concat([df1['Cell Type'], df1['Cell Type']]).unique()
print(unique_cell_types)
Reading file: /u/yashjain/hra-cell-distance-analysis/data/data-original/lung-codex-urmc/D265_final_annotated.h5ad

Data successfully saved to: /u/yashjain/hra-cell-distance-analysis/data/data-original/lung-codex-urmc/D265_final_annotated_cells.csv
Reading file: /u/yashjain/hra-cell-distance-analysis/data/data-original/lung-codex-urmc/D115_final_annotated.h5ad

Data successfully saved to: /u/yashjain/hra-cell-distance-analysis/data/data-original/lung-codex-urmc/D115_final_annotated_cells.csv
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/lung-codex-urmc' created successfully.
['CD4_+_Tcell/macrophage' 'CD8+_T_cell_1' 'macrophage_3' 'ENDO_1'
 'ENDO_CD8+_T_Cell' 'CD4+_T_cell_1' 'Lung_Epithelial_1' 'CAP_ENDO'
 'macrophage_CD1c+_myeloidDC' 'ENDO_SMC' 'Endo_p'
 'Lung_Epithelil_2_CD4+_T_cell' 'AT2_2' 'AT2_1' 'CD8+_T_cell_2'
 'macrophage_2' 'macrophage_p' 'CD4+_T_cell_2' 'AT2_p' 'SMC_2' 'SMC_1'
 'CD8+_T_cell_3' 'B_cell_1' 'Lung_Epithelial_p' 'CD8+_T_cell_CD_4+_T_cell'
 'Lung_Epithelial_4' 'UNK_5_ambiguous' 'UNK_1_APC' 'CD4+_T_cell_3'
 'B_cell_macrophage_p?' 'UNK_3_(col1a1-driven_cluster)'
 'Lymphatic_Endothelium' 'UNK_4_(col1a1_driven_cluster)']

bonemarrow-codex-chop

dataset_name = "bonemarrow-codex-chop"

metadata1 = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "AML_NSM_RefMap_Seurat_v2_seurat_metadata.csv"))

metadata2 = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "Normal_Bone_Marrow_CODEX_Atlas_Seurat_v2_seurat_metadata.csv"))

# Extract x, y coords, cell type, and filename from metadata1 into a new dataframe.
metadata1['x'] = metadata1['x.coord']
metadata1['y'] = metadata1['y.coord']
metadata1['Cell Type'] = metadata1['classified_cluster_anno_l2']
metadata1['filename'] = metadata1['orig.ident']
metadata1 = metadata1[['x', 'y', 'Cell Type', 'filename']]

# Extract x, y coords, cell type, and filename from metadata2 into a new dataframe.
metadata2['x'] = metadata2['x.coord']
metadata2['y'] = metadata2['y.coord']
metadata2['Cell Type'] = metadata2['cluster_anno_l2']
metadata2['filename'] = metadata2['orig.ident']
metadata2 = metadata2[['x', 'y', 'Cell Type', 'filename']]

# Merge metadata1 and metadata2.
data_merged = pd.concat([metadata1, metadata2], axis=0)
data_merged = data_merged.reset_index(drop=True)

# Remove "_CODEX_Mesmer" from filename.
data_merged['filename'] = data_merged['filename'].str.replace('_CODEX_Mesmer', '')

# Write data_merged to a csv file.
data_merged.to_csv(os.path.join(basepath, orig_filedir, dataset_name, "data_merged.csv"), index=False)

# Print unique cell types in the data_merged dataframe.
print(data_merged['Cell Type'].unique())

# Iterate over unique values in data_merged column filename and write each subset to a csv file in vccf-data-cell-nodes/published/bonemarrow-codex-chop directory. Drop filename column.
target_dir = os.path.join(basepath, dest_filedir, dataset_name)
create_directory(target_dir)

for filename in data_merged['filename'].unique():
    data_subset = data_merged[data_merged['filename'] == filename]
    data_subset = data_subset.drop(columns=['filename'])
    data_subset.to_csv(f'{target_dir}/{filename}-nodes.csv', index=False)
['Erythroid' 'B-Cells' 'AEC' 'Early Myeloid Progenitor' 'SEC'
 'Intermediate Myeloid' 'Mature Myeloid' 'CD8+ T-Cell' 'Plasma Cells'
 'Erythroblast' 'Adipocyte' 'Monocytes' 'Adipo-MSC' 'Endosteal'
 'THY1+ MSC' 'CD4+ T-Cell' 'GMP/Myeloblast' 'GATA1pos_Mks'
 'Immature_B_Cell' 'Macrophages' 'SPINK2+ HSPC' 'pDC'
 'Non-Classical Monocyte' 'GATA1neg_Mks' 'HSPC' 'VSMC' 'GMP'
 'MEP/Early Erythroblast' 'CLP' 'CD34+ CD61+' 'HSC' 'NPM1 Mutant Blast'
 'Schwann Cells' 'Artifact' 'Undetermined' 'Autofluorescent'
 'CD44+ Undetermined']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/bonemarrow-codex-chop' created successfully.
# Print final message
print("All datasets processed and saved.")
All datasets processed and saved.