import numpy as np
import pandas as pd
import os
import json
import requests
import scanpy as sc
import shutil
'display.max_columns', None)
pd.set_option(
# suppress warnings
import warnings
"ignore") warnings.filterwarnings(
Data Processing: Convert to CDE Format
Convert all datasets from original format to CDE format.
= "/u/yashjain/hra-cell-distance-analysis/data"
basepath = "data-original"
orig_filedir = "data-processed-nodes" dest_filedir
# Function to load your data
def load_data(path):
= pd.read_csv(path)
data return data
def create_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)print(f"Directory '{directory}' created successfully.")
else:
print(f"Directory '{directory}' already exists.")
def get_hubmap_uuid(hubmap_id):
# Construct the API URL
= "https://entity.api.hubmapconsortium.org/entities/"
base_url = base_url + hubmap_id
url
try:
# Send GET request to the API
= requests.get(url)
response
# Check if the request was successful
response.raise_for_status()
# Parse the JSON response
= response.json()
data
# Extract the UUID
= data.get("uuid")
uuid
if uuid:
return uuid
else:
return "UUID not found in the response"
except requests.RequestException as e:
return f"An error occurred: {str(e)}"
print(len(os.listdir(os.path.join(basepath, orig_filedir))), os.listdir(os.path.join(basepath, orig_filedir)))
14 ['bonemarrow-codex-chop', 'colon-cycif-sorgerlab', 'colon-xenium-stanford', 'esophagus-codex-stanford', 'intestine-codex-stanford', 'lung-codex-urmc', 'lymphnode-codex-yale', 'maternalfetalinterface-mibitof-stanford', 'oralcavity-codex-czi', 'pancreas-geomx-ufl', 'skin-celldive-ge', 'skin-confocal-sorgerlab', 'spleen-codex-ufl', 'tonsil-codex-stanford']
# Create destination directory. Overwrite if it exists.
if os.path.exists(os.path.join(basepath, dest_filedir)):
shutil.rmtree(os.path.join(basepath, dest_filedir))print(f"Directory '{dest_filedir}' already exists and has been removed. New directory will be created.")
else:
print(f"Directory '{dest_filedir}' does not exist and will be created.")
=False) os.makedirs(os.path.join(basepath, dest_filedir), exist_ok
Directory 'data-processed-nodes' already exists and has been removed. New directory will be created.
Processsing Individual Datasets
intestine-codex-stanford
= "intestine-codex-stanford"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name, "23_09_CODEX_HuBMAP_alldata_Dryad_merged.csv")
raw_filepath
= load_data(raw_filepath)
data_new
# Filename to HuBMAP ID mapping.
= os.path.join(basepath, orig_filedir, dataset_name, "filename-to-hubmap-mapping.csv")
mapping_file = load_data(mapping_file)
mapping_df
# Convert the mapping dataframe to dictionary. Key is Filename and value is HuBMAP ID.
= dict(zip(mapping_df["Filename"], mapping_df["HuBMAP ID"]))
filename_to_id
# Store types of unique regions before splitting
# column is "unique_region"
= data_new["unique_region"].unique()
unique_regions
={"cell_type": "Cell Type"}, inplace=True)
data_new.rename(columns
# Store types of cell before splitting
= data_new["Cell Type"].unique()
cell_types print(cell_types)
# Take the column 'unique_region' to split from the actual column names of data frame
= "unique_region"
column_to_split
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for label in unique_regions:
# If label contains "extra-nodes" then skip.
if "extra" in label:
continue
# Create another sub data frame using the value for the value of the column each time
= data_new[data_new[column_to_split] == label]
df_label
= df_label[["x", "y", "Cell Type"]]
df_Region_1
# Calculate μm per px
= 0.37742
micro_per_pixel = micro_per_pixel # to convert given pixel in micro meter unit
scale "x"] = scale * df_Region_1["x"]
df_Region_1["y"] = scale * df_Region_1["y"]
df_Region_1[
= label.replace("-", "_")
label_clean = label_clean.replace(" ", "")
label_clean # Write to the file using pandas to_csv
f"{target_dir}/{label_clean}-nodes.csv", index=False, header=True, mode="w")
df_Region_1.to_csv(
# Generate dataset.json file for HRAPoP. Get the UUID for the HuBMAP ID from Entity API.
id = "https://entity.api.hubmapconsortium.org/entities/" + get_hubmap_uuid(filename_to_id[f"{label_clean}-nodes"])
= {
dataset_json "@id": id
}
with open(f"{target_dir}/{label_clean}-dataset.json", "w") as f:
=4) json.dump(dataset_json, f, indent
['NK' 'Enterocyte' 'MUC1+ Enterocyte' 'TA' 'CD66+ Enterocyte' 'Paneth'
'Smooth muscle' 'M1 Macrophage' 'Goblet' 'Neuroendocrine'
'CD57+ Enterocyte' 'Lymphatic' 'CD8+ T' 'DC' 'M2 Macrophage' 'B'
'Neutrophil' 'Endothelial' 'Cycling TA' 'Plasma' 'CD4+ T cell' 'Stroma'
'Nerve' 'ICC' 'CD7+ Immune']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/intestine-codex-stanford' created successfully.
tonsil-codex-stanford
= "tonsil-codex-stanford"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name, "BE_Tonsil_l3_dryad.csv")
raw_filepath
= load_data(raw_filepath)
data_new
# Store types of unique regions before splitting
# column is "unique_region"
= data_new["sample_name"].unique()
unique_regions
={"cell_type": "Cell Type"}, inplace=True)
data_new.rename(columns
# Store types of cell before splitting
= data_new["Cell Type"].unique()
cell_types print(cell_types)
# Take the column 'unique_region' to split from the actual column names of data frame
= "sample_name"
column_to_split
for label in unique_regions:
# Create another sub data frame using the value for the value of the column each time
if label == "tonsil":
= data_new[data_new[column_to_split] == label]
df_label
= df_label[["x", "y", "Cell Type"]]
df_Region_1
# Calculate μm per px
= 0.377
micro_per_pixel = micro_per_pixel # to convert given pixel in micro meter unit
scale "x"] = scale * df_Region_1["x"]
df_Region_1["y"] = scale * df_Region_1["y"]
df_Region_1[
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)= label.replace("-", "_")
label_clean = label_clean.replace(" ", "")
label_clean # Write to the file using pandas to_csv
f"{target_dir}/{label_clean}-nodes.csv", index=False, header=True, mode="w") df_Region_1.to_csv(
['Innate' 'PDPN' 'Endothelial' 'B' 'T' 'Squamous_epithelial' 'Stroma'
'SmoothMuscle' 'Plasma' 'Nerve' 'Glandular_epi' 'Secretory_epithelial'
'Paneth']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/tonsil-codex-stanford' created successfully.
esophagus-codex-stanford
= "esophagus-codex-stanford"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name, "BE_Tonsil_l3_dryad.csv")
raw_filepath
= load_data(raw_filepath)
data_new
# Store types of unique regions before splitting
# column is "unique_region"
= data_new["sample_name"].unique()
unique_regions
={"cell_type": "Cell Type"}, inplace=True)
data_new.rename(columns
# Store types of cell before splitting
= data_new["Cell Type"].unique()
cell_types print(cell_types)
# Take the column 'unique_region' to split from the actual column names of data frame
= "sample_name"
column_to_split
for label in unique_regions:
# Create another sub data frame using the value for the value of the column each time
if label == "Barretts Esophagus":
= data_new[data_new[column_to_split] == label]
df_label
= df_label[["x", "y", "Cell Type"]]
df_Region_1
# Calculate μm per px
= 0.377
micro_per_pixel = micro_per_pixel # to convert given pixel in micro meter unit
scale "x"] = scale * df_Region_1["x"]
df_Region_1["y"] = scale * df_Region_1["y"]
df_Region_1[
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
# Write to the file using pandas to_csv
f"{target_dir}/esophagus-nodes.csv", index=False, header=True, mode="w") df_Region_1.to_csv(
['Innate' 'PDPN' 'Endothelial' 'B' 'T' 'Squamous_epithelial' 'Stroma'
'SmoothMuscle' 'Plasma' 'Nerve' 'Glandular_epi' 'Secretory_epithelial'
'Paneth']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/esophagus-codex-stanford' created successfully.
colon-cycif-sorgerlab
= "colon-cycif-sorgerlab"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name)
raw_filedir
# Preprocess to merge CT labels.
# List of filenames
= [
filenames "Reg_Celltype_CRC01002.csv", "Reg_Celltype_CRC01007.csv", "Reg_Celltype_CRC01014.csv",
"Reg_Celltype_CRC01020.csv", "Reg_Celltype_CRC01025.csv", "Reg_Celltype_CRC01029.csv",
"Reg_Celltype_CRC01034.csv", "Reg_Celltype_CRC01039.csv", "Reg_Celltype_CRC01044.csv",
"Reg_Celltype_CRC01049.csv", "Reg_Celltype_CRC01050.csv", "Reg_Celltype_CRC01051.csv",
"Reg_Celltype_CRC01052.csv", "Reg_Celltype_CRC01054.csv", "Reg_Celltype_CRC01059.csv",
"Reg_Celltype_CRC01064.csv", "Reg_Celltype_CRC01069.csv", "Reg_Celltype_CRC01074.csv",
"Reg_Celltype_CRC01078.csv", "Reg_Celltype_CRC01084.csv", "Reg_Celltype_CRC01086.csv",
"Reg_Celltype_CRC01091.csv", "Reg_Celltype_CRC01097.csv", "Reg_Celltype_CRC01102.csv",
"Reg_Celltype_CRC01106.csv"
]
# Read CSV files, add filename column, and combine
= []
df_list for filename in filenames:
= pd.read_csv(os.path.join(raw_filedir, filename))
df # Extract CRCXXXXX part from the filename
= filename.split('.')[0].split('_')[2]
crc_code 'Layer'] = crc_code
df[
df_list.append(df)
= pd.concat(df_list, ignore_index=True)
combined_df
# Read Celltype_reference_table.csv
= pd.read_csv(os.path.join(raw_filedir, "Celltype_reference_table.csv"))
ref_df
# Create a dictionary for mapping NewType to Name and Category
= ref_df.set_index('NewType')[['Name', 'Category']].to_dict('index')
ref_dict
# Add new columns "Cell Type" and "Category" to the combined csv file
'Cell Type'] = combined_df['NewType'].map(lambda x: ref_dict.get(x, {}).get('Name', ''))
combined_df['Category'] = combined_df['NewType'].map(lambda x: ref_dict.get(x, {}).get('Category', ''))
combined_df[
={"Xr": "x", "Yr": "y"}, inplace=True)
combined_df.rename(columns
= combined_df
data_new
= data_new["Layer"].unique()
unique_regions
= data_new["Cell Type"].unique()
cell_types print(cell_types)
= "Layer"
column_to_split
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for label in unique_regions:
# Create another sub data frame using the value for the value of the column each time
= data_new[data_new[column_to_split] == label]
df_label
= df_label[["x", "y", "Cell Type"]]
df_Region_1
# Write to the file using pandas to_csv
f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
df_Region_1.to_csv(
# Generate dataset.json file for HRAPoP. Paper DOI: https://doi.org/10.1016/j.cell.2022.12.028
id = f"https://doi.org/10.1016/j.cell.2022.12.028#{label}"
= {
dataset_json "@id": id
}
with open(f"{target_dir}/{label}-dataset.json", "w") as f:
=4) json.dump(dataset_json, f, indent
['Other' 'PDL1+ Macrophage' 'Tumor/Epithelial' 'Lymphocyte(III)' 'Treg'
'Endothelial' 'Muscle/Fibroblast' 'Tc cell' 'PDL1+ Tumor/Epithelial'
'Macrophage(IV)' 'B cells' 'PD1+ Tc' 'Macrophage(III)' 'DN Lymphocyte'
'DP Lymphocyte' 'T helper' 'Macrophage(II)' 'Macrophage(I)'
'Ki67+ Tumor/Epithelial' 'PD1+ T helper' 'PDL1+ lymphocyte']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/colon-cycif-sorgerlab' created successfully.
colon-xenium-stanford
= "colon-xenium-stanford"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name, "xenium_polyp_29sections.txt")
raw_filepath
= pd.read_csv(raw_filepath, sep='\t')
data_new
# Store types of unique regions before splitting
# column is "unique_region"
= data_new["layer"].unique()
unique_regions
={"cell_type": "Cell Type", "x_align": "x", "y_align": "y"}, inplace=True)
data_new.rename(columns
# Store types of cell before splitting
= data_new["Cell Type"].unique()
cell_types print(cell_types)
# Take the column 'unique_region' to split from the actual column names of data frame
= "layer"
column_to_split
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for label in unique_regions:
# Create another sub data frame using the value for the value of the column each time
= data_new[data_new[column_to_split] == label]
df_label
= df_label[["x", "y", "Cell Type"]]
df_Region_1
# Write to the file using pandas to_csv
f"{target_dir}/layer_{label}-nodes.csv", index=False, header=True, mode="w") df_Region_1.to_csv(
['Immature Goblet' 'Tuft' 'TA1' 'CD4+' 'Pericytes' 'Macrophages'
'CyclingTA' 'Cancer Associated Fibroblasts' 'Best4+ Enterocytes' 'Stem'
'CD8+' 'Endothelial' 'TA2' 'Myofibroblasts/Smooth Muscle 3'
'Unknown_lowcount' 'Myofibroblasts/Smooth Muscle 1' 'Crypt Fibroblasts 3'
'Crypt Fibroblasts 1' 'Glia' 'Crypt Fibroblasts 4' 'Adipocytes' 'Plasma'
'GC' 'Mast' 'Enteroendocrine' 'Tregs' 'Lymphatic endothelial cells'
'Goblet' 'Immature Enterocytes' 'Enterocyte Progenitors' 'Enterocytes'
'Myofibroblasts/Smooth Muscle 2' 'Naive T' 'Villus Fibroblasts WNT5B+'
'Naive B' 'Neurons' 'Memory B' 'Crypt Fibroblasts 2' 'ILCs' 'Unknown'
'DC']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/colon-xenium-stanford' created successfully.
lymphnode-codex-yale
= "lymphnode-codex-yale"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name)
raw_filedir
= set()
ct
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for raw_filename in os.listdir(raw_filedir):
= load_data(os.path.join(raw_filedir, raw_filename))
data_new
={"celltype": "Cell Type"}, inplace=True)
data_new.rename(columns
"Cell Type"] = data_new["Cell Type"].apply(lambda x: x.replace('Endo', 'Endothelial'))
data_new[
= data_new["Cell Type"].unique()
cell_types
for i in cell_types:
ct.add(i)
= data_new[["x", "y", "Cell Type"]]
df_Region_1
# Write to the file using pandas to_csv
= raw_filename.split(".")[0].split("_")[0]
label f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
df_Region_1.to_csv(
print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/lymphnode-codex-yale' created successfully.
{'DC_CCR7+', 'B_plasma', 'Mast', 'T_CD8+_cytotoxic', 'Macrophages_M2', 'T_CD8+_naive', 'DC_pDC', 'VSMC', 'T_CD4+_TfH_GC', 'B_GC_DZ', 'B_naive', 'B_GC_LZ', 'T_CD4+', 'DC_cDC1', 'NK', 'B_mem', 'T_CD4+_naive', 'B_preGC', 'B_IFN', 'B_activated', 'T_CD4+_TfH', 'B_GC_prePB', 'Monocytes', 'NKT', 'Endothelial', 'T_CD8+_CD161+', 'T_TfR', 'T_TIM3+', 'DC_cDC2', 'FDC', 'T_Treg', 'Macrophages_M1', 'B_Cycling', 'ILC'}
maternalfetalinterface-mibitof-stanford
= "maternalfetalinterface-mibitof-stanford"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name, "Supplementary_table_3_single_cells_updated.csv")
raw_filepath
= load_data(raw_filepath)
data
= data[data['overlap_decidua'] == 1.0]
data_new
# Store types of unique regions before splitting
# column is "unique_region"
= data_new["Point"].unique()
unique_regions
={'centroid0': 'x', 'centroid1': 'y', 'lineage': 'Cell Type' }, inplace=True)
data_new.rename(columns
# Store types of cell before splitting
= data_new["Cell Type"].unique()
cell_types print(cell_types)
# Take the column 'unique_region' to split from the actual column names of data frame
= "Point"
column_to_split
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for label in unique_regions:
# Create another sub data frame using the value for the value of the column each time
= data_new[data_new[column_to_split] == label]
df_label
= df_label[["x", "y", "Cell Type"]]
df_Region_1
# Calculate μm per px
= 0.391
micro_per_pixel = micro_per_pixel # to convert given pixel in micro meter unit
scale "x"] = scale * df_Region_1["x"]
df_Region_1["y"] = scale * df_Region_1["y"]
df_Region_1[
# Write to the file using pandas to_csv
f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w") df_Region_1.to_csv(
['Mac2a' 'other' 'NK1' 'Fibroblasts' 'NKT' 'Endothelial' 'Myofibroblasts'
'Mac1a' 'EVT1a' 'Mac1b' 'CD8T' 'EVT1b' 'Mac2c' 'NK2' 'muscle' 'NK3'
'EVT2' 'Mac2b' 'DC' 'Glandular' 'CD4T' 'EVT1c' 'NK4' 'Mast' 'Treg'
'Placental_Mac']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/maternalfetalinterface-mibitof-stanford' created successfully.
oralcavity-codex-czi
= "oralcavity-codex-czi"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name)
raw_filedir
= set()
ct
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for raw_filename in os.listdir(raw_filedir):
= load_data(os.path.join(raw_filedir, raw_filename))
data_new
={"TACIT": "Cell Type"}, inplace=True)
data_new.rename(columns={"X": "x"}, inplace=True)
data_new.rename(columns={"Y": "y"}, inplace=True)
data_new.rename(columns
# If "Cell Type" contains "VECs", replace with "Endothelial (Vascular)". Else if, "Cell Type" contains "VEC", replace with "Endothelial (Vascular)"
# Else if "Cell Type" contains "Endothelial Cells", replace with "Endothelial (Vascular)". Else, keep the same.
"Cell Type"] = data_new["Cell Type"].replace('VECs', 'Vascular Endothelial Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('VEC', 'Vascular Endothelial Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('Endothelial Cells', 'Vascular Endothelial Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('LECs', 'Lymphatic Endothelial Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('Melanocyte', 'Melanocytes')
data_new["Cell Type"] = data_new["Cell Type"].replace('B cells', 'B Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('B Cell', 'B Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('NK cells', 'NK Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('Merkel cells', 'Merkel Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('Neutrophil', 'Neutrophils')
data_new["Cell Type"] = data_new["Cell Type"].replace('Fibroblast', 'Fibroblasts')
data_new["Cell Type"] = data_new["Cell Type"].replace('DC cells', 'Dendritic Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('Myoepithelial', 'Myoepithelial Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('Ductal Epithelial Cells', 'Ductal Epithelial Cells')
data_new["Cell Type"] = data_new["Cell Type"].replace('myfibroblast', 'myofibroblast')
data_new["Cell Type"] = data_new["Cell Type"].replace('Myfibroblast', 'myofibroblast')
data_new[
= data_new["Cell Type"].unique()
cell_types
for i in cell_types:
ct.add(i)
= data_new[["x", "y", "Cell Type"]]
df_Region_1
# Calculate μm per px
= 0.5 # This is the pixel size but no need to scale since data is already given in micrometers.
micro_per_pixel = micro_per_pixel # to convert given pixel in micro meter unit
scale "x"] = scale * df_Region_1["x"]
df_Region_1["y"] = scale * df_Region_1["y"]
df_Region_1[
# Write to the file using pandas to_csv
= raw_filename.split(".")[0]
label f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
df_Region_1.to_csv(print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/oralcavity-codex-czi' already exists.
{'Others', 'Melanocytes', 'myofibroblast', 'Plasma Cells', 'Skeletal Myocytes', 'Suprabasal Keratinocytes', 'Dendritic Cells', 'Macrophage', 'Vascular Endothelial Cells', 'CD8 T Cells', 'Monocyte-Macrophage', 'Glial/Neuron', 'Basal Keratincytes', 'Ducts', 'B Cells', 'Epithelial', 'Tc', 'Lymphatic Endothelial Cells', 'Adipocytes', 'Myoepithelial Cells', 'Neutrophils', 'Acinar Cells', 'Lymphatic Vascular Cells', 'DP', 'Mural Cells', 'CD4 T Cells', 'gd T Cells', 'VEC Progen', 'Ionocytes', 'Langerhans Cells', 'Fibroblasts', 'Th', 'Acini', 'Mast Cells', 'Merkel Cells', 'NK Cells', 'Ductal Epithelial Cells', 'Keratinocyte', 'Treg'}
pancreas-geomx-ufl
= "pancreas-geomx-ufl"
dataset_name
# Loop through all csv files in the directory "vccf-data-original/unpublished/pancreas-geomx-ufl/original-unprocessed/".
# For each file, read it into a pandas dataframe and print the first few rows.
for file in os.listdir(os.path.join(basepath, orig_filedir, dataset_name, "original-unprocessed")):
if file.endswith(".csv"):
# read csv. Coordinates are in micrometers.
= pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "original-unprocessed", file))
data
# Rename column CD3134 with CD31/34
= data.rename(columns={"CD3134": "CD31/34"})
data # Replace "Negative" with "-" and "Positive" with "+" in column "Insulin"
"Insulin"] = data["Insulin"].replace("Negative", "Insulin-")
data["Insulin"] = data["Insulin"].replace("Positive", "Insulin+")
data[
# Replace "Negative" with "-" and "Positive" with "+" in column "PanCK"
"PanCK"] = data["PanCK"].replace("Negative", "PanCK-")
data["PanCK"] = data["PanCK"].replace("Positive", "PanCK+")
data[
# Replace "Negative" with "-" and "Positive" with "+" in column "CD31/34"
"CD31/34"] = data["CD31/34"].replace("Negative", "CD31/34-")
data["CD31/34"] = data["CD31/34"].replace("Positive", "CD31/34+")
data[
# Merge columns "Insulin", "PanCK" and "CD31/34" into a new column "CellType"
"CellType"] = data["Insulin"] + " " + data["PanCK"] + " " + data["CD31/34"]
data[
# drop columns "Insulin", "PanCK" and "CD31/34"
= data.drop(columns=["Insulin", "PanCK", "CD31/34"])
data
# Replace markers with cell types.
"CellType"] = data["CellType"].replace("Insulin- PanCK- CD31/34+", "Endothelial")
data["CellType"] = data["CellType"].replace("Insulin- PanCK+ CD31/34-", "Ductal cell")
data["CellType"] = data["CellType"].replace("Insulin+ PanCK- CD31/34-", "Beta cell")
data[
# Replace all other marker combinations with "unknown"
"CellType"] = data["CellType"].replace("Insulin+ PanCK+ CD31/34-", "unknown")
data["CellType"] = data["CellType"].replace("Insulin- PanCK+ CD31/34+", "unknown")
data["CellType"] = data["CellType"].replace("Insulin+ PanCK- CD31/34+", "unknown")
data["CellType"] = data["CellType"].replace("Insulin- PanCK- CD31/34-", "unknown")
data["CellType"] = data["CellType"].replace("Insulin+ PanCK+ CD31/34+", "unknown")
data[
# Rename Cell X Coordinate to x, Cell Y Coordinate to y, and CellType to Cell Type.
= data.rename(columns={"xcoord": "x", "ycoord": "y", "CellType": "Cell Type"})
data
# Save the new dataframe to a new csv file.
"cell-type-annotated", file), index=False)
data.to_csv(os.path.join(basepath, orig_filedir, dataset_name,
= os.path.join(basepath, orig_filedir, dataset_name, "cell-type-annotated")
raw_filedir
= set()
ct
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for raw_filename in os.listdir(raw_filedir):
= load_data(os.path.join(raw_filedir, raw_filename))
data_new = data_new["Cell Type"].unique()
cell_types for i in cell_types:
ct.add(i)= data_new[["x", "y", "Cell Type"]]
df_Region_1 # Write to the file using pandas to_csv
= raw_filename.split(".")[0].split("_")[0]
label f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
df_Region_1.to_csv(print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/pancreas-geomx-ufl' created successfully.
{'Beta cell', 'Ductal cell', 'unknown', 'Endothelial'}
skin-celldive-ge
= "skin-celldive-ge"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name, "regions")
raw_filedir
# Filename to HuBMAP ID dictionary.
# This is used to map the filename to the HuBMAP ID. Source: https://www.biorxiv.org/content/10.1101/2023.10.05.560733v2
= {
filename_to_id "region_1": "HBM732.FZVZ.656",
"region_2": "HBM747.SPWK.779",
"region_3": "HBM398.NCVN.256",
"region_4": "HBM746.VTDZ.959",
"region_5": "HBM875.SBHJ.939",
"region_6": "HBM867.NMXL.794", # Excluded in original analysis
"region_7": "HBM666.JCGS.862",
"region_8": "HBM592.JGSQ.253",
"region_9": "HBM494.XDQW.356",
"region_10": "HBM238.ZKPC.934",
"region_11": "HBM975.FVCG.922",
"region_12": "HBM674.XQFQ.364", # Excluded in original analysis
}
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
= set()
ct for region in os.listdir(raw_filedir):
# Exclude regions 6 and 12 as they were excluded in the original analysis.
if region == "region_6" or region == "region_12":
continue
= os.path.join(raw_filedir, region, "centroids.csv")
raw_filename
= pd.read_csv(raw_filename)
data_new
={"cell_type": "Cell Type"}, inplace=True)
data_new.rename(columns={"X": "x"}, inplace=True)
data_new.rename(columns={"Y": "y"}, inplace=True)
data_new.rename(columns={"Z": "z"}, inplace=True)
data_new.rename(columns
# For consistency. Note that all endothelial cells in this dataset are blood endothelial
"Cell Type"] = data_new["Cell Type"].apply(lambda x: x.replace('CD31', 'Endothelial'))
data_new[
# Drop "skin" coordinates.
= data_new[data_new['Cell Type'] != 'Skin']
data_new
= data_new["Cell Type"].unique()
cell_types for i in cell_types:
ct.add(i)= data_new[["x", "y", "z", "Cell Type"]]
df_Region_1
# Calculate μm per px
= 1
micro_per_pixel = micro_per_pixel # to convert given pixel in micro meter unit
scale "x"] = scale * df_Region_1["x"]
df_Region_1["y"] = scale * df_Region_1["y"]
df_Region_1["z"] = scale * df_Region_1["z"]
df_Region_1[
# Write to the file using pandas to_csv
f"{target_dir}/{region}-nodes.csv", index=False, header=True, mode="w")
df_Region_1.to_csv(
# Generate dataset.json file for HRAPoP. Get the UUID for the HuBMAP ID from Entity API.
id = "https://entity.api.hubmapconsortium.org/entities/" + get_hubmap_uuid(filename_to_id[region])
= {
dataset_json "@id": id
}
with open(f"{target_dir}/{region}-dataset.json", "w") as f:
=4)
json.dump(dataset_json, f, indentprint(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/skin-celldive-ge' created successfully.
{'KI67', 'Endothelial', 'P53', 'DDB2', 'CD68', 'T-Reg', 'T-Killer', 'T-Helper'}
skin-confocal-sorgerlab
= "skin-confocal-sorgerlab"
dataset_name
= os.path.join(basepath, orig_filedir, dataset_name)
raw_filedir
= set()
ct
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for raw_filename in os.listdir(raw_filedir):
= load_data(os.path.join(raw_filedir, raw_filename))
data_new
={"phenotype": "Cell Type", "X_centroid": "x", "Y_centroid": "y", "Z_centroid": "z"}, inplace=True)
data_new.rename(columns
"Cell Type"] = data_new["Cell Type"].apply(lambda x: x.replace('endothelial', 'Endothelial'))
data_new[
= data_new["Cell Type"].unique()
cell_types
for i in cell_types:
ct.add(i)
= data_new[["x", "y", "z", "Cell Type"]]
df_Region_1
# Write to the file using pandas to_csv
= raw_filename.split(".")[0]
label
f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
df_Region_1.to_csv(
# Generate dataset.json file for HRAPoP. Paper DOI: https://doi.org/10.1101/2023.11.10.566670
id = f"https://doi.org/10.1101/2023.11.10.566670#{label}"
= {
dataset_json "@id": id
}
with open(f"{target_dir}/{label}-dataset.json", "w") as f:
=4)
json.dump(dataset_json, f, indent
print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/skin-confocal-sorgerlab' created successfully.
{'Unknown', 'Dendritic cells', 'Tumor', 'Endothelial', 'Macrophage', 'Other Immune', 'B cells', 'Myeloid', 'Tissue T', 'Langerhan cells', 'CD11B+ CD11C- cells', 'CD8 T', 'CD4 T', 'keratinocytes', 'T reg'}
spleen-codex-ufl
= "spleen-codex-ufl"
dataset_name = set()
ct = os.path.join(basepath, orig_filedir, dataset_name)
raw_filedir
# Filename to HuBMAP ID dictionary.
# This is used to map the filename to the HuBMAP ID. Source: https://www.biorxiv.org/content/10.1101/2023.10.05.560733v2
= {
filename_to_id "FSLD": "HBM342.FSLD.938",
"KSFB": "HBM556.KSFB.592",
"NGPL": "HBM568.NGPL.345",
"PBVN": "HBM825.PBVN.284",
"PKHL": "HBM389.PKHL.936",
"XXCD": "HBM772.XXCD.697",
}
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for raw_filename in os.listdir(raw_filedir):
= pd.read_csv(os.path.join(raw_filedir, raw_filename), sep=';')
data_new
={"celltypes_folBcombined": "Cell Type"}, inplace=True)
data_new.rename(columns
# For consistency. Note that all endothelial cells in this dataset are blood endothelial
"Cell Type"] = data_new["Cell Type"].apply(lambda x: x.replace('Blood endothelial', 'blood endothelial'))
data_new[
= data_new["Cell Type"].unique()
cell_types
# Add all elements in cell_types list to teh ct set
for i in cell_types:
ct.add(i)
= data_new[["x", "y", "Cell Type"]]
df_Region_1
# Calculate μm per px
= 0.377
micro_per_pixel = micro_per_pixel # to convert given pixel in micro meter unit
scale "x"] = scale * df_Region_1["x"]
df_Region_1["y"] = scale * df_Region_1["y"]
df_Region_1[
# Find the most negative value in each column
= abs(df_Region_1['x'].min())
shift_x = abs(df_Region_1['y'].min())
shift_y
# Shift all values in the columns to be positive
'x'] = df_Region_1['x'] + shift_x
df_Region_1['y'] = df_Region_1['y'] + shift_y
df_Region_1[
# Write to the file using pandas to_csv
= raw_filename.split('.')[0]
label f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
df_Region_1.to_csv(
# Generate dataset.json file for HRAPoP. Get the UUID for the HuBMAP ID from Entity API.
id = "https://entity.api.hubmapconsortium.org/entities/" + get_hubmap_uuid(filename_to_id[label])
= {
dataset_json "@id": id
}
with open(f"{target_dir}/{label}-dataset.json", "w") as f:
=4)
json.dump(dataset_json, f, indent
print(ct)
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/spleen-codex-ufl' created successfully.
{'Myeloid cells', 'Sinusoidal cells', 'CD8 Memory T cells', 'Macrophages', 'indistinct', 'B cells, red pulp', 'Ki67 proliferating', 'Fol B cells', 'blood endothelial', 'Neutrophils/Monocytes', 'Podoplanin', 'CD4 Memory T cells'}
lung-codex-urmc
= "lung-codex-urmc"
dataset_name
# Process D265 Dataset.
# Read the h5ad file
# Specify your input and output file paths
= os.path.join(basepath, orig_filedir, dataset_name, "D265_final_annotated.h5ad")
h5ad_path = os.path.join(basepath, orig_filedir, dataset_name, "D265_final_annotated_cells.csv")
output_csv_path
print(f"Reading file: {h5ad_path}")
= sc.read_h5ad(h5ad_path)
adata
# Initialize dictionary to store the data
= {}
data_dict
# Extract spatial coordinates and split into x, y
if 'spatial' in adata.obsm_keys():
= adata.obsm['spatial']
spatial_coords 'x'] = spatial_coords[:, 0]
data_dict['y'] = spatial_coords[:, 1]
data_dict[else:
raise KeyError("'spatial' coordinates not found in obsm")
# Extract cell type calls from obs
if 'cell_type_calls' in adata.obs:
'Cell Type'] = adata.obs['cell_type_calls'].tolist()
data_dict[else:
raise KeyError("'cell_type_calls' column not found in obs")
# Extract ontology mappings from uns
if 'ontology_mappings' in adata.uns:
# Create a mapping dictionary from uns data
'Cell Ontology ID'] = adata.uns['ontology_mappings']["D265_ont_1"].tolist()
data_dict['Cell Ontology ID_2'] = adata.uns['ontology_mappings']["D265_ont_2"].tolist()
data_dict[
else:
raise KeyError("'ontology_mappings' not found in uns")
# Create DataFrame
= pd.DataFrame(data_dict)
df # In D265 dataset, remove space in Cell Type column where value is ENDO_1.
'Cell Type'] = df['Cell Type'].replace(' ENDO_1', 'ENDO_1')
df['Cell Type'] = df['Cell Type'].replace(' macrophage', 'macrophage')
df[
# Save to CSV
=False)
df.to_csv(output_csv_path, indexprint(f"\nData successfully saved to: {output_csv_path}")
del adata
del df
del data_dict
# Process D115 Dataset.
# Read the h5ad file
# Specify your input and output file paths
= os.path.join(basepath, orig_filedir, dataset_name, "D115_final_annotated.h5ad")
h5ad_path = os.path.join(basepath, orig_filedir, dataset_name, "D115_final_annotated_cells.csv")
output_csv_path
print(f"Reading file: {h5ad_path}")
= sc.read_h5ad(h5ad_path)
adata
# Initialize dictionary to store the data
= {}
data_dict
# Extract spatial coordinates and split into x, y
if 'spatial' in adata.obsm_keys():
= adata.obsm['spatial']
spatial_coords 'x'] = spatial_coords[:, 0]
data_dict['y'] = spatial_coords[:, 1]
data_dict[else:
raise KeyError("'spatial' coordinates not found in obsm")
# Extract cell type calls from obs
if 'cell_type_calls' in adata.obs:
'Cell Type'] = adata.obs['cell_type_calls'].tolist()
data_dict[else:
raise KeyError("'cell_type_calls' column not found in obs")
# Extract ontology mappings from uns
if 'ontology_mappings' in adata.uns:
# Create a mapping dictionary from uns data
# Note: Adjust this based on the actual structure of your ontology_mappings
'Cell Ontology ID'] = adata.uns['ontology_mappings']["D115_ont_1"].tolist()
data_dict['Cell Ontology ID_2'] = adata.uns['ontology_mappings']["D115_ont_2"].tolist()
data_dict[
else:
raise KeyError("'ontology_mappings' not found in uns")
# Create DataFrame
= pd.DataFrame(data_dict)
df
# Save to CSV
=False)
df.to_csv(output_csv_path, indexprint(f"\nData successfully saved to: {output_csv_path}")
del adata
del df
del data_dict
# Replace cell type names with cell labels from crosswalk file and generate cell-nodes files for both datasets.
# Read D265 csv file.
= pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "D265_final_annotated_cells.csv"))
df1
# Drop duplicate rows in D265. Check if any duplicate rows.
= df1.drop_duplicates(subset=['x', 'y', 'Cell Type'])
df1
# # Read crosswalk file.
# crosswalk = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "extras", "combined_unique_ct_list_with_cl_labels.csv"))
# # Keep unique rows in crosswalk file. Check uniqueness based on Cell Type.
# crosswalk = crosswalk.drop_duplicates(subset=['Cell Type'])
# # For each row in D265, replace Cell Type with the corresponding CL Label from crosswalk file. In case of no match, keep the original Cell Type.
# # In case the Cell Type matches but the CL Label field in the crosswalk file is empty, keep the original Cell Type.
# df1 = pd.merge(df1, crosswalk, on='Cell Type', how='left')
# df1['Cell Type'] = np.where(df1['CL Label'].isnull(), df1['Cell Type'], df1['CL Label'])
# df1 = df1.drop(columns=['CL Label'])
# Drop Cell Ontology ID and Cell Ontology ID_2 columns.
= df1.drop(columns=['Cell Ontology ID', 'Cell Ontology ID_2'])
df1
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
= "D265-LLL-7A7-12"
label # Save the updated D265 csv file.
f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
df1.to_csv(
id = f"https://entity.api.hubmapconsortium.org/entities/0f1ddcb41a484adbda759c0c79097a02#{label}"
= {
dataset_json "@id": id
}
with open(f"{target_dir}/{label}-dataset.json", "w") as f:
=4)
json.dump(dataset_json, f, indent
del df1
# Replace cell type names with cell labels from crosswalk file and generate cell-nodes files for both datasets.
# Read D115 csv file.
= pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "D115_final_annotated_cells.csv"))
df1
# Drop duplicate rows in D115. Check if any duplicate rows. If true, print them.
= df1.drop_duplicates(subset=['x', 'y', 'Cell Type'])
df1
# # Read crosswalk file.
# crosswalk = pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "extras", "combined_unique_ct_list_with_cl_labels.csv"))
# # Keep unique rows in crosswalk file. Check uniqueness based on Cell Type.
# crosswalk = crosswalk.drop_duplicates(subset=['Cell Type'])
# # For each row in D115, replace Cell Type with the corresponding CL Label from crosswalk file. In case of no match, keep the original Cell Type.
# # In case the Cell Type matches but the CL Label field in the crosswalk file is empty, keep the original Cell Type.
# df1 = pd.merge(df1, crosswalk, on='Cell Type', how='left')
# df1['Cell Type'] = np.where(df1['CL Label'].isnull(), df1['Cell Type'], df1['CL Label'])
# df1 = df1.drop(columns=['CL Label'])
# Drop Cell Ontology ID and Cell Ontology ID_2 columns.
= df1.drop(columns=['Cell Ontology ID', 'Cell Ontology ID_2'])
df1
= "D115-RLL-10A3-40"
label # Save the updated D115 csv file.
f"{target_dir}/{label}-nodes.csv", index=False, header=True, mode="w")
df1.to_csv(
id = f"https://entity.api.hubmapconsortium.org/entities/0f1ddcb41a484adbda759c0c79097a02#{label}"
= {
dataset_json "@id": id
}
with open(f"{target_dir}/{label}-dataset.json", "w") as f:
=4)
json.dump(dataset_json, f, indent
# Print the unique cell types in both datasets combined.
= pd.concat([df1['Cell Type'], df1['Cell Type']]).unique()
unique_cell_types print(unique_cell_types)
Reading file: /u/yashjain/hra-cell-distance-analysis/data/data-original/lung-codex-urmc/D265_final_annotated.h5ad
Data successfully saved to: /u/yashjain/hra-cell-distance-analysis/data/data-original/lung-codex-urmc/D265_final_annotated_cells.csv
Reading file: /u/yashjain/hra-cell-distance-analysis/data/data-original/lung-codex-urmc/D115_final_annotated.h5ad
Data successfully saved to: /u/yashjain/hra-cell-distance-analysis/data/data-original/lung-codex-urmc/D115_final_annotated_cells.csv
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/lung-codex-urmc' created successfully.
['CD4_+_Tcell/macrophage' 'CD8+_T_cell_1' 'macrophage_3' 'ENDO_1'
'ENDO_CD8+_T_Cell' 'CD4+_T_cell_1' 'Lung_Epithelial_1' 'CAP_ENDO'
'macrophage_CD1c+_myeloidDC' 'ENDO_SMC' 'Endo_p'
'Lung_Epithelil_2_CD4+_T_cell' 'AT2_2' 'AT2_1' 'CD8+_T_cell_2'
'macrophage_2' 'macrophage_p' 'CD4+_T_cell_2' 'AT2_p' 'SMC_2' 'SMC_1'
'CD8+_T_cell_3' 'B_cell_1' 'Lung_Epithelial_p' 'CD8+_T_cell_CD_4+_T_cell'
'Lung_Epithelial_4' 'UNK_5_ambiguous' 'UNK_1_APC' 'CD4+_T_cell_3'
'B_cell_macrophage_p?' 'UNK_3_(col1a1-driven_cluster)'
'Lymphatic_Endothelium' 'UNK_4_(col1a1_driven_cluster)']
bonemarrow-codex-chop
= "bonemarrow-codex-chop"
dataset_name
= pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "AML_NSM_RefMap_Seurat_v2_seurat_metadata.csv"))
metadata1
= pd.read_csv(os.path.join(basepath, orig_filedir, dataset_name, "Normal_Bone_Marrow_CODEX_Atlas_Seurat_v2_seurat_metadata.csv"))
metadata2
# Extract x, y coords, cell type, and filename from metadata1 into a new dataframe.
'x'] = metadata1['x.coord']
metadata1['y'] = metadata1['y.coord']
metadata1['Cell Type'] = metadata1['classified_cluster_anno_l2']
metadata1['filename'] = metadata1['orig.ident']
metadata1[= metadata1[['x', 'y', 'Cell Type', 'filename']]
metadata1
# Extract x, y coords, cell type, and filename from metadata2 into a new dataframe.
'x'] = metadata2['x.coord']
metadata2['y'] = metadata2['y.coord']
metadata2['Cell Type'] = metadata2['cluster_anno_l2']
metadata2['filename'] = metadata2['orig.ident']
metadata2[= metadata2[['x', 'y', 'Cell Type', 'filename']]
metadata2
# Merge metadata1 and metadata2.
= pd.concat([metadata1, metadata2], axis=0)
data_merged = data_merged.reset_index(drop=True)
data_merged
# Remove "_CODEX_Mesmer" from filename.
'filename'] = data_merged['filename'].str.replace('_CODEX_Mesmer', '')
data_merged[
# Write data_merged to a csv file.
"data_merged.csv"), index=False)
data_merged.to_csv(os.path.join(basepath, orig_filedir, dataset_name,
# Print unique cell types in the data_merged dataframe.
print(data_merged['Cell Type'].unique())
# Iterate over unique values in data_merged column filename and write each subset to a csv file in vccf-data-cell-nodes/published/bonemarrow-codex-chop directory. Drop filename column.
= os.path.join(basepath, dest_filedir, dataset_name)
target_dir
create_directory(target_dir)
for filename in data_merged['filename'].unique():
= data_merged[data_merged['filename'] == filename]
data_subset = data_subset.drop(columns=['filename'])
data_subset f'{target_dir}/{filename}-nodes.csv', index=False) data_subset.to_csv(
['Erythroid' 'B-Cells' 'AEC' 'Early Myeloid Progenitor' 'SEC'
'Intermediate Myeloid' 'Mature Myeloid' 'CD8+ T-Cell' 'Plasma Cells'
'Erythroblast' 'Adipocyte' 'Monocytes' 'Adipo-MSC' 'Endosteal'
'THY1+ MSC' 'CD4+ T-Cell' 'GMP/Myeloblast' 'GATA1pos_Mks'
'Immature_B_Cell' 'Macrophages' 'SPINK2+ HSPC' 'pDC'
'Non-Classical Monocyte' 'GATA1neg_Mks' 'HSPC' 'VSMC' 'GMP'
'MEP/Early Erythroblast' 'CLP' 'CD34+ CD61+' 'HSC' 'NPM1 Mutant Blast'
'Schwann Cells' 'Artifact' 'Undetermined' 'Autofluorescent'
'CD44+ Undetermined']
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes/bonemarrow-codex-chop' created successfully.
# Print final message
print("All datasets processed and saved.")
All datasets processed and saved.