Distance Analysis: maternalfetalinterface-mibitof-stanford

Analyze and visualize cell-to-nearest-endothelial-cell distance distributions for the maternalfetalinterface-mibitof-stanford dataset.

import numpy as np
import pandas as pd
import os
import json
import requests
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
import plotly.express as px

from _cde_compute_edges_from_nodes import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

basepath = "/u/yashjain/hra-cell-distance-analysis/data"
dataset_dir = "maternalfetalinterface-mibitof-stanford" 
data_filedir = os.path.join("data-processed-nodes-with-harmonized-cell-types", dataset_dir)
output_edge_dir = os.path.join("data-processed-edges", dataset_dir)
figures_output_dir = "generated-figures"

# Function to load your data
def load_data(path, edges=False):
    if edges:
        column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
        data = pd.read_csv(path, header=None, names=column_names)
    else:
        data = pd.read_csv(path)
    return data

# Function to read all files ending with "-nodes.csv" in the `data_filedir` directory into a single DataFrame. 
# Another additional column `Dataset` is added to identify the dataset name which comes from the filename before the `-nodes.csv` suffix.

# Additionally, function reads all files ending with "-edges.csv" in the `output_edge_dir` directory into a single DataFrame. 
# Three additional columns are added "Dataset", "Anchor Cell Type", and "Anchor Cell Type Level" to identify the dataset name, anchor cell type, and anchor cell type level respectively which come from the filename before the `.csv` suffix.
# The three additional columns are created by splitting the filename on the `-` character, and extracting the relevant parts.
# On splitting, the first part is the dataset name, second part is the anchor cell type level, and third part is the anchor cell type, and last part is the `edges` suffix.
# When reading files, check if the file has the correct format (i.e., ends with `-edges.csv`).

# Additionally, the function merges the edges DataFrame with the nodes DataFrame to get the cell type information for the anchor cells.
# This is done by reading the corresponding nodes file from the `data_filedir` directory for each edges file, and merging it with the edges DataFrame on the `cell_id` column.
# The merged DataFrame contains the edges with additional columns for the cell type information.

# The function returns three DataFrames:
# 1. `merged_nodes`: DataFrame containing all nodes with an additional column `Dataset`.
# 2. `merged_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, and `Anchor Cell Type Level`.
# 3. `merged_nodes_for_all_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, `Anchor Cell Type Level`, and the cell type information for cells.
def read_all_edge_datasets(basepath, data_filedir, output_edge_dir):
    all_nodes_files = []
    all_edges_files = []
    all_nodes_edges_files = []
    for file in os.listdir(os.path.join(basepath, output_edge_dir)):
        if file.endswith("-edges.csv"):
            file_path = os.path.join(basepath, output_edge_dir, file)
            dataset_name, anchor_cell_type_level, anchor_cell_type = file.replace("-edges.csv", "").split('-')
            edges_df = load_data(file_path, edges=False)
            edges_df['Dataset'] = dataset_name
            edges_df['Anchor Cell Type'] = anchor_cell_type
            edges_df['Anchor Cell Type Level'] = anchor_cell_type_level
            edges_df.rename(columns={"distance": "Distance"}, inplace=True) # Rename column "distance" to "Distance".
            all_edges_files.append(edges_df)

            # Read the corresponding nodes file from data_filedir to get the cell type information
            nodes_file_path = os.path.join(basepath, data_filedir, f"{dataset_name}-nodes.csv")
            nodes_df = load_data(nodes_file_path)
            nodes_df['Dataset'] = dataset_name
            all_nodes_files.append(nodes_df)

            # Add a new 'cell_id' column to nodes_df
            nodes_df['cell_id'] = range(len(nodes_df))
            # Set 'cell_id' column as index for nodes_df
            nodes_df.set_index('cell_id', inplace=True)
            # Merge edges_df with nodes_df to get the cell type information for the anchor cells
            edges_nodes_df = pd.merge(edges_df, nodes_df[['Level Three Cell Type', 'Level Two Cell Type', 'Level One Cell Type']], how='left', left_on='cell_id', right_index=True)
            all_nodes_edges_files.append(edges_nodes_df)

    
    merged_edges = pd.concat(all_edges_files, ignore_index=True)
    merged_nodes = pd.concat(all_nodes_files, ignore_index=True)
    merged_nodes_for_all_edges = pd.concat(all_nodes_edges_files, ignore_index=True) 

    return merged_nodes, merged_edges, merged_nodes_for_all_edges

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

Get initial statistics and identify endothelial cell categories for dataset.

df_all_nodes, df_all_edges, df_all_edges_with_cell_types = read_all_edge_datasets(basepath, data_filedir, output_edge_dir)

df_all_nodes.head(5)

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset
0	448.086	13.685	Mac2a	macrophage:2a	macrophage:2a	CL:0000235	skos:narrowMatch	macrophage	macrophage	CL:0000235	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	10_31742_1_2
1	486.795	245.939	other	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	10_31742_1_2
2	297.160	464.899	Mac2a	macrophage:2a	macrophage:2a	CL:0000235	skos:narrowMatch	macrophage	macrophage	CL:0000235	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	10_31742_1_2
3	450.041	748.374	Mac2a	macrophage:2a	macrophage:2a	CL:0000235	skos:narrowMatch	macrophage	macrophage	CL:0000235	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	10_31742_1_2
4	3.910	125.902	other	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	10_31742_1_2

# Print the total number of unique cell types per dataset. Compute separately for each cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type).
print("Total number of unique cell types per cell type annnotation level:")
unique_cell_types = {
    'Original Cell Type': df_all_nodes['Original Cell Type'].nunique(),
    'Level Three Cell Type': df_all_nodes['Level Three Cell Type'].nunique(),
    'Level Two Cell Type': df_all_nodes['Level Two Cell Type'].nunique(),
    'Level One Cell Type': df_all_nodes['Level One Cell Type'].nunique()
}
for cell_type, count in unique_cell_types.items():
    print(f"{cell_type}: {count}")

Total number of unique cell types per cell type annnotation level:
Original Cell Type: 26
Level Three Cell Type: 23
Level Two Cell Type: 11
Level One Cell Type: 5

# Save the unique cell types containing "endothelial" in name per cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type) to a dictionary where the key is the level and the value is a list of unique cell types.
endothelial_cell_types = {
    'Original Cell Type': df_all_nodes[df_all_nodes['Original Cell Type'].str.contains("endothelial", case=False, na=False)]['Original Cell Type'].unique().tolist(),
    'Level Three Cell Type': df_all_nodes[df_all_nodes['Level Three Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Three Cell Type'].unique().tolist(),
    'Level Two Cell Type': df_all_nodes[df_all_nodes['Level Two Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Two Cell Type'].unique().tolist(),
    'Level One Cell Type': df_all_nodes[df_all_nodes['Level One Cell Type'].str.contains("endothelial", case=False, na=False)]['Level One Cell Type'].unique().tolist()
}

print("\nEndothelial cell types per cell type annotation level:")
for level, cell_types in endothelial_cell_types.items():
    print(f"\n{level}:")
    for cell in cell_types:
        print(f"  - {cell}")


Endothelial cell types per cell type annotation level:

Original Cell Type:
  - Endothelial

Level Three Cell Type:
  - endothelial cell

Level Two Cell Type:
  - endothelial cell

Level One Cell Type:
  - endothelial cell

type_field_list = ["Level Three Cell Type", "Level Two Cell Type", "Level One Cell Type"] # Skipping Original Cell Type as it is not a hierarchical level.

# Define the anchor cell type (type of endothelial cell) for each level in type_field_list based on available categories in the previous cell. The distance analysis at all three levels will be limited to the specified anchor cell type.
anchor_cell_type_dict = {
    'Level Three Cell Type': 'endothelial cell',
    'Level Two Cell Type': 'endothelial cell',
    'Level One Cell Type': 'endothelial cell'
}

Process datasets to add region information to Nodes files.

df_all_nodes.head()

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset
0	448.086	13.685	Mac2a	macrophage:2a	macrophage:2a	CL:0000235	skos:narrowMatch	macrophage	macrophage	CL:0000235	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	10_31742_1_2
1	486.795	245.939	other	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	10_31742_1_2
2	297.160	464.899	Mac2a	macrophage:2a	macrophage:2a	CL:0000235	skos:narrowMatch	macrophage	macrophage	CL:0000235	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	10_31742_1_2
3	450.041	748.374	Mac2a	macrophage:2a	macrophage:2a	CL:0000235	skos:narrowMatch	macrophage	macrophage	CL:0000235	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	10_31742_1_2
4	3.910	125.902	other	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	10_31742_1_2

# The "Unique Region" column is created by mapping the region names based on the full dataset name. For this dataset (maternalfetalinterface-mibitof-stanford), the region is the dataset name itself.
df_all_nodes['Unique Region'] = dataset_dir
# df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].map(region_map)
# df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].str.split('-').str[1].map(region_map)

# Check if the new columns are created correctly.
df_all_nodes[['Dataset', 'Unique Region']].head(5)

	Dataset	Unique Region
0	10_31742_1_2	maternalfetalinterface-mibitof-stanford
1	10_31742_1_2	maternalfetalinterface-mibitof-stanford
2	10_31742_1_2	maternalfetalinterface-mibitof-stanford
3	10_31742_1_2	maternalfetalinterface-mibitof-stanford
4	10_31742_1_2	maternalfetalinterface-mibitof-stanford

# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_nodes['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_nodes['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_nodes['Unique Region'].unique():
    num_datasets = df_all_nodes[df_all_nodes['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")


Unique Regions in the data:
['maternalfetalinterface-mibitof-stanford']
Total number of unique regions: 1

Number of unique datasets per unique region:
maternalfetalinterface-mibitof-stanford: 209

Process datasets to add region information to Edges files.

df_all_edges.head(5)

	cell_id	x1	y1	x2	y2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level
0	0	448.086	13.685	475.456	50.439	45.825467	10_31742_1_2	endothelial cell	Level Three Cell Type
1	7	465.290	133.722	469.982	70.380	63.515540	10_31742_1_2	endothelial cell	Level Three Cell Type
2	59	450.041	1.173	475.456	50.439	55.435196	10_31742_1_2	endothelial cell	Level Three Cell Type
3	82	477.411	163.438	469.982	70.380	93.354065	10_31742_1_2	endothelial cell	Level Three Cell Type
4	99	420.325	31.671	389.045	42.619	33.140566	10_31742_1_2	endothelial cell	Level Three Cell Type

# The "Unique Region" column is created by mapping the region names based on the full dataset name. For this dataset (maternalfetalinterface-mibitof-stanford), the region is the dataset name itself.
df_all_edges['Unique Region'] = dataset_dir


# Check if the new columns are created correctly.
df_all_edges[['Dataset', 'Unique Region']].head(5)

	Dataset	Unique Region
0	10_31742_1_2	maternalfetalinterface-mibitof-stanford
1	10_31742_1_2	maternalfetalinterface-mibitof-stanford
2	10_31742_1_2	maternalfetalinterface-mibitof-stanford
3	10_31742_1_2	maternalfetalinterface-mibitof-stanford
4	10_31742_1_2	maternalfetalinterface-mibitof-stanford

# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_edges['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_edges['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_edges['Unique Region'].unique():
    num_datasets = df_all_edges[df_all_edges['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")


Unique Regions in the data:
['maternalfetalinterface-mibitof-stanford']
Total number of unique regions: 1

Number of unique datasets per unique region:
maternalfetalinterface-mibitof-stanford: 201

df_all_edges_with_cell_types['Unique Region'] = dataset_dir
# df_all_edges_with_cell_types['Unique Region'] = df_all_edges_with_cell_types['Dataset'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges_with_cell_types[['Dataset', 'Unique Region']].head(5)

	Dataset	Unique Region
0	10_31742_1_2	maternalfetalinterface-mibitof-stanford
1	10_31742_1_2	maternalfetalinterface-mibitof-stanford
2	10_31742_1_2	maternalfetalinterface-mibitof-stanford
3	10_31742_1_2	maternalfetalinterface-mibitof-stanford
4	10_31742_1_2	maternalfetalinterface-mibitof-stanford

df_all_nodes.head(1)

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset	Unique Region
0	448.086	13.685	Mac2a	macrophage:2a	macrophage:2a	CL:0000235	skos:narrowMatch	macrophage	macrophage	CL:0000235	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	10_31742_1_2	maternalfetalinterface-mibitof-stanford

df_all_edges.head(1)

	cell_id	x1	y1	z1	x2	y2	z2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Unique Region
0	0	448.086	13.685	0	475.456	50.439	0	45.825467	10_31742_1_2	endothelial cell	Level Three Cell Type	maternalfetalinterface-mibitof-stanford

df_all_edges_with_cell_types.head(1)

	cell_id	x1	y1	z1	x2	y2	z2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Level Three Cell Type	Level Two Cell Type	Level One Cell Type	Unique Region
0	0	448.086	13.685	0	475.456	50.439	0	45.825467	10_31742_1_2	endothelial cell	Level Three Cell Type	macrophage:2a	macrophage	immune cell	maternalfetalinterface-mibitof-stanford

Node Analysis

# Plot number of cells per cell type in the same plot. Color by cell type and unique region. Output figure saved in existing `figures_output_dir`.
def plot_cells_per_celltype(df, type_field, output_dir):
    plt.figure(figsize=(20, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.countplot(data=df, x=type_field, palette='Spectral', hue='Unique Region')
    plt.title(f'Number of Cells per {type_field} in `{dataset_dir}`')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.legend(title='Unique Region', bbox_to_anchor=(0.85, 1), loc='upper left')
    plt.xlabel(type_field)

    # For numbers on y-axis, do not use scientific notation.
    plt.ticklabel_format(style='plain', axis='y')
    # Set y-axis label
    plt.ylabel('Number of Cells')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    # Show the plot
    plt.show()
    plt.close()
for type_field in type_field_list:
    plot_cells_per_celltype(df_all_nodes, type_field, os.path.join(basepath, figures_output_dir))

Distance Analysis

# Get mean, median, minimum, maximum distance per unique region per anchor cell type.
df_distance_stats = df_all_edges_with_cell_types.groupby(['Unique Region', 'Anchor Cell Type', 'Anchor Cell Type Level']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
# Print the first few rows of the distance statistics DataFrame.
df_distance_stats

	Unique Region	Anchor Cell Type	Anchor Cell Type Level	mean_distance	median_distance	min_distance	max_distance
0	maternalfetalinterface-mibitof-stanford	endothelial cell	Level One Cell Type	67.89132	57.54213	3.49721	199.994111
1	maternalfetalinterface-mibitof-stanford	endothelial cell	Level Three Cell Type	67.89132	57.54213	3.49721	199.994111
2	maternalfetalinterface-mibitof-stanford	endothelial cell	Level Two Cell Type	67.89132	57.54213	3.49721	199.994111

Level One Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level One Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level One Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	epithelial cell	maternalfetalinterface-mibitof-stanford	71.729685	60.817785	5.083000	199.994111
1	immune cell	maternalfetalinterface-mibitof-stanford	69.292915	59.560480	3.497210	199.992200
2	mesenchymal cell	maternalfetalinterface-mibitof-stanford	67.481963	57.231769	3.910000	199.987996
3	unknown cell	maternalfetalinterface-mibitof-stanford	58.887208	47.444914	4.211199	199.992200

# Get top five and bottom five cell types with respect to mean distance in each unique region separately.
def get_top_bottom_cell_types_by_mean(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate mean distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(mean_distance=('Distance', 'mean')).reset_index()
    
    # Sort by mean distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='mean_distance', ascending=False)
    
    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in maternalfetalinterface-mibitof-stanford:
  Level One Cell Type  mean_distance
0     epithelial cell      71.729685
1         immune cell      69.292915
2    mesenchymal cell      67.481963
3        unknown cell      58.887208

Bottom 5 cell types in maternalfetalinterface-mibitof-stanford:
  Level One Cell Type  mean_distance
0     epithelial cell      71.729685
1         immune cell      69.292915
2    mesenchymal cell      67.481963
3        unknown cell      58.887208

# Get top five and bottom five cell types with respect to median distance in each unique region separately.
def get_top_bottom_cell_types_by_median(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate median distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(median_distance=('Distance', 'median')).reset_index()

    # Sort by median distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='median_distance', ascending=False)

    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in maternalfetalinterface-mibitof-stanford:
  Level One Cell Type  median_distance
0     epithelial cell        60.817785
1         immune cell        59.560480
2    mesenchymal cell        57.231769
3        unknown cell        47.444914

Bottom 5 cell types in maternalfetalinterface-mibitof-stanford:
  Level One Cell Type  median_distance
0     epithelial cell        60.817785
1         immune cell        59.560480
2    mesenchymal cell        57.231769
3        unknown cell        47.444914

# Calculate regional variability
def calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level):
    """    Calculate regional variability for distances in the given DataFrame.
    """
    regional_variability = df_all_edges_with_cell_type_level.groupby('Unique Region')['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    regional_variability['CV (%)'] = (regional_variability['std'] / regional_variability['mean'] * 100).round(1)

    print("\nRegional Variability Analysis:")
    print("Mean: Average distance in each region")
    print("Std: Standard deviation of distances")
    print("CV: Coefficient of Variation (std/mean * 100%)")
    print(regional_variability)

    # Calculate variability for each cell type
    cell_type_variability = df_all_edges_with_cell_type_level.groupby(cell_type_level)['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    cell_type_variability['CV (%)'] = (cell_type_variability['std'] / cell_type_variability['mean'] * 100).round(1)

    print("\nCell Type Variability Analysis (sorted by CV):")
    print(cell_type_variability.sort_values('CV (%)', ascending=False))

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                                          mean    std  CV (%)
Unique Region                                                
maternalfetalinterface-mibitof-stanford  67.89  44.96    66.2

Cell Type Variability Analysis (sorted by CV):
                      mean    std  CV (%)
Level One Cell Type                      
unknown cell         58.89  41.57    70.6
mesenchymal cell     67.48  45.43    67.3
epithelial cell      71.73  46.54    64.9
immune cell          69.29  44.09    63.6

# Define the standard region sequence for plots (Get all unique values in Unique Region column as a list.)
regions = df_all_edges_with_cell_types['Unique Region'].unique().tolist()

print("\nRegions in the data:")
print(regions)


Regions in the data:
['maternalfetalinterface-mibitof-stanford']

# Generate Violin Plot
def plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm='area'):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(10, 6))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette='Spectral', alpha=.9)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size)

    plt.title(f'Violin Plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    plt.figure(figsize=(16, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    # Create categorical type with only the regions that exist in the data
    available_regions = [r for r in regions if r in df_all_edges_with_cell_type_level['Unique Region'].unique()]
    df_all_edges_with_cell_type_level['Unique Region'] = pd.Categorical(
        df_all_edges_with_cell_type_level['Unique Region'],
        categories=available_regions,
        ordered=True
    )

    # Make box plot.
    sns.boxplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y='Distance', hue='Unique Region', showfliers=False, palette='Spectral') # viridis or Spectral palette for better color distinction
    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)
    plt.title(f'Distribution of distances by {cell_type_level} and region', fontsize=font_size)
    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    pivot_data = df_all_edges_with_cell_type_level.pivot_table(
    values='Distance',
    index=cell_type_level,
    columns='Unique Region',
    aggfunc='median'
    )

    plt.figure(figsize=(15, 10))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='Spectral')
    plt.title(f'Heatmap of median distances by {cell_type_level}', fontsize=12)

    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)

    plt.xlabel('Unique Region', fontsize=font_size)
    plt.ylabel(f'{cell_type_level}', fontsize=font_size)
    
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Generate Violin Plot per unique region.
def plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 1})
    font_size = 10
    plt.figure(figsize=(15, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    # Keep the sequence of Cell Types consistent across plots.
    cell_types = sorted(df_all_edges_with_cell_type_level[cell_type_level].unique())

    # Create a color palette based on the number of unique classes
    color_palette = sns.color_palette("Spectral", n_colors=len(cell_types))

    # Create a dictionary mapping class to color
    class_color_dict = dict(zip(cell_types, color_palette))

    # Create the violin plot for the entire dataset
    sns.violinplot(
        data=df_all_edges_with_cell_type_level, 
        x=cell_type_level, 
        y="Distance", 
        density_norm=density_norm, 
        common_norm=True, 
        cut=0, 
        inner="box", 
        split=False, 
        palette=class_color_dict, 
        alpha=.9, 
        hue=cell_type_level, 
        legend=False, 
        order=cell_types, 
        fill=True
    )

    # Set title and labels
    plt.title(f'Distance distribution per {cell_type_level} in `{dataset_dir}` (density normalization = {density_norm})', fontsize=font_size)
    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (μm)', fontsize=font_size)
    
    # Rotate x-axis labels and adjust font size
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    
    plt.show()

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

Level Two Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Two Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level Two Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	dendritic cell	maternalfetalinterface-mibitof-stanford	58.324142	47.317462	5.098016	199.666668
1	fibroblast	maternalfetalinterface-mibitof-stanford	67.813447	57.627086	3.910000	199.987996
2	gland epithelium cell	maternalfetalinterface-mibitof-stanford	51.287235	42.921044	5.981150	199.885151
3	macrophage	maternalfetalinterface-mibitof-stanford	72.110494	62.600309	3.604842	199.992200
4	mast cell	maternalfetalinterface-mibitof-stanford	63.148860	54.321564	6.448537	195.472628
5	muscle cell	maternalfetalinterface-mibitof-stanford	45.641366	33.661210	5.487947	197.687141
6	natural killer cell	maternalfetalinterface-mibitof-stanford	67.902679	58.362562	3.497210	199.930654
7	t cell	maternalfetalinterface-mibitof-stanford	62.509286	52.500396	4.576538	199.463659
8	trophoblast	maternalfetalinterface-mibitof-stanford	81.257066	72.895027	5.083000	199.994111
9	unknown cell	maternalfetalinterface-mibitof-stanford	58.887208	47.444914	4.211199	199.992200

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in maternalfetalinterface-mibitof-stanford:
   Level Two Cell Type  mean_distance
8         trophoblast       81.257066
3           macrophage      72.110494
6  natural killer cell      67.902679
1           fibroblast      67.813447
4            mast cell      63.148860

Bottom 5 cell types in maternalfetalinterface-mibitof-stanford:
     Level Two Cell Type  mean_distance
7                 t cell      62.509286
9           unknown cell      58.887208
0         dendritic cell      58.324142
2  gland epithelium cell      51.287235
5            muscle cell      45.641366

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in maternalfetalinterface-mibitof-stanford:
   Level Two Cell Type  median_distance
8         trophoblast         72.895027
3           macrophage        62.600309
6  natural killer cell        58.362562
1           fibroblast        57.627086
4            mast cell        54.321564

Bottom 5 cell types in maternalfetalinterface-mibitof-stanford:
     Level Two Cell Type  median_distance
7                 t cell        52.500396
9           unknown cell        47.444914
0         dendritic cell        47.317462
2  gland epithelium cell        42.921044
5            muscle cell        33.661210

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                                          mean    std  CV (%)
Unique Region                                                
maternalfetalinterface-mibitof-stanford  67.89  44.96    66.2

Cell Type Variability Analysis (sorted by CV):
                        mean    std  CV (%)
Level Two Cell Type                        
muscle cell            45.64  35.99    78.9
dendritic cell         58.32  41.41    71.0
unknown cell           58.89  41.57    70.6
gland epithelium cell  51.29  34.72    67.7
fibroblast             67.81  45.48    67.1
t cell                 62.51  41.77    66.8
mast cell              63.15  41.34    65.5
natural killer cell    67.90  42.91    63.2
macrophage             72.11  45.38    62.9
trophoblast            81.26  48.25    59.4

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

Level Three Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Three Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level Three Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	dendritic cell	maternalfetalinterface-mibitof-stanford	58.324142	47.317462	5.098016	199.666668
1	fibroblast	maternalfetalinterface-mibitof-stanford	77.889958	69.612277	3.910000	199.987996
2	glandular cell of placenta	maternalfetalinterface-mibitof-stanford	51.287235	42.921044	5.981150	199.885151
3	hofbauer cell	maternalfetalinterface-mibitof-stanford	51.969181	51.969181	51.969181	51.969181
4	macrophage:1a	maternalfetalinterface-mibitof-stanford	75.795827	68.247143	4.724471	199.456378
5	macrophage:1b	maternalfetalinterface-mibitof-stanford	69.105499	58.802292	5.431946	199.992200
6	macrophage:2a	maternalfetalinterface-mibitof-stanford	71.011379	61.357107	3.604842	199.978822
7	macrophage:2b	maternalfetalinterface-mibitof-stanford	76.811644	67.782982	6.839707	199.575914
8	macrophage:2c	maternalfetalinterface-mibitof-stanford	79.310856	70.081228	5.098016	199.924919
9	mast cell	maternalfetalinterface-mibitof-stanford	63.148860	54.321564	6.448537	195.472628
10	muscle cell	maternalfetalinterface-mibitof-stanford	45.641366	33.661210	5.487947	197.687141
11	myofibroblast	maternalfetalinterface-mibitof-stanford	44.639942	33.840141	4.423660	199.978440
12	natural killer cell	maternalfetalinterface-mibitof-stanford	67.902679	58.362562	3.497210	199.930654
13	t cell:cd4+ alpha-beta	maternalfetalinterface-mibitof-stanford	57.463278	47.325539	4.976618	198.665326
14	t cell:cd8+ alpha-beta	maternalfetalinterface-mibitof-stanford	61.399550	51.377460	4.576538	198.576424
15	t cell:mature natural killer	maternalfetalinterface-mibitof-stanford	67.510450	58.018392	5.878019	199.463659
16	t cell:regulatory	maternalfetalinterface-mibitof-stanford	67.053392	56.678818	8.924741	179.373567
17	trophoblast:hla-g minus, cd57- ck7 low extravi...	maternalfetalinterface-mibitof-stanford	72.329049	59.481490	5.968356	198.836091
18	trophoblast:hla-g+ cd56+ extravillous	maternalfetalinterface-mibitof-stanford	62.454379	51.417615	5.083000	199.396583
19	trophoblast:hla-g+ ck7+ extravillous	maternalfetalinterface-mibitof-stanford	83.334643	75.236264	5.981150	199.994111
20	trophoblast:hla-g+ ck7- extravillous	maternalfetalinterface-mibitof-stanford	83.224385	75.506038	5.529575	199.973471
21	unknown cell	maternalfetalinterface-mibitof-stanford	58.887208	47.444914	4.211199	199.992200

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in maternalfetalinterface-mibitof-stanford:
                   Level Three Cell Type  mean_distance
19  trophoblast:hla-g+ ck7+ extravillous      83.334643
20  trophoblast:hla-g+ ck7- extravillous      83.224385
8                          macrophage:2c      79.310856
1                             fibroblast      77.889958
7                          macrophage:2b      76.811644

Bottom 5 cell types in maternalfetalinterface-mibitof-stanford:
         Level Three Cell Type  mean_distance
13      t cell:cd4+ alpha-beta      57.463278
3                hofbauer cell      51.969181
2   glandular cell of placenta      51.287235
10                 muscle cell      45.641366
11               myofibroblast      44.639942

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in maternalfetalinterface-mibitof-stanford:
                   Level Three Cell Type  median_distance
20  trophoblast:hla-g+ ck7- extravillous        75.506038
19  trophoblast:hla-g+ ck7+ extravillous        75.236264
8                          macrophage:2c        70.081228
1                             fibroblast        69.612277
4                          macrophage:1a        68.247143

Bottom 5 cell types in maternalfetalinterface-mibitof-stanford:
         Level Three Cell Type  median_distance
13      t cell:cd4+ alpha-beta        47.325539
0               dendritic cell        47.317462
2   glandular cell of placenta        42.921044
11               myofibroblast        33.840141
10                 muscle cell        33.661210

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                                          mean    std  CV (%)
Unique Region                                                
maternalfetalinterface-mibitof-stanford  67.89  44.96    66.2

Cell Type Variability Analysis (sorted by CV):
                                                     mean    std  CV (%)
Level Three Cell Type                                                   
myofibroblast                                       44.64  35.31    79.1
muscle cell                                         45.64  35.99    78.9
dendritic cell                                      58.32  41.41    71.0
unknown cell                                        58.89  41.57    70.6
t cell:cd4+ alpha-beta                              57.46  40.41    70.3
trophoblast:hla-g+ cd56+ extravillous               62.45  42.43    67.9
glandular cell of placenta                          51.29  34.72    67.7
t cell:regulatory                                   67.05  45.09    67.2
trophoblast:hla-g minus, cd57- ck7 low extravil...  72.33  48.30    66.8
t cell:cd8+ alpha-beta                              61.40  40.96    66.7
mast cell                                           63.15  41.34    65.5
t cell:mature natural killer                        67.51  43.43    64.3
macrophage:1b                                       69.11  44.25    64.0
macrophage:2a                                       71.01  45.29    63.8
natural killer cell                                 67.90  42.91    63.2
macrophage:2b                                       76.81  46.70    60.8
macrophage:2c                                       79.31  47.08    59.4
macrophage:1a                                       75.80  44.50    58.7
fibroblast                                          77.89  45.73    58.7
trophoblast:hla-g+ ck7- extravillous                83.22  48.60    58.4
trophoblast:hla-g+ ck7+ extravillous                83.33  48.07    57.7
hofbauer cell                                       51.97    NaN     NaN

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.