Distance Analysis: tonsil-codex-stanford and esophagus-codex-stanford

Analyze and visualize cell-to-nearest-endothelial-cell distance distributions for the tonsil-codex-stanford and esophagus-codex-stanford datasets.

import numpy as np
import pandas as pd
import os
import json
import requests
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
import plotly.express as px

from _cde_compute_edges_from_nodes import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

basepath = "/u/yashjain/hra-cell-distance-analysis/data"
dataset_dir = "tonsil-esophagus-codex-stanford" # This variable is used only for figures.
dataset_dir_1 = "tonsil-codex-stanford" # This variable is used for data loading dataset 1.
dataset_dir_2 = "esophagus-codex-stanford" # This variable is used for data loading dataset 2.
data_filedir_1 = os.path.join("data-processed-nodes-with-harmonized-cell-types", dataset_dir_1)
output_edge_dir_1 = os.path.join("data-processed-edges", dataset_dir_1)
data_filedir_2 = os.path.join("data-processed-nodes-with-harmonized-cell-types", dataset_dir_2)
output_edge_dir_2 = os.path.join("data-processed-edges", dataset_dir_2)
figures_output_dir = "generated-figures"

# Function to load your data
def load_data(path, edges=False):
    if edges:
        column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
        data = pd.read_csv(path, header=None, names=column_names)
    else:
        data = pd.read_csv(path)
    return data

# Function to read all files ending with "-nodes.csv" in the `data_filedir` directory into a single DataFrame. 
# Another additional column `Dataset` is added to identify the dataset name which comes from the filename before the `-nodes.csv` suffix.

# Additionally, function reads all files ending with "-edges.csv" in the `output_edge_dir` directory into a single DataFrame. 
# Three additional columns are added "Dataset", "Anchor Cell Type", and "Anchor Cell Type Level" to identify the dataset name, anchor cell type, and anchor cell type level respectively which come from the filename before the `.csv` suffix.
# The three additional columns are created by splitting the filename on the `-` character, and extracting the relevant parts.
# On splitting, the first part is the dataset name, second part is the anchor cell type level, and third part is the anchor cell type, and last part is the `edges` suffix.
# When reading files, check if the file has the correct format (i.e., ends with `-edges.csv`).

# Additionally, the function merges the edges DataFrame with the nodes DataFrame to get the cell type information for the anchor cells.
# This is done by reading the corresponding nodes file from the `data_filedir` directory for each edges file, and merging it with the edges DataFrame on the `cell_id` column.
# The merged DataFrame contains the edges with additional columns for the cell type information.

# The function returns three DataFrames:
# 1. `merged_nodes`: DataFrame containing all nodes with an additional column `Dataset`.
# 2. `merged_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, and `Anchor Cell Type Level`.
# 3. `merged_nodes_for_all_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, `Anchor Cell Type Level`, and the cell type information for cells.
def read_all_edge_datasets(basepath, data_filedir, output_edge_dir):
    all_nodes_files = []
    all_edges_files = []
    all_nodes_edges_files = []
    for file in os.listdir(os.path.join(basepath, output_edge_dir)):
        if file.endswith("-edges.csv"):
            file_path = os.path.join(basepath, output_edge_dir, file)
            dataset_name, anchor_cell_type_level, anchor_cell_type = file.replace("-edges.csv", "").split('-')
            edges_df = load_data(file_path, edges=False)
            edges_df['Dataset'] = dataset_name
            edges_df['Anchor Cell Type'] = anchor_cell_type
            edges_df['Anchor Cell Type Level'] = anchor_cell_type_level
            edges_df.rename(columns={"distance": "Distance"}, inplace=True) # Rename column "distance" to "Distance".
            all_edges_files.append(edges_df)

            # Read the corresponding nodes file from data_filedir to get the cell type information
            nodes_file_path = os.path.join(basepath, data_filedir, f"{dataset_name}-nodes.csv")
            nodes_df = load_data(nodes_file_path)
            nodes_df['Dataset'] = dataset_name
            all_nodes_files.append(nodes_df)

            # Add a new 'cell_id' column to nodes_df
            nodes_df['cell_id'] = range(len(nodes_df))
            # Set 'cell_id' column as index for nodes_df
            nodes_df.set_index('cell_id', inplace=True)
            # Merge edges_df with nodes_df to get the cell type information for the anchor cells
            edges_nodes_df = pd.merge(edges_df, nodes_df[['Level Three Cell Type', 'Level Two Cell Type', 'Level One Cell Type']], how='left', left_on='cell_id', right_index=True)
            all_nodes_edges_files.append(edges_nodes_df)

    
    merged_edges = pd.concat(all_edges_files, ignore_index=True)
    merged_nodes = pd.concat(all_nodes_files, ignore_index=True)
    merged_nodes_for_all_edges = pd.concat(all_nodes_edges_files, ignore_index=True) 

    return merged_nodes, merged_edges, merged_nodes_for_all_edges

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

Get initial statistics and identify endothelial cell categories for dataset.

# Read all datasets from Dataset 1
df_all_nodes_1, df_all_edges_1, df_all_edges_with_cell_types_1 = read_all_edge_datasets(basepath, data_filedir_1, output_edge_dir_1)

# Create a column 'Tissue Type' in all three DataFrames with the value dataset_dir_1
df_all_nodes_1['Tissue Type'] = dataset_dir_1
df_all_edges_1['Tissue Type'] = dataset_dir_1
df_all_edges_with_cell_types_1['Tissue Type'] = dataset_dir_1

# Read all datasets from Dataset 2
df_all_nodes_2, df_all_edges_2, df_all_edges_with_cell_types_2 = read_all_edge_datasets(basepath, data_filedir_2, output_edge_dir_2)

# Create a column 'Tissue Type' in all three DataFrames with the value dataset_dir_2
df_all_nodes_2['Tissue Type'] = dataset_dir_2
df_all_edges_2['Tissue Type'] = dataset_dir_2
df_all_edges_with_cell_types_2['Tissue Type'] = dataset_dir_2

# Merge the two datasets
df_all_nodes = pd.concat([df_all_nodes_1, df_all_nodes_2], ignore_index=True)
df_all_edges = pd.concat([df_all_edges_1, df_all_edges_2], ignore_index=True)
df_all_edges_with_cell_types = pd.concat([df_all_edges_with_cell_types_1, df_all_edges_with_cell_types_2], ignore_index=True)

df_all_nodes.head(5)

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset	Tissue Type
0	148.538	1325.532	Innate	lymphoid cell:innate	innate lymphoid cell	CL:0001065	skos:exactMatch	lymphoid cell	lymphocyte	CL:0000542	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	tonsil	tonsil-codex-stanford
1	2061.813	928.551	Innate	lymphoid cell:innate	innate lymphoid cell	CL:0001065	skos:exactMatch	lymphoid cell	lymphocyte	CL:0000542	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	tonsil	tonsil-codex-stanford
2	944.762	1806.961	PDPN	stromal cell:podoplanin+	stromal cell:podoplanin-positive	CL:0000499	skos:narrowMatch	stromal cell	stromal cell	CL:0000499	skos:exactMatch	mesenchymal cell	mesenchymal cell	CL:0008019	skos:exactMatch	tonsil	tonsil-codex-stanford
3	3367.741	300.092	Endothelial	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	tonsil	tonsil-codex-stanford
4	1806.961	710.645	Innate	lymphoid cell:innate	innate lymphoid cell	CL:0001065	skos:exactMatch	lymphoid cell	lymphocyte	CL:0000542	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	tonsil	tonsil-codex-stanford

# Print the total number of unique cell types per dataset. Compute separately for each cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type).
print("Total number of unique cell types per cell type annnotation level:")
unique_cell_types = {
    'Original Cell Type': df_all_nodes['Original Cell Type'].nunique(),
    'Level Three Cell Type': df_all_nodes['Level Three Cell Type'].nunique(),
    'Level Two Cell Type': df_all_nodes['Level Two Cell Type'].nunique(),
    'Level One Cell Type': df_all_nodes['Level One Cell Type'].nunique()
}
for cell_type, count in unique_cell_types.items():
    print(f"{cell_type}: {count}")

Total number of unique cell types per cell type annnotation level:
Original Cell Type: 13
Level Three Cell Type: 13
Level Two Cell Type: 11
Level One Cell Type: 5

# Save the unique cell types containing "endothelial" in name per cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type) to a dictionary where the key is the level and the value is a list of unique cell types.
endothelial_cell_types = {
    'Original Cell Type': df_all_nodes[df_all_nodes['Original Cell Type'].str.contains("endothelial", case=False, na=False)]['Original Cell Type'].unique().tolist(),
    'Level Three Cell Type': df_all_nodes[df_all_nodes['Level Three Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Three Cell Type'].unique().tolist(),
    'Level Two Cell Type': df_all_nodes[df_all_nodes['Level Two Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Two Cell Type'].unique().tolist(),
    'Level One Cell Type': df_all_nodes[df_all_nodes['Level One Cell Type'].str.contains("endothelial", case=False, na=False)]['Level One Cell Type'].unique().tolist()
}

print("\nEndothelial cell types per cell type annotation level:")
for level, cell_types in endothelial_cell_types.items():
    print(f"\n{level}:")
    for cell in cell_types:
        print(f"  - {cell}")


Endothelial cell types per cell type annotation level:

Original Cell Type:
  - Endothelial

Level Three Cell Type:
  - endothelial cell

Level Two Cell Type:
  - endothelial cell

Level One Cell Type:
  - endothelial cell

type_field_list = ["Level Three Cell Type", "Level Two Cell Type", "Level One Cell Type"] # Skipping Original Cell Type as it is not a hierarchical level.

# Define the anchor cell type (type of endothelial cell) for each level in type_field_list based on available categories in the previous cell. The distance analysis at all three levels will be limited to the specified anchor cell type.
anchor_cell_type_dict = {
    'Level Three Cell Type': 'endothelial cell',
    'Level Two Cell Type': 'endothelial cell',
    'Level One Cell Type': 'endothelial cell'
}

Process datasets to add region information to Nodes files.

# Create a dictionary to map skin regions to correct condition.
region_map = {dataset_dir_1:'tonsil-codex-stanford',
 dataset_dir_2:'esophagus-codex-stanford', 
}

# Define the standard region sequence for plots
regions = ['tonsil-codex-stanford', 'esophagus-codex-stanford']

df_all_nodes.head()

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset	Tissue Type
0	148.538	1325.532	Innate	lymphoid cell:innate	innate lymphoid cell	CL:0001065	skos:exactMatch	lymphoid cell	lymphocyte	CL:0000542	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	tonsil	tonsil-codex-stanford
1	2061.813	928.551	Innate	lymphoid cell:innate	innate lymphoid cell	CL:0001065	skos:exactMatch	lymphoid cell	lymphocyte	CL:0000542	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	tonsil	tonsil-codex-stanford
2	944.762	1806.961	PDPN	stromal cell:podoplanin+	stromal cell:podoplanin-positive	CL:0000499	skos:narrowMatch	stromal cell	stromal cell	CL:0000499	skos:exactMatch	mesenchymal cell	mesenchymal cell	CL:0008019	skos:exactMatch	tonsil	tonsil-codex-stanford
3	3367.741	300.092	Endothelial	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	tonsil	tonsil-codex-stanford
4	1806.961	710.645	Innate	lymphoid cell:innate	innate lymphoid cell	CL:0001065	skos:exactMatch	lymphoid cell	lymphocyte	CL:0000542	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	tonsil	tonsil-codex-stanford

# Iterate through the df_all_data dataframe to create new column "Unique Region" based on the "Tissue Type" column.
# The "Unique Region" column is created by mapping the region names based on the full dataset name.
df_all_nodes['Unique Region'] = df_all_nodes['Tissue Type'].map(region_map)

# Check if the new columns are created correctly.
df_all_nodes[['Tissue Type', 'Unique Region']].head(5)

	Tissue Type	Unique Region
0	tonsil-codex-stanford	tonsil-codex-stanford
1	tonsil-codex-stanford	tonsil-codex-stanford
2	tonsil-codex-stanford	tonsil-codex-stanford
3	tonsil-codex-stanford	tonsil-codex-stanford
4	tonsil-codex-stanford	tonsil-codex-stanford

# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_nodes['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_nodes['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_nodes['Unique Region'].unique():
    num_datasets = df_all_nodes[df_all_nodes['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")


Unique Regions in the data:
['tonsil-codex-stanford' 'esophagus-codex-stanford']
Total number of unique regions: 2

Number of unique datasets per unique region:
tonsil-codex-stanford: 1
esophagus-codex-stanford: 1

Process datasets to add region information to Edges files.

df_all_edges.head(5)

	cell_id	x1	y1	x2	y2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Tissue Type
0	0	148.538	1325.532	96.889	1333.449	52.252254	tonsil	endothelial cell	Level Three Cell Type	tonsil-codex-stanford
1	190	183.599	1254.656	96.889	1333.449	117.162114	tonsil	endothelial cell	Level Three Cell Type	tonsil-codex-stanford
2	242	180.583	1235.052	109.330	1150.227	110.780281	tonsil	endothelial cell	Level Three Cell Type	tonsil-codex-stanford
3	609	163.618	1238.445	109.330	1150.227	103.583794	tonsil	endothelial cell	Level Three Cell Type	tonsil-codex-stanford
4	836	88.595	1252.394	56.550	1316.484	71.654798	tonsil	endothelial cell	Level Three Cell Type	tonsil-codex-stanford

# Process the edge data to create new columns "Unique Region" based on the "Tissue Type" column, similar to how it was done for the node data.
df_all_edges['Unique Region'] = df_all_edges['Tissue Type'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges[['Tissue Type', 'Unique Region']].head(5)

	Tissue Type	Unique Region
0	tonsil-codex-stanford	tonsil-codex-stanford
1	tonsil-codex-stanford	tonsil-codex-stanford
2	tonsil-codex-stanford	tonsil-codex-stanford
3	tonsil-codex-stanford	tonsil-codex-stanford
4	tonsil-codex-stanford	tonsil-codex-stanford

# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_edges['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_edges['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_edges['Unique Region'].unique():
    num_datasets = df_all_edges[df_all_edges['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")


Unique Regions in the data:
['tonsil-codex-stanford' 'esophagus-codex-stanford']
Total number of unique regions: 2

Number of unique datasets per unique region:
tonsil-codex-stanford: 1
esophagus-codex-stanford: 1

df_all_edges_with_cell_types['Unique Region'] = df_all_edges_with_cell_types['Tissue Type'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges_with_cell_types[['Tissue Type', 'Unique Region']].head(5)

	Tissue Type	Unique Region
0	tonsil-codex-stanford	tonsil-codex-stanford
1	tonsil-codex-stanford	tonsil-codex-stanford
2	tonsil-codex-stanford	tonsil-codex-stanford
3	tonsil-codex-stanford	tonsil-codex-stanford
4	tonsil-codex-stanford	tonsil-codex-stanford

df_all_nodes.head(1)

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset	Tissue Type	Unique Region
0	148.538	1325.532	Innate	lymphoid cell:innate	innate lymphoid cell	CL:0001065	skos:exactMatch	lymphoid cell	lymphocyte	CL:0000542	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	tonsil	tonsil-codex-stanford	tonsil-codex-stanford

df_all_edges.head(1)

	cell_id	x1	y1	z1	x2	y2	z2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Tissue Type	Unique Region
0	0	148.538	1325.532	0	96.889	1333.449	0	52.252254	tonsil	endothelial cell	Level Three Cell Type	tonsil-codex-stanford	tonsil-codex-stanford

df_all_edges_with_cell_types.head(1)

	cell_id	x1	y1	z1	x2	y2	z2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Level Three Cell Type	Level Two Cell Type	Level One Cell Type	Tissue Type	Unique Region
0	0	148.538	1325.532	0	96.889	1333.449	0	52.252254	tonsil	endothelial cell	Level Three Cell Type	lymphoid cell:innate	lymphoid cell	immune cell	tonsil-codex-stanford	tonsil-codex-stanford

Node Analysis

# Plot number of cells per cell type in the same plot. Color by cell type and unique region. Output figure saved in existing `figures_output_dir`.
def plot_cells_per_celltype(df, type_field, output_dir):
    plt.figure(figsize=(20, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.countplot(data=df, x=type_field, palette='Spectral', hue='Unique Region')
    plt.title(f'Number of Cells per {type_field} in `{dataset_dir}`')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.legend(title='Unique Region', bbox_to_anchor=(0.85, 1), loc='upper left')
    plt.xlabel(type_field)

    # For numbers on y-axis, do not use scientific notation.
    plt.ticklabel_format(style='plain', axis='y')
    # Set y-axis label
    plt.ylabel('Number of Cells')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    # Show the plot
    plt.show()
    plt.close()
for type_field in type_field_list:
    plot_cells_per_celltype(df_all_nodes, type_field, os.path.join(basepath, figures_output_dir))

Distance Analysis

# Get mean, median, minimum, maximum distance per unique region per anchor cell type.
df_distance_stats = df_all_edges_with_cell_types.groupby(['Unique Region', 'Anchor Cell Type', 'Anchor Cell Type Level']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
# Print the first few rows of the distance statistics DataFrame.
df_distance_stats

	Unique Region	Anchor Cell Type	Anchor Cell Type Level	mean_distance	median_distance	min_distance	max_distance
0	esophagus-codex-stanford	endothelial cell	Level One Cell Type	26.201365	21.751953	2.384357	193.046457
1	esophagus-codex-stanford	endothelial cell	Level Three Cell Type	26.201365	21.751953	2.384357	193.046457
2	esophagus-codex-stanford	endothelial cell	Level Two Cell Type	26.201365	21.751953	2.384357	193.046457
3	tonsil-codex-stanford	endothelial cell	Level One Cell Type	56.899226	45.210144	2.639000	199.993437
4	tonsil-codex-stanford	endothelial cell	Level Three Cell Type	56.899226	45.210144	2.639000	199.993437
5	tonsil-codex-stanford	endothelial cell	Level Two Cell Type	56.899226	45.210144	2.639000	199.993437

Level One Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level One Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level One Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	epithelial cell	esophagus-codex-stanford	25.325414	21.492307	2.528993	193.046457
1	epithelial cell	tonsil-codex-stanford	65.469626	55.553479	4.060414	199.673736
2	immune cell	esophagus-codex-stanford	21.482974	15.712354	2.639000	143.075846
3	immune cell	tonsil-codex-stanford	56.521626	44.457236	2.639000	199.993437
4	mesenchymal cell	esophagus-codex-stanford	29.463088	25.393686	2.384357	142.577791
5	mesenchymal cell	tonsil-codex-stanford	57.159447	47.676714	3.108822	199.931243
6	neural cell	esophagus-codex-stanford	26.639884	22.024674	3.788803	107.653801
7	neural cell	tonsil-codex-stanford	66.127466	57.077850	6.409000	194.409948

# Get top five and bottom five cell types with respect to mean distance in each unique region separately.
def get_top_bottom_cell_types_by_mean(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate mean distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(mean_distance=('Distance', 'mean')).reset_index()
    
    # Sort by mean distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='mean_distance', ascending=False)
    
    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in tonsil-codex-stanford:
  Level One Cell Type  mean_distance
3         neural cell      66.127466
0     epithelial cell      65.469626
2    mesenchymal cell      57.159447
1         immune cell      56.521626

Bottom 5 cell types in tonsil-codex-stanford:
  Level One Cell Type  mean_distance
3         neural cell      66.127466
0     epithelial cell      65.469626
2    mesenchymal cell      57.159447
1         immune cell      56.521626

Top 5 cell types in esophagus-codex-stanford:
  Level One Cell Type  mean_distance
2    mesenchymal cell      29.463088
3         neural cell      26.639884
0     epithelial cell      25.325414
1         immune cell      21.482974

Bottom 5 cell types in esophagus-codex-stanford:
  Level One Cell Type  mean_distance
2    mesenchymal cell      29.463088
3         neural cell      26.639884
0     epithelial cell      25.325414
1         immune cell      21.482974

# Get top five and bottom five cell types with respect to median distance in each unique region separately.
def get_top_bottom_cell_types_by_median(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate median distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(median_distance=('Distance', 'median')).reset_index()

    # Sort by median distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='median_distance', ascending=False)

    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in tonsil-codex-stanford:
  Level One Cell Type  median_distance
3         neural cell        57.077850
0     epithelial cell        55.553479
2    mesenchymal cell        47.676714
1         immune cell        44.457236

Bottom 5 cell types in tonsil-codex-stanford:
  Level One Cell Type  median_distance
3         neural cell        57.077850
0     epithelial cell        55.553479
2    mesenchymal cell        47.676714
1         immune cell        44.457236

Top 5 cell types in esophagus-codex-stanford:
  Level One Cell Type  median_distance
2    mesenchymal cell        25.393686
3         neural cell        22.024674
0     epithelial cell        21.492307
1         immune cell        15.712354

Bottom 5 cell types in esophagus-codex-stanford:
  Level One Cell Type  median_distance
2    mesenchymal cell        25.393686
3         neural cell        22.024674
0     epithelial cell        21.492307
1         immune cell        15.712354

# Calculate regional variability
def calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level):
    """    Calculate regional variability for distances in the given DataFrame.
    """
    regional_variability = df_all_edges_with_cell_type_level.groupby('Unique Region')['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    regional_variability['CV (%)'] = (regional_variability['std'] / regional_variability['mean'] * 100).round(1)

    print("\nRegional Variability Analysis:")
    print("Mean: Average distance in each region")
    print("Std: Standard deviation of distances")
    print("CV: Coefficient of Variation (std/mean * 100%)")
    print(regional_variability)

    # Calculate variability for each cell type
    cell_type_variability = df_all_edges_with_cell_type_level.groupby(cell_type_level)['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    cell_type_variability['CV (%)'] = (cell_type_variability['std'] / cell_type_variability['mean'] * 100).round(1)

    print("\nCell Type Variability Analysis (sorted by CV):")
    print(cell_type_variability.sort_values('CV (%)', ascending=False))

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                          mean    std  CV (%)
Unique Region                                
esophagus-codex-stanford  26.2  17.46    66.6
tonsil-codex-stanford     56.9  42.08    74.0

Cell Type Variability Analysis (sorted by CV):
                      mean    std  CV (%)
Level One Cell Type                      
epithelial cell      34.36  29.16    84.9
neural cell          31.35  24.62    78.5
mesenchymal cell     45.67  35.37    77.4
immune cell          54.77  42.25    77.1

# Generate Violin Plot
def plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm='area'):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(10, 6))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette='Spectral', alpha=.9)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size)

    plt.title(f'Violin Plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    plt.figure(figsize=(16, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    # Create categorical type with only the regions that exist in the data
    available_regions = [r for r in regions if r in df_all_edges_with_cell_type_level['Unique Region'].unique()]
    df_all_edges_with_cell_type_level['Unique Region'] = pd.Categorical(
        df_all_edges_with_cell_type_level['Unique Region'],
        categories=available_regions,
        ordered=True
    )

    # Make box plot.
    sns.boxplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y='Distance', hue='Unique Region', showfliers=False, palette='Spectral') # viridis or Spectral palette for better color distinction
    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)
    plt.title(f'Distribution of distances by {cell_type_level} and region', fontsize=font_size)
    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    pivot_data = df_all_edges_with_cell_type_level.pivot_table(
    values='Distance',
    index=cell_type_level,
    columns='Unique Region',
    aggfunc='median'
    )

    plt.figure(figsize=(15, 10))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='Spectral')
    plt.title(f'Heatmap of median distances by {cell_type_level}', fontsize=12)

    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)

    plt.xlabel('Unique Region', fontsize=font_size)
    plt.ylabel(f'{cell_type_level}', fontsize=font_size)
    
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Generate Violin Plot per unique region in both small intestine and large intestine. Create for all 8 regions as 8 subplots.
def plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 1})
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    font_size = 10
    fig, axs = plt.subplots(2, 1, figsize=(10, 15)) # Adjusted figsize for horizontal layout
    fig.suptitle(f'Distance distribution per {cell_type_level} in `{dataset_dir}` (density normalization = {density_norm})', fontsize=font_size, y=1)

    # Keep the sequence of Cell Types consistent across plots.
    cell_types = sorted(df_all_edges_with_cell_type_level[cell_type_level].unique())

    # Create a color palette based on the number of unique classes
    color_palette = sns.color_palette("Spectral", n_colors=len(cell_types))

    # Create a dictionary mapping class to color
    class_color_dict = dict(zip(cell_types, color_palette))

    for i, region in enumerate(regions):
        data_reg = df_all_edges_with_cell_type_level[df_all_edges_with_cell_type_level['Unique Region'] == region]
        sns.violinplot(data=data_reg, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette=class_color_dict, alpha=.9, ax=axs[i], hue=cell_type_level, legend=False, order=cell_types, fill=True)
        axs[i].set_title(region, fontsize=font_size)
        axs[i].set_xlabel('', fontsize=font_size)
        axs[i].set_ylabel('Distance (\u03bcm)', fontsize=font_size)
        # axs[i].tick_params(axis='x', labelrotation=90, labelsize=font_size)
        # only show xtick labels for the last subplot
        if i < len(regions) - 1:
            axs[i].set_xticklabels([])
        else:
            axs[i].set_xticklabels(cell_types, fontsize=font_size, rotation=90, ha='right')
        # axs[i].set_ylim(0, data_reg['Distance'].max() * 1.1)  # Set y-limits to be consistent across all plots
        axs[i].tick_params(axis='both', labelsize=font_size)

    # Use fig.text for precise label positioning
    fig.figure.text(0.5, -0.02, f'{cell_type_level}', ha='center', va='bottom', fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    
    plt.show()

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

# Generate Split Violin Plot
def plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(15, 10))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", hue="Unique Region", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=True, palette='Spectral', alpha=.9, hue_order=regions)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size, loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.title(f'Split violin plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel('Cell Type', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_plot_violin_cells_per_celltype_split_by_condition_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_plot_violin_cells_per_celltype_split_by_condition_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

Level Two Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Two Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level Two Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	b cell	esophagus-codex-stanford	18.969648	16.016955	3.881453	126.909646
1	b cell	tonsil-codex-stanford	53.795103	45.373324	3.016000	199.787236
2	gland epithelium cell	esophagus-codex-stanford	24.161285	20.704132	2.528993	193.046457
3	lymphoid cell	esophagus-codex-stanford	22.645679	15.703303	2.639000	143.075846
4	lymphoid cell	tonsil-codex-stanford	71.002055	53.958434	3.016000	199.993437
5	muscle cell	esophagus-codex-stanford	29.075721	25.112275	3.371991	121.626183
6	muscle cell	tonsil-codex-stanford	14.724496	11.403859	4.214988	59.595818
7	neuron	esophagus-codex-stanford	26.639884	22.024674	3.788803	107.653801
8	neuron	tonsil-codex-stanford	66.127466	57.077850	6.409000	194.409948
9	paneth cell	esophagus-codex-stanford	40.358060	42.616011	4.586403	75.125232
10	secretory cell of esophagus	esophagus-codex-stanford	24.726692	22.005202	2.871146	87.171824
11	squamous epithelial cell	esophagus-codex-stanford	37.731198	35.148048	4.723796	127.522444
12	squamous epithelial cell	tonsil-codex-stanford	65.469626	55.553479	4.060414	199.673736
13	stromal cell	esophagus-codex-stanford	30.144150	26.062121	2.384357	142.577791
14	stromal cell	tonsil-codex-stanford	57.484964	47.978342	3.108822	199.931243
15	t cell	esophagus-codex-stanford	20.056056	15.309171	3.413880	111.112076
16	t cell	tonsil-codex-stanford	44.040178	38.233451	2.639000	199.934442

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in tonsil-codex-stanford:
        Level Two Cell Type  mean_distance
1             lymphoid cell      71.002055
3                    neuron      66.127466
4  squamous epithelial cell      65.469626
5              stromal cell      57.484964
0                    b cell      53.795103

Bottom 5 cell types in tonsil-codex-stanford:
        Level Two Cell Type  mean_distance
4  squamous epithelial cell      65.469626
5              stromal cell      57.484964
0                    b cell      53.795103
6                    t cell      44.040178
2               muscle cell      14.724496

Top 5 cell types in esophagus-codex-stanford:
        Level Two Cell Type  mean_distance
5               paneth cell      40.358060
7  squamous epithelial cell      37.731198
8              stromal cell      30.144150
3               muscle cell      29.075721
4                    neuron      26.639884

Bottom 5 cell types in esophagus-codex-stanford:
           Level Two Cell Type  mean_distance
6  secretory cell of esophagus      24.726692
1        gland epithelium cell      24.161285
2                lymphoid cell      22.645679
9                       t cell      20.056056
0                       b cell      18.969648

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in tonsil-codex-stanford:
        Level Two Cell Type  median_distance
3                    neuron        57.077850
4  squamous epithelial cell        55.553479
1             lymphoid cell        53.958434
5              stromal cell        47.978342
0                    b cell        45.373324

Bottom 5 cell types in tonsil-codex-stanford:
  Level Two Cell Type  median_distance
1       lymphoid cell        53.958434
5        stromal cell        47.978342
0              b cell        45.373324
6              t cell        38.233451
2         muscle cell        11.403859

Top 5 cell types in esophagus-codex-stanford:
        Level Two Cell Type  median_distance
5               paneth cell        42.616011
7  squamous epithelial cell        35.148048
8              stromal cell        26.062121
3               muscle cell        25.112275
4                    neuron        22.024674

Bottom 5 cell types in esophagus-codex-stanford:
           Level Two Cell Type  median_distance
6  secretory cell of esophagus        22.005202
1        gland epithelium cell        20.704132
0                       b cell        16.016955
2                lymphoid cell        15.703303
9                       t cell        15.309171

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                          mean    std  CV (%)
Unique Region                                
esophagus-codex-stanford  26.2  17.46    66.6
tonsil-codex-stanford     56.9  42.08    74.0

Cell Type Variability Analysis (sorted by CV):
                              mean    std  CV (%)
Level Two Cell Type                              
neuron                       31.35  24.62    78.5
lymphoid cell                67.06  52.45    78.2
stromal cell                 51.86  38.02    73.3
b cell                       52.62  36.47    69.3
squamous epithelial cell     60.43  39.56    65.5
t cell                       43.36  28.24    65.1
muscle cell                  28.84  18.34    63.6
gland epithelium cell        24.16  14.80    61.3
secretory cell of esophagus  24.73  14.31    57.9
paneth cell                  40.36  16.31    40.4

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

Level Three Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Three Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level Three Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	b cell	tonsil-codex-stanford	55.759563	47.530416	3.016000	199.787236
1	epithelial cell:secretory	esophagus-codex-stanford	24.726692	22.005202	2.871146	87.171824
2	glandular cell of esophagus	esophagus-codex-stanford	24.161285	20.704132	2.528993	193.046457
3	lymphoid cell:innate	esophagus-codex-stanford	22.645679	15.703303	2.639000	143.075846
4	lymphoid cell:innate	tonsil-codex-stanford	71.002055	53.958434	3.016000	199.993437
5	muscle cell:smooth	esophagus-codex-stanford	29.075721	25.112275	3.371991	121.626183
6	muscle cell:smooth	tonsil-codex-stanford	14.724496	11.403859	4.214988	59.595818
7	neuron	esophagus-codex-stanford	26.639884	22.024674	3.788803	107.653801
8	neuron	tonsil-codex-stanford	66.127466	57.077850	6.409000	194.409948
9	paneth cell	esophagus-codex-stanford	40.358060	42.616011	4.586403	75.125232
10	plasma cell	esophagus-codex-stanford	18.969648	16.016955	3.881453	126.909646
11	plasma cell	tonsil-codex-stanford	26.122452	18.626242	3.039471	195.892407
12	squamous epithelial cell	esophagus-codex-stanford	37.731198	35.148048	4.723796	127.522444
13	squamous epithelial cell	tonsil-codex-stanford	65.469626	55.553479	4.060414	199.673736
14	stromal cell	esophagus-codex-stanford	29.426691	24.946174	2.384357	142.577791
15	stromal cell	tonsil-codex-stanford	58.536073	46.646038	3.108822	199.931243
16	stromal cell:podoplanin+	esophagus-codex-stanford	33.455139	30.538157	3.713019	107.070655
17	stromal cell:podoplanin+	tonsil-codex-stanford	56.732338	48.751217	3.221089	199.928400
18	t cell	esophagus-codex-stanford	20.056056	15.309171	3.413880	111.112076
19	t cell	tonsil-codex-stanford	44.040178	38.233451	2.639000	199.934442

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in tonsil-codex-stanford:
      Level Three Cell Type  mean_distance
1      lymphoid cell:innate      71.002055
3                    neuron      66.127466
5  squamous epithelial cell      65.469626
6              stromal cell      58.536073
7  stromal cell:podoplanin+      56.732338

Bottom 5 cell types in tonsil-codex-stanford:
      Level Three Cell Type  mean_distance
7  stromal cell:podoplanin+      56.732338
0                    b cell      55.759563
8                    t cell      44.040178
4               plasma cell      26.122452
2        muscle cell:smooth      14.724496

Top 5 cell types in esophagus-codex-stanford:
      Level Three Cell Type  mean_distance
5               paneth cell      40.358060
7  squamous epithelial cell      37.731198
9  stromal cell:podoplanin+      33.455139
8              stromal cell      29.426691
3        muscle cell:smooth      29.075721

Bottom 5 cell types in esophagus-codex-stanford:
          Level Three Cell Type  mean_distance
0     epithelial cell:secretory      24.726692
1   glandular cell of esophagus      24.161285
2          lymphoid cell:innate      22.645679
10                       t cell      20.056056
6                   plasma cell      18.969648

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in tonsil-codex-stanford:
      Level Three Cell Type  median_distance
3                    neuron        57.077850
5  squamous epithelial cell        55.553479
1      lymphoid cell:innate        53.958434
7  stromal cell:podoplanin+        48.751217
0                    b cell        47.530416

Bottom 5 cell types in tonsil-codex-stanford:
  Level Three Cell Type  median_distance
0                b cell        47.530416
6          stromal cell        46.646038
8                t cell        38.233451
4           plasma cell        18.626242
2    muscle cell:smooth        11.403859

Top 5 cell types in esophagus-codex-stanford:
      Level Three Cell Type  median_distance
5               paneth cell        42.616011
7  squamous epithelial cell        35.148048
9  stromal cell:podoplanin+        30.538157
3        muscle cell:smooth        25.112275
8              stromal cell        24.946174

Bottom 5 cell types in esophagus-codex-stanford:
          Level Three Cell Type  median_distance
0     epithelial cell:secretory        22.005202
1   glandular cell of esophagus        20.704132
6                   plasma cell        16.016955
2          lymphoid cell:innate        15.703303
10                       t cell        15.309171

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                          mean    std  CV (%)
Unique Region                                
esophagus-codex-stanford  26.2  17.46    66.6
tonsil-codex-stanford     56.9  42.08    74.0

Cell Type Variability Analysis (sorted by CV):
                              mean    std  CV (%)
Level Three Cell Type                            
plasma cell                  23.66  20.60    87.1
stromal cell                 48.70  39.75    81.6
neuron                       31.35  24.62    78.5
lymphoid cell:innate         67.06  52.45    78.2
squamous epithelial cell     60.43  39.56    65.5
b cell                       55.76  36.44    65.4
stromal cell:podoplanin+     55.02  35.92    65.3
t cell                       43.36  28.24    65.1
muscle cell:smooth           28.84  18.34    63.6
glandular cell of esophagus  24.16  14.80    61.3
epithelial cell:secretory    24.73  14.31    57.9
paneth cell                  40.36  16.31    40.4

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.