Distance Analysis: lung-codex-urmc

Analyze and visualize cell-to-nearest-endothelial-cell distance distributions for the lung-codex-urmc dataset.

import numpy as np
import pandas as pd
import os
import json
import requests
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
import plotly.express as px

from _cde_compute_edges_from_nodes import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

basepath = "/u/yashjain/hra-cell-distance-analysis/data"
dataset_dir = "lung-codex-urmc"
data_filedir = os.path.join("data-processed-nodes-with-harmonized-cell-types", dataset_dir)
output_edge_dir = os.path.join("data-processed-edges", dataset_dir)
figures_output_dir = "generated-figures"

# Function to load your data
def load_data(path, edges=False):
    if edges:
        column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
        data = pd.read_csv(path, header=None, names=column_names)
    else:
        data = pd.read_csv(path)
    return data

# Function to read all files ending with "-nodes.csv" in the `data_filedir` directory into a single DataFrame. 
# Another additional column `Dataset` is added to identify the dataset name which comes from the filename before the `-nodes.csv` suffix.

# Additionally, function reads all files ending with "-edges.csv" in the `output_edge_dir` directory into a single DataFrame. 
# Three additional columns are added "Dataset", "Anchor Cell Type", and "Anchor Cell Type Level" to identify the dataset name, anchor cell type, and anchor cell type level respectively which come from the filename before the `.csv` suffix.
# The three additional columns are created by splitting the filename on the `-` character, and extracting the relevant parts.
# On splitting, the first part is the dataset name, second part is the anchor cell type level, and third part is the anchor cell type, and last part is the `edges` suffix.
# When reading files, check if the file has the correct format (i.e., ends with `-edges.csv`).

# Additionally, the function merges the edges DataFrame with the nodes DataFrame to get the cell type information for the anchor cells.
# This is done by reading the corresponding nodes file from the `data_filedir` directory for each edges file, and merging it with the edges DataFrame on the `cell_id` column.
# The merged DataFrame contains the edges with additional columns for the cell type information.

# The function returns three DataFrames:
# 1. `merged_nodes`: DataFrame containing all nodes with an additional column `Dataset`.
# 2. `merged_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, and `Anchor Cell Type Level`.
# 3. `merged_nodes_for_all_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, `Anchor Cell Type Level`, and the cell type information for cells.
def read_all_edge_datasets(basepath, data_filedir, output_edge_dir):
    all_nodes_files = []
    all_edges_files = []
    all_nodes_edges_files = []
    for file in os.listdir(os.path.join(basepath, output_edge_dir)):
        if file.endswith("-edges.csv"):
            file_path = os.path.join(basepath, output_edge_dir, file)
            d1, d2, d3, d4, anchor_cell_type_level, anchor_cell_type = file.replace("-edges.csv", "").split('-')
            dataset_name = f'{d1}-{d2}-{d3}-{d4}'  # Construct dataset name from the first four parts of the filename.
            edges_df = load_data(file_path, edges=False)
            edges_df['Dataset'] = dataset_name
            edges_df['Anchor Cell Type'] = anchor_cell_type
            edges_df['Anchor Cell Type Level'] = anchor_cell_type_level
            edges_df.rename(columns={"distance": "Distance"}, inplace=True) # Rename column "distance" to "Distance".
            all_edges_files.append(edges_df)

            # Read the corresponding nodes file from data_filedir to get the cell type information
            nodes_file_path = os.path.join(basepath, data_filedir, f"{dataset_name}-nodes.csv")
            nodes_df = load_data(nodes_file_path)
            nodes_df['Dataset'] = dataset_name
            all_nodes_files.append(nodes_df)

            # Add a new 'cell_id' column to nodes_df
            nodes_df['cell_id'] = range(len(nodes_df))
            # Set 'cell_id' column as index for nodes_df
            nodes_df.set_index('cell_id', inplace=True)
            # Merge edges_df with nodes_df to get the cell type information for the anchor cells
            edges_nodes_df = pd.merge(edges_df, nodes_df[['Level Three Cell Type', 'Level Two Cell Type', 'Level One Cell Type']], how='left', left_on='cell_id', right_index=True)
            all_nodes_edges_files.append(edges_nodes_df)

    
    merged_edges = pd.concat(all_edges_files, ignore_index=True)
    merged_nodes = pd.concat(all_nodes_files, ignore_index=True)
    merged_nodes_for_all_edges = pd.concat(all_nodes_edges_files, ignore_index=True) 

    return merged_nodes, merged_edges, merged_nodes_for_all_edges

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

Get initial statistics and identify endothelial cell categories for dataset.

df_all_nodes, df_all_edges, df_all_edges_with_cell_types = read_all_edge_datasets(basepath, data_filedir, output_edge_dir)

df_all_nodes.head(5)

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset
0	55.000000	4633.074074	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12
1	794.777778	3643.197531	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12
2	848.313953	3606.447674	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12
3	1046.877061	2890.736132	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12
4	2406.666667	1298.307692	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12

# Print the total number of unique cell types per dataset. Compute separately for each cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type).
print("Total number of unique cell types per cell type annnotation level:")
unique_cell_types = {
    'Original Cell Type': df_all_nodes['Original Cell Type'].nunique(),
    'Level Three Cell Type': df_all_nodes['Level Three Cell Type'].nunique(),
    'Level Two Cell Type': df_all_nodes['Level Two Cell Type'].nunique(),
    'Level One Cell Type': df_all_nodes['Level One Cell Type'].nunique()
}
for cell_type, count in unique_cell_types.items():
    print(f"{cell_type}: {count}")

Total number of unique cell types per cell type annnotation level:
Original Cell Type: 54
Level Three Cell Type: 20
Level Two Cell Type: 17
Level One Cell Type: 7

# Save the unique cell types containing "endothelial" in name per cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type) to a dictionary where the key is the level and the value is a list of unique cell types.
endothelial_cell_types = {
    'Original Cell Type': df_all_nodes[df_all_nodes['Original Cell Type'].str.contains("endothelial", case=False, na=False)]['Original Cell Type'].unique().tolist(),
    'Level Three Cell Type': df_all_nodes[df_all_nodes['Level Three Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Three Cell Type'].unique().tolist(),
    'Level Two Cell Type': df_all_nodes[df_all_nodes['Level Two Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Two Cell Type'].unique().tolist(),
    'Level One Cell Type': df_all_nodes[df_all_nodes['Level One Cell Type'].str.contains("endothelial", case=False, na=False)]['Level One Cell Type'].unique().tolist()
}

print("\nEndothelial cell types per cell type annotation level:")
for level, cell_types in endothelial_cell_types.items():
    print(f"\n{level}:")
    for cell in cell_types:
        print(f"  - {cell}")


Endothelial cell types per cell type annotation level:

Original Cell Type:

Level Three Cell Type:
  - endothelial cell
  - endothelial cell of lymphatic vessel
  - endothelial cell of capillary

Level Two Cell Type:
  - endothelial cell
  - endothelial cell of lymphatic vessel
  - endothelial cell of capillary

Level One Cell Type:
  - endothelial cell

type_field_list = ["Level Three Cell Type", "Level Two Cell Type", "Level One Cell Type"] # Skipping Original Cell Type as it is not a hierarchical level.

# Define the anchor cell type (type of endothelial cell) for each level in type_field_list based on available categories in the previous cell. The distance analysis at all three levels will be limited to the specified anchor cell type.
anchor_cell_type_dict = {
    'Level Three Cell Type': 'endothelial cell',
    'Level Two Cell Type': 'endothelial cell',
    'Level One Cell Type': 'endothelial cell'
}

Process datasets to add region information to Nodes files.

# Create a dictionary to map lung regions to correct Normal/Disease condition.
region_map = {'D115-RLL-10A3-40':'Disease',
 'D265-LLL-7A7-12':'Normal', 
}

# Define the standard region sequence for plots
regions = ['Normal', 'Disease']

df_all_nodes.head()

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset
0	55.000000	4633.074074	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12
1	794.777778	3643.197531	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12
2	848.313953	3606.447674	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12
3	1046.877061	2890.736132	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12
4	2406.666667	1298.307692	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12

# Iterate through the df_all_data dataframe to create new column "Unique Region" based on the "Dataset" column.
# The "Unique Region" column is created by mapping the region names based on the full dataset name.
df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].map(region_map)
# df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].str.split('-').str[1].map(region_map)

# Check if the new columns are created correctly.
df_all_nodes[['Dataset', 'Unique Region']].head(5)

	Dataset	Unique Region
0	D265-LLL-7A7-12	Normal
1	D265-LLL-7A7-12	Normal
2	D265-LLL-7A7-12	Normal
3	D265-LLL-7A7-12	Normal
4	D265-LLL-7A7-12	Normal

# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_nodes['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_nodes['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_nodes['Unique Region'].unique():
    num_datasets = df_all_nodes[df_all_nodes['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")


Unique Regions in the data:
['Normal' 'Disease']
Total number of unique regions: 2

Number of unique datasets per unique region:
Normal: 1
Disease: 1

Process datasets to add region information to Edges files.

df_all_edges.head(5)

	cell_id	x1	y1	x2	y2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level
0	9600	54.891304	14218.369565	164.375000	14329.000000	155.646306	D265-LLL-7A7-12	endothelial cell	Level Three Cell Type
1	9604	148.519685	14344.433071	164.375000	14329.000000	22.126244	D265-LLL-7A7-12	endothelial cell	Level Three Cell Type
2	9603	145.042553	13454.680851	150.838235	13477.191176	23.244455	D265-LLL-7A7-12	endothelial cell	Level Three Cell Type
3	9605	151.107143	13463.553571	150.838235	13477.191176	13.640256	D265-LLL-7A7-12	endothelial cell	Level Three Cell Type
4	9607	160.127168	13502.421965	150.838235	13477.191176	26.886371	D265-LLL-7A7-12	endothelial cell	Level Three Cell Type

# Process the edge data to create new columns "Unique Region" based on the "Dataset" column, similar to how it was done for the node data.
df_all_edges['Unique Region'] = df_all_edges['Dataset'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges[['Dataset', 'Unique Region']].head(5)

	Dataset	Unique Region
0	D265-LLL-7A7-12	Normal
1	D265-LLL-7A7-12	Normal
2	D265-LLL-7A7-12	Normal
3	D265-LLL-7A7-12	Normal
4	D265-LLL-7A7-12	Normal

# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_edges['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_edges['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_edges['Unique Region'].unique():
    num_datasets = df_all_edges[df_all_edges['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")


Unique Regions in the data:
['Normal' 'Disease']
Total number of unique regions: 2

Number of unique datasets per unique region:
Normal: 1
Disease: 1

df_all_edges_with_cell_types['Unique Region'] = df_all_edges_with_cell_types['Dataset'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges_with_cell_types[['Dataset', 'Unique Region']].head(5)

	Dataset	Unique Region
0	D265-LLL-7A7-12	Normal
1	D265-LLL-7A7-12	Normal
2	D265-LLL-7A7-12	Normal
3	D265-LLL-7A7-12	Normal
4	D265-LLL-7A7-12	Normal

df_all_nodes.head(1)

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset	Unique Region
0	55.0	4633.074074	CD8 + T cell_1	t cell:cd8+	lung resident memory CD8-positive, alpha-beta ...	CL:4033039	skos:exactMatch	t cell	T cell	CL:0000084	skos:exactMatch	immune cell	leukocyte	CL:0000738	skos:exactMatch	D265-LLL-7A7-12	Normal

df_all_edges.head(1)

	cell_id	x1	y1	z1	x2	y2	z2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Unique Region
0	9600	54.891304	14218.369565	0	164.375	14329.0	0	155.646306	D265-LLL-7A7-12	endothelial cell	Level Three Cell Type	Normal

df_all_edges_with_cell_types.head(1)

	cell_id	x1	y1	z1	x2	y2	z2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Level Three Cell Type	Level Two Cell Type	Level One Cell Type	Unique Region
0	9600	54.891304	14218.369565	0	164.375	14329.0	0	155.646306	D265-LLL-7A7-12	endothelial cell	Level Three Cell Type	t cell:cd8+	t cell	immune cell	Normal

Node Analysis

# Plot number of cells per cell type in the same plot. Color by cell type and unique region. Output figure saved in existing `figures_output_dir`.
def plot_cells_per_celltype(df, type_field, output_dir):
    plt.figure(figsize=(20, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.countplot(data=df, x=type_field, palette='Spectral', hue='Unique Region')
    plt.title(f'Number of Cells per {type_field} in `{dataset_dir}`')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.legend(title='Unique Region', bbox_to_anchor=(0.85, 1), loc='upper left')
    plt.xlabel(type_field)

    # For numbers on y-axis, do not use scientific notation.
    plt.ticklabel_format(style='plain', axis='y')
    # Set y-axis label
    plt.ylabel('Number of Cells')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    # Show the plot
    plt.show()
    plt.close()
for type_field in type_field_list:
    plot_cells_per_celltype(df_all_nodes, type_field, os.path.join(basepath, figures_output_dir))

Distance Analysis

# Get mean, median, minimum, maximum distance per unique region per anchor cell type.
df_distance_stats = df_all_edges_with_cell_types.groupby(['Unique Region', 'Anchor Cell Type', 'Anchor Cell Type Level']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
# Print the first few rows of the distance statistics DataFrame.
df_distance_stats

	Unique Region	Anchor Cell Type	Anchor Cell Type Level	mean_distance	median_distance	min_distance	max_distance
0	Disease	endothelial cell	Level One Cell Type	42.723724	30.471857	4.826959	199.996537
1	Disease	endothelial cell	Level Three Cell Type	46.507558	34.814730	4.317932	199.976085
2	Disease	endothelial cell	Level Two Cell Type	46.507558	34.814730	4.317932	199.976085
3	Disease	endothelial cell of capillary	Level Three Cell Type	62.225686	49.163320	4.317932	199.998891
4	Disease	endothelial cell of capillary	Level Two Cell Type	62.225686	49.163320	4.317932	199.998891
5	Normal	endothelial cell	Level One Cell Type	38.755288	31.499763	4.467862	199.906751
6	Normal	endothelial cell	Level Three Cell Type	45.655140	35.412244	4.467862	199.967962
7	Normal	endothelial cell	Level Two Cell Type	45.655140	35.412244	4.467862	199.967962
8	Normal	endothelial cell of lymphatic vessel	Level Three Cell Type	100.380353	98.949395	5.158630	199.998081
9	Normal	endothelial cell of lymphatic vessel	Level Two Cell Type	100.380353	98.949395	5.158630	199.998081

Level One Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level One Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level One Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	epithelial cell	Disease	32.455094	25.588249	4.826959	199.648377
1	epithelial cell	Normal	35.403572	28.856798	4.647661	199.778494
2	hematopoietic precursor cell	Normal	34.836844	27.595297	6.647334	183.441998
3	immune cell	Disease	52.244046	38.866237	5.341610	199.996537
4	immune cell	Normal	39.351442	31.840836	5.332036	199.829270
5	mesenchymal cell	Disease	46.334527	33.301188	5.314126	199.976085
6	mesenchymal cell	Normal	49.582849	43.837154	6.818182	199.215219
7	mixed immune/epithelial cell population	Disease	29.528454	23.648768	6.334930	199.968661
8	unknown cell	Disease	21.070750	15.768757	5.697526	190.229178
9	unknown cell	Normal	37.909030	30.412302	4.467862	199.906751

# Get top five and bottom five cell types with respect to mean distance in each unique region separately.
def get_top_bottom_cell_types_by_mean(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate mean distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(mean_distance=('Distance', 'mean')).reset_index()
    
    # Sort by mean distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='mean_distance', ascending=False)
    
    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Normal:
            Level One Cell Type  mean_distance
3              mesenchymal cell      49.582849
2                   immune cell      39.351442
4                  unknown cell      37.909030
0               epithelial cell      35.403572
1  hematopoietic precursor cell      34.836844

Bottom 5 cell types in Normal:
            Level One Cell Type  mean_distance
3              mesenchymal cell      49.582849
2                   immune cell      39.351442
4                  unknown cell      37.909030
0               epithelial cell      35.403572
1  hematopoietic precursor cell      34.836844

Top 5 cell types in Disease:
                       Level One Cell Type  mean_distance
1                              immune cell      52.244046
2                         mesenchymal cell      46.334527
0                          epithelial cell      32.455094
3  mixed immune/epithelial cell population      29.528454
4                             unknown cell      21.070750

Bottom 5 cell types in Disease:
                       Level One Cell Type  mean_distance
1                              immune cell      52.244046
2                         mesenchymal cell      46.334527
0                          epithelial cell      32.455094
3  mixed immune/epithelial cell population      29.528454
4                             unknown cell      21.070750

# Get top five and bottom five cell types with respect to median distance in each unique region separately.
def get_top_bottom_cell_types_by_median(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate median distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(median_distance=('Distance', 'median')).reset_index()

    # Sort by median distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='median_distance', ascending=False)

    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Normal:
            Level One Cell Type  median_distance
3              mesenchymal cell        43.837154
2                   immune cell        31.840836
4                  unknown cell        30.412302
0               epithelial cell        28.856798
1  hematopoietic precursor cell        27.595297

Bottom 5 cell types in Normal:
            Level One Cell Type  median_distance
3              mesenchymal cell        43.837154
2                   immune cell        31.840836
4                  unknown cell        30.412302
0               epithelial cell        28.856798
1  hematopoietic precursor cell        27.595297

Top 5 cell types in Disease:
                       Level One Cell Type  median_distance
1                              immune cell        38.866237
2                         mesenchymal cell        33.301188
0                          epithelial cell        25.588249
3  mixed immune/epithelial cell population        23.648768
4                             unknown cell        15.768757

Bottom 5 cell types in Disease:
                       Level One Cell Type  median_distance
1                              immune cell        38.866237
2                         mesenchymal cell        33.301188
0                          epithelial cell        25.588249
3  mixed immune/epithelial cell population        23.648768
4                             unknown cell        15.768757

# Calculate regional variability
def calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level):
    """    Calculate regional variability for distances in the given DataFrame.
    """
    regional_variability = df_all_edges_with_cell_type_level.groupby('Unique Region')['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    regional_variability['CV (%)'] = (regional_variability['std'] / regional_variability['mean'] * 100).round(1)

    print("\nRegional Variability Analysis:")
    print("Mean: Average distance in each region")
    print("Std: Standard deviation of distances")
    print("CV: Coefficient of Variation (std/mean * 100%)")
    print(regional_variability)

    # Calculate variability for each cell type
    cell_type_variability = df_all_edges_with_cell_type_level.groupby(cell_type_level)['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    cell_type_variability['CV (%)'] = (cell_type_variability['std'] / cell_type_variability['mean'] * 100).round(1)

    print("\nCell Type Variability Analysis (sorted by CV):")
    print(cell_type_variability.sort_values('CV (%)', ascending=False))

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Disease        42.72  34.40    80.5
Normal         38.76  26.44    68.2

Cell Type Variability Analysis (sorted by CV):
                                          mean    std  CV (%)
Level One Cell Type                                          
unknown cell                             34.17  25.86    75.7
immune cell                              46.17  34.71    75.2
mesenchymal cell                         47.92  33.60    70.1
hematopoietic precursor cell             34.84  24.25    69.6
epithelial cell                          33.93  23.11    68.1
mixed immune/epithelial cell population  29.53  19.66    66.6

# Generate Violin Plot
def plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm='area'):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(10, 6))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette='Spectral', alpha=.9)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size)

    plt.title(f'Violin Plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    plt.figure(figsize=(16, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    # Create categorical type with only the regions that exist in the data
    available_regions = [r for r in regions if r in df_all_edges_with_cell_type_level['Unique Region'].unique()]
    df_all_edges_with_cell_type_level['Unique Region'] = pd.Categorical(
        df_all_edges_with_cell_type_level['Unique Region'],
        categories=available_regions,
        ordered=True
    )

    # Make box plot.
    sns.boxplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y='Distance', hue='Unique Region', showfliers=False, palette='Spectral') # viridis or Spectral palette for better color distinction
    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)
    plt.title(f'Distribution of distances by {cell_type_level} and region', fontsize=font_size)
    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    pivot_data = df_all_edges_with_cell_type_level.pivot_table(
    values='Distance',
    index=cell_type_level,
    columns='Unique Region',
    aggfunc='median'
    )

    plt.figure(figsize=(15, 10))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='Spectral')
    plt.title(f'Heatmap of median distances by {cell_type_level}', fontsize=12)

    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)

    plt.xlabel('Unique Region', fontsize=font_size)
    plt.ylabel(f'{cell_type_level}', fontsize=font_size)
    
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Generate Violin Plot per unique region in both small intestine and large intestine. Create for all 8 regions as 8 subplots.
def plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 1})
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    font_size = 10
    fig, axs = plt.subplots(2, 1, figsize=(10, 15)) # Adjusted figsize for horizontal layout
    fig.suptitle(f'Distance distribution per {cell_type_level} in `{dataset_dir}` (density normalization = {density_norm})', fontsize=font_size, y=1)

    # Keep the sequence of Cell Types consistent across plots.
    cell_types = sorted(df_all_edges_with_cell_type_level[cell_type_level].unique())

    # Create a color palette based on the number of unique classes
    color_palette = sns.color_palette("Spectral", n_colors=len(cell_types))

    # Create a dictionary mapping class to color
    class_color_dict = dict(zip(cell_types, color_palette))

    for i, region in enumerate(regions):
        data_reg = df_all_edges_with_cell_type_level[df_all_edges_with_cell_type_level['Unique Region'] == region]
        sns.violinplot(data=data_reg, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette=class_color_dict, alpha=.9, ax=axs[i], hue=cell_type_level, legend=False, order=cell_types, fill=True)
        axs[i].set_title(region, fontsize=font_size)
        axs[i].set_xlabel('', fontsize=font_size)
        axs[i].set_ylabel('Distance (\u03bcm)', fontsize=font_size)
        # axs[i].tick_params(axis='x', labelrotation=90, labelsize=font_size)
        # only show xtick labels for the last subplot
        if i < len(regions) - 1:
            axs[i].set_xticklabels([])
        else:
            axs[i].set_xticklabels(cell_types, fontsize=font_size, rotation=90, ha='right')
        # axs[i].set_ylim(0, data_reg['Distance'].max() * 1.1)  # Set y-limits to be consistent across all plots
        axs[i].tick_params(axis='both', labelsize=font_size)

    # Use fig.text for precise label positioning
    fig.figure.text(0.5, -0.02, f'{cell_type_level}', ha='center', va='bottom', fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    
    plt.show()

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

# Generate Split Violin Plot
def plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(15, 10))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", hue="Unique Region", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=True, palette='Spectral', alpha=.9, hue_order=regions)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size, loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.title(f'Split violin plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel('Cell Type', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_plot_violin_cells_per_celltype_split_by_condition_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_plot_violin_cells_per_celltype_split_by_condition_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

Level Two Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Two Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level Two Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	b cell	Disease	37.449808	25.025741	8.194906	191.217795
1	b cell	Normal	36.288294	26.858117	8.676844	197.378220
2	endothelial cell of capillary	Disease	28.936591	22.843498	4.317932	197.145756
3	endothelial cell of lymphatic vessel	Normal	90.450213	82.516556	6.820439	199.965441
4	epithelial cell	Disease	47.290109	39.116006	7.217011	199.626286
5	immune cell	Disease	199.429368	199.429368	199.429368	199.429368
6	leukocyte	Normal	35.224795	27.520283	6.271830	198.055232
7	macrophage	Disease	72.456390	62.749724	6.409083	199.948392
8	macrophage	Normal	43.693437	35.902059	5.764018	199.800005
9	mast cell	Normal	56.037289	45.889204	6.549645	199.829270
10	megakaryocyte	Normal	35.709374	28.120303	6.647334	183.441998
11	mixed t cell/epithelial cell population	Disease	35.456705	28.416673	7.589362	199.630836
12	muscle cell	Disease	50.141764	37.800183	5.314126	199.976085
13	muscle cell	Normal	74.768813	67.226262	7.202741	199.967962
14	neutrophil	Normal	34.791835	27.134180	5.332036	199.638840
15	t cell	Disease	49.913913	37.638729	5.341610	199.924082
16	t cell	Normal	52.112910	41.077107	5.873783	199.929988
17	type 1 pneumocyte	Normal	36.197262	26.538157	5.164310	199.815662
18	type 2 pneumocyte	Disease	35.069124	29.533140	7.236066	199.648377
19	type 2 pneumocyte	Normal	40.193411	31.636089	4.647661	199.955656
20	unknown cell	Disease	27.554498	20.212155	5.697526	196.945710
21	unknown cell	Normal	43.607941	34.399009	4.467862	199.906751

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Normal:
                    Level Two Cell Type  mean_distance
1  endothelial cell of lymphatic vessel      90.450213
6                           muscle cell      74.768813
4                             mast cell      56.037289
8                                t cell      52.112910
3                            macrophage      43.693437

Bottom 5 cell types in Normal:
  Level Two Cell Type  mean_distance
0              b cell      36.288294
9   type 1 pneumocyte      36.197262
5       megakaryocyte      35.709374
2           leukocyte      35.224795
7          neutrophil      34.791835

Top 5 cell types in Disease:
  Level Two Cell Type  mean_distance
3         immune cell     199.429368
4          macrophage      72.456390
6         muscle cell      50.141764
7              t cell      49.913913
2     epithelial cell      47.290109

Bottom 5 cell types in Disease:
                       Level Two Cell Type  mean_distance
0                                   b cell      37.449808
5  mixed t cell/epithelial cell population      35.456705
8                        type 2 pneumocyte      35.069124
1            endothelial cell of capillary      28.936591
9                             unknown cell      27.554498

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Normal:
                    Level Two Cell Type  median_distance
1  endothelial cell of lymphatic vessel        82.516556
6                           muscle cell        67.226262
4                             mast cell        45.889204
8                                t cell        41.077107
3                            macrophage        35.902059

Bottom 5 cell types in Normal:
  Level Two Cell Type  median_distance
5       megakaryocyte        28.120303
2           leukocyte        27.520283
7          neutrophil        27.134180
0              b cell        26.858117
9   type 1 pneumocyte        26.538157

Top 5 cell types in Disease:
  Level Two Cell Type  median_distance
3         immune cell       199.429368
4          macrophage        62.749724
2     epithelial cell        39.116006
6         muscle cell        37.800183
7              t cell        37.638729

Bottom 5 cell types in Disease:
                       Level Two Cell Type  median_distance
8                        type 2 pneumocyte        29.533140
5  mixed t cell/epithelial cell population        28.416673
0                                   b cell        25.025741
1            endothelial cell of capillary        22.843498
9                             unknown cell        20.212155

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Disease        46.51  35.32    75.9
Normal         45.66  34.14    74.8

Cell Type Variability Analysis (sorted by CV):
                                           mean    std  CV (%)
Level Two Cell Type                                           
type 1 pneumocyte                         36.20  28.99    80.1
b cell                                    36.89  29.26    79.3
unknown cell                              40.03  30.98    77.4
t cell                                    50.39  37.60    74.6
neutrophil                                34.79  24.90    71.6
leukocyte                                 35.22  25.04    71.1
megakaryocyte                             35.71  25.36    71.0
macrophage                                55.66  39.15    70.3
type 2 pneumocyte                         37.89  25.70    67.8
mixed t cell/epithelial cell population   35.46  23.98    67.6
endothelial cell of capillary             28.94  19.51    67.4
muscle cell                               61.52  41.15    66.9
mast cell                                 56.04  36.88    65.8
epithelial cell                           47.29  30.10    63.6
endothelial cell of lymphatic vessel      90.45  48.80    54.0
immune cell                              199.43    NaN     NaN

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

Level Three Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Three Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level Three Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	b cell	Disease	37.449808	25.025741	8.194906	191.217795
1	b cell	Normal	36.288294	26.858117	8.676844	197.378220
2	endothelial cell of capillary	Disease	28.936591	22.843498	4.317932	197.145756
3	endothelial cell of lymphatic vessel	Normal	90.450213	82.516556	6.820439	199.965441
4	epithelial cell	Disease	47.290109	39.116006	7.217011	199.626286
5	immune cell	Disease	199.429368	199.429368	199.429368	199.429368
6	leukocyte	Normal	35.224795	27.520283	6.271830	198.055232
7	macrophage	Disease	72.456390	62.749724	6.409083	199.948392
8	macrophage	Normal	43.683982	35.893711	5.764018	199.800005
9	macrophage:interstitial	Normal	47.581049	39.836315	9.818172	172.605327
10	mast cell	Normal	56.037289	45.889204	6.549645	199.829270
11	megakaryocyte	Normal	35.709374	28.120303	6.647334	183.441998
12	mixed t cell/epithelial cell population	Disease	35.456705	28.416673	7.589362	199.630836
13	muscle cell:smooth	Disease	50.141764	37.800183	5.314126	199.976085
14	muscle cell:smooth	Normal	74.768813	67.226262	7.202741	199.967962
15	neutrophil	Normal	34.791835	27.134180	5.332036	199.638840
16	pneumocyte:type 1	Normal	36.197262	26.538157	5.164310	199.815662
17	pneumocyte:type 2	Disease	35.069124	29.533140	7.236066	199.648377
18	pneumocyte:type 2	Normal	40.193411	31.636089	4.647661	199.955656
19	t cell	Disease	52.833976	43.817389	7.516588	199.586367
20	t cell:cd4+ alpha-beta	Disease	39.855009	30.664763	5.341610	199.765323
21	t cell:cd4+ alpha-beta	Normal	61.190621	50.726455	6.368590	199.929988
22	t cell:cd8+	Disease	72.166680	63.643480	7.237513	199.924082
23	t cell:cd8+	Normal	45.077855	35.291540	5.873783	199.742345
24	unknown cell	Disease	27.554498	20.212155	5.697526	196.945710
25	unknown cell	Normal	43.607941	34.399009	4.467862	199.906751

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Normal:
                   Level Three Cell Type  mean_distance
1   endothelial cell of lymphatic vessel      90.450213
7                     muscle cell:smooth      74.768813
11                t cell:cd4+ alpha-beta      61.190621
5                              mast cell      56.037289
4                macrophage:interstitial      47.581049

Bottom 5 cell types in Normal:
  Level Three Cell Type  mean_distance
0                b cell      36.288294
9     pneumocyte:type 1      36.197262
6         megakaryocyte      35.709374
2             leukocyte      35.224795
8            neutrophil      34.791835

Top 5 cell types in Disease:
   Level Three Cell Type  mean_distance
3            immune cell     199.429368
4             macrophage      72.456390
10           t cell:cd8+      72.166680
8                 t cell      52.833976
6     muscle cell:smooth      50.141764

Bottom 5 cell types in Disease:
                      Level Three Cell Type  mean_distance
0                                    b cell      37.449808
5   mixed t cell/epithelial cell population      35.456705
7                        pneumocyte:type 2       35.069124
1             endothelial cell of capillary      28.936591
11                             unknown cell      27.554498

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Normal:
                   Level Three Cell Type  median_distance
1   endothelial cell of lymphatic vessel        82.516556
7                     muscle cell:smooth        67.226262
11                t cell:cd4+ alpha-beta        50.726455
5                              mast cell        45.889204
4                macrophage:interstitial        39.836315

Bottom 5 cell types in Normal:
  Level Three Cell Type  median_distance
6         megakaryocyte        28.120303
2             leukocyte        27.520283
8            neutrophil        27.134180
0                b cell        26.858117
9     pneumocyte:type 1        26.538157

Top 5 cell types in Disease:
   Level Three Cell Type  median_distance
3            immune cell       199.429368
10           t cell:cd8+        63.643480
4             macrophage        62.749724
8                 t cell        43.817389
2        epithelial cell        39.116006

Bottom 5 cell types in Disease:
                      Level Three Cell Type  median_distance
7                        pneumocyte:type 2         29.533140
5   mixed t cell/epithelial cell population        28.416673
0                                    b cell        25.025741
1             endothelial cell of capillary        22.843498
11                             unknown cell        20.212155

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Disease        46.51  35.32    75.9
Normal         45.66  34.14    74.8

Cell Type Variability Analysis (sorted by CV):
                                           mean    std  CV (%)
Level Three Cell Type                                         
pneumocyte:type 1                         36.20  28.99    80.1
b cell                                    36.89  29.26    79.3
unknown cell                              40.03  30.98    77.4
t cell:cd4+ alpha-beta                    43.13  32.55    75.5
neutrophil                                34.79  24.90    71.6
leukocyte                                 35.22  25.04    71.1
megakaryocyte                             35.71  25.36    71.0
macrophage                                55.67  39.15    70.3
macrophage:interstitial                   47.58  32.59    68.5
t cell:cd8+                               62.80  42.56    67.8
pneumocyte:type 2                         37.89  25.70    67.8
mixed t cell/epithelial cell population   35.46  23.98    67.6
endothelial cell of capillary             28.94  19.51    67.4
muscle cell:smooth                        61.52  41.15    66.9
mast cell                                 56.04  36.88    65.8
t cell                                    52.83  34.08    64.5
epithelial cell                           47.29  30.10    63.6
endothelial cell of lymphatic vessel      90.45  48.80    54.0
immune cell                              199.43    NaN     NaN

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.