Distance Analysis: pancreas-geomx-ufl

Analyze and visualize cell-to-nearest-endothelial-cell distance distributions for the pancreas-geomx-ufl dataset.

import numpy as np
import pandas as pd
import os
import json
import requests
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
import plotly.express as px

from _cde_compute_edges_from_nodes import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

basepath = "/u/yashjain/hra-cell-distance-analysis/data"
dataset_dir = "pancreas-geomx-ufl"
data_filedir = os.path.join("data-processed-nodes-with-harmonized-cell-types", dataset_dir)
output_edge_dir = os.path.join("data-processed-edges", dataset_dir)
figures_output_dir = "generated-figures"

# Function to load your data
def load_data(path, edges=False):
    if edges:
        column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
        data = pd.read_csv(path, header=None, names=column_names)
    else:
        data = pd.read_csv(path)
    return data

# Function to read all files ending with "-nodes.csv" in the `data_filedir` directory into a single DataFrame. 
# Another additional column `Dataset` is added to identify the dataset name which comes from the filename before the `-nodes.csv` suffix.

# Additionally, function reads all files ending with "-edges.csv" in the `output_edge_dir` directory into a single DataFrame. 
# Three additional columns are added "Dataset", "Anchor Cell Type", and "Anchor Cell Type Level" to identify the dataset name, anchor cell type, and anchor cell type level respectively which come from the filename before the `.csv` suffix.
# The three additional columns are created by splitting the filename on the `-` character, and extracting the relevant parts.
# On splitting, the first part is the dataset name, second part is the anchor cell type level, and third part is the anchor cell type, and last part is the `edges` suffix.
# When reading files, check if the file has the correct format (i.e., ends with `-edges.csv`).

# Additionally, the function merges the edges DataFrame with the nodes DataFrame to get the cell type information for the anchor cells.
# This is done by reading the corresponding nodes file from the `data_filedir` directory for each edges file, and merging it with the edges DataFrame on the `cell_id` column.
# The merged DataFrame contains the edges with additional columns for the cell type information.

# The function returns three DataFrames:
# 1. `merged_nodes`: DataFrame containing all nodes with an additional column `Dataset`.
# 2. `merged_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, and `Anchor Cell Type Level`.
# 3. `merged_nodes_for_all_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, `Anchor Cell Type Level`, and the cell type information for cells.
def read_all_edge_datasets(basepath, data_filedir, output_edge_dir):
    all_nodes_files = []
    all_edges_files = []
    all_nodes_edges_files = []
    for file in os.listdir(os.path.join(basepath, output_edge_dir)):
        if file.endswith("-edges.csv"):
            file_path = os.path.join(basepath, output_edge_dir, file)
            donor_name, dataset_id, anchor_cell_type_level, anchor_cell_type = file.replace("-edges.csv", "").split('-')
            dataset_name = f"{donor_name}-{dataset_id}" # Considering the dataset name as the combination of donor name and dataset id for pancreas-geomx-ufl dataset.
            edges_df = load_data(file_path, edges=False)
            edges_df['Dataset'] = dataset_name
            edges_df['Anchor Cell Type'] = anchor_cell_type
            edges_df['Anchor Cell Type Level'] = anchor_cell_type_level
            edges_df.rename(columns={"distance": "Distance"}, inplace=True) # Rename column "distance" to "Distance".
            all_edges_files.append(edges_df)

            # Read the corresponding nodes file from data_filedir to get the cell type information
            nodes_file_path = os.path.join(basepath, data_filedir, f"{dataset_name}-nodes.csv")
            nodes_df = load_data(nodes_file_path)
            nodes_df['Dataset'] = dataset_name
            all_nodes_files.append(nodes_df)

            # Add a new 'cell_id' column to nodes_df
            nodes_df['cell_id'] = range(len(nodes_df))
            # Set 'cell_id' column as index for nodes_df
            nodes_df.set_index('cell_id', inplace=True)
            # Merge edges_df with nodes_df to get the cell type information for the anchor cells
            edges_nodes_df = pd.merge(edges_df, nodes_df[['Level Three Cell Type', 'Level Two Cell Type', 'Level One Cell Type']], how='left', left_on='cell_id', right_index=True)
            all_nodes_edges_files.append(edges_nodes_df)

    
    merged_edges = pd.concat(all_edges_files, ignore_index=True)
    merged_nodes = pd.concat(all_nodes_files, ignore_index=True)
    merged_nodes_for_all_edges = pd.concat(all_nodes_edges_files, ignore_index=True) 

    return merged_nodes, merged_edges, merged_nodes_for_all_edges

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

Get initial statistics and identify endothelial cell categories for dataset.

df_all_nodes, df_all_edges, df_all_edges_with_cell_types = read_all_edge_datasets(basepath, data_filedir, output_edge_dir)

df_all_nodes.head(5)

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset
0	10774.2	7856.6	unknown	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	P2-13
1	10790.9	7852.3	unknown	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	P2-13
2	10805.0	7859.6	Endothelial	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	P2-13
3	10858.9	7861.8	unknown	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	P2-13
4	10758.2	7862.8	Endothelial	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	P2-13

Following aspiration of islet beta cells, duct cells, and endothelial cells, mainly acinar cells remain to be aspirated by DSP instrument. Hence, most unknown cells in pancreas-geomx-ufl dataset are likely to be acinar cells.

# Print the total number of unique cell types per dataset. Compute separately for each cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type).
print("Total number of unique cell types per cell type annnotation level:")
unique_cell_types = {
    'Original Cell Type': df_all_nodes['Original Cell Type'].nunique(),
    'Level Three Cell Type': df_all_nodes['Level Three Cell Type'].nunique(),
    'Level Two Cell Type': df_all_nodes['Level Two Cell Type'].nunique(),
    'Level One Cell Type': df_all_nodes['Level One Cell Type'].nunique()
}
for cell_type, count in unique_cell_types.items():
    print(f"{cell_type}: {count}")

Total number of unique cell types per cell type annnotation level:
Original Cell Type: 4
Level Three Cell Type: 4
Level Two Cell Type: 4
Level One Cell Type: 3

# Save the unique cell types containing "endothelial" in name per cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type) to a dictionary where the key is the level and the value is a list of unique cell types.
endothelial_cell_types = {
    'Original Cell Type': df_all_nodes[df_all_nodes['Original Cell Type'].str.contains("endothelial", case=False, na=False)]['Original Cell Type'].unique().tolist(),
    'Level Three Cell Type': df_all_nodes[df_all_nodes['Level Three Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Three Cell Type'].unique().tolist(),
    'Level Two Cell Type': df_all_nodes[df_all_nodes['Level Two Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Two Cell Type'].unique().tolist(),
    'Level One Cell Type': df_all_nodes[df_all_nodes['Level One Cell Type'].str.contains("endothelial", case=False, na=False)]['Level One Cell Type'].unique().tolist()
}

print("\nEndothelial cell types per cell type annotation level:")
for level, cell_types in endothelial_cell_types.items():
    print(f"\n{level}:")
    for cell in cell_types:
        print(f"  - {cell}")


Endothelial cell types per cell type annotation level:

Original Cell Type:
  - Endothelial

Level Three Cell Type:
  - endothelial cell

Level Two Cell Type:
  - endothelial cell

Level One Cell Type:
  - endothelial cell

type_field_list = ["Level Three Cell Type", "Level Two Cell Type", "Level One Cell Type"] # Skipping Original Cell Type as it is not a hierarchical level.

# Define the anchor cell type (type of endothelial cell) for each level in type_field_list based on available categories in the previous cell. The distance analysis at all three levels will be limited to the specified anchor cell type.
anchor_cell_type_dict = {
    'Level Three Cell Type': 'endothelial cell',
    'Level Two Cell Type': 'endothelial cell',
    'Level One Cell Type': 'endothelial cell'
}

Process datasets to add region information to Nodes files.

# Create a dictionary to map pancreas regions to correct region names.
region_map = {
    'P2-4AD': 'Head',
    'P2-7A': 'Neck',
    'P2-13': 'Body',
    'P2-19': 'Tail',
    'P3-3A': 'Head',
    'P3-7A': 'Neck',
    'P3-9A': 'Body',
    'P3-13A': 'Tail',
    'P4-3A': 'Head',
    'P4-6A': 'Neck',
    'P4-12A': 'Body',
    'P4-22': 'Tail'
}

df_all_nodes.head()

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset
0	10774.2	7856.6	unknown	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	P2-13
1	10790.9	7852.3	unknown	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	P2-13
2	10805.0	7859.6	Endothelial	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	P2-13
3	10858.9	7861.8	unknown	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	P2-13
4	10758.2	7862.8	Endothelial	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	endothelial cell	endothelial cell	CL:0000115	skos:exactMatch	P2-13

# Iterate through the df_all_data dataframe to create new columns "Donor" and "Unique Region" based on the "Dataset" column.
# The "Donor" column is created by extracting the donor name from the "Dataset" column.
# The "Unique Region" column is created by mapping the region names based on the full dataset name.
df_all_nodes['Donor'] = df_all_nodes['Dataset'].str.split('-').str[0]
df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].map(region_map)
# df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].str.split('-').str[1].map(region_map)

# Check if the new columns are created correctly.
df_all_nodes[['Dataset', 'Donor', 'Unique Region']].head(5)

	Dataset	Donor	Unique Region
0	P2-13	P2	Body
1	P2-13	P2	Body
2	P2-13	P2	Body
3	P2-13	P2	Body
4	P2-13	P2	Body

# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_nodes['Unique Region'].unique())

# Print all unique donors in the data.
print("\nUnique Donors in the data:")
print(df_all_nodes['Donor'].unique())

# Print the total number of unique donors and unique regions.
print(f"\nTotal number of unique donors: {df_all_nodes['Donor'].nunique()}")
print(f"Total number of unique regions: {df_all_nodes['Unique Region'].nunique()}")

# Print number of unique donors per unique region.
print("\nNumber of unique donors per unique region:")
for region in df_all_nodes['Unique Region'].unique():
    num_donors = df_all_nodes[df_all_nodes['Unique Region'] == region]['Donor'].nunique()
    print(f"{region}: {num_donors}")


Unique Regions in the data:
['Body' 'Tail' 'Head' 'Neck']

Unique Donors in the data:
['P2' 'P3' 'P4']

Total number of unique donors: 3
Total number of unique regions: 4

Number of unique donors per unique region:
Body: 3
Tail: 3
Head: 3
Neck: 3

Process datasets to add region information to Edges files.

df_all_edges.head(5)

	cell_id	x1	y1	x2	y2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level
0	0	10774.2	7856.6	10758.2	7862.8	17.159254	P2-13	endothelial cell	Level Three Cell Type
1	1	10790.9	7852.3	10805.0	7859.6	15.877657	P2-13	endothelial cell	Level Three Cell Type
2	5	10788.8	7867.4	10805.0	7859.6	17.979989	P2-13	endothelial cell	Level Three Cell Type
3	6	10778.4	7872.1	10763.9	7874.8	14.749237	P2-13	endothelial cell	Level Three Cell Type
4	7	10683.4	7871.0	10690.7	7883.8	14.735332	P2-13	endothelial cell	Level Three Cell Type

# Process the edge data to create new columns "Donor", "Unique Region" based on the "Dataset" column, similar to how it was done for the node data.
df_all_edges['Donor'] = df_all_edges['Dataset'].str.split('-').str[0]
df_all_edges['Unique Region'] = df_all_edges['Dataset'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges[['Dataset', 'Donor', 'Unique Region']].head(5)

	Dataset	Donor	Unique Region
0	P2-13	P2	Body
1	P2-13	P2	Body
2	P2-13	P2	Body
3	P2-13	P2	Body
4	P2-13	P2	Body

# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_edges['Unique Region'].unique())

# Print all unique donors in the data.
print("\nUnique Donors in the data:")
print(df_all_edges['Donor'].unique())

# Print the total number of unique donors and unique regions.
print(f"\nTotal number of unique donors: {df_all_edges['Donor'].nunique()}")
print(f"Total number of unique regions: {df_all_edges['Unique Region'].nunique()}")

# Print number of unique donors per unique region.
print("\nNumber of unique donors per unique region:")
for region in df_all_edges['Unique Region'].unique():
    num_donors = df_all_edges[df_all_edges['Unique Region'] == region]['Donor'].nunique()
    print(f"{region}: {num_donors}")


Unique Regions in the data:
['Body' 'Tail' 'Head' 'Neck']

Unique Donors in the data:
['P2' 'P3' 'P4']

Total number of unique donors: 3
Total number of unique regions: 4

Number of unique donors per unique region:
Body: 3
Tail: 3
Head: 3
Neck: 3

df_all_edges_with_cell_types['Donor'] = df_all_edges_with_cell_types['Dataset'].str.split('-').str[0]
df_all_edges_with_cell_types['Unique Region'] = df_all_edges_with_cell_types['Dataset'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges_with_cell_types[['Dataset', 'Donor', 'Unique Region']].head(5)

	Dataset	Donor	Unique Region
0	P2-13	P2	Body
1	P2-13	P2	Body
2	P2-13	P2	Body
3	P2-13	P2	Body
4	P2-13	P2	Body

df_all_nodes.head(1)

	x	y	Original Cell Type	Level Three Cell Type	Level Three CL Label	Level Three CL ID	CL_Match/3	Level Two Cell Type	Level Two CL Label	Level Two CL ID	CL_Match/2	Level One Cell Type	Level One CL Label	Level One CL ID	CL_Match/1	Dataset	Donor	Unique Region
0	10774.2	7856.6	unknown	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	unknown cell	cell:unknown	CL:0000000	skos:narrowMatch	P2-13	P2	Body

df_all_edges.head(1)

	cell_id	x1	y1	z1	x2	y2	z2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Donor	Unique Region
0	0	10774.2	7856.6	0	10758.2	7862.8	0	17.159254	P2-13	endothelial cell	Level Three Cell Type	P2	Body

df_all_edges_with_cell_types.head(1)

	cell_id	x1	y1	z1	x2	y2	z2	Distance	Dataset	Anchor Cell Type	Anchor Cell Type Level	Level Three Cell Type	Level Two Cell Type	Level One Cell Type	Donor	Unique Region
0	0	10774.2	7856.6	0	10758.2	7862.8	0	17.159254	P2-13	endothelial cell	Level Three Cell Type	unknown cell	unknown cell	unknown cell	P2	Body

Node Analysis

# Plot number of cells per cell type in the same plot. Color by cell type and unique region. Output figure saved in existing `figures_output_dir`.
def plot_cells_per_celltype(df, type_field, output_dir):
    plt.figure(figsize=(14, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.countplot(data=df, x=type_field, palette='Spectral', hue='Unique Region', width=0.2)
    plt.title(f'Number of Cells per {type_field} in `{dataset_dir}`')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.legend(title='Unique Region', bbox_to_anchor=(0.85, 1), loc='upper left')
    plt.xlabel(type_field)

    # For numbers on y-axis, do not use scientific notation.
    plt.ticklabel_format(style='plain', axis='y')
    # Set y-axis label
    plt.ylabel('Number of Cells')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    # Show the plot
    plt.show()
    plt.close()
for type_field in type_field_list:
    plot_cells_per_celltype(df_all_nodes, type_field, os.path.join(basepath, figures_output_dir))

Distance Analysis

# Get mean, median, minimum, maximum distance per unique region per anchor cell type.
df_distance_stats = df_all_edges_with_cell_types.groupby(['Unique Region', 'Anchor Cell Type', 'Anchor Cell Type Level']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
# Print the first few rows of the distance statistics DataFrame.
df_distance_stats

	Unique Region	Anchor Cell Type	Anchor Cell Type Level	mean_distance	median_distance	min_distance	max_distance
0	Body	endothelial cell	Level One Cell Type	18.319087	14.700340	4.031129	199.990725
1	Body	endothelial cell	Level Three Cell Type	18.319087	14.700340	4.031129	199.990725
2	Body	endothelial cell	Level Two Cell Type	18.319087	14.700340	4.031129	199.990725
3	Head	endothelial cell	Level One Cell Type	16.526058	14.060228	3.721559	199.841938
4	Head	endothelial cell	Level Three Cell Type	16.526058	14.060228	3.721559	199.841938
5	Head	endothelial cell	Level Two Cell Type	16.526058	14.060228	3.721559	199.841938
6	Neck	endothelial cell	Level One Cell Type	17.067487	13.726616	4.001250	199.960921
7	Neck	endothelial cell	Level Three Cell Type	17.067487	13.726616	4.001250	199.960921
8	Neck	endothelial cell	Level Two Cell Type	17.067487	13.726616	4.001250	199.960921
9	Tail	endothelial cell	Level One Cell Type	17.707408	14.534442	4.252058	199.950819
10	Tail	endothelial cell	Level Three Cell Type	17.707408	14.534442	4.252058	199.950819
11	Tail	endothelial cell	Level Two Cell Type	17.707408	14.534442	4.252058	199.950819

Level One Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level One Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level One Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	epithelial cell	Body	18.965929	16.368873	4.310452	199.981049
1	epithelial cell	Head	17.352177	15.519343	3.900000	197.600860
2	epithelial cell	Neck	17.860771	15.337862	4.622770	199.862203
3	epithelial cell	Tail	18.714225	16.245923	4.386342	199.910605
4	unknown cell	Body	18.036458	13.952061	4.031129	199.990725
5	unknown cell	Head	16.176917	13.418271	3.721559	199.841938
6	unknown cell	Neck	16.725727	13.083195	4.001250	199.960921
7	unknown cell	Tail	17.291476	13.764447	4.252058	199.950819

# Get top five and bottom five cell types with respect to mean distance in each unique region separately.
def get_top_bottom_cell_types_by_mean(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate mean distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(mean_distance=('Distance', 'mean')).reset_index()
    
    # Sort by mean distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='mean_distance', ascending=False)
    
    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Body:
  Level One Cell Type  mean_distance
0     epithelial cell      18.965929
1        unknown cell      18.036458

Bottom 5 cell types in Body:
  Level One Cell Type  mean_distance
0     epithelial cell      18.965929
1        unknown cell      18.036458

Top 5 cell types in Tail:
  Level One Cell Type  mean_distance
0     epithelial cell      18.714225
1        unknown cell      17.291476

Bottom 5 cell types in Tail:
  Level One Cell Type  mean_distance
0     epithelial cell      18.714225
1        unknown cell      17.291476

Top 5 cell types in Head:
  Level One Cell Type  mean_distance
0     epithelial cell      17.352177
1        unknown cell      16.176917

Bottom 5 cell types in Head:
  Level One Cell Type  mean_distance
0     epithelial cell      17.352177
1        unknown cell      16.176917

Top 5 cell types in Neck:
  Level One Cell Type  mean_distance
0     epithelial cell      17.860771
1        unknown cell      16.725727

Bottom 5 cell types in Neck:
  Level One Cell Type  mean_distance
0     epithelial cell      17.860771
1        unknown cell      16.725727

# Get top five and bottom five cell types with respect to median distance in each unique region separately.
def get_top_bottom_cell_types_by_median(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate median distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(median_distance=('Distance', 'median')).reset_index()

    # Sort by median distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='median_distance', ascending=False)

    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Body:
  Level One Cell Type  median_distance
0     epithelial cell        16.368873
1        unknown cell        13.952061

Bottom 5 cell types in Body:
  Level One Cell Type  median_distance
0     epithelial cell        16.368873
1        unknown cell        13.952061

Top 5 cell types in Tail:
  Level One Cell Type  median_distance
0     epithelial cell        16.245923
1        unknown cell        13.764447

Bottom 5 cell types in Tail:
  Level One Cell Type  median_distance
0     epithelial cell        16.245923
1        unknown cell        13.764447

Top 5 cell types in Head:
  Level One Cell Type  median_distance
0     epithelial cell        15.519343
1        unknown cell        13.418271

Bottom 5 cell types in Head:
  Level One Cell Type  median_distance
0     epithelial cell        15.519343
1        unknown cell        13.418271

Top 5 cell types in Neck:
  Level One Cell Type  median_distance
0     epithelial cell        15.337862
1        unknown cell        13.083195

Bottom 5 cell types in Neck:
  Level One Cell Type  median_distance
0     epithelial cell        15.337862
1        unknown cell        13.083195

# Calculate regional variability
def calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level):
    """    Calculate regional variability for distances in the given DataFrame.
    """
    regional_variability = df_all_edges_with_cell_type_level.groupby('Unique Region')['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    regional_variability['CV (%)'] = (regional_variability['std'] / regional_variability['mean'] * 100).round(1)

    print("\nRegional Variability Analysis:")
    print("Mean: Average distance in each region")
    print("Std: Standard deviation of distances")
    print("CV: Coefficient of Variation (std/mean * 100%)")
    print(regional_variability)

    # Calculate variability for each cell type
    cell_type_variability = df_all_edges_with_cell_type_level.groupby(cell_type_level)['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    cell_type_variability['CV (%)'] = (cell_type_variability['std'] / cell_type_variability['mean'] * 100).round(1)

    print("\nCell Type Variability Analysis (sorted by CV):")
    print(cell_type_variability.sort_values('CV (%)', ascending=False))

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Body           18.32  13.52    73.8
Head           16.53   9.06    54.8
Neck           17.07  12.05    70.6
Tail           17.71  11.43    64.5

Cell Type Variability Analysis (sorted by CV):
                      mean    std  CV (%)
Level One Cell Type                      
unknown cell         17.07  11.92    69.8
epithelial cell      18.25  10.88    59.6

# Define the standard region sequence for plots
regions = ['Head', 'Neck', 'Body', 'Tail']

# Generate Violin Plot
def plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm='area'):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(10, 5))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette='Spectral', alpha=.9)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size)

    plt.title(f'Violin Plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    plt.figure(figsize=(10, 5))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    # Create categorical type with only the regions that exist in the data
    available_regions = [r for r in regions if r in df_all_edges_with_cell_type_level['Unique Region'].unique()]
    df_all_edges_with_cell_type_level['Unique Region'] = pd.Categorical(
        df_all_edges_with_cell_type_level['Unique Region'],
        categories=available_regions,
        ordered=True
    )

    # Make box plot.
    sns.boxplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y='Distance', hue='Unique Region', showfliers=False, palette='Spectral', width=0.2) # viridis or Spectral palette for better color distinction
    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)
    plt.title(f'Distribution of distances by {cell_type_level} and region', fontsize=font_size)
    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    pivot_data = df_all_edges_with_cell_type_level.pivot_table(
    values='Distance',
    index=cell_type_level,
    columns='Unique Region',
    aggfunc='median'
    )

    plt.figure(figsize=(5, 5))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='Spectral')
    plt.title(f'Heatmap of median distances by {cell_type_level}', fontsize=12)

    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)

    plt.xlabel('Unique Region', fontsize=font_size)
    plt.ylabel(f'{cell_type_level}', fontsize=font_size)
    
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Generate Violin Plot per unique region in both small intestine and large intestine. Create for all 8 regions as 8 subplots.
def plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 1})
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    font_size = 10
    fig, axs = plt.subplots(1, 4, figsize=(15, 5)) # Adjusted figsize for horizontal layout
    fig.suptitle(f'Distance distribution per {cell_type_level} in `{dataset_dir}` (density normalization = {density_norm})', fontsize=font_size)

    # Keep the sequence of Cell Types consistent across plots.
    cell_types = sorted(df_all_edges_with_cell_type_level[cell_type_level].unique())

    # Create a color palette based on the number of unique classes
    color_palette = sns.color_palette("Spectral", n_colors=len(cell_types))

    # Create a dictionary mapping class to color
    class_color_dict = dict(zip(cell_types, color_palette))

    for i, region in enumerate(regions):
        data_reg = df_all_edges_with_cell_type_level[df_all_edges_with_cell_type_level['Unique Region'] == region]
        sns.violinplot(data=data_reg, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette=class_color_dict, alpha=.9, ax=axs[i], hue=cell_type_level, legend=False, order=cell_types, fill=True)
        axs[i].set_title(region, fontsize=font_size)
        axs[i].set_xlabel('', fontsize=font_size)
        axs[i].set_ylabel('Distance (\u03bcm)', fontsize=font_size)
        axs[i].tick_params(axis='x', labelrotation=90, labelsize=font_size)
        axs[i].tick_params(axis='both', labelsize=font_size)

    # Use fig.text for precise label positioning
    fig.figure.text(0.5, -0.02, f'{cell_type_level}', ha='center', va='bottom', fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    
    plt.show()

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

# Bar plot for median distances per cell type per unique region faceted by donor.
def plot_median_distance_barplot_by_donor(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 1})
    font_size = 10
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    # Convert region to categorical with ordered sequence
    df_all_edges_with_cell_type_level['Unique Region'] = pd.Categorical(df_all_edges_with_cell_type_level['Unique Region'], categories=regions, ordered=True)

    # Create faceted plot
    plt.figure(figsize=(20, 15))

    g = sns.FacetGrid(df_all_edges_with_cell_type_level, col='Donor', col_wrap=3, height=4, aspect=1.2)
    g.map_dataframe(sns.barplot, x='Unique Region', y='Distance', hue=cell_type_level,
                estimator=np.median, palette='Spectral', errorbar=None,
                order=regions)  # Specify the order for x-axis

    
    g.figure.suptitle(f'Median Distances per {cell_type_level} by Donor in `{dataset_dir}`', fontsize=font_size)

    # Customize each facet
    for ax in g.axes:
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
        ax.set_xlabel('', fontsize=font_size)
        ax.set_ylabel('Median Distance (μm)', fontsize=font_size)

    # Use fig.text for precise label positioning
    g.figure.text(0.5, -0.02, 'Unique Region', ha='center', va='bottom', fontsize=font_size)
    
    plt.legend(title=f'{cell_type_level}', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_median_distance_barplot_by_donor_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_median_distance_barplot_by_donor_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)

    plt.show()

plot_median_distance_barplot_by_donor(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

<Figure size 2000x1500 with 0 Axes>

Level Two Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Two Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level Two Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	beta cell	Body	29.846050	26.362284	5.186521	171.124545
1	beta cell	Head	26.123536	22.600111	4.854894	129.169230
2	beta cell	Neck	30.119941	26.340273	5.458938	196.232133
3	beta cell	Tail	26.932770	23.086793	4.838388	196.558617
4	gland epithelium cell	Body	18.637023	16.194135	4.310452	199.981049
5	gland epithelium cell	Head	17.119328	15.395129	3.900000	197.600860
6	gland epithelium cell	Neck	17.379948	15.100331	4.622770	199.862203
7	gland epithelium cell	Tail	18.298722	16.007811	4.386342	199.910605
8	unknown cell	Body	18.036458	13.952061	4.031129	199.990725
9	unknown cell	Head	16.176917	13.418271	3.721559	199.841938
10	unknown cell	Neck	16.725727	13.083195	4.001250	199.960921
11	unknown cell	Tail	17.291476	13.764447	4.252058	199.950819

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Body:
     Level Two Cell Type  mean_distance
0              beta cell      29.846050
1  gland epithelium cell      18.637023
2           unknown cell      18.036458

Bottom 5 cell types in Body:
     Level Two Cell Type  mean_distance
0              beta cell      29.846050
1  gland epithelium cell      18.637023
2           unknown cell      18.036458

Top 5 cell types in Tail:
     Level Two Cell Type  mean_distance
0              beta cell      26.932770
1  gland epithelium cell      18.298722
2           unknown cell      17.291476

Bottom 5 cell types in Tail:
     Level Two Cell Type  mean_distance
0              beta cell      26.932770
1  gland epithelium cell      18.298722
2           unknown cell      17.291476

Top 5 cell types in Head:
     Level Two Cell Type  mean_distance
0              beta cell      26.123536
1  gland epithelium cell      17.119328
2           unknown cell      16.176917

Bottom 5 cell types in Head:
     Level Two Cell Type  mean_distance
0              beta cell      26.123536
1  gland epithelium cell      17.119328
2           unknown cell      16.176917

Top 5 cell types in Neck:
     Level Two Cell Type  mean_distance
0              beta cell      30.119941
1  gland epithelium cell      17.379948
2           unknown cell      16.725727

Bottom 5 cell types in Neck:
     Level Two Cell Type  mean_distance
0              beta cell      30.119941
1  gland epithelium cell      17.379948
2           unknown cell      16.725727

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Body:
     Level Two Cell Type  median_distance
0              beta cell        26.362284
1  gland epithelium cell        16.194135
2           unknown cell        13.952061

Bottom 5 cell types in Body:
     Level Two Cell Type  median_distance
0              beta cell        26.362284
1  gland epithelium cell        16.194135
2           unknown cell        13.952061

Top 5 cell types in Tail:
     Level Two Cell Type  median_distance
0              beta cell        23.086793
1  gland epithelium cell        16.007811
2           unknown cell        13.764447

Bottom 5 cell types in Tail:
     Level Two Cell Type  median_distance
0              beta cell        23.086793
1  gland epithelium cell        16.007811
2           unknown cell        13.764447

Top 5 cell types in Head:
     Level Two Cell Type  median_distance
0              beta cell        22.600111
1  gland epithelium cell        15.395129
2           unknown cell        13.418271

Bottom 5 cell types in Head:
     Level Two Cell Type  median_distance
0              beta cell        22.600111
1  gland epithelium cell        15.395129
2           unknown cell        13.418271

Top 5 cell types in Neck:
     Level Two Cell Type  median_distance
0              beta cell        26.340273
1  gland epithelium cell        15.100331
2           unknown cell        13.083195

Bottom 5 cell types in Neck:
     Level Two Cell Type  median_distance
0              beta cell        26.340273
1  gland epithelium cell        15.100331
2           unknown cell        13.083195

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Body           18.32  13.52    73.8
Head           16.53   9.06    54.8
Neck           17.07  12.05    70.6
Tail           17.71  11.43    64.5

Cell Type Variability Analysis (sorted by CV):
                        mean    std  CV (%)
Level Two Cell Type                        
unknown cell           17.07  11.92    69.8
beta cell              28.16  16.82    59.7
gland epithelium cell  17.88  10.41    58.2

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

plot_median_distance_barplot_by_donor(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

<Figure size 2000x1500 with 0 Axes>

Level Three Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Three Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level

	Level Three Cell Type	Unique Region	mean_distance	median_distance	min_distance	max_distance
0	beta cell:pancreatic	Body	29.846050	26.362284	5.186521	171.124545
1	beta cell:pancreatic	Head	26.123536	22.600111	4.854894	129.169230
2	beta cell:pancreatic	Neck	30.119941	26.340273	5.458938	196.232133
3	beta cell:pancreatic	Tail	26.932770	23.086793	4.838388	196.558617
4	epithelial cell:ductal	Body	18.637023	16.194135	4.310452	199.981049
5	epithelial cell:ductal	Head	17.119328	15.395129	3.900000	197.600860
6	epithelial cell:ductal	Neck	17.379948	15.100331	4.622770	199.862203
7	epithelial cell:ductal	Tail	18.298722	16.007811	4.386342	199.910605
8	unknown cell	Body	18.036458	13.952061	4.031129	199.990725
9	unknown cell	Head	16.176917	13.418271	3.721559	199.841938
10	unknown cell	Neck	16.725727	13.083195	4.001250	199.960921
11	unknown cell	Tail	17.291476	13.764447	4.252058	199.950819

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Body:
     Level Three Cell Type  mean_distance
0     beta cell:pancreatic      29.846050
1  epithelial cell:ductal       18.637023
2             unknown cell      18.036458

Bottom 5 cell types in Body:
     Level Three Cell Type  mean_distance
0     beta cell:pancreatic      29.846050
1  epithelial cell:ductal       18.637023
2             unknown cell      18.036458

Top 5 cell types in Tail:
     Level Three Cell Type  mean_distance
0     beta cell:pancreatic      26.932770
1  epithelial cell:ductal       18.298722
2             unknown cell      17.291476

Bottom 5 cell types in Tail:
     Level Three Cell Type  mean_distance
0     beta cell:pancreatic      26.932770
1  epithelial cell:ductal       18.298722
2             unknown cell      17.291476

Top 5 cell types in Head:
     Level Three Cell Type  mean_distance
0     beta cell:pancreatic      26.123536
1  epithelial cell:ductal       17.119328
2             unknown cell      16.176917

Bottom 5 cell types in Head:
     Level Three Cell Type  mean_distance
0     beta cell:pancreatic      26.123536
1  epithelial cell:ductal       17.119328
2             unknown cell      16.176917

Top 5 cell types in Neck:
     Level Three Cell Type  mean_distance
0     beta cell:pancreatic      30.119941
1  epithelial cell:ductal       17.379948
2             unknown cell      16.725727

Bottom 5 cell types in Neck:
     Level Three Cell Type  mean_distance
0     beta cell:pancreatic      30.119941
1  epithelial cell:ductal       17.379948
2             unknown cell      16.725727

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])


Top 5 cell types in Body:
     Level Three Cell Type  median_distance
0     beta cell:pancreatic        26.362284
1  epithelial cell:ductal         16.194135
2             unknown cell        13.952061

Bottom 5 cell types in Body:
     Level Three Cell Type  median_distance
0     beta cell:pancreatic        26.362284
1  epithelial cell:ductal         16.194135
2             unknown cell        13.952061

Top 5 cell types in Tail:
     Level Three Cell Type  median_distance
0     beta cell:pancreatic        23.086793
1  epithelial cell:ductal         16.007811
2             unknown cell        13.764447

Bottom 5 cell types in Tail:
     Level Three Cell Type  median_distance
0     beta cell:pancreatic        23.086793
1  epithelial cell:ductal         16.007811
2             unknown cell        13.764447

Top 5 cell types in Head:
     Level Three Cell Type  median_distance
0     beta cell:pancreatic        22.600111
1  epithelial cell:ductal         15.395129
2             unknown cell        13.418271

Bottom 5 cell types in Head:
     Level Three Cell Type  median_distance
0     beta cell:pancreatic        22.600111
1  epithelial cell:ductal         15.395129
2             unknown cell        13.418271

Top 5 cell types in Neck:
     Level Three Cell Type  median_distance
0     beta cell:pancreatic        26.340273
1  epithelial cell:ductal         15.100331
2             unknown cell        13.083195

Bottom 5 cell types in Neck:
     Level Three Cell Type  median_distance
0     beta cell:pancreatic        26.340273
1  epithelial cell:ductal         15.100331
2             unknown cell        13.083195

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)


Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Body           18.32  13.52    73.8
Head           16.53   9.06    54.8
Neck           17.07  12.05    70.6
Tail           17.71  11.43    64.5

Cell Type Variability Analysis (sorted by CV):
                          mean    std  CV (%)
Level Three Cell Type                        
unknown cell             17.07  11.92    69.8
beta cell:pancreatic     28.16  16.82    59.7
epithelial cell:ductal   17.88  10.41    58.2

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

plot_median_distance_barplot_by_donor(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

<Figure size 2000x1500 with 0 Axes>