Distance Analysis: spleen-codex-ufl

Analyze and visualize cell-to-nearest-endothelial-cell distance distributions for the spleen-codex-ufl dataset.
import numpy as np
import pandas as pd
import os
import json
import requests
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
import plotly.express as px

from _cde_compute_edges_from_nodes import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")
basepath = "/u/yashjain/hra-cell-distance-analysis/data"
dataset_dir = "spleen-codex-ufl"
data_filedir = os.path.join("data-processed-nodes-with-harmonized-cell-types", dataset_dir)
output_edge_dir = os.path.join("data-processed-edges", dataset_dir)
figures_output_dir = "generated-figures"
# Function to load your data
def load_data(path, edges=False):
    if edges:
        column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
        data = pd.read_csv(path, header=None, names=column_names)
    else:
        data = pd.read_csv(path)
    return data
# Function to read all files ending with "-nodes.csv" in the `data_filedir` directory into a single DataFrame. 
# Another additional column `Dataset` is added to identify the dataset name which comes from the filename before the `-nodes.csv` suffix.

# Additionally, function reads all files ending with "-edges.csv" in the `output_edge_dir` directory into a single DataFrame. 
# Three additional columns are added "Dataset", "Anchor Cell Type", and "Anchor Cell Type Level" to identify the dataset name, anchor cell type, and anchor cell type level respectively which come from the filename before the `.csv` suffix.
# The three additional columns are created by splitting the filename on the `-` character, and extracting the relevant parts.
# On splitting, the first part is the dataset name, second part is the anchor cell type level, and third part is the anchor cell type, and last part is the `edges` suffix.
# When reading files, check if the file has the correct format (i.e., ends with `-edges.csv`).

# Additionally, the function merges the edges DataFrame with the nodes DataFrame to get the cell type information for the anchor cells.
# This is done by reading the corresponding nodes file from the `data_filedir` directory for each edges file, and merging it with the edges DataFrame on the `cell_id` column.
# The merged DataFrame contains the edges with additional columns for the cell type information.

# The function returns three DataFrames:
# 1. `merged_nodes`: DataFrame containing all nodes with an additional column `Dataset`.
# 2. `merged_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, and `Anchor Cell Type Level`.
# 3. `merged_nodes_for_all_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, `Anchor Cell Type Level`, and the cell type information for cells.
def read_all_edge_datasets(basepath, data_filedir, output_edge_dir):
    all_nodes_files = []
    all_edges_files = []
    all_nodes_edges_files = []
    for file in os.listdir(os.path.join(basepath, output_edge_dir)):
        if file.endswith("-edges.csv"):
            file_path = os.path.join(basepath, output_edge_dir, file)
            dataset_name, anchor_cell_type_level, anchor_cell_type = file.replace("-edges.csv", "").split('-')
            edges_df = load_data(file_path, edges=False)
            edges_df['Dataset'] = dataset_name
            edges_df['Anchor Cell Type'] = anchor_cell_type
            edges_df['Anchor Cell Type Level'] = anchor_cell_type_level
            edges_df.rename(columns={"distance": "Distance"}, inplace=True) # Rename column "distance" to "Distance".
            all_edges_files.append(edges_df)

            # Read the corresponding nodes file from data_filedir to get the cell type information
            nodes_file_path = os.path.join(basepath, data_filedir, f"{dataset_name}-nodes.csv")
            nodes_df = load_data(nodes_file_path)
            nodes_df['Dataset'] = dataset_name
            all_nodes_files.append(nodes_df)

            # Add a new 'cell_id' column to nodes_df
            nodes_df['cell_id'] = range(len(nodes_df))
            # Set 'cell_id' column as index for nodes_df
            nodes_df.set_index('cell_id', inplace=True)
            # Merge edges_df with nodes_df to get the cell type information for the anchor cells
            edges_nodes_df = pd.merge(edges_df, nodes_df[['Level Three Cell Type', 'Level Two Cell Type', 'Level One Cell Type']], how='left', left_on='cell_id', right_index=True)
            all_nodes_edges_files.append(edges_nodes_df)

    
    merged_edges = pd.concat(all_edges_files, ignore_index=True)
    merged_nodes = pd.concat(all_nodes_files, ignore_index=True)
    merged_nodes_for_all_edges = pd.concat(all_nodes_edges_files, ignore_index=True) 

    return merged_nodes, merged_edges, merged_nodes_for_all_edges
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

Get initial statistics and identify endothelial cell categories for dataset.

df_all_nodes, df_all_edges, df_all_edges_with_cell_types = read_all_edge_datasets(basepath, data_filedir, output_edge_dir)
df_all_nodes.head(5)
x y Original Cell Type Level Three Cell Type Level Three CL Label Level Three CL ID CL_Match/3 Level Two Cell Type Level Two CL Label Level Two CL ID CL_Match/2 Level One Cell Type Level One CL Label Level One CL ID CL_Match/1 Dataset
0 0.0 3403.933 Macrophages macrophage macrophage CL:0000235 skos:exactMatch macrophage macrophage CL:0000235 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch FSLD
1 0.0 3356.808 Sinusoidal cells endothelial cell of sinusoid endothelial cell of sinusoid CL:0002262 skos:exactMatch endothelial cell of sinusoid endothelial cell of sinusoid CL:0002262 skos:exactMatch endothelial cell endothelial cell CL:0000115 skos:exactMatch FSLD
2 0.0 3313.830 Sinusoidal cells endothelial cell of sinusoid endothelial cell of sinusoid CL:0002262 skos:exactMatch endothelial cell of sinusoid endothelial cell of sinusoid CL:0002262 skos:exactMatch endothelial cell endothelial cell CL:0000115 skos:exactMatch FSLD
3 0.0 3304.405 indistinct unknown cell cell:unknown CL:0000000 skos:narrowMatch unknown cell cell:unknown CL:0000000 skos:narrowMatch unknown cell cell:unknown CL:0000000 skos:narrowMatch FSLD
4 0.0 3281.031 indistinct unknown cell cell:unknown CL:0000000 skos:narrowMatch unknown cell cell:unknown CL:0000000 skos:narrowMatch unknown cell cell:unknown CL:0000000 skos:narrowMatch FSLD
# Print the total number of unique cell types per dataset. Compute separately for each cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type).
print("Total number of unique cell types per cell type annnotation level:")
unique_cell_types = {
    'Original Cell Type': df_all_nodes['Original Cell Type'].nunique(),
    'Level Three Cell Type': df_all_nodes['Level Three Cell Type'].nunique(),
    'Level Two Cell Type': df_all_nodes['Level Two Cell Type'].nunique(),
    'Level One Cell Type': df_all_nodes['Level One Cell Type'].nunique()
}
for cell_type, count in unique_cell_types.items():
    print(f"{cell_type}: {count}")
Total number of unique cell types per cell type annnotation level:
Original Cell Type: 12
Level Three Cell Type: 12
Level Two Cell Type: 9
Level One Cell Type: 3
# Save the unique cell types containing "endothelial" in name per cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type) to a dictionary where the key is the level and the value is a list of unique cell types.
endothelial_cell_types = {
    'Original Cell Type': df_all_nodes[df_all_nodes['Original Cell Type'].str.contains("endothelial", case=False, na=False)]['Original Cell Type'].unique().tolist(),
    'Level Three Cell Type': df_all_nodes[df_all_nodes['Level Three Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Three Cell Type'].unique().tolist(),
    'Level Two Cell Type': df_all_nodes[df_all_nodes['Level Two Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Two Cell Type'].unique().tolist(),
    'Level One Cell Type': df_all_nodes[df_all_nodes['Level One Cell Type'].str.contains("endothelial", case=False, na=False)]['Level One Cell Type'].unique().tolist()
}

print("\nEndothelial cell types per cell type annotation level:")
for level, cell_types in endothelial_cell_types.items():
    print(f"\n{level}:")
    for cell in cell_types:
        print(f"  - {cell}")

Endothelial cell types per cell type annotation level:

Original Cell Type:
  - blood endothelial

Level Three Cell Type:
  - endothelial cell of sinusoid
  - endothelial cell of lymphatic vessel
  - endothelial cell of vascular tree

Level Two Cell Type:
  - endothelial cell of sinusoid
  - endothelial cell of lymphatic vessel
  - endothelial cell of vascular tree

Level One Cell Type:
  - endothelial cell
type_field_list = ["Level Three Cell Type", "Level Two Cell Type", "Level One Cell Type"] # Skipping Original Cell Type as it is not a hierarchical level.

# Define the anchor cell type (type of endothelial cell) for each level in type_field_list based on available categories in the previous cell. The distance analysis at all three levels will be limited to the specified anchor cell type.
anchor_cell_type_dict = {
    'Level Three Cell Type': 'endothelial cell of sinusoid', # Choosing sinusoid due to high number of instances in the dataset compared to other endothelial cell types.
    'Level Two Cell Type': 'endothelial cell of sinusoid',
    'Level One Cell Type': 'endothelial cell'
}

Process datasets to add region information to Nodes files.

df_all_nodes.head()
x y Original Cell Type Level Three Cell Type Level Three CL Label Level Three CL ID CL_Match/3 Level Two Cell Type Level Two CL Label Level Two CL ID CL_Match/2 Level One Cell Type Level One CL Label Level One CL ID CL_Match/1 Dataset
0 0.0 3403.933 Macrophages macrophage macrophage CL:0000235 skos:exactMatch macrophage macrophage CL:0000235 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch FSLD
1 0.0 3356.808 Sinusoidal cells endothelial cell of sinusoid endothelial cell of sinusoid CL:0002262 skos:exactMatch endothelial cell of sinusoid endothelial cell of sinusoid CL:0002262 skos:exactMatch endothelial cell endothelial cell CL:0000115 skos:exactMatch FSLD
2 0.0 3313.830 Sinusoidal cells endothelial cell of sinusoid endothelial cell of sinusoid CL:0002262 skos:exactMatch endothelial cell of sinusoid endothelial cell of sinusoid CL:0002262 skos:exactMatch endothelial cell endothelial cell CL:0000115 skos:exactMatch FSLD
3 0.0 3304.405 indistinct unknown cell cell:unknown CL:0000000 skos:narrowMatch unknown cell cell:unknown CL:0000000 skos:narrowMatch unknown cell cell:unknown CL:0000000 skos:narrowMatch FSLD
4 0.0 3281.031 indistinct unknown cell cell:unknown CL:0000000 skos:narrowMatch unknown cell cell:unknown CL:0000000 skos:narrowMatch unknown cell cell:unknown CL:0000000 skos:narrowMatch FSLD
# Iterate through the df_all_data dataframe to create new column "Unique Region" based on the "Dataset" column.
# The "Unique Region" column is created by mapping the region names based on the full dataset name.
df_all_nodes['Unique Region'] = df_all_nodes['Dataset']
# df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].map(region_map)
# df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].str.split('-').str[1].map(region_map)

# Check if the new columns are created correctly.
df_all_nodes[['Dataset', 'Unique Region']].head(5)
Dataset Unique Region
0 FSLD FSLD
1 FSLD FSLD
2 FSLD FSLD
3 FSLD FSLD
4 FSLD FSLD
# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_nodes['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_nodes['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_nodes['Unique Region'].unique():
    num_datasets = df_all_nodes[df_all_nodes['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")

Unique Regions in the data:
['FSLD' 'KSFB' 'NGPL' 'PBVN' 'PKHL' 'XXCD']
Total number of unique regions: 6

Number of unique datasets per unique region:
FSLD: 1
KSFB: 1
NGPL: 1
PBVN: 1
PKHL: 1
XXCD: 1

Process datasets to add region information to Edges files.

df_all_edges.head(5)
cell_id x1 y1 z1 x2 y2 z2 Distance Dataset Anchor Cell Type Anchor Cell Type Level
0 0 0.000 3403.933 0 9.425 3409.211 0 10.802218 FSLD endothelial cell of sinusoid Level Three Cell Type
1 24 1.131 3416.374 0 9.425 3409.211 0 10.958969 FSLD endothelial cell of sinusoid Level Three Cell Type
2 43 12.818 3415.620 0 9.425 3409.211 0 7.251740 FSLD endothelial cell of sinusoid Level Three Cell Type
3 95 21.866 3407.703 0 9.425 3409.211 0 12.532061 FSLD endothelial cell of sinusoid Level Three Cell Type
4 115 22.997 3415.997 0 9.425 3409.211 0 15.173957 FSLD endothelial cell of sinusoid Level Three Cell Type
# Process the edge data to create new columns "Unique Region" based on the "Dataset" column, similar to how it was done for the node data.
df_all_edges['Unique Region'] = df_all_edges['Dataset']
# df_all_edges['Unique Region'] = df_all_edges['Dataset'].map(region_map)


# Check if the new columns are created correctly.
df_all_edges[['Dataset', 'Unique Region']].head(5)
Dataset Unique Region
0 FSLD FSLD
1 FSLD FSLD
2 FSLD FSLD
3 FSLD FSLD
4 FSLD FSLD
# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_edges['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_edges['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_edges['Unique Region'].unique():
    num_datasets = df_all_edges[df_all_edges['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")

Unique Regions in the data:
['FSLD' 'KSFB' 'NGPL' 'PBVN' 'PKHL' 'XXCD']
Total number of unique regions: 6

Number of unique datasets per unique region:
FSLD: 1
KSFB: 1
NGPL: 1
PBVN: 1
PKHL: 1
XXCD: 1
df_all_edges_with_cell_types['Unique Region'] = df_all_edges_with_cell_types['Dataset']
# df_all_edges_with_cell_types['Unique Region'] = df_all_edges_with_cell_types['Dataset'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges_with_cell_types[['Dataset', 'Unique Region']].head(5)
Dataset Unique Region
0 FSLD FSLD
1 FSLD FSLD
2 FSLD FSLD
3 FSLD FSLD
4 FSLD FSLD
df_all_nodes.head(1)
x y Original Cell Type Level Three Cell Type Level Three CL Label Level Three CL ID CL_Match/3 Level Two Cell Type Level Two CL Label Level Two CL ID CL_Match/2 Level One Cell Type Level One CL Label Level One CL ID CL_Match/1 Dataset Unique Region
0 0.0 3403.933 Macrophages macrophage macrophage CL:0000235 skos:exactMatch macrophage macrophage CL:0000235 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch FSLD FSLD
df_all_edges.head(1)
cell_id x1 y1 z1 x2 y2 z2 Distance Dataset Anchor Cell Type Anchor Cell Type Level Unique Region
0 0 0.0 3403.933 0 9.425 3409.211 0 10.802218 FSLD endothelial cell of sinusoid Level Three Cell Type FSLD
df_all_edges_with_cell_types.head(1)
cell_id x1 y1 z1 x2 y2 z2 Distance Dataset Anchor Cell Type Anchor Cell Type Level Level Three Cell Type Level Two Cell Type Level One Cell Type Unique Region
0 0 0.0 3403.933 0 9.425 3409.211 0 10.802218 FSLD endothelial cell of sinusoid Level Three Cell Type macrophage macrophage immune cell FSLD

Node Analysis

# Plot number of cells per cell type in the same plot. Color by cell type and unique region. Output figure saved in existing `figures_output_dir`.
def plot_cells_per_celltype(df, type_field, output_dir):
    plt.figure(figsize=(20, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.countplot(data=df, x=type_field, palette='Spectral', hue='Unique Region')
    plt.title(f'Number of Cells per {type_field} in `{dataset_dir}`')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.legend(title='Unique Region', bbox_to_anchor=(0.85, 1), loc='upper left')
    plt.xlabel(type_field)

    # For numbers on y-axis, do not use scientific notation.
    plt.ticklabel_format(style='plain', axis='y')
    # Set y-axis label
    plt.ylabel('Number of Cells')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    # Show the plot
    plt.show()
    plt.close()
for type_field in type_field_list:
    plot_cells_per_celltype(df_all_nodes, type_field, os.path.join(basepath, figures_output_dir))

Distance Analysis

# Get mean, median, minimum, maximum distance per unique region per anchor cell type.
df_distance_stats = df_all_edges_with_cell_types.groupby(['Unique Region', 'Anchor Cell Type', 'Anchor Cell Type Level']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
# Print the first few rows of the distance statistics DataFrame.
df_distance_stats
Unique Region Anchor Cell Type Anchor Cell Type Level mean_distance median_distance min_distance max_distance
0 FSLD endothelial cell Level One Cell Type 11.933050 10.185979 0.377000 120.663560
1 FSLD endothelial cell of lymphatic vessel Level Three Cell Type 69.140742 62.332820 0.377000 199.993437
2 FSLD endothelial cell of lymphatic vessel Level Two Cell Type 69.140742 62.332820 0.377000 199.993437
3 FSLD endothelial cell of sinusoid Level Three Cell Type 17.758929 12.486613 0.377000 174.062170
4 FSLD endothelial cell of sinusoid Level Two Cell Type 17.758929 12.486613 0.377000 174.062170
5 FSLD endothelial cell of vascular tree Level Three Cell Type 31.444898 28.859590 0.377000 134.193466
6 FSLD endothelial cell of vascular tree Level Two Cell Type 31.444898 28.859590 0.377000 134.193466
7 KSFB endothelial cell Level One Cell Type 18.516381 12.999668 0.377000 159.194472
8 KSFB endothelial cell of lymphatic vessel Level Three Cell Type 100.106671 98.720745 0.377000 199.996635
9 KSFB endothelial cell of lymphatic vessel Level Two Cell Type 100.106671 98.720745 0.377000 199.996635
10 KSFB endothelial cell of sinusoid Level Three Cell Type 32.301586 17.541644 0.377000 199.992015
11 KSFB endothelial cell of sinusoid Level Two Cell Type 32.301586 17.541644 0.377000 199.992015
12 KSFB endothelial cell of vascular tree Level Three Cell Type 34.072529 30.051417 0.377000 162.403006
13 KSFB endothelial cell of vascular tree Level Two Cell Type 34.072529 30.051417 0.377000 162.403006
14 NGPL endothelial cell Level One Cell Type 19.649715 12.756869 0.377000 199.990594
15 NGPL endothelial cell of lymphatic vessel Level Three Cell Type 105.868068 107.070655 0.377000 199.996635
16 NGPL endothelial cell of lymphatic vessel Level Two Cell Type 105.868068 107.070655 0.377000 199.996635
17 NGPL endothelial cell of sinusoid Level Three Cell Type 32.165822 17.358384 0.377000 199.946882
18 NGPL endothelial cell of sinusoid Level Two Cell Type 32.165822 17.358384 0.377000 199.946882
19 NGPL endothelial cell of vascular tree Level Three Cell Type 36.527379 31.724052 0.377000 199.990594
20 NGPL endothelial cell of vascular tree Level Two Cell Type 36.527379 31.724052 0.377000 199.990594
21 PBVN endothelial cell Level One Cell Type 16.689185 12.823543 0.377000 181.684667
22 PBVN endothelial cell of lymphatic vessel Level Three Cell Type 90.564477 86.386076 0.533159 199.999832
23 PBVN endothelial cell of lymphatic vessel Level Two Cell Type 90.564477 86.386076 0.533159 199.999832
24 PBVN endothelial cell of sinusoid Level Three Cell Type 24.695179 16.694759 0.377000 199.993437
25 PBVN endothelial cell of sinusoid Level Two Cell Type 24.695179 16.694759 0.377000 199.993437
26 PBVN endothelial cell of vascular tree Level Three Cell Type 33.754132 30.265846 0.377000 191.104792
27 PBVN endothelial cell of vascular tree Level Two Cell Type 33.754132 30.265846 0.377000 191.104792
28 PKHL endothelial cell Level One Cell Type 22.572654 14.562159 0.377000 165.169461
29 PKHL endothelial cell of lymphatic vessel Level Three Cell Type 113.542690 116.746498 0.842998 199.996635
30 PKHL endothelial cell of lymphatic vessel Level Two Cell Type 113.542690 116.746498 0.842998 199.996635
31 PKHL endothelial cell of sinusoid Level Three Cell Type 32.273382 17.602307 0.377000 199.996635
32 PKHL endothelial cell of sinusoid Level Two Cell Type 32.273382 17.602307 0.377000 199.996635
33 PKHL endothelial cell of vascular tree Level Three Cell Type 41.424710 36.574829 0.377000 177.757399
34 PKHL endothelial cell of vascular tree Level Two Cell Type 41.424710 36.574829 0.377000 177.757399
35 XXCD endothelial cell Level One Cell Type 16.957493 12.582993 0.377000 182.868318
36 XXCD endothelial cell of lymphatic vessel Level Three Cell Type 107.389387 108.560945 0.377000 199.999832
37 XXCD endothelial cell of lymphatic vessel Level Two Cell Type 107.389387 108.560945 0.377000 199.999832
38 XXCD endothelial cell of sinusoid Level Three Cell Type 24.938923 15.197356 0.377000 199.913826
39 XXCD endothelial cell of sinusoid Level Two Cell Type 24.938923 15.197356 0.377000 199.913826
40 XXCD endothelial cell of vascular tree Level Three Cell Type 38.484744 34.769948 0.377000 182.868318
41 XXCD endothelial cell of vascular tree Level Two Cell Type 38.484744 34.769948 0.377000 182.868318

Level One Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level One Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level
Level One Cell Type Unique Region mean_distance median_distance min_distance max_distance
0 immune cell FSLD 12.325330 10.515530 0.377 120.663560
1 immune cell KSFB 19.574997 13.498495 0.377 159.194472
2 immune cell NGPL 21.022792 13.514280 0.377 199.990594
3 immune cell PBVN 17.059760 12.983258 0.377 181.684667
4 immune cell PKHL 22.544364 14.330960 0.377 165.169461
5 immune cell XXCD 17.306820 12.644964 0.377 182.868318
6 unknown cell FSLD 9.336397 7.997378 0.377 83.688056
7 unknown cell KSFB 12.696620 10.633133 0.377 150.979439
8 unknown cell NGPL 10.879824 9.311213 0.377 97.890139
9 unknown cell PBVN 14.605778 11.783889 0.377 145.222827
10 unknown cell PKHL 22.720231 16.105447 0.377 146.043105
11 unknown cell XXCD 14.710918 12.308920 0.377 161.352917
# Get top five and bottom five cell types with respect to mean distance in each unique region separately.
def get_top_bottom_cell_types_by_mean(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate mean distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(mean_distance=('Distance', 'mean')).reset_index()
    
    # Sort by mean distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='mean_distance', ascending=False)
    
    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in FSLD:
  Level One Cell Type  mean_distance
0         immune cell      12.325330
1        unknown cell       9.336397

Bottom 5 cell types in FSLD:
  Level One Cell Type  mean_distance
0         immune cell      12.325330
1        unknown cell       9.336397

Top 5 cell types in KSFB:
  Level One Cell Type  mean_distance
0         immune cell      19.574997
1        unknown cell      12.696620

Bottom 5 cell types in KSFB:
  Level One Cell Type  mean_distance
0         immune cell      19.574997
1        unknown cell      12.696620

Top 5 cell types in NGPL:
  Level One Cell Type  mean_distance
0         immune cell      21.022792
1        unknown cell      10.879824

Bottom 5 cell types in NGPL:
  Level One Cell Type  mean_distance
0         immune cell      21.022792
1        unknown cell      10.879824

Top 5 cell types in PBVN:
  Level One Cell Type  mean_distance
0         immune cell      17.059760
1        unknown cell      14.605778

Bottom 5 cell types in PBVN:
  Level One Cell Type  mean_distance
0         immune cell      17.059760
1        unknown cell      14.605778

Top 5 cell types in PKHL:
  Level One Cell Type  mean_distance
1        unknown cell      22.720231
0         immune cell      22.544364

Bottom 5 cell types in PKHL:
  Level One Cell Type  mean_distance
1        unknown cell      22.720231
0         immune cell      22.544364

Top 5 cell types in XXCD:
  Level One Cell Type  mean_distance
0         immune cell      17.306820
1        unknown cell      14.710918

Bottom 5 cell types in XXCD:
  Level One Cell Type  mean_distance
0         immune cell      17.306820
1        unknown cell      14.710918
# Get top five and bottom five cell types with respect to median distance in each unique region separately.
def get_top_bottom_cell_types_by_median(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate median distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(median_distance=('Distance', 'median')).reset_index()

    # Sort by median distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='median_distance', ascending=False)

    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in FSLD:
  Level One Cell Type  median_distance
0         immune cell        10.515530
1        unknown cell         7.997378

Bottom 5 cell types in FSLD:
  Level One Cell Type  median_distance
0         immune cell        10.515530
1        unknown cell         7.997378

Top 5 cell types in KSFB:
  Level One Cell Type  median_distance
0         immune cell        13.498495
1        unknown cell        10.633133

Bottom 5 cell types in KSFB:
  Level One Cell Type  median_distance
0         immune cell        13.498495
1        unknown cell        10.633133

Top 5 cell types in NGPL:
  Level One Cell Type  median_distance
0         immune cell        13.514280
1        unknown cell         9.311213

Bottom 5 cell types in NGPL:
  Level One Cell Type  median_distance
0         immune cell        13.514280
1        unknown cell         9.311213

Top 5 cell types in PBVN:
  Level One Cell Type  median_distance
0         immune cell        12.983258
1        unknown cell        11.783889

Bottom 5 cell types in PBVN:
  Level One Cell Type  median_distance
0         immune cell        12.983258
1        unknown cell        11.783889

Top 5 cell types in PKHL:
  Level One Cell Type  median_distance
1        unknown cell        16.105447
0         immune cell        14.330960

Bottom 5 cell types in PKHL:
  Level One Cell Type  median_distance
1        unknown cell        16.105447
0         immune cell        14.330960

Top 5 cell types in XXCD:
  Level One Cell Type  median_distance
0         immune cell        12.644964
1        unknown cell        12.308920

Bottom 5 cell types in XXCD:
  Level One Cell Type  median_distance
0         immune cell        12.644964
1        unknown cell        12.308920
# Calculate regional variability
def calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level):
    """    Calculate regional variability for distances in the given DataFrame.
    """
    regional_variability = df_all_edges_with_cell_type_level.groupby('Unique Region')['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    regional_variability['CV (%)'] = (regional_variability['std'] / regional_variability['mean'] * 100).round(1)

    print("\nRegional Variability Analysis:")
    print("Mean: Average distance in each region")
    print("Std: Standard deviation of distances")
    print("CV: Coefficient of Variation (std/mean * 100%)")
    print(regional_variability)

    # Calculate variability for each cell type
    cell_type_variability = df_all_edges_with_cell_type_level.groupby(cell_type_level)['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    cell_type_variability['CV (%)'] = (cell_type_variability['std'] / cell_type_variability['mean'] * 100).round(1)

    print("\nCell Type Variability Analysis (sorted by CV):")
    print(cell_type_variability.sort_values('CV (%)', ascending=False))

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)

Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
FSLD           11.93   7.39    61.9
KSFB           18.52  18.52   100.0
NGPL           19.65  21.17   107.7
PBVN           16.69  14.62    87.6
PKHL           22.57  22.69   100.5
XXCD           16.96  16.01    94.4

Cell Type Variability Analysis (sorted by CV):
                      mean    std  CV (%)
Level One Cell Type                      
immune cell          18.10  18.36   101.4
unknown cell         14.16  12.15    85.8
# Define the standard region sequence for plots (Get all unique values in Unique Region column as a list.)
regions = df_all_edges_with_cell_types['Unique Region'].unique().tolist()
print("\nRegions in the data:")
print(regions)

Regions in the data:
['FSLD', 'KSFB', 'NGPL', 'PBVN', 'PKHL', 'XXCD']
# Generate Violin Plot
def plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm='area'):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(10, 6))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette='Spectral', alpha=.9)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size)

    plt.title(f'Violin Plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    plt.figure(figsize=(16, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    # Create categorical type with only the regions that exist in the data
    available_regions = [r for r in regions if r in df_all_edges_with_cell_type_level['Unique Region'].unique()]
    df_all_edges_with_cell_type_level['Unique Region'] = pd.Categorical(
        df_all_edges_with_cell_type_level['Unique Region'],
        categories=available_regions,
        ordered=True
    )

    # Make box plot.
    sns.boxplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y='Distance', hue='Unique Region', showfliers=False, palette='Spectral') # viridis or Spectral palette for better color distinction
    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)
    plt.title(f'Distribution of distances by {cell_type_level} and region', fontsize=font_size)
    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    pivot_data = df_all_edges_with_cell_type_level.pivot_table(
    values='Distance',
    index=cell_type_level,
    columns='Unique Region',
    aggfunc='median'
    )

    plt.figure(figsize=(15, 10))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='Spectral')
    plt.title(f'Heatmap of median distances by {cell_type_level}', fontsize=12)

    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)

    plt.xlabel('Unique Region', fontsize=font_size)
    plt.ylabel(f'{cell_type_level}', fontsize=font_size)
    
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Generate Violin Plot per unique region.
def plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 1})
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    font_size = 10
    fig, axs = plt.subplots(6, 1, figsize=(10, 20)) # Adjusted figsize for horizontal layout
    fig.suptitle(f'Distance distribution per {cell_type_level} in `{dataset_dir}` (density normalization = {density_norm})', fontsize=font_size, y=1)

    # Keep the sequence of Cell Types consistent across plots.
    cell_types = sorted(df_all_edges_with_cell_type_level[cell_type_level].unique())

    # Create a color palette based on the number of unique classes
    color_palette = sns.color_palette("Spectral", n_colors=len(cell_types))

    # Create a dictionary mapping class to color
    class_color_dict = dict(zip(cell_types, color_palette))

    for i, region in enumerate(regions):
        data_reg = df_all_edges_with_cell_type_level[df_all_edges_with_cell_type_level['Unique Region'] == region]
        sns.violinplot(data=data_reg, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette=class_color_dict, alpha=.9, ax=axs[i], hue=cell_type_level, legend=False, order=cell_types, fill=True)
        axs[i].set_title(region, fontsize=font_size)
        axs[i].set_xlabel('', fontsize=font_size)
        axs[i].set_ylabel('Distance (\u03bcm)', fontsize=font_size)
        # axs[i].tick_params(axis='x', labelrotation=90, labelsize=font_size)
        # only show xtick labels for the last subplot
        if i < len(regions) - 1:
            axs[i].set_xticklabels([])
        else:
            axs[i].set_xticklabels(cell_types, fontsize=font_size, rotation=90, ha='right')
        # axs[i].set_ylim(0, data_reg['Distance'].max() * 1.1)  # Set y-limits to be consistent across all plots
        axs[i].tick_params(axis='both', labelsize=font_size)

    # Use fig.text for precise label positioning
    fig.figure.text(0.5, -0.02, f'{cell_type_level}', ha='center', va='bottom', fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    
    plt.show()

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

Level Two Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Two Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level
Level Two Cell Type Unique Region mean_distance median_distance min_distance max_distance
0 b cell FSLD 28.793129 18.193912 0.377000 174.062170
1 b cell KSFB 66.936676 47.283075 0.533159 199.992015
2 b cell NGPL 62.721335 50.108392 0.377000 199.946882
3 b cell PBVN 43.296183 27.544230 0.377000 199.993437
4 b cell PKHL 74.037735 63.134837 0.533159 199.996635
5 b cell XXCD 60.047765 41.497409 0.533159 199.913826
6 endothelial cell of lymphatic vessel FSLD 29.642101 22.399004 1.685995 170.821358
7 endothelial cell of lymphatic vessel KSFB 61.887568 50.581903 3.243077 199.493211
8 endothelial cell of lymphatic vessel NGPL 43.458134 39.208000 2.871146 198.593137
9 endothelial cell of lymphatic vessel PBVN 32.893007 27.726806 3.039471 136.266075
10 endothelial cell of lymphatic vessel PKHL 56.864136 48.143203 7.952824 183.699222
11 endothelial cell of lymphatic vessel XXCD 36.088648 32.225229 2.639000 129.051896
12 endothelial cell of vascular tree FSLD 21.913759 20.119229 0.377000 171.695254
13 endothelial cell of vascular tree KSFB 29.505573 23.603934 1.131000 198.672919
14 endothelial cell of vascular tree NGPL 26.593129 22.220624 0.533159 196.830469
15 endothelial cell of vascular tree PBVN 23.699591 21.791122 0.533159 132.553972
16 endothelial cell of vascular tree PKHL 25.871987 23.120276 2.665793 183.957844
17 endothelial cell of vascular tree XXCD 25.129076 22.500300 2.871146 174.160943
18 macrophage FSLD 12.964001 11.638253 0.533159 156.988703
19 macrophage KSFB 19.576427 16.003639 0.377000 185.984841
20 macrophage NGPL 19.867353 15.475379 0.377000 198.830958
21 macrophage PBVN 16.594531 14.813750 0.377000 124.547586
22 macrophage PKHL 17.784159 15.084712 0.377000 158.327882
23 macrophage XXCD 16.401596 14.395280 0.377000 158.600096
24 myeloid cell FSLD 12.219064 10.122992 0.377000 128.742602
25 myeloid cell KSFB 19.470228 14.640032 0.377000 195.567455
26 myeloid cell NGPL 18.322417 13.660724 0.377000 189.631000
27 myeloid cell PBVN 14.973737 12.823543 0.377000 113.148371
28 myeloid cell PKHL 17.663756 13.994776 0.377000 157.639204
29 myeloid cell XXCD 15.418976 12.795804 0.377000 185.362891
30 neutrophil FSLD 13.274293 10.616416 0.377000 140.427313
31 neutrophil KSFB 17.949726 13.151844 0.377000 191.740362
32 neutrophil NGPL 17.447778 12.069889 0.377000 187.014041
33 neutrophil PBVN 16.360062 13.328963 0.377000 136.539595
34 neutrophil PKHL 15.311728 12.492303 0.377000 151.875329
35 neutrophil XXCD 14.615880 12.140337 0.377000 194.779158
36 t cell FSLD 19.170687 13.723007 0.533159 173.112390
37 t cell KSFB 37.426450 20.902262 0.377000 199.818891
38 t cell NGPL 36.643007 21.725801 0.377000 199.150585
39 t cell PBVN 26.348933 17.818984 0.533159 199.797551
40 t cell PKHL 31.271041 17.670807 0.533159 199.732451
41 t cell XXCD 19.165004 13.707463 0.377000 199.459011
42 unknown cell FSLD 11.225060 8.328202 0.377000 160.251610
43 unknown cell KSFB 20.793916 12.210378 0.377000 197.669193
44 unknown cell NGPL 14.866453 9.960223 0.377000 194.954206
45 unknown cell PBVN 20.411115 14.025211 0.377000 175.661774
46 unknown cell PKHL 32.481525 19.752066 0.377000 199.245838
47 unknown cell XXCD 20.205180 14.483867 0.377000 199.347818
# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in FSLD:
                    Level Two Cell Type  mean_distance
1  endothelial cell of lymphatic vessel      29.642101
0                                b cell      28.793129
2     endothelial cell of vascular tree      21.913759
6                                t cell      19.170687
5                            neutrophil      13.274293

Bottom 5 cell types in FSLD:
  Level Two Cell Type  mean_distance
6              t cell      19.170687
5          neutrophil      13.274293
3          macrophage      12.964001
4        myeloid cell      12.219064
7        unknown cell      11.225060

Top 5 cell types in KSFB:
                    Level Two Cell Type  mean_distance
0                                b cell      66.936676
1  endothelial cell of lymphatic vessel      61.887568
6                                t cell      37.426450
2     endothelial cell of vascular tree      29.505573
7                          unknown cell      20.793916

Bottom 5 cell types in KSFB:
                 Level Two Cell Type  mean_distance
2  endothelial cell of vascular tree      29.505573
7                       unknown cell      20.793916
3                         macrophage      19.576427
4                       myeloid cell      19.470228
5                         neutrophil      17.949726

Top 5 cell types in NGPL:
                    Level Two Cell Type  mean_distance
0                                b cell      62.721335
1  endothelial cell of lymphatic vessel      43.458134
6                                t cell      36.643007
2     endothelial cell of vascular tree      26.593129
3                            macrophage      19.867353

Bottom 5 cell types in NGPL:
                 Level Two Cell Type  mean_distance
2  endothelial cell of vascular tree      26.593129
3                         macrophage      19.867353
4                       myeloid cell      18.322417
5                         neutrophil      17.447778
7                       unknown cell      14.866453

Top 5 cell types in PBVN:
                    Level Two Cell Type  mean_distance
0                                b cell      43.296183
1  endothelial cell of lymphatic vessel      32.893007
6                                t cell      26.348933
2     endothelial cell of vascular tree      23.699591
7                          unknown cell      20.411115

Bottom 5 cell types in PBVN:
                 Level Two Cell Type  mean_distance
2  endothelial cell of vascular tree      23.699591
7                       unknown cell      20.411115
3                         macrophage      16.594531
5                         neutrophil      16.360062
4                       myeloid cell      14.973737

Top 5 cell types in PKHL:
                    Level Two Cell Type  mean_distance
0                                b cell      74.037735
1  endothelial cell of lymphatic vessel      56.864136
7                          unknown cell      32.481525
6                                t cell      31.271041
2     endothelial cell of vascular tree      25.871987

Bottom 5 cell types in PKHL:
                 Level Two Cell Type  mean_distance
6                             t cell      31.271041
2  endothelial cell of vascular tree      25.871987
3                         macrophage      17.784159
4                       myeloid cell      17.663756
5                         neutrophil      15.311728

Top 5 cell types in XXCD:
                    Level Two Cell Type  mean_distance
0                                b cell      60.047765
1  endothelial cell of lymphatic vessel      36.088648
2     endothelial cell of vascular tree      25.129076
7                          unknown cell      20.205180
6                                t cell      19.165004

Bottom 5 cell types in XXCD:
  Level Two Cell Type  mean_distance
7        unknown cell      20.205180
6              t cell      19.165004
3          macrophage      16.401596
4        myeloid cell      15.418976
5          neutrophil      14.615880
# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in FSLD:
                    Level Two Cell Type  median_distance
1  endothelial cell of lymphatic vessel        22.399004
2     endothelial cell of vascular tree        20.119229
0                                b cell        18.193912
6                                t cell        13.723007
3                            macrophage        11.638253

Bottom 5 cell types in FSLD:
  Level Two Cell Type  median_distance
6              t cell        13.723007
3          macrophage        11.638253
5          neutrophil        10.616416
4        myeloid cell        10.122992
7        unknown cell         8.328202

Top 5 cell types in KSFB:
                    Level Two Cell Type  median_distance
1  endothelial cell of lymphatic vessel        50.581903
0                                b cell        47.283075
2     endothelial cell of vascular tree        23.603934
6                                t cell        20.902262
3                            macrophage        16.003639

Bottom 5 cell types in KSFB:
  Level Two Cell Type  median_distance
6              t cell        20.902262
3          macrophage        16.003639
4        myeloid cell        14.640032
5          neutrophil        13.151844
7        unknown cell        12.210378

Top 5 cell types in NGPL:
                    Level Two Cell Type  median_distance
0                                b cell        50.108392
1  endothelial cell of lymphatic vessel        39.208000
2     endothelial cell of vascular tree        22.220624
6                                t cell        21.725801
3                            macrophage        15.475379

Bottom 5 cell types in NGPL:
  Level Two Cell Type  median_distance
6              t cell        21.725801
3          macrophage        15.475379
4        myeloid cell        13.660724
5          neutrophil        12.069889
7        unknown cell         9.960223

Top 5 cell types in PBVN:
                    Level Two Cell Type  median_distance
1  endothelial cell of lymphatic vessel        27.726806
0                                b cell        27.544230
2     endothelial cell of vascular tree        21.791122
6                                t cell        17.818984
3                            macrophage        14.813750

Bottom 5 cell types in PBVN:
  Level Two Cell Type  median_distance
6              t cell        17.818984
3          macrophage        14.813750
7        unknown cell        14.025211
5          neutrophil        13.328963
4        myeloid cell        12.823543

Top 5 cell types in PKHL:
                    Level Two Cell Type  median_distance
0                                b cell        63.134837
1  endothelial cell of lymphatic vessel        48.143203
2     endothelial cell of vascular tree        23.120276
7                          unknown cell        19.752066
6                                t cell        17.670807

Bottom 5 cell types in PKHL:
  Level Two Cell Type  median_distance
7        unknown cell        19.752066
6              t cell        17.670807
3          macrophage        15.084712
4        myeloid cell        13.994776
5          neutrophil        12.492303

Top 5 cell types in XXCD:
                    Level Two Cell Type  median_distance
0                                b cell        41.497409
1  endothelial cell of lymphatic vessel        32.225229
2     endothelial cell of vascular tree        22.500300
7                          unknown cell        14.483867
3                            macrophage        14.395280

Bottom 5 cell types in XXCD:
  Level Two Cell Type  median_distance
7        unknown cell        14.483867
3          macrophage        14.395280
6              t cell        13.707463
4        myeloid cell        12.795804
5          neutrophil        12.140337
calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)

Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
FSLD           17.76  17.69    99.6
KSFB           32.30  37.95   117.5
NGPL           32.17  35.55   110.5
PBVN           24.70  24.92   100.9
PKHL           32.27  36.56   113.3
XXCD           24.94  29.75   119.3

Cell Type Variability Analysis (sorted by CV):
                                       mean    std  CV (%)
Level Two Cell Type                                       
unknown cell                          19.98  22.82   114.2
t cell                                27.79  30.32   109.1
b cell                                54.90  48.74    88.8
neutrophil                            15.81  13.10    82.9
myeloid cell                          15.79  12.86    81.4
endothelial cell of lymphatic vessel  42.11  30.14    71.6
macrophage                            16.86  11.93    70.8
endothelial cell of vascular tree     25.07  15.68    62.5
plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

Level Three Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Three Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level
Level Three Cell Type Unique Region mean_distance median_distance min_distance max_distance
0 b cell FSLD 15.017328 12.756869 0.377000 150.453233
1 b cell KSFB 21.320057 16.424398 0.533159 191.699959
2 b cell NGPL 21.343961 15.945808 0.377000 197.501949
3 b cell PBVN 18.469944 15.530387 0.377000 167.593356
4 b cell PKHL 18.087155 14.218947 0.533159 156.409572
5 b cell XXCD 16.399071 13.592928 0.533159 147.880623
6 b cell:follicular FSLD 44.741041 33.688278 0.533159 174.062170
7 b cell:follicular KSFB 98.262190 94.780079 3.016000 199.992015
8 b cell:follicular NGPL 86.597350 82.378598 0.533159 199.946882
9 b cell:follicular PBVN 62.807603 52.209394 3.221089 199.993437
10 b cell:follicular PKHL 83.720368 75.842618 2.744601 199.996635
11 b cell:follicular XXCD 76.025086 66.886430 2.871146 199.913826
12 cell:proliferating FSLD 18.250368 13.070542 2.639000 160.251610
13 cell:proliferating KSFB 16.776125 12.308920 0.377000 194.953112
14 cell:proliferating NGPL 16.035974 11.196329 0.377000 194.954206
15 cell:proliferating PBVN 22.825014 15.856425 0.377000 175.661774
16 cell:proliferating PKHL 22.113843 16.118679 0.377000 195.479862
17 cell:proliferating XXCD 17.848101 14.508379 0.533159 199.347818
18 endothelial cell of lymphatic vessel FSLD 29.642101 22.399004 1.685995 170.821358
19 endothelial cell of lymphatic vessel KSFB 61.887568 50.581903 3.243077 199.493211
20 endothelial cell of lymphatic vessel NGPL 43.458134 39.208000 2.871146 198.593137
21 endothelial cell of lymphatic vessel PBVN 32.893007 27.726806 3.039471 136.266075
22 endothelial cell of lymphatic vessel PKHL 56.864136 48.143203 7.952824 183.699222
23 endothelial cell of lymphatic vessel XXCD 36.088648 32.225229 2.639000 129.051896
24 endothelial cell of vascular tree FSLD 21.913759 20.119229 0.377000 171.695254
25 endothelial cell of vascular tree KSFB 29.505573 23.603934 1.131000 198.672919
26 endothelial cell of vascular tree NGPL 26.593129 22.220624 0.533159 196.830469
27 endothelial cell of vascular tree PBVN 23.699591 21.791122 0.533159 132.553972
28 endothelial cell of vascular tree PKHL 25.871987 23.120276 2.665793 183.957844
29 endothelial cell of vascular tree XXCD 25.129076 22.500300 2.871146 174.160943
30 macrophage FSLD 12.964001 11.638253 0.533159 156.988703
31 macrophage KSFB 19.576427 16.003639 0.377000 185.984841
32 macrophage NGPL 19.867353 15.475379 0.377000 198.830958
33 macrophage PBVN 16.594531 14.813750 0.377000 124.547586
34 macrophage PKHL 17.784159 15.084712 0.377000 158.327882
35 macrophage XXCD 16.401596 14.395280 0.377000 158.600096
36 myeloid cell FSLD 12.219064 10.122992 0.377000 128.742602
37 myeloid cell KSFB 19.470228 14.640032 0.377000 195.567455
38 myeloid cell NGPL 18.322417 13.660724 0.377000 189.631000
39 myeloid cell PBVN 14.973737 12.823543 0.377000 113.148371
40 myeloid cell PKHL 17.663756 13.994776 0.377000 157.639204
41 myeloid cell XXCD 15.418976 12.795804 0.377000 185.362891
42 neutrophil/monocyte FSLD 13.274293 10.616416 0.377000 140.427313
43 neutrophil/monocyte KSFB 17.949726 13.151844 0.377000 191.740362
44 neutrophil/monocyte NGPL 17.447778 12.069889 0.377000 187.014041
45 neutrophil/monocyte PBVN 16.360062 13.328963 0.377000 136.539595
46 neutrophil/monocyte PKHL 15.311728 12.492303 0.377000 151.875329
47 neutrophil/monocyte XXCD 14.615880 12.140337 0.377000 194.779158
48 t cell:cd4+ alpha-beta memory FSLD 30.214247 19.784420 0.842998 173.112390
49 t cell:cd4+ alpha-beta memory KSFB 59.324035 41.748388 2.293201 199.818891
50 t cell:cd4+ alpha-beta memory NGPL 56.807808 47.174738 2.744601 199.150585
51 t cell:cd4+ alpha-beta memory PBVN 41.516702 27.747302 2.639000 199.797551
52 t cell:cd4+ alpha-beta memory PKHL 56.975496 42.997011 1.922330 199.732451
53 t cell:cd4+ alpha-beta memory XXCD 35.364842 20.820506 1.922330 199.459011
54 t cell:cd8+ alpha-beta memory FSLD 13.932302 12.169569 0.533159 117.974499
55 t cell:cd8+ alpha-beta memory KSFB 18.510546 15.197356 0.377000 183.973682
56 t cell:cd8+ alpha-beta memory NGPL 18.357679 14.552395 0.377000 195.575450
57 t cell:cd8+ alpha-beta memory PBVN 16.785163 14.871205 0.533159 127.954155
58 t cell:cd8+ alpha-beta memory PKHL 17.167939 14.171389 0.533159 169.717009
59 t cell:cd8+ alpha-beta memory XXCD 14.440700 12.549061 0.377000 155.671330
60 unknown cell FSLD 9.064110 7.511672 0.377000 156.522210
61 unknown cell KSFB 25.388738 12.022695 0.377000 197.669193
62 unknown cell NGPL 13.683767 8.744450 0.377000 164.182093
63 unknown cell PBVN 19.382652 13.086842 0.377000 160.660840
64 unknown cell PKHL 39.160130 25.699678 0.377000 199.245838
65 unknown cell XXCD 23.403814 14.417472 0.377000 199.051002
# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in FSLD:
                  Level Three Cell Type  mean_distance
1                     b cell:follicular      44.741041
8         t cell:cd4+ alpha-beta memory      30.214247
3  endothelial cell of lymphatic vessel      29.642101
4     endothelial cell of vascular tree      21.913759
2                    cell:proliferating      18.250368

Bottom 5 cell types in FSLD:
            Level Three Cell Type  mean_distance
9   t cell:cd8+ alpha-beta memory      13.932302
7             neutrophil/monocyte      13.274293
5                      macrophage      12.964001
6                    myeloid cell      12.219064
10                   unknown cell       9.064110

Top 5 cell types in KSFB:
                   Level Three Cell Type  mean_distance
1                      b cell:follicular      98.262190
3   endothelial cell of lymphatic vessel      61.887568
8          t cell:cd4+ alpha-beta memory      59.324035
4      endothelial cell of vascular tree      29.505573
10                          unknown cell      25.388738

Bottom 5 cell types in KSFB:
           Level Three Cell Type  mean_distance
5                     macrophage      19.576427
6                   myeloid cell      19.470228
9  t cell:cd8+ alpha-beta memory      18.510546
7            neutrophil/monocyte      17.949726
2             cell:proliferating      16.776125

Top 5 cell types in NGPL:
                  Level Three Cell Type  mean_distance
1                     b cell:follicular      86.597350
8         t cell:cd4+ alpha-beta memory      56.807808
3  endothelial cell of lymphatic vessel      43.458134
4     endothelial cell of vascular tree      26.593129
0                                b cell      21.343961

Bottom 5 cell types in NGPL:
            Level Three Cell Type  mean_distance
9   t cell:cd8+ alpha-beta memory      18.357679
6                    myeloid cell      18.322417
7             neutrophil/monocyte      17.447778
2              cell:proliferating      16.035974
10                   unknown cell      13.683767

Top 5 cell types in PBVN:
                  Level Three Cell Type  mean_distance
1                     b cell:follicular      62.807603
8         t cell:cd4+ alpha-beta memory      41.516702
3  endothelial cell of lymphatic vessel      32.893007
4     endothelial cell of vascular tree      23.699591
2                    cell:proliferating      22.825014

Bottom 5 cell types in PBVN:
           Level Three Cell Type  mean_distance
0                         b cell      18.469944
9  t cell:cd8+ alpha-beta memory      16.785163
5                     macrophage      16.594531
7            neutrophil/monocyte      16.360062
6                   myeloid cell      14.973737

Top 5 cell types in PKHL:
                   Level Three Cell Type  mean_distance
1                      b cell:follicular      83.720368
8          t cell:cd4+ alpha-beta memory      56.975496
3   endothelial cell of lymphatic vessel      56.864136
10                          unknown cell      39.160130
4      endothelial cell of vascular tree      25.871987

Bottom 5 cell types in PKHL:
           Level Three Cell Type  mean_distance
0                         b cell      18.087155
5                     macrophage      17.784159
6                   myeloid cell      17.663756
9  t cell:cd8+ alpha-beta memory      17.167939
7            neutrophil/monocyte      15.311728

Top 5 cell types in XXCD:
                   Level Three Cell Type  mean_distance
1                      b cell:follicular      76.025086
3   endothelial cell of lymphatic vessel      36.088648
8          t cell:cd4+ alpha-beta memory      35.364842
4      endothelial cell of vascular tree      25.129076
10                          unknown cell      23.403814

Bottom 5 cell types in XXCD:
           Level Three Cell Type  mean_distance
5                     macrophage      16.401596
0                         b cell      16.399071
6                   myeloid cell      15.418976
7            neutrophil/monocyte      14.615880
9  t cell:cd8+ alpha-beta memory      14.440700
# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in FSLD:
                  Level Three Cell Type  median_distance
1                     b cell:follicular        33.688278
3  endothelial cell of lymphatic vessel        22.399004
4     endothelial cell of vascular tree        20.119229
8         t cell:cd4+ alpha-beta memory        19.784420
2                    cell:proliferating        13.070542

Bottom 5 cell types in FSLD:
            Level Three Cell Type  median_distance
9   t cell:cd8+ alpha-beta memory        12.169569
5                      macrophage        11.638253
7             neutrophil/monocyte        10.616416
6                    myeloid cell        10.122992
10                   unknown cell         7.511672

Top 5 cell types in KSFB:
                  Level Three Cell Type  median_distance
1                     b cell:follicular        94.780079
3  endothelial cell of lymphatic vessel        50.581903
8         t cell:cd4+ alpha-beta memory        41.748388
4     endothelial cell of vascular tree        23.603934
0                                b cell        16.424398

Bottom 5 cell types in KSFB:
            Level Three Cell Type  median_distance
9   t cell:cd8+ alpha-beta memory        15.197356
6                    myeloid cell        14.640032
7             neutrophil/monocyte        13.151844
2              cell:proliferating        12.308920
10                   unknown cell        12.022695

Top 5 cell types in NGPL:
                  Level Three Cell Type  median_distance
1                     b cell:follicular        82.378598
8         t cell:cd4+ alpha-beta memory        47.174738
3  endothelial cell of lymphatic vessel        39.208000
4     endothelial cell of vascular tree        22.220624
0                                b cell        15.945808

Bottom 5 cell types in NGPL:
            Level Three Cell Type  median_distance
9   t cell:cd8+ alpha-beta memory        14.552395
6                    myeloid cell        13.660724
7             neutrophil/monocyte        12.069889
2              cell:proliferating        11.196329
10                   unknown cell         8.744450

Top 5 cell types in PBVN:
                  Level Three Cell Type  median_distance
1                     b cell:follicular        52.209394
8         t cell:cd4+ alpha-beta memory        27.747302
3  endothelial cell of lymphatic vessel        27.726806
4     endothelial cell of vascular tree        21.791122
2                    cell:proliferating        15.856425

Bottom 5 cell types in PBVN:
            Level Three Cell Type  median_distance
9   t cell:cd8+ alpha-beta memory        14.871205
5                      macrophage        14.813750
7             neutrophil/monocyte        13.328963
10                   unknown cell        13.086842
6                    myeloid cell        12.823543

Top 5 cell types in PKHL:
                   Level Three Cell Type  median_distance
1                      b cell:follicular        75.842618
3   endothelial cell of lymphatic vessel        48.143203
8          t cell:cd4+ alpha-beta memory        42.997011
10                          unknown cell        25.699678
4      endothelial cell of vascular tree        23.120276

Bottom 5 cell types in PKHL:
           Level Three Cell Type  median_distance
5                     macrophage        15.084712
0                         b cell        14.218947
9  t cell:cd8+ alpha-beta memory        14.171389
6                   myeloid cell        13.994776
7            neutrophil/monocyte        12.492303

Top 5 cell types in XXCD:
                  Level Three Cell Type  median_distance
1                     b cell:follicular        66.886430
3  endothelial cell of lymphatic vessel        32.225229
4     endothelial cell of vascular tree        22.500300
8         t cell:cd4+ alpha-beta memory        20.820506
2                    cell:proliferating        14.508379

Bottom 5 cell types in XXCD:
           Level Three Cell Type  median_distance
5                     macrophage        14.395280
0                         b cell        13.592928
6                   myeloid cell        12.795804
9  t cell:cd8+ alpha-beta memory        12.549061
7            neutrophil/monocyte        12.140337
calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)

Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
FSLD           17.76  17.69    99.6
KSFB           32.30  37.95   117.5
NGPL           32.17  35.55   110.5
PBVN           24.70  24.92   100.9
PKHL           32.27  36.56   113.3
XXCD           24.94  29.75   119.3

Cell Type Variability Analysis (sorted by CV):
                                       mean    std  CV (%)
Level Three Cell Type                                     
unknown cell                          21.03  25.66   122.0
cell:proliferating                    18.51  18.05    97.5
t cell:cd4+ alpha-beta memory         47.75  40.36    84.5
neutrophil/monocyte                   15.81  13.10    82.9
myeloid cell                          15.79  12.86    81.4
b cell                                18.27  14.68    80.4
t cell:cd8+ alpha-beta memory         16.14  11.70    72.5
endothelial cell of lymphatic vessel  42.11  30.14    71.6
macrophage                            16.86  11.93    70.8
b cell:follicular                     76.64  48.88    63.8
endothelial cell of vascular tree     25.07  15.68    62.5
plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.