Distance Analysis: lung-codex-urmc

Analyze and visualize cell-to-nearest-endothelial-cell distance distributions for the lung-codex-urmc dataset.
import numpy as np
import pandas as pd
import os
import json
import requests
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
import plotly.express as px

from _cde_compute_edges_from_nodes import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")
basepath = "/u/yashjain/hra-cell-distance-analysis/data"
dataset_dir = "lung-codex-urmc"
data_filedir = os.path.join("data-processed-nodes-with-harmonized-cell-types", dataset_dir)
output_edge_dir = os.path.join("data-processed-edges", dataset_dir)
figures_output_dir = "generated-figures"
# Function to load your data
def load_data(path, edges=False):
    if edges:
        column_names = ['cell_id', 'x1', 'y1', 'z1', 'x2', 'y2', 'z2']
        data = pd.read_csv(path, header=None, names=column_names)
    else:
        data = pd.read_csv(path)
    return data
# Function to read all files ending with "-nodes.csv" in the `data_filedir` directory into a single DataFrame. 
# Another additional column `Dataset` is added to identify the dataset name which comes from the filename before the `-nodes.csv` suffix.

# Additionally, function reads all files ending with "-edges.csv" in the `output_edge_dir` directory into a single DataFrame. 
# Three additional columns are added "Dataset", "Anchor Cell Type", and "Anchor Cell Type Level" to identify the dataset name, anchor cell type, and anchor cell type level respectively which come from the filename before the `.csv` suffix.
# The three additional columns are created by splitting the filename on the `-` character, and extracting the relevant parts.
# On splitting, the first part is the dataset name, second part is the anchor cell type level, and third part is the anchor cell type, and last part is the `edges` suffix.
# When reading files, check if the file has the correct format (i.e., ends with `-edges.csv`).

# Additionally, the function merges the edges DataFrame with the nodes DataFrame to get the cell type information for the anchor cells.
# This is done by reading the corresponding nodes file from the `data_filedir` directory for each edges file, and merging it with the edges DataFrame on the `cell_id` column.
# The merged DataFrame contains the edges with additional columns for the cell type information.

# The function returns three DataFrames:
# 1. `merged_nodes`: DataFrame containing all nodes with an additional column `Dataset`.
# 2. `merged_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, and `Anchor Cell Type Level`.
# 3. `merged_nodes_for_all_edges`: DataFrame containing all edges with additional columns `Dataset`, `Anchor Cell Type`, `Anchor Cell Type Level`, and the cell type information for cells.
def read_all_edge_datasets(basepath, data_filedir, output_edge_dir):
    all_nodes_files = []
    all_edges_files = []
    all_nodes_edges_files = []
    for file in os.listdir(os.path.join(basepath, output_edge_dir)):
        if file.endswith("-edges.csv"):
            file_path = os.path.join(basepath, output_edge_dir, file)
            d1, d2, d3, d4, anchor_cell_type_level, anchor_cell_type = file.replace("-edges.csv", "").split('-')
            dataset_name = f'{d1}-{d2}-{d3}-{d4}'  # Construct dataset name from the first four parts of the filename.
            edges_df = load_data(file_path, edges=False)
            edges_df['Dataset'] = dataset_name
            edges_df['Anchor Cell Type'] = anchor_cell_type
            edges_df['Anchor Cell Type Level'] = anchor_cell_type_level
            edges_df.rename(columns={"distance": "Distance"}, inplace=True) # Rename column "distance" to "Distance".
            all_edges_files.append(edges_df)

            # Read the corresponding nodes file from data_filedir to get the cell type information
            nodes_file_path = os.path.join(basepath, data_filedir, f"{dataset_name}-nodes.csv")
            nodes_df = load_data(nodes_file_path)
            nodes_df['Dataset'] = dataset_name
            all_nodes_files.append(nodes_df)

            # Add a new 'cell_id' column to nodes_df
            nodes_df['cell_id'] = range(len(nodes_df))
            # Set 'cell_id' column as index for nodes_df
            nodes_df.set_index('cell_id', inplace=True)
            # Merge edges_df with nodes_df to get the cell type information for the anchor cells
            edges_nodes_df = pd.merge(edges_df, nodes_df[['Level Three Cell Type', 'Level Two Cell Type', 'Level One Cell Type']], how='left', left_on='cell_id', right_index=True)
            all_nodes_edges_files.append(edges_nodes_df)

    
    merged_edges = pd.concat(all_edges_files, ignore_index=True)
    merged_nodes = pd.concat(all_nodes_files, ignore_index=True)
    merged_nodes_for_all_edges = pd.concat(all_nodes_edges_files, ignore_index=True) 

    return merged_nodes, merged_edges, merged_nodes_for_all_edges
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

Get initial statistics and identify endothelial cell categories for dataset.

df_all_nodes, df_all_edges, df_all_edges_with_cell_types = read_all_edge_datasets(basepath, data_filedir, output_edge_dir)
df_all_nodes.head(5)
x y Original Cell Type Level Three Cell Type Level Three CL Label Level Three CL ID CL_Match/3 Level Two Cell Type Level Two CL Label Level Two CL ID CL_Match/2 Level One Cell Type Level One CL Label Level One CL ID CL_Match/1 Dataset
0 55.000000 4633.074074 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
1 794.777778 3643.197531 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
2 848.313953 3606.447674 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
3 1046.877061 2890.736132 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
4 2406.666667 1298.307692 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
# Print the total number of unique cell types per dataset. Compute separately for each cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type).
print("Total number of unique cell types per cell type annnotation level:")
unique_cell_types = {
    'Original Cell Type': df_all_nodes['Original Cell Type'].nunique(),
    'Level Three Cell Type': df_all_nodes['Level Three Cell Type'].nunique(),
    'Level Two Cell Type': df_all_nodes['Level Two Cell Type'].nunique(),
    'Level One Cell Type': df_all_nodes['Level One Cell Type'].nunique()
}
for cell_type, count in unique_cell_types.items():
    print(f"{cell_type}: {count}")
Total number of unique cell types per cell type annnotation level:
Original Cell Type: 54
Level Three Cell Type: 20
Level Two Cell Type: 17
Level One Cell Type: 7
# Save the unique cell types containing "endothelial" in name per cell type column (Level One Cell Type, Level Two Cell Type, Level Three Cell Type, Original Cell Type) to a dictionary where the key is the level and the value is a list of unique cell types.
endothelial_cell_types = {
    'Original Cell Type': df_all_nodes[df_all_nodes['Original Cell Type'].str.contains("endothelial", case=False, na=False)]['Original Cell Type'].unique().tolist(),
    'Level Three Cell Type': df_all_nodes[df_all_nodes['Level Three Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Three Cell Type'].unique().tolist(),
    'Level Two Cell Type': df_all_nodes[df_all_nodes['Level Two Cell Type'].str.contains("endothelial", case=False, na=False)]['Level Two Cell Type'].unique().tolist(),
    'Level One Cell Type': df_all_nodes[df_all_nodes['Level One Cell Type'].str.contains("endothelial", case=False, na=False)]['Level One Cell Type'].unique().tolist()
}

print("\nEndothelial cell types per cell type annotation level:")
for level, cell_types in endothelial_cell_types.items():
    print(f"\n{level}:")
    for cell in cell_types:
        print(f"  - {cell}")

Endothelial cell types per cell type annotation level:

Original Cell Type:

Level Three Cell Type:
  - endothelial cell
  - endothelial cell of lymphatic vessel
  - endothelial cell of capillary

Level Two Cell Type:
  - endothelial cell
  - endothelial cell of lymphatic vessel
  - endothelial cell of capillary

Level One Cell Type:
  - endothelial cell
type_field_list = ["Level Three Cell Type", "Level Two Cell Type", "Level One Cell Type"] # Skipping Original Cell Type as it is not a hierarchical level.

# Define the anchor cell type (type of endothelial cell) for each level in type_field_list based on available categories in the previous cell. The distance analysis at all three levels will be limited to the specified anchor cell type.
anchor_cell_type_dict = {
    'Level Three Cell Type': 'endothelial cell',
    'Level Two Cell Type': 'endothelial cell',
    'Level One Cell Type': 'endothelial cell'
}

Process datasets to add region information to Nodes files.

# Create a dictionary to map lung regions to correct Normal/Disease condition.
region_map = {'D115-RLL-10A3-40':'Disease',
 'D265-LLL-7A7-12':'Normal', 
}

# Define the standard region sequence for plots
regions = ['Normal', 'Disease']
df_all_nodes.head()
x y Original Cell Type Level Three Cell Type Level Three CL Label Level Three CL ID CL_Match/3 Level Two Cell Type Level Two CL Label Level Two CL ID CL_Match/2 Level One Cell Type Level One CL Label Level One CL ID CL_Match/1 Dataset
0 55.000000 4633.074074 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
1 794.777778 3643.197531 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
2 848.313953 3606.447674 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
3 1046.877061 2890.736132 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
4 2406.666667 1298.307692 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12
# Iterate through the df_all_data dataframe to create new column "Unique Region" based on the "Dataset" column.
# The "Unique Region" column is created by mapping the region names based on the full dataset name.
df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].map(region_map)
# df_all_nodes['Unique Region'] = df_all_nodes['Dataset'].str.split('-').str[1].map(region_map)

# Check if the new columns are created correctly.
df_all_nodes[['Dataset', 'Unique Region']].head(5)
Dataset Unique Region
0 D265-LLL-7A7-12 Normal
1 D265-LLL-7A7-12 Normal
2 D265-LLL-7A7-12 Normal
3 D265-LLL-7A7-12 Normal
4 D265-LLL-7A7-12 Normal
# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_nodes['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_nodes['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_nodes['Unique Region'].unique():
    num_datasets = df_all_nodes[df_all_nodes['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")

Unique Regions in the data:
['Normal' 'Disease']
Total number of unique regions: 2

Number of unique datasets per unique region:
Normal: 1
Disease: 1

Process datasets to add region information to Edges files.

df_all_edges.head(5)
cell_id x1 y1 z1 x2 y2 z2 Distance Dataset Anchor Cell Type Anchor Cell Type Level
0 9600 54.891304 14218.369565 0 164.375000 14329.000000 0 155.646306 D265-LLL-7A7-12 endothelial cell Level Three Cell Type
1 9604 148.519685 14344.433071 0 164.375000 14329.000000 0 22.126244 D265-LLL-7A7-12 endothelial cell Level Three Cell Type
2 9603 145.042553 13454.680851 0 150.838235 13477.191176 0 23.244455 D265-LLL-7A7-12 endothelial cell Level Three Cell Type
3 9605 151.107143 13463.553571 0 150.838235 13477.191176 0 13.640256 D265-LLL-7A7-12 endothelial cell Level Three Cell Type
4 9607 160.127168 13502.421965 0 150.838235 13477.191176 0 26.886371 D265-LLL-7A7-12 endothelial cell Level Three Cell Type
# Process the edge data to create new columns "Unique Region" based on the "Dataset" column, similar to how it was done for the node data.
df_all_edges['Unique Region'] = df_all_edges['Dataset'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges[['Dataset', 'Unique Region']].head(5)
Dataset Unique Region
0 D265-LLL-7A7-12 Normal
1 D265-LLL-7A7-12 Normal
2 D265-LLL-7A7-12 Normal
3 D265-LLL-7A7-12 Normal
4 D265-LLL-7A7-12 Normal
# Print all unique regions in the data.
print("\nUnique Regions in the data:")
print(df_all_edges['Unique Region'].unique())

# Print the total number of unique regions.
print(f"Total number of unique regions: {df_all_edges['Unique Region'].nunique()}")

# Print number of unique datasets per unique region.
print("\nNumber of unique datasets per unique region:")
for region in df_all_edges['Unique Region'].unique():
    num_datasets = df_all_edges[df_all_edges['Unique Region'] == region]['Dataset'].nunique()
    print(f"{region}: {num_datasets}")

Unique Regions in the data:
['Normal' 'Disease']
Total number of unique regions: 2

Number of unique datasets per unique region:
Normal: 1
Disease: 1
df_all_edges_with_cell_types['Unique Region'] = df_all_edges_with_cell_types['Dataset'].map(region_map)

# Check if the new columns are created correctly.
df_all_edges_with_cell_types[['Dataset', 'Unique Region']].head(5)
Dataset Unique Region
0 D265-LLL-7A7-12 Normal
1 D265-LLL-7A7-12 Normal
2 D265-LLL-7A7-12 Normal
3 D265-LLL-7A7-12 Normal
4 D265-LLL-7A7-12 Normal
df_all_nodes.head(1)
x y Original Cell Type Level Three Cell Type Level Three CL Label Level Three CL ID CL_Match/3 Level Two Cell Type Level Two CL Label Level Two CL ID CL_Match/2 Level One Cell Type Level One CL Label Level One CL ID CL_Match/1 Dataset Unique Region
0 55.0 4633.074074 CD8 + T cell_1 t cell:cd8+ lung resident memory CD8-positive, alpha-beta ... CL:4033039 skos:exactMatch t cell T cell CL:0000084 skos:exactMatch immune cell leukocyte CL:0000738 skos:exactMatch D265-LLL-7A7-12 Normal
df_all_edges.head(1)
cell_id x1 y1 z1 x2 y2 z2 Distance Dataset Anchor Cell Type Anchor Cell Type Level Unique Region
0 9600 54.891304 14218.369565 0 164.375 14329.0 0 155.646306 D265-LLL-7A7-12 endothelial cell Level Three Cell Type Normal
df_all_edges_with_cell_types.head(1)
cell_id x1 y1 z1 x2 y2 z2 Distance Dataset Anchor Cell Type Anchor Cell Type Level Level Three Cell Type Level Two Cell Type Level One Cell Type Unique Region
0 9600 54.891304 14218.369565 0 164.375 14329.0 0 155.646306 D265-LLL-7A7-12 endothelial cell Level Three Cell Type t cell:cd8+ t cell immune cell Normal

Node Analysis

# Plot number of cells per cell type in the same plot. Color by cell type and unique region. Output figure saved in existing `figures_output_dir`.
def plot_cells_per_celltype(df, type_field, output_dir):
    plt.figure(figsize=(20, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.countplot(data=df, x=type_field, palette='Spectral', hue='Unique Region')
    plt.title(f'Number of Cells per {type_field} in `{dataset_dir}`')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_cells_per_celltype_{type_field}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.legend(title='Unique Region', bbox_to_anchor=(0.85, 1), loc='upper left')
    plt.xlabel(type_field)

    # For numbers on y-axis, do not use scientific notation.
    plt.ticklabel_format(style='plain', axis='y')
    # Set y-axis label
    plt.ylabel('Number of Cells')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    # Show the plot
    plt.show()
    plt.close()
for type_field in type_field_list:
    plot_cells_per_celltype(df_all_nodes, type_field, os.path.join(basepath, figures_output_dir))

Distance Analysis

# Get mean, median, minimum, maximum distance per unique region per anchor cell type.
df_distance_stats = df_all_edges_with_cell_types.groupby(['Unique Region', 'Anchor Cell Type', 'Anchor Cell Type Level']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
# Print the first few rows of the distance statistics DataFrame.
df_distance_stats
Unique Region Anchor Cell Type Anchor Cell Type Level mean_distance median_distance min_distance max_distance
0 Disease endothelial cell Level One Cell Type 42.723724 30.471857 4.826959 199.996537
1 Disease endothelial cell Level Three Cell Type 46.507558 34.814730 4.317932 199.976085
2 Disease endothelial cell Level Two Cell Type 46.507558 34.814730 4.317932 199.976085
3 Disease endothelial cell of capillary Level Three Cell Type 62.225686 49.163320 4.317932 199.998891
4 Disease endothelial cell of capillary Level Two Cell Type 62.225686 49.163320 4.317932 199.998891
5 Normal endothelial cell Level One Cell Type 38.755288 31.499763 4.467862 199.906751
6 Normal endothelial cell Level Three Cell Type 45.655140 35.412244 4.467862 199.967962
7 Normal endothelial cell Level Two Cell Type 45.655140 35.412244 4.467862 199.967962
8 Normal endothelial cell of lymphatic vessel Level Three Cell Type 100.380353 98.949395 5.158630 199.998081
9 Normal endothelial cell of lymphatic vessel Level Two Cell Type 100.380353 98.949395 5.158630 199.998081

Level One Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level One Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level
Level One Cell Type Unique Region mean_distance median_distance min_distance max_distance
0 epithelial cell Disease 32.455094 25.588249 4.826959 199.648377
1 epithelial cell Normal 35.403572 28.856798 4.647661 199.778494
2 hematopoietic precursor cell Normal 34.836844 27.595297 6.647334 183.441998
3 immune cell Disease 52.244046 38.866237 5.341610 199.996537
4 immune cell Normal 39.351442 31.840836 5.332036 199.829270
5 mesenchymal cell Disease 46.334527 33.301188 5.314126 199.976085
6 mesenchymal cell Normal 49.582849 43.837154 6.818182 199.215219
7 mixed immune/epithelial cell population Disease 29.528454 23.648768 6.334930 199.968661
8 unknown cell Disease 21.070750 15.768757 5.697526 190.229178
9 unknown cell Normal 37.909030 30.412302 4.467862 199.906751
# Get top five and bottom five cell types with respect to mean distance in each unique region separately.
def get_top_bottom_cell_types_by_mean(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate mean distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(mean_distance=('Distance', 'mean')).reset_index()
    
    # Sort by mean distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='mean_distance', ascending=False)
    
    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in Normal:
            Level One Cell Type  mean_distance
3              mesenchymal cell      49.582849
2                   immune cell      39.351442
4                  unknown cell      37.909030
0               epithelial cell      35.403572
1  hematopoietic precursor cell      34.836844

Bottom 5 cell types in Normal:
            Level One Cell Type  mean_distance
3              mesenchymal cell      49.582849
2                   immune cell      39.351442
4                  unknown cell      37.909030
0               epithelial cell      35.403572
1  hematopoietic precursor cell      34.836844

Top 5 cell types in Disease:
                       Level One Cell Type  mean_distance
1                              immune cell      52.244046
2                         mesenchymal cell      46.334527
0                          epithelial cell      32.455094
3  mixed immune/epithelial cell population      29.528454
4                             unknown cell      21.070750

Bottom 5 cell types in Disease:
                       Level One Cell Type  mean_distance
1                              immune cell      52.244046
2                         mesenchymal cell      46.334527
0                          epithelial cell      32.455094
3  mixed immune/epithelial cell population      29.528454
4                             unknown cell      21.070750
# Get top five and bottom five cell types with respect to median distance in each unique region separately.
def get_top_bottom_cell_types_by_median(df, cell_type_level, unique_region, top_n=5):
    # Filter the DataFrame for the specified unique region and cell type level
    df_filtered = df[df['Unique Region'] == unique_region]

    # Group by the specified cell type level and calculate median distance
    df_grouped = df_filtered.groupby(cell_type_level).agg(median_distance=('Distance', 'median')).reset_index()

    # Sort by median distance to get top and bottom cell types
    df_sorted = df_grouped.sort_values(by='median_distance', ascending=False)

    # Get top N and bottom N cell types
    top_cell_types = df_sorted.head(top_n)
    bottom_cell_types = df_sorted.tail(top_n)
    
    return top_cell_types, bottom_cell_types

# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in Normal:
            Level One Cell Type  median_distance
3              mesenchymal cell        43.837154
2                   immune cell        31.840836
4                  unknown cell        30.412302
0               epithelial cell        28.856798
1  hematopoietic precursor cell        27.595297

Bottom 5 cell types in Normal:
            Level One Cell Type  median_distance
3              mesenchymal cell        43.837154
2                   immune cell        31.840836
4                  unknown cell        30.412302
0               epithelial cell        28.856798
1  hematopoietic precursor cell        27.595297

Top 5 cell types in Disease:
                       Level One Cell Type  median_distance
1                              immune cell        38.866237
2                         mesenchymal cell        33.301188
0                          epithelial cell        25.588249
3  mixed immune/epithelial cell population        23.648768
4                             unknown cell        15.768757

Bottom 5 cell types in Disease:
                       Level One Cell Type  median_distance
1                              immune cell        38.866237
2                         mesenchymal cell        33.301188
0                          epithelial cell        25.588249
3  mixed immune/epithelial cell population        23.648768
4                             unknown cell        15.768757
# Calculate regional variability
def calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level):
    """    Calculate regional variability for distances in the given DataFrame.
    """
    regional_variability = df_all_edges_with_cell_type_level.groupby('Unique Region')['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    regional_variability['CV (%)'] = (regional_variability['std'] / regional_variability['mean'] * 100).round(1)

    print("\nRegional Variability Analysis:")
    print("Mean: Average distance in each region")
    print("Std: Standard deviation of distances")
    print("CV: Coefficient of Variation (std/mean * 100%)")
    print(regional_variability)

    # Calculate variability for each cell type
    cell_type_variability = df_all_edges_with_cell_type_level.groupby(cell_type_level)['Distance'].agg([
        ('mean', 'mean'),
        ('std', 'std')
    ]).round(2)

    # Add CV as percentage
    cell_type_variability['CV (%)'] = (cell_type_variability['std'] / cell_type_variability['mean'] * 100).round(1)

    print("\nCell Type Variability Analysis (sorted by CV):")
    print(cell_type_variability.sort_values('CV (%)', ascending=False))

calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)

Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Disease        42.72  34.40    80.5
Normal         38.76  26.44    68.2

Cell Type Variability Analysis (sorted by CV):
                                          mean    std  CV (%)
Level One Cell Type                                          
unknown cell                             34.17  25.86    75.7
immune cell                              46.17  34.71    75.2
mesenchymal cell                         47.92  33.60    70.1
hematopoietic precursor cell             34.84  24.25    69.6
epithelial cell                          33.93  23.11    68.1
mixed immune/epithelial cell population  29.53  19.66    66.6
# Generate Violin Plot
def plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm='area'):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(10, 6))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette='Spectral', alpha=.9)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size)

    plt.title(f'Violin Plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_cells_per_celltype_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    plt.figure(figsize=(16, 8))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    # Create categorical type with only the regions that exist in the data
    available_regions = [r for r in regions if r in df_all_edges_with_cell_type_level['Unique Region'].unique()]
    df_all_edges_with_cell_type_level['Unique Region'] = pd.Categorical(
        df_all_edges_with_cell_type_level['Unique Region'],
        categories=available_regions,
        ordered=True
    )

    # Make box plot.
    sns.boxplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y='Distance', hue='Unique Region', showfliers=False, palette='Spectral') # viridis or Spectral palette for better color distinction
    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)
    plt.title(f'Distribution of distances by {cell_type_level} and region', fontsize=font_size)
    plt.xlabel(f'{cell_type_level}', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)
    plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_boxplots_by_region_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Boxplots of distribution of distances by cell type and region.
def plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, output_dir):
    pivot_data = df_all_edges_with_cell_type_level.pivot_table(
    values='Distance',
    index=cell_type_level,
    columns='Unique Region',
    aggfunc='median'
    )

    plt.figure(figsize=(15, 10))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    sns.heatmap(pivot_data, annot=True, fmt='.1f', cmap='Spectral')
    plt.title(f'Heatmap of median distances by {cell_type_level}', fontsize=12)

    font_size = 10
    plt.xticks(rotation=90, ha='right', fontsize=font_size)
    plt.yticks(fontsize=font_size)

    plt.xlabel('Unique Region', fontsize=font_size)
    plt.ylabel(f'{cell_type_level}', fontsize=font_size)
    
    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_distance_distribution_heatmap_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.show()

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

# Generate Violin Plot per unique region in both small intestine and large intestine. Create for all 8 regions as 8 subplots.
def plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 1})
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path
    font_size = 10
    fig, axs = plt.subplots(2, 1, figsize=(10, 15)) # Adjusted figsize for horizontal layout
    fig.suptitle(f'Distance distribution per {cell_type_level} in `{dataset_dir}` (density normalization = {density_norm})', fontsize=font_size, y=1)

    # Keep the sequence of Cell Types consistent across plots.
    cell_types = sorted(df_all_edges_with_cell_type_level[cell_type_level].unique())

    # Create a color palette based on the number of unique classes
    color_palette = sns.color_palette("Spectral", n_colors=len(cell_types))

    # Create a dictionary mapping class to color
    class_color_dict = dict(zip(cell_types, color_palette))

    for i, region in enumerate(regions):
        data_reg = df_all_edges_with_cell_type_level[df_all_edges_with_cell_type_level['Unique Region'] == region]
        sns.violinplot(data=data_reg, x=cell_type_level, y="Distance", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=False, palette=class_color_dict, alpha=.9, ax=axs[i], hue=cell_type_level, legend=False, order=cell_types, fill=True)
        axs[i].set_title(region, fontsize=font_size)
        axs[i].set_xlabel('', fontsize=font_size)
        axs[i].set_ylabel('Distance (\u03bcm)', fontsize=font_size)
        # axs[i].tick_params(axis='x', labelrotation=90, labelsize=font_size)
        # only show xtick labels for the last subplot
        if i < len(regions) - 1:
            axs[i].set_xticklabels([])
        else:
            axs[i].set_xticklabels(cell_types, fontsize=font_size, rotation=90, ha='right')
        # axs[i].set_ylim(0, data_reg['Distance'].max() * 1.1)  # Set y-limits to be consistent across all plots
        axs[i].tick_params(axis='both', labelsize=font_size)

    # Use fig.text for precise label positioning
    fig.figure.text(0.5, -0.02, f'{cell_type_level}', ha='center', va='bottom', fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.png'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_violin_plots_all_regions_{cell_type_level}.svg'), dpi=300,
                    bbox_inches='tight',
                    pad_inches=0.5)
    
    plt.show()

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

# Generate Split Violin Plot
def plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, output_dir, density_norm="area"):
    sns.set_style("whitegrid")
    sns.set_context("notebook", rc={"grid.linewidth": 2})
    plt.figure(figsize=(15, 10))
    plt.rcParams["svg.fonttype"] = 'none'  # to store text as text, not as path

    sns.violinplot(data=df_all_edges_with_cell_type_level, x=cell_type_level, y="Distance", hue="Unique Region", density_norm=density_norm, common_norm=True, cut=0, inner="box", split=True, palette='Spectral', alpha=.9, hue_order=regions)

    sns.set_theme(style="whitegrid")
    sns.set_context("paper")


    font_size = 10
    plt.legend(fontsize=font_size, loc='upper right', bbox_to_anchor=(1.15, 1))

    plt.title(f'Split violin plot of distances by {cell_type_level} (Density Normalization: {density_norm})', fontsize=font_size)

    plt.xlabel('Cell Type', fontsize=font_size)
    plt.ylabel('Distance (\u03bcm)', fontsize=font_size)

    # Increase font size for all text in the figure
    plt.xticks(fontsize=font_size)
    plt.xticks(rotation=90)
    plt.yticks(fontsize=font_size)

    plt.tight_layout()

    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_plot_violin_cells_per_celltype_split_by_condition_{cell_type_level}.png'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.savefig(os.path.join(output_dir, f'{dataset_dir}_plot_violin_cells_per_celltype_split_by_condition_{cell_type_level}.svg'), dpi=300,
                bbox_inches='tight',
                pad_inches=0.5)
    plt.show()

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

Level Two Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Two Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level
Level Two Cell Type Unique Region mean_distance median_distance min_distance max_distance
0 b cell Disease 37.449808 25.025741 8.194906 191.217795
1 b cell Normal 36.288294 26.858117 8.676844 197.378220
2 endothelial cell of capillary Disease 28.936591 22.843498 4.317932 197.145756
3 endothelial cell of lymphatic vessel Normal 90.450213 82.516556 6.820439 199.965441
4 epithelial cell Disease 47.290109 39.116006 7.217011 199.626286
5 immune cell Disease 199.429368 199.429368 199.429368 199.429368
6 leukocyte Normal 35.224795 27.520283 6.271830 198.055232
7 macrophage Disease 72.456390 62.749724 6.409083 199.948392
8 macrophage Normal 43.693437 35.902059 5.764018 199.800005
9 mast cell Normal 56.037289 45.889204 6.549645 199.829270
10 megakaryocyte Normal 35.709374 28.120303 6.647334 183.441998
11 mixed t cell/epithelial cell population Disease 35.456705 28.416673 7.589362 199.630836
12 muscle cell Disease 50.141764 37.800183 5.314126 199.976085
13 muscle cell Normal 74.768813 67.226262 7.202741 199.967962
14 neutrophil Normal 34.791835 27.134180 5.332036 199.638840
15 t cell Disease 49.913913 37.638729 5.341610 199.924082
16 t cell Normal 52.112910 41.077107 5.873783 199.929988
17 type 1 pneumocyte Normal 36.197262 26.538157 5.164310 199.815662
18 type 2 pneumocyte Disease 35.069124 29.533140 7.236066 199.648377
19 type 2 pneumocyte Normal 40.193411 31.636089 4.647661 199.955656
20 unknown cell Disease 27.554498 20.212155 5.697526 196.945710
21 unknown cell Normal 43.607941 34.399009 4.467862 199.906751
# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in Normal:
                    Level Two Cell Type  mean_distance
1  endothelial cell of lymphatic vessel      90.450213
6                           muscle cell      74.768813
4                             mast cell      56.037289
8                                t cell      52.112910
3                            macrophage      43.693437

Bottom 5 cell types in Normal:
  Level Two Cell Type  mean_distance
0              b cell      36.288294
9   type 1 pneumocyte      36.197262
5       megakaryocyte      35.709374
2           leukocyte      35.224795
7          neutrophil      34.791835

Top 5 cell types in Disease:
  Level Two Cell Type  mean_distance
3         immune cell     199.429368
4          macrophage      72.456390
6         muscle cell      50.141764
7              t cell      49.913913
2     epithelial cell      47.290109

Bottom 5 cell types in Disease:
                       Level Two Cell Type  mean_distance
0                                   b cell      37.449808
5  mixed t cell/epithelial cell population      35.456705
8                        type 2 pneumocyte      35.069124
1            endothelial cell of capillary      28.936591
9                             unknown cell      27.554498
# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in Normal:
                    Level Two Cell Type  median_distance
1  endothelial cell of lymphatic vessel        82.516556
6                           muscle cell        67.226262
4                             mast cell        45.889204
8                                t cell        41.077107
3                            macrophage        35.902059

Bottom 5 cell types in Normal:
  Level Two Cell Type  median_distance
5       megakaryocyte        28.120303
2           leukocyte        27.520283
7          neutrophil        27.134180
0              b cell        26.858117
9   type 1 pneumocyte        26.538157

Top 5 cell types in Disease:
  Level Two Cell Type  median_distance
3         immune cell       199.429368
4          macrophage        62.749724
2     epithelial cell        39.116006
6         muscle cell        37.800183
7              t cell        37.638729

Bottom 5 cell types in Disease:
                       Level Two Cell Type  median_distance
8                        type 2 pneumocyte        29.533140
5  mixed t cell/epithelial cell population        28.416673
0                                   b cell        25.025741
1            endothelial cell of capillary        22.843498
9                             unknown cell        20.212155
calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)

Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Disease        46.51  35.32    75.9
Normal         45.66  34.14    74.8

Cell Type Variability Analysis (sorted by CV):
                                           mean    std  CV (%)
Level Two Cell Type                                           
type 1 pneumocyte                         36.20  28.99    80.1
b cell                                    36.89  29.26    79.3
unknown cell                              40.03  30.98    77.4
t cell                                    50.39  37.60    74.6
neutrophil                                34.79  24.90    71.6
leukocyte                                 35.22  25.04    71.1
megakaryocyte                             35.71  25.36    71.0
macrophage                                55.66  39.15    70.3
type 2 pneumocyte                         37.89  25.70    67.8
mixed t cell/epithelial cell population   35.46  23.98    67.6
endothelial cell of capillary             28.94  19.51    67.4
muscle cell                               61.52  41.15    66.9
mast cell                                 56.04  36.88    65.8
epithelial cell                           47.29  30.10    63.6
endothelial cell of lymphatic vessel      90.45  48.80    54.0
immune cell                              199.43    NaN     NaN
plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.

Level Three Cell Type Analysis

# Get mean, median, minimum, maximum distance per cell type in all unique regions.
cell_type_level = 'Level Three Cell Type'
df_all_edges_with_cell_type_level = df_all_edges_with_cell_types[(df_all_edges_with_cell_types['Anchor Cell Type Level'] == cell_type_level) & (df_all_edges_with_cell_types['Anchor Cell Type'] == anchor_cell_type_dict[cell_type_level])]

df_distance_stats_cell_type_level = df_all_edges_with_cell_type_level.groupby([cell_type_level, 'Unique Region']).agg(
    mean_distance=('Distance', 'mean'),
    median_distance=('Distance', 'median'),
    min_distance=('Distance', 'min'),
    max_distance=('Distance', 'max')
).reset_index()
df_distance_stats_cell_type_level
Level Three Cell Type Unique Region mean_distance median_distance min_distance max_distance
0 b cell Disease 37.449808 25.025741 8.194906 191.217795
1 b cell Normal 36.288294 26.858117 8.676844 197.378220
2 endothelial cell of capillary Disease 28.936591 22.843498 4.317932 197.145756
3 endothelial cell of lymphatic vessel Normal 90.450213 82.516556 6.820439 199.965441
4 epithelial cell Disease 47.290109 39.116006 7.217011 199.626286
5 immune cell Disease 199.429368 199.429368 199.429368 199.429368
6 leukocyte Normal 35.224795 27.520283 6.271830 198.055232
7 macrophage Disease 72.456390 62.749724 6.409083 199.948392
8 macrophage Normal 43.683982 35.893711 5.764018 199.800005
9 macrophage:interstitial Normal 47.581049 39.836315 9.818172 172.605327
10 mast cell Normal 56.037289 45.889204 6.549645 199.829270
11 megakaryocyte Normal 35.709374 28.120303 6.647334 183.441998
12 mixed t cell/epithelial cell population Disease 35.456705 28.416673 7.589362 199.630836
13 muscle cell:smooth Disease 50.141764 37.800183 5.314126 199.976085
14 muscle cell:smooth Normal 74.768813 67.226262 7.202741 199.967962
15 neutrophil Normal 34.791835 27.134180 5.332036 199.638840
16 pneumocyte:type 1 Normal 36.197262 26.538157 5.164310 199.815662
17 pneumocyte:type 2 Disease 35.069124 29.533140 7.236066 199.648377
18 pneumocyte:type 2 Normal 40.193411 31.636089 4.647661 199.955656
19 t cell Disease 52.833976 43.817389 7.516588 199.586367
20 t cell:cd4+ alpha-beta Disease 39.855009 30.664763 5.341610 199.765323
21 t cell:cd4+ alpha-beta Normal 61.190621 50.726455 6.368590 199.929988
22 t cell:cd8+ Disease 72.166680 63.643480 7.237513 199.924082
23 t cell:cd8+ Normal 45.077855 35.291540 5.873783 199.742345
24 unknown cell Disease 27.554498 20.212155 5.697526 196.945710
25 unknown cell Normal 43.607941 34.399009 4.467862 199.906751
# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_mean(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in Normal:
                   Level Three Cell Type  mean_distance
1   endothelial cell of lymphatic vessel      90.450213
7                     muscle cell:smooth      74.768813
11                t cell:cd4+ alpha-beta      61.190621
5                              mast cell      56.037289
4                macrophage:interstitial      47.581049

Bottom 5 cell types in Normal:
  Level Three Cell Type  mean_distance
0                b cell      36.288294
9     pneumocyte:type 1      36.197262
6         megakaryocyte      35.709374
2             leukocyte      35.224795
8            neutrophil      34.791835

Top 5 cell types in Disease:
   Level Three Cell Type  mean_distance
3            immune cell     199.429368
4             macrophage      72.456390
10           t cell:cd8+      72.166680
8                 t cell      52.833976
6     muscle cell:smooth      50.141764

Bottom 5 cell types in Disease:
                      Level Three Cell Type  mean_distance
0                                    b cell      37.449808
5   mixed t cell/epithelial cell population      35.456705
7                        pneumocyte:type 2       35.069124
1             endothelial cell of capillary      28.936591
11                             unknown cell      27.554498
# Get top and bottom cell types for each unique region in the dataset.
unique_regions = df_all_edges_with_cell_type_level['Unique Region'].unique()
for region in unique_regions:
    top_bottom = get_top_bottom_cell_types_by_median(df_all_edges_with_cell_type_level, cell_type_level, region)
    print(f"\nTop 5 cell types in {region}:")
    print(top_bottom[0])
    print(f"\nBottom 5 cell types in {region}:")
    print(top_bottom[1])

Top 5 cell types in Normal:
                   Level Three Cell Type  median_distance
1   endothelial cell of lymphatic vessel        82.516556
7                     muscle cell:smooth        67.226262
11                t cell:cd4+ alpha-beta        50.726455
5                              mast cell        45.889204
4                macrophage:interstitial        39.836315

Bottom 5 cell types in Normal:
  Level Three Cell Type  median_distance
6         megakaryocyte        28.120303
2             leukocyte        27.520283
8            neutrophil        27.134180
0                b cell        26.858117
9     pneumocyte:type 1        26.538157

Top 5 cell types in Disease:
   Level Three Cell Type  median_distance
3            immune cell       199.429368
10           t cell:cd8+        63.643480
4             macrophage        62.749724
8                 t cell        43.817389
2        epithelial cell        39.116006

Bottom 5 cell types in Disease:
                      Level Three Cell Type  median_distance
7                        pneumocyte:type 2         29.533140
5   mixed t cell/epithelial cell population        28.416673
0                                    b cell        25.025741
1             endothelial cell of capillary        22.843498
11                             unknown cell        20.212155
calculate_regional_variability(df_all_edges_with_cell_type_level, cell_type_level)

Regional Variability Analysis:
Mean: Average distance in each region
Std: Standard deviation of distances
CV: Coefficient of Variation (std/mean * 100%)
                mean    std  CV (%)
Unique Region                      
Disease        46.51  35.32    75.9
Normal         45.66  34.14    74.8

Cell Type Variability Analysis (sorted by CV):
                                           mean    std  CV (%)
Level Three Cell Type                                         
pneumocyte:type 1                         36.20  28.99    80.1
b cell                                    36.89  29.26    79.3
unknown cell                              40.03  30.98    77.4
t cell:cd4+ alpha-beta                    43.13  32.55    75.5
neutrophil                                34.79  24.90    71.6
leukocyte                                 35.22  25.04    71.1
megakaryocyte                             35.71  25.36    71.0
macrophage                                55.67  39.15    70.3
macrophage:interstitial                   47.58  32.59    68.5
t cell:cd8+                               62.80  42.56    67.8
pneumocyte:type 2                         37.89  25.70    67.8
mixed t cell/epithelial cell population   35.46  23.98    67.6
endothelial cell of capillary             28.94  19.51    67.4
muscle cell:smooth                        61.52  41.15    66.9
mast cell                                 56.04  36.88    65.8
t cell                                    52.83  34.08    64.5
epithelial cell                           47.29  30.10    63.6
endothelial cell of lymphatic vessel      90.45  48.80    54.0
immune cell                              199.43    NaN     NaN
plot_violin_cells_per_celltype(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm='area')

plot_distance_distribution_boxplots_by_region(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_distance_distribution_heatmap(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir))

plot_violin_plots_all_regions(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # Or, density_norm="count" or "area" based on preference.

plot_violin_cells_per_celltype_split_by_condition(df_all_edges_with_cell_type_level, cell_type_level, os.path.join(basepath, figures_output_dir), density_norm="count") # density_norm="count" or "area" can be used based on preference.