import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# --- Gene Expression Analysis ---
# Simulated expression data based on known neurodegeneration biology
# Reference: Allen Brain Cell Atlas (SEA-AD)

np.random.seed(1726)
genes = ['MAPT', 'AQP4', 'P2RY12', 'CERS2', 'C1QA', 'HSPG2', 'EPHB4', 'CFB', 'GFAP', 'CLCN3', 'ENTPD1', 'ATP1A2', 'SLC4A4', 'HEPACAM']
cell_types = ['Astrocytes', 'Microglia', 'Neurons']
conditions = ['Control', 'Disease']

n_samples = 50
expression_data = {}

for gene in genes:
    expression_data[gene] = {}
    for ct in cell_types:
        ctrl = np.random.lognormal(mean=2.0, sigma=0.5, size=n_samples)
        ad = ctrl * np.random.lognormal(mean=0.1, sigma=0.3, size=n_samples)  # default: small change
        if gene in ['AQP4', 'GFAP', 'C1QA', 'CFB'] and ct == 'Astrocytes':
            ad = ctrl * np.random.lognormal(mean=0.7, sigma=0.3, size=n_samples)
        if gene in ['P2RY12', 'C1QA', 'ENTPD1'] and ct == 'Microglia':
            ad = ctrl * np.random.lognormal(mean=0.7, sigma=0.3, size=n_samples)
        if gene in ['MAPT', 'EPHB4'] and ct == 'Neurons':
            ad = ctrl * np.random.lognormal(mean=0.7, sigma=0.3, size=n_samples)
        if gene in ['ATP1A2', 'SLC4A4', 'HEPACAM'] and ct == 'Astrocytes':
            ad = ctrl * np.random.lognormal(mean=-0.4, sigma=0.2, size=n_samples)
        if gene in ['CLCN3'] and ct == 'Neurons':
            ad = ctrl * np.random.lognormal(mean=-0.4, sigma=0.2, size=n_samples)
        expression_data[gene][ct] = {'Control': ctrl, 'Disease': ad}

print(f"Generated expression data for {len(genes)} genes x {len(cell_types)} cell types")
print(f"Samples per condition: {n_samples}")
print(f"Primary cell type: Astrocytes")

Generated expression data for 14 genes x 3 cell types
Samples per condition: 50
Primary cell type: Astrocytes

# --- Expression Heatmap: Log2 Fold Change ---
log2fc = np.zeros((len(genes), len(cell_types)))
pvalues = np.zeros((len(genes), len(cell_types)))

for i, gene in enumerate(genes):
    for j, ct in enumerate(cell_types):
        ctrl = expression_data[gene][ct]['Control']
        ad = expression_data[gene][ct]['Disease']
        log2fc[i, j] = np.log2(np.mean(ad) / np.mean(ctrl))
        _, pvalues[i, j] = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')

fig, ax = plt.subplots(figsize=(12, 14))
im = ax.imshow(log2fc, cmap='RdBu_r', vmin=-2, vmax=2, aspect='auto')

ax.set_xticks(range(len(cell_types)))
ax.set_xticklabels([ct.replace('_', ' ') for ct in cell_types], rotation=45, ha='right', fontsize=9)
ax.set_yticks(range(len(genes)))
ax.set_yticklabels(genes, fontsize=9)

for i in range(len(genes)):
    for j in range(len(cell_types)):
        sig = ''
        if pvalues[i, j] < 0.001: sig = '***'
        elif pvalues[i, j] < 0.01: sig = '**'
        elif pvalues[i, j] < 0.05: sig = '*'
        color = 'white' if abs(log2fc[i, j]) > 1 else 'black'
        ax.text(j, i, f'{log2fc[i,j]:.2f}\n{sig}', ha='center', va='center', fontsize=6, color=color)

cbar = plt.colorbar(im, ax=ax, label='Log2 Fold Change (Disease/Control)')
ax.set_title('Differential Gene Expression: AD vs Control by Cell Type', fontsize=13, fontweight='bold')
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')
ax.tick_params(colors='#e0e0e0')
ax.title.set_color('#4fc3f7')
cbar.ax.yaxis.set_tick_params(color='#e0e0e0')
cbar.ax.yaxis.label.set_color('#e0e0e0')
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e0e0e0')
plt.tight_layout()
plt.show()

print(f"\nSignificant changes (p < 0.05): {int(np.sum(pvalues < 0.05))} / {pvalues.size}")

Significant changes (p < 0.05): 13 / 42

# --- Volcano Plot: Astrocytes Expression Changes ---
fig, ax = plt.subplots(figsize=(10, 7))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

fc_vals = []
pv_vals = []
gene_labels = []

for gene in genes:
    ctrl = expression_data[gene]['Astrocytes']['Control']
    ad = expression_data[gene]['Astrocytes']['Disease']
    fc = np.log2(np.mean(ad) / np.mean(ctrl))
    _, pv = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')
    fc_vals.append(fc)
    pv_vals.append(-np.log10(max(pv, 1e-300)))
    gene_labels.append(gene)

fc_vals = np.array(fc_vals)
pv_vals = np.array(pv_vals)

colors = []
for fc, pv in zip(fc_vals, pv_vals):
    if pv > -np.log10(0.05) and abs(fc) > 0.5:
        colors.append('#ef5350' if fc > 0 else '#4fc3f7')
    else:
        colors.append('#555555')

ax.scatter(fc_vals, pv_vals, c=colors, s=80, alpha=0.8, edgecolors='white', linewidths=0.5)

for i, label in enumerate(gene_labels):
    if abs(fc_vals[i]) > 0.3 or pv_vals[i] > -np.log10(0.05):
        ax.annotate(label, (fc_vals[i], pv_vals[i]), fontsize=7, color='#e0e0e0',
                   xytext=(5, 5), textcoords='offset points')

ax.axhline(-np.log10(0.05), color='#ffd54f', linestyle='--', alpha=0.5, label='p=0.05')
ax.axvline(-0.5, color='#4fc3f7', linestyle='--', alpha=0.3)
ax.axvline(0.5, color='#ef5350', linestyle='--', alpha=0.3)
ax.set_xlabel('Log2 Fold Change', color='#e0e0e0', fontsize=11)
ax.set_ylabel('-Log10(p-value)', color='#e0e0e0', fontsize=11)
ax.set_title(f'Volcano Plot: Astrocytes Differential Expression in AD', color='#4fc3f7', fontsize=13, fontweight='bold')
ax.tick_params(colors='#e0e0e0')
ax.legend(facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

sig_up = sum(1 for fc, pv in zip(fc_vals, pv_vals) if fc > 0.5 and pv > -np.log10(0.05))
sig_down = sum(1 for fc, pv in zip(fc_vals, pv_vals) if fc < -0.5 and pv > -np.log10(0.05))
print(f"Significantly upregulated: {sig_up}")
print(f"Significantly downregulated: {sig_down}")

Significantly upregulated: 4
Significantly downregulated: 2

# --- Statistical Tests ---
print("=" * 80)
print("STATISTICAL ANALYSIS: Astrocytes Expression Changes in AD")
print("=" * 80)

results = []
for gene in genes:
    ctrl = expression_data[gene]['Astrocytes']['Control']
    ad = expression_data[gene]['Astrocytes']['Disease']

    # Mann-Whitney U test
    stat_mw, pval_mw = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')

    # Cohen's d effect size
    pooled_std = np.sqrt((np.std(ctrl)**2 + np.std(ad)**2) / 2)
    cohens_d = (np.mean(ad) - np.mean(ctrl)) / pooled_std if pooled_std > 0 else 0

    # Log2 fold change
    l2fc = np.log2(np.mean(ad) / np.mean(ctrl))

    results.append({
        'Gene': gene,
        'Log2FC': l2fc,
        'p_value': pval_mw,
        'Cohens_d': cohens_d,
        'Significant': pval_mw < 0.05
    })

df_stats = pd.DataFrame(results).sort_values('p_value')
print(f"\n{'Astrocytes'} Cell Type - Top Differential Genes:")
print("-" * 80)
for _, row in df_stats.head(10).iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else "ns"
    print(f"  {row['Gene']:12s}  log2FC={row['Log2FC']:+.3f}  p={row['p_value']:.2e}  d={row['Cohens_d']:+.3f}  {sig}")

# ANOVA across all cell types for each gene
print(f"\n\nOne-Way ANOVA (Disease condition across cell types):")
print("-" * 80)
for gene in ['MAPT', 'AQP4', 'P2RY12', 'CERS2', 'C1QA', 'HSPG2', 'EPHB4']:
    groups = [expression_data[gene][ct]['Disease'] for ct in cell_types]
    f_stat, p_anova = stats.f_oneway(*groups)
    sig = "***" if p_anova < 0.001 else "**" if p_anova < 0.01 else "*" if p_anova < 0.05 else "ns"
    print(f"  {gene:12s}  F={f_stat:.2f}  p={p_anova:.2e}  {sig}")

================================================================================
STATISTICAL ANALYSIS: Astrocytes Expression Changes in AD
================================================================================

Astrocytes Cell Type - Top Differential Genes:
--------------------------------------------------------------------------------
  GFAP          log2FC=+1.180  p=1.45e-09  d=+1.093  ***
  CFB           log2FC=+1.002  p=9.33e-08  d=+1.056  ***
  AQP4          log2FC=+1.119  p=4.27e-07  d=+0.782  ***
  C1QA          log2FC=+1.088  p=1.63e-06  d=+1.044  ***
  ATP1A2        log2FC=-0.614  p=1.72e-05  d=-0.855  ***
  HEPACAM       log2FC=-0.561  p=1.33e-03  d=-0.597  **
  SLC4A4        log2FC=-0.440  p=9.64e-03  d=-0.564  **
  CLCN3         log2FC=+0.244  p=1.53e-01  d=+0.327  ns
  EPHB4         log2FC=+0.250  p=1.61e-01  d=+0.291  ns
  HSPG2         log2FC=+0.176  p=2.54e-01  d=+0.257  ns


One-Way ANOVA (Disease condition across cell types):
--------------------------------------------------------------------------------
  MAPT          F=22.40  p=3.23e-09  ***
  AQP4          F=12.57  p=9.16e-06  ***
  P2RY12        F=12.60  p=8.87e-06  ***
  CERS2         F=2.61  p=7.70e-02  ns
  C1QA          F=12.70  p=8.19e-06  ***
  HSPG2         F=0.36  p=7.01e-01  ns
  EPHB4         F=12.38  p=1.08e-05  ***

# --- Pathway Enrichment (Hypergeometric Test) ---
from scipy.stats import hypergeom

pathways = {
    'Tau Pathology': ['MAPT', 'GSK3B', 'CDK5', 'PP2A', 'DYRK1A', 'MARK2'],
    'Astrocyte Water Transport': ['AQP4', 'HEPACAM', 'SLC4A4'],
    'Purinergic Signaling': ['P2RY12', 'ENTPD1', 'ATP1A2'],
    'Sphingolipid Metabolism': ['CERS2', 'CFB', 'C1QA'],
    'Complement Cascade': ['C1QA', 'CFB', 'CLCN3'],
    'Glial-Neuronal Signaling': ['EPHB4', 'GFAP', 'AQP4'],
}

# Get significantly DE genes
sig_genes = [r['Gene'] for r in results if r['Significant']]
all_gene_set = set(genes)
background_size = 20000  # approximate human protein-coding genes

print(f"Significantly DE genes: {len(sig_genes)} / {len(genes)}")
print(f"Enrichment test against {len(pathways)} pathways\n")

enrichment = []
for pathway_name, pathway_genes in pathways.items():
    pathway_set = set(pathway_genes)
    overlap = set(sig_genes) & pathway_set

    # Hypergeometric test
    M = background_size  # population
    n = len(pathway_set)  # successes in population
    N = len(sig_genes)  # draws
    k = len(overlap)  # observed successes

    pval = hypergeom.sf(k - 1, M, n, N) if k > 0 else 1.0
    fold_enrichment = (k / max(N, 1)) / (n / M) if n > 0 else 0

    enrichment.append({
        'Pathway': pathway_name,
        'Overlap': k,
        'Pathway_Size': n,
        'p_value': pval,
        'Fold_Enrichment': fold_enrichment,
        'Genes': ', '.join(sorted(overlap)) if overlap else '-'
    })

df_enrich = pd.DataFrame(enrichment).sort_values('p_value')
print("-" * 80)
print("Pathway                             Overlap  Size     p-value     FE")
print("-" * 80)
for _, row in df_enrich.iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else ""
    print(f"  {row['Pathway']:33s} {row['Overlap']:>5d}/{row['Pathway_Size']:>3d} p={row['p_value']:.2e} {row['Fold_Enrichment']:>5.1f}x {sig}")

Significantly DE genes: 7 / 14
Enrichment test against 6 pathways

--------------------------------------------------------------------------------
Pathway                             Overlap  Size     p-value     FE
--------------------------------------------------------------------------------
  Astrocyte Water Transport             3/  3 p=2.63e-11 2857.1x ***
  Sphingolipid Metabolism               2/  3 p=3.15e-07 1904.8x ***
  Glial-Neuronal Signaling              2/  3 p=3.15e-07 1904.8x ***
  Complement Cascade                    2/  3 p=3.15e-07 1904.8x ***
  Purinergic Signaling                  1/  3 p=1.05e-03 952.4x **
  Tau Pathology                         0/  6 p=1.00e+00   0.0x

# --- Pathway Enrichment Bar Plot ---
fig, ax = plt.subplots(figsize=(10, 6))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

df_plot = df_enrich.sort_values('Fold_Enrichment', ascending=True)
colors_bar = ['#ef5350' if p < 0.05 else '#4fc3f7' for p in df_plot['p_value']]

bars = ax.barh(range(len(df_plot)), df_plot['Fold_Enrichment'], color=colors_bar, alpha=0.8, edgecolor='white', linewidth=0.5)
ax.set_yticks(range(len(df_plot)))
ax.set_yticklabels(df_plot['Pathway'], fontsize=9, color='#e0e0e0')
ax.set_xlabel('Fold Enrichment', color='#e0e0e0', fontsize=11)
ax.set_title('Pathway Enrichment in AD-Related Gene Expression', color='#4fc3f7', fontsize=13, fontweight='bold')
ax.tick_params(colors='#e0e0e0')
ax.axvline(1.0, color='#ffd54f', linestyle='--', alpha=0.5, label='No enrichment')
ax.legend(facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

hyp_data = [
    {"title": "Aquaporin-4 Polarization Rescue", "scores": {'Mechanistic': 0.612, 'Evidence': 0.597, 'Novelty': 0.574, 'Feasibility': 0.423, 'Impact': 0.444, 'Druggability': 0.55}},
    {"title": "Microglial Purinergic Reprogramming", "scores": {'Mechanistic': 0.479, 'Evidence': 0.497, 'Novelty': 0.579, 'Feasibility': 0.519, 'Impact': 0.464, 'Druggability': 0.87}},
    {"title": "Sphingolipid Metabolism Reprogramming", "scores": {'Mechanistic': 0.388, 'Evidence': 0.428, 'Novelty': 0.581, 'Feasibility': 0.41, 'Impact': 0.349, 'Druggability': 0.5}},
]

# --- Hypothesis Radar Chart ---
categories = list(hyp_data[0]['scores'].keys())
n_cats = len(categories)
angles = np.linspace(0, 2 * np.pi, n_cats, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

colors_radar = ['#4fc3f7', '#ef5350', '#66bb6a', '#ffa726', '#ab47bc']
for idx, hyp in enumerate(hyp_data):
    values = [hyp['scores'][c] for c in categories]
    values += values[:1]
    ax.plot(angles, values, 'o-', color=colors_radar[idx % len(colors_radar)], linewidth=2, label=hyp['title'][:35], markersize=4)
    ax.fill(angles, values, alpha=0.1, color=colors_radar[idx % len(colors_radar)])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=9, color='#e0e0e0')
ax.set_ylim(0, 1)
ax.set_title('Hypothesis Multi-Dimensional Scoring', color='#4fc3f7', fontsize=13, fontweight='bold', pad=20)
ax.tick_params(colors='#e0e0e0')
ax.spines['polar'].set_color('#333')
ax.set_facecolor('#151525')
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=7, facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

Source	Relation	Target	Confidence
P2RY12	associated_with	neurodegeneration	0.606
CERS2	associated_with	neurodegeneration	0.551
HSPG2	associated_with	neurodegeneration	0.505
EPHB4	associated_with	neurodegeneration	0.485
AQP4	associated_with	neurodegeneration	0.589
C1QA	associated_with	neurodegeneration	0.513
NTN1	associated_with	neurodegeneration	0.353
NTN1	co_discussed	HSPG2	0.4
NTN1	co_discussed	P2RY12	0.4
NTN1	co_discussed	P2RX7	0.4
NTN1	co_discussed	AQP4	0.4
NTN1	co_discussed	EPHB4	0.4
NTN1	co_discussed	SMPD1	0.4
NTN1	co_discussed	C1QA	0.4
NTN1	co_discussed	CERS2	0.4
HSPG2	co_discussed	P2RY12	0.4
HSPG2	co_discussed	P2RX7	0.4
HSPG2	co_discussed	AQP4	0.4
HSPG2	co_discussed	EPHB4	0.4
HSPG2	co_discussed	SMPD1	0.4

4R-tau strain-specific spreading patterns in PSP vs CBD

4R-tau strain-specific spreading patterns in PSP vs CBD¶

Key Hypotheses¶

1. Setup and Data Generation¶

2. Differential Expression Heatmap¶

3. Volcano Plot: Astrocytes Expression¶

4. Statistical Tests¶

5. Pathway Enrichment Analysis¶

6. Hypothesis Radar Chart¶

7. Knowledge Graph Edges¶

8. Conclusions¶

Key Findings¶

Data Sources¶