import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# --- Gene Expression Analysis ---
# Simulated expression data based on known neurodegeneration biology
# Reference: Allen Brain Cell Atlas (SEA-AD)

np.random.seed(2614)
genes = ['TREM2', 'HSP90AA1', 'LRP1', 'VCP', 'CHMP4B', 'MAPT', 'TYROBP', 'HSP90AB1', 'STUB1', 'BAG2', 'HSPA1A', 'CDC37', 'AHSA1', 'HSPB1']
cell_types = ['Astrocytes', 'Microglia', 'Neurons']
conditions = ['Control', 'Disease']

n_samples = 50
expression_data = {}

for gene in genes:
    expression_data[gene] = {}
    for ct in cell_types:
        ctrl = np.random.lognormal(mean=2.0, sigma=0.5, size=n_samples)
        ad = ctrl * np.random.lognormal(mean=0.1, sigma=0.3, size=n_samples)  # default: small change
        if gene in ['TREM2', 'TYROBP', 'HSP90AA1', 'HSPA1A'] and ct == 'Microglia':
            ad = ctrl * np.random.lognormal(mean=0.7, sigma=0.3, size=n_samples)
        if gene in ['MAPT', 'HSP90AB1', 'VCP'] and ct == 'Neurons':
            ad = ctrl * np.random.lognormal(mean=0.7, sigma=0.3, size=n_samples)
        if gene in ['LRP1', 'CHMP4B'] and ct == 'Astrocytes':
            ad = ctrl * np.random.lognormal(mean=0.7, sigma=0.3, size=n_samples)
        if gene in ['STUB1', 'BAG2'] and ct == 'Microglia':
            ad = ctrl * np.random.lognormal(mean=-0.4, sigma=0.2, size=n_samples)
        if gene in ['CDC37', 'AHSA1'] and ct == 'Neurons':
            ad = ctrl * np.random.lognormal(mean=-0.4, sigma=0.2, size=n_samples)
        expression_data[gene][ct] = {'Control': ctrl, 'Disease': ad}

print(f"Generated expression data for {len(genes)} genes x {len(cell_types)} cell types")
print(f"Samples per condition: {n_samples}")
print(f"Primary cell type: Microglia")

Generated expression data for 14 genes x 3 cell types
Samples per condition: 50
Primary cell type: Microglia

# --- Expression Heatmap: Log2 Fold Change ---
log2fc = np.zeros((len(genes), len(cell_types)))
pvalues = np.zeros((len(genes), len(cell_types)))

for i, gene in enumerate(genes):
    for j, ct in enumerate(cell_types):
        ctrl = expression_data[gene][ct]['Control']
        ad = expression_data[gene][ct]['Disease']
        log2fc[i, j] = np.log2(np.mean(ad) / np.mean(ctrl))
        _, pvalues[i, j] = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')

fig, ax = plt.subplots(figsize=(12, 14))
im = ax.imshow(log2fc, cmap='RdBu_r', vmin=-2, vmax=2, aspect='auto')

ax.set_xticks(range(len(cell_types)))
ax.set_xticklabels([ct.replace('_', ' ') for ct in cell_types], rotation=45, ha='right', fontsize=9)
ax.set_yticks(range(len(genes)))
ax.set_yticklabels(genes, fontsize=9)

for i in range(len(genes)):
    for j in range(len(cell_types)):
        sig = ''
        if pvalues[i, j] < 0.001: sig = '***'
        elif pvalues[i, j] < 0.01: sig = '**'
        elif pvalues[i, j] < 0.05: sig = '*'
        color = 'white' if abs(log2fc[i, j]) > 1 else 'black'
        ax.text(j, i, f'{log2fc[i,j]:.2f}\n{sig}', ha='center', va='center', fontsize=6, color=color)

cbar = plt.colorbar(im, ax=ax, label='Log2 Fold Change (Disease/Control)')
ax.set_title('Differential Gene Expression: AD vs Control by Cell Type', fontsize=13, fontweight='bold')
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')
ax.tick_params(colors='#e0e0e0')
ax.title.set_color('#4fc3f7')
cbar.ax.yaxis.set_tick_params(color='#e0e0e0')
cbar.ax.yaxis.label.set_color('#e0e0e0')
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e0e0e0')
plt.tight_layout()
plt.show()

print(f"\nSignificant changes (p < 0.05): {int(np.sum(pvalues < 0.05))} / {pvalues.size}")

Significant changes (p < 0.05): 13 / 42

# --- Volcano Plot: Microglia Expression Changes ---
fig, ax = plt.subplots(figsize=(10, 7))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

fc_vals = []
pv_vals = []
gene_labels = []

for gene in genes:
    ctrl = expression_data[gene]['Microglia']['Control']
    ad = expression_data[gene]['Microglia']['Disease']
    fc = np.log2(np.mean(ad) / np.mean(ctrl))
    _, pv = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')
    fc_vals.append(fc)
    pv_vals.append(-np.log10(max(pv, 1e-300)))
    gene_labels.append(gene)

fc_vals = np.array(fc_vals)
pv_vals = np.array(pv_vals)

colors = []
for fc, pv in zip(fc_vals, pv_vals):
    if pv > -np.log10(0.05) and abs(fc) > 0.5:
        colors.append('#ef5350' if fc > 0 else '#4fc3f7')
    else:
        colors.append('#555555')

ax.scatter(fc_vals, pv_vals, c=colors, s=80, alpha=0.8, edgecolors='white', linewidths=0.5)

for i, label in enumerate(gene_labels):
    if abs(fc_vals[i]) > 0.3 or pv_vals[i] > -np.log10(0.05):
        ax.annotate(label, (fc_vals[i], pv_vals[i]), fontsize=7, color='#e0e0e0',
                   xytext=(5, 5), textcoords='offset points')

ax.axhline(-np.log10(0.05), color='#ffd54f', linestyle='--', alpha=0.5, label='p=0.05')
ax.axvline(-0.5, color='#4fc3f7', linestyle='--', alpha=0.3)
ax.axvline(0.5, color='#ef5350', linestyle='--', alpha=0.3)
ax.set_xlabel('Log2 Fold Change', color='#e0e0e0', fontsize=11)
ax.set_ylabel('-Log10(p-value)', color='#e0e0e0', fontsize=11)
ax.set_title(f'Volcano Plot: Microglia Differential Expression in AD', color='#4fc3f7', fontsize=13, fontweight='bold')
ax.tick_params(colors='#e0e0e0')
ax.legend(facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

sig_up = sum(1 for fc, pv in zip(fc_vals, pv_vals) if fc > 0.5 and pv > -np.log10(0.05))
sig_down = sum(1 for fc, pv in zip(fc_vals, pv_vals) if fc < -0.5 and pv > -np.log10(0.05))
print(f"Significantly upregulated: {sig_up}")
print(f"Significantly downregulated: {sig_down}")

Significantly upregulated: 4
Significantly downregulated: 2

# --- Statistical Tests ---
print("=" * 80)
print("STATISTICAL ANALYSIS: Microglia Expression Changes in AD")
print("=" * 80)

results = []
for gene in genes:
    ctrl = expression_data[gene]['Microglia']['Control']
    ad = expression_data[gene]['Microglia']['Disease']

    # Mann-Whitney U test
    stat_mw, pval_mw = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')

    # Cohen's d effect size
    pooled_std = np.sqrt((np.std(ctrl)**2 + np.std(ad)**2) / 2)
    cohens_d = (np.mean(ad) - np.mean(ctrl)) / pooled_std if pooled_std > 0 else 0

    # Log2 fold change
    l2fc = np.log2(np.mean(ad) / np.mean(ctrl))

    results.append({
        'Gene': gene,
        'Log2FC': l2fc,
        'p_value': pval_mw,
        'Cohens_d': cohens_d,
        'Significant': pval_mw < 0.05
    })

df_stats = pd.DataFrame(results).sort_values('p_value')
print(f"\n{'Microglia'} Cell Type - Top Differential Genes:")
print("-" * 80)
for _, row in df_stats.head(10).iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else "ns"
    print(f"  {row['Gene']:12s}  log2FC={row['Log2FC']:+.3f}  p={row['p_value']:.2e}  d={row['Cohens_d']:+.3f}  {sig}")

# ANOVA across all cell types for each gene
print(f"\n\nOne-Way ANOVA (Disease condition across cell types):")
print("-" * 80)
for gene in ['TREM2', 'HSP90AA1', 'LRP1', 'VCP', 'CHMP4B', 'MAPT', 'TYROBP']:
    groups = [expression_data[gene][ct]['Disease'] for ct in cell_types]
    f_stat, p_anova = stats.f_oneway(*groups)
    sig = "***" if p_anova < 0.001 else "**" if p_anova < 0.01 else "*" if p_anova < 0.05 else "ns"
    print(f"  {gene:12s}  F={f_stat:.2f}  p={p_anova:.2e}  {sig}")

================================================================================
STATISTICAL ANALYSIS: Microglia Expression Changes in AD
================================================================================

Microglia Cell Type - Top Differential Genes:
--------------------------------------------------------------------------------
  HSP90AA1      log2FC=+1.120  p=3.46e-10  d=+1.382  ***
  HSPA1A        log2FC=+1.089  p=7.15e-09  d=+1.289  ***
  TYROBP        log2FC=+1.109  p=1.55e-08  d=+1.000  ***
  TREM2         log2FC=+1.113  p=4.59e-07  d=+1.118  ***
  STUB1         log2FC=-0.507  p=1.40e-03  d=-0.557  **
  BAG2          log2FC=-0.523  p=1.65e-03  d=-0.602  **
  CDC37         log2FC=+0.201  p=2.96e-01  d=+0.282  ns
  HSP90AB1      log2FC=+0.238  p=3.72e-01  d=+0.296  ns
  HSPB1         log2FC=+0.151  p=3.76e-01  d=+0.246  ns
  VCP           log2FC=+0.260  p=3.76e-01  d=+0.240  ns


One-Way ANOVA (Disease condition across cell types):
--------------------------------------------------------------------------------
  TREM2         F=18.75  p=5.58e-08  ***
  HSP90AA1      F=35.47  p=2.69e-13  ***
  LRP1          F=19.07  p=4.32e-08  ***
  VCP           F=15.57  p=7.35e-07  ***
  CHMP4B        F=21.49  p=6.47e-09  ***
  MAPT          F=19.95  p=2.16e-08  ***
  TYROBP        F=10.73  p=4.48e-05  ***

# --- Pathway Enrichment (Hypergeometric Test) ---
from scipy.stats import hypergeom

pathways = {
    'Tau Pathology': ['MAPT', 'GSK3B', 'CDK5', 'PP2A', 'DYRK1A'],
    'Microglial Clearance': ['TREM2', 'TYROBP', 'LRP1'],
    'Protein Homeostasis': ['HSP90AA1', 'HSP90AB1', 'STUB1', 'BAG2', 'HSPA1A'],
    'Extracellular Vesicles': ['CHMP4B', 'VCP', 'HSP90AA1'],
    'Autophagy-Lysosome': ['VCP', 'STUB1', 'BAG2', 'HSPB1'],
    'Neuroinflammation': ['TREM2', 'TYROBP', 'LRP1'],
}

# Get significantly DE genes
sig_genes = [r['Gene'] for r in results if r['Significant']]
all_gene_set = set(genes)
background_size = 20000  # approximate human protein-coding genes

print(f"Significantly DE genes: {len(sig_genes)} / {len(genes)}")
print(f"Enrichment test against {len(pathways)} pathways\n")

enrichment = []
for pathway_name, pathway_genes in pathways.items():
    pathway_set = set(pathway_genes)
    overlap = set(sig_genes) & pathway_set

    # Hypergeometric test
    M = background_size  # population
    n = len(pathway_set)  # successes in population
    N = len(sig_genes)  # draws
    k = len(overlap)  # observed successes

    pval = hypergeom.sf(k - 1, M, n, N) if k > 0 else 1.0
    fold_enrichment = (k / max(N, 1)) / (n / M) if n > 0 else 0

    enrichment.append({
        'Pathway': pathway_name,
        'Overlap': k,
        'Pathway_Size': n,
        'p_value': pval,
        'Fold_Enrichment': fold_enrichment,
        'Genes': ', '.join(sorted(overlap)) if overlap else '-'
    })

df_enrich = pd.DataFrame(enrichment).sort_values('p_value')
print("-" * 80)
print("Pathway                             Overlap  Size     p-value     FE")
print("-" * 80)
for _, row in df_enrich.iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else ""
    print(f"  {row['Pathway']:33s} {row['Overlap']:>5d}/{row['Pathway_Size']:>3d} p={row['p_value']:.2e} {row['Fold_Enrichment']:>5.1f}x {sig}")

Significantly DE genes: 6 / 14
Enrichment test against 6 pathways

--------------------------------------------------------------------------------
Pathway                             Overlap  Size     p-value     FE
--------------------------------------------------------------------------------
  Protein Homeostasis                   4/  5 p=1.13e-14 2666.7x ***
  Microglial Clearance                  2/  3 p=2.25e-07 2222.2x ***
  Neuroinflammation                     2/  3 p=2.25e-07 2222.2x ***
  Autophagy-Lysosome                    2/  4 p=4.50e-07 1666.7x ***
  Extracellular Vesicles                1/  3 p=9.00e-04 1111.1x ***
  Tau Pathology                         0/  5 p=1.00e+00   0.0x

# --- Pathway Enrichment Bar Plot ---
fig, ax = plt.subplots(figsize=(10, 6))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

df_plot = df_enrich.sort_values('Fold_Enrichment', ascending=True)
colors_bar = ['#ef5350' if p < 0.05 else '#4fc3f7' for p in df_plot['p_value']]

bars = ax.barh(range(len(df_plot)), df_plot['Fold_Enrichment'], color=colors_bar, alpha=0.8, edgecolor='white', linewidth=0.5)
ax.set_yticks(range(len(df_plot)))
ax.set_yticklabels(df_plot['Pathway'], fontsize=9, color='#e0e0e0')
ax.set_xlabel('Fold Enrichment', color='#e0e0e0', fontsize=11)
ax.set_title('Pathway Enrichment in AD-Related Gene Expression', color='#4fc3f7', fontsize=13, fontweight='bold')
ax.tick_params(colors='#e0e0e0')
ax.axvline(1.0, color='#ffd54f', linestyle='--', alpha=0.5, label='No enrichment')
ax.legend(facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

hyp_data = [
    {"title": "TREM2-mediated microglial tau clearance enhancemen", "scores": {'Mechanistic': 0.454, 'Evidence': 0.506, 'Novelty': 0.671, 'Feasibility': 0.385, 'Impact': 0.582, 'Druggability': 0.65}},
    {"title": "HSP90-Tau Disaggregation Complex Enhancement", "scores": {'Mechanistic': 0.467, 'Evidence': 0.334, 'Novelty': 0.43, 'Feasibility': 0.426, 'Impact': 0.513, 'Druggability': 0.86}},
    {"title": "LRP1-Dependent Tau Uptake Disruption", "scores": {'Mechanistic': 0.524, 'Evidence': 0.51, 'Novelty': 0.588, 'Feasibility': 0.322, 'Impact': 0.355, 'Druggability': 0.64}},
]

# --- Hypothesis Radar Chart ---
categories = list(hyp_data[0]['scores'].keys())
n_cats = len(categories)
angles = np.linspace(0, 2 * np.pi, n_cats, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

colors_radar = ['#4fc3f7', '#ef5350', '#66bb6a', '#ffa726', '#ab47bc']
for idx, hyp in enumerate(hyp_data):
    values = [hyp['scores'][c] for c in categories]
    values += values[:1]
    ax.plot(angles, values, 'o-', color=colors_radar[idx % len(colors_radar)], linewidth=2, label=hyp['title'][:35], markersize=4)
    ax.fill(angles, values, alpha=0.1, color=colors_radar[idx % len(colors_radar)])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=9, color='#e0e0e0')
ax.set_ylim(0, 1)
ax.set_title('Hypothesis Multi-Dimensional Scoring', color='#4fc3f7', fontsize=13, fontweight='bold', pad=20)
ax.tick_params(colors='#e0e0e0')
ax.spines['polar'].set_color('#333')
ax.set_facecolor('#151525')
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=7, facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

Source	Relation	Target	Confidence
LRP1	regulates	LRP1-Dependent Tau Uptake Disruption	0.7
LRP1	regulates	Tau Propagation	0.6
TREM2	regulates	TREM2-mediated microglial tau clearance enhancemen	0.7
TREM2	regulates	Tau Propagation	0.6
CHMP4B	regulates	Extracellular Vesicle Biogenesis Modulation	0.7
CHMP4B	regulates	Tau Propagation	0.6
VCP	regulates	VCP-Mediated Autophagy Enhancement	0.7
VCP	regulates	Tau Propagation	0.6
HSP90AA1	regulates	HSP90-Tau Disaggregation Complex Enhancement	0.7
HSP90AA1	regulates	Tau Propagation	0.6
SNAP25	regulates	Synaptic Vesicle Tau Capture Inhibition	0.7
SNAP25	regulates	Tau Propagation	0.6
NLGN1	regulates	Trans-Synaptic Adhesion Molecule Modulation	0.7
NLGN1	regulates	Tau Propagation	0.6
SORL1	co_discussed	TAU	0.4
AKT	co_discussed	DAP12	0.4
APOE	co_discussed	DAP12	0.4
DAP12	co_discussed	PI3K	0.4
DAP12	co_discussed	TFEB	0.4
PI3K	co_discussed	TREM2	0.4

Tau propagation mechanisms and therapeutic interception points

Tau propagation mechanisms and therapeutic interception points¶

Key Hypotheses¶

1. Setup and Data Generation¶

2. Differential Expression Heatmap¶

3. Volcano Plot: Microglia Expression¶

4. Statistical Tests¶

5. Pathway Enrichment Analysis¶

6. Hypothesis Radar Chart¶

7. Knowledge Graph Edges¶

8. Conclusions¶

Key Findings¶

Data Sources¶