import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

# --- Gene Expression Analysis ---
# Simulated expression data based on known neurodegeneration biology
# Reference: Allen Brain Cell Atlas (SEA-AD)

np.random.seed(9597)
genes = ['G3BP1', 'TARDBP', 'HNRNPA2B1', 'SETX', 'SYNCRIP', 'NPM1', 'FUS', 'EWSR1', 'TAF15', 'TIA1', 'TIAR', 'ATXN2', 'PABPC1', 'SF3B1']
cell_types = ['Glia', 'Motor_Neurons']
conditions = ['Control', 'Disease']

n_samples = 50
expression_data = {}

for gene in genes:
    expression_data[gene] = {}
    for ct in cell_types:
        ctrl = np.random.lognormal(mean=2.0, sigma=0.5, size=n_samples)
        ad = ctrl * np.random.lognormal(mean=0.1, sigma=0.3, size=n_samples)  # default: small change
        if gene in ['TARDBP', 'FUS', 'G3BP1', 'HNRNPA2B1'] and ct == 'Motor_Neurons':
            ad = ctrl * np.random.lognormal(mean=0.7, sigma=0.3, size=n_samples)
        if gene in ['ATXN2', 'SYNCRIP', 'PABPC1'] and ct == 'Glia':
            ad = ctrl * np.random.lognormal(mean=0.7, sigma=0.3, size=n_samples)
        if gene in ['SETX', 'NPM1', 'EWSR1', 'TAF15'] and ct == 'Motor_Neurons':
            ad = ctrl * np.random.lognormal(mean=-0.4, sigma=0.2, size=n_samples)
        if gene in ['TIA1', 'TIAR', 'SF3B1'] and ct == 'Glia':
            ad = ctrl * np.random.lognormal(mean=-0.4, sigma=0.2, size=n_samples)
        expression_data[gene][ct] = {'Control': ctrl, 'Disease': ad}

print(f"Generated expression data for {len(genes)} genes x {len(cell_types)} cell types")
print(f"Samples per condition: {n_samples}")
print(f"Primary cell type: Motor_Neurons")

Generated expression data for 14 genes x 2 cell types
Samples per condition: 50
Primary cell type: Motor_Neurons

# --- Expression Heatmap: Log2 Fold Change ---
log2fc = np.zeros((len(genes), len(cell_types)))
pvalues = np.zeros((len(genes), len(cell_types)))

for i, gene in enumerate(genes):
    for j, ct in enumerate(cell_types):
        ctrl = expression_data[gene][ct]['Control']
        ad = expression_data[gene][ct]['Disease']
        log2fc[i, j] = np.log2(np.mean(ad) / np.mean(ctrl))
        _, pvalues[i, j] = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')

fig, ax = plt.subplots(figsize=(12, 14))
im = ax.imshow(log2fc, cmap='RdBu_r', vmin=-2, vmax=2, aspect='auto')

ax.set_xticks(range(len(cell_types)))
ax.set_xticklabels([ct.replace('_', ' ') for ct in cell_types], rotation=45, ha='right', fontsize=9)
ax.set_yticks(range(len(genes)))
ax.set_yticklabels(genes, fontsize=9)

for i in range(len(genes)):
    for j in range(len(cell_types)):
        sig = ''
        if pvalues[i, j] < 0.001: sig = '***'
        elif pvalues[i, j] < 0.01: sig = '**'
        elif pvalues[i, j] < 0.05: sig = '*'
        color = 'white' if abs(log2fc[i, j]) > 1 else 'black'
        ax.text(j, i, f'{log2fc[i,j]:.2f}\n{sig}', ha='center', va='center', fontsize=6, color=color)

cbar = plt.colorbar(im, ax=ax, label='Log2 Fold Change (Disease/Control)')
ax.set_title('Differential Gene Expression: AD vs Control by Cell Type', fontsize=13, fontweight='bold')
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')
ax.tick_params(colors='#e0e0e0')
ax.title.set_color('#4fc3f7')
cbar.ax.yaxis.set_tick_params(color='#e0e0e0')
cbar.ax.yaxis.label.set_color('#e0e0e0')
plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color='#e0e0e0')
plt.tight_layout()
plt.show()

print(f"\nSignificant changes (p < 0.05): {int(np.sum(pvalues < 0.05))} / {pvalues.size}")

Significant changes (p < 0.05): 14 / 28

# --- Volcano Plot: Motor_Neurons Expression Changes ---
fig, ax = plt.subplots(figsize=(10, 7))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

fc_vals = []
pv_vals = []
gene_labels = []

for gene in genes:
    ctrl = expression_data[gene]['Motor_Neurons']['Control']
    ad = expression_data[gene]['Motor_Neurons']['Disease']
    fc = np.log2(np.mean(ad) / np.mean(ctrl))
    _, pv = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')
    fc_vals.append(fc)
    pv_vals.append(-np.log10(max(pv, 1e-300)))
    gene_labels.append(gene)

fc_vals = np.array(fc_vals)
pv_vals = np.array(pv_vals)

colors = []
for fc, pv in zip(fc_vals, pv_vals):
    if pv > -np.log10(0.05) and abs(fc) > 0.5:
        colors.append('#ef5350' if fc > 0 else '#4fc3f7')
    else:
        colors.append('#555555')

ax.scatter(fc_vals, pv_vals, c=colors, s=80, alpha=0.8, edgecolors='white', linewidths=0.5)

for i, label in enumerate(gene_labels):
    if abs(fc_vals[i]) > 0.3 or pv_vals[i] > -np.log10(0.05):
        ax.annotate(label, (fc_vals[i], pv_vals[i]), fontsize=7, color='#e0e0e0',
                   xytext=(5, 5), textcoords='offset points')

ax.axhline(-np.log10(0.05), color='#ffd54f', linestyle='--', alpha=0.5, label='p=0.05')
ax.axvline(-0.5, color='#4fc3f7', linestyle='--', alpha=0.3)
ax.axvline(0.5, color='#ef5350', linestyle='--', alpha=0.3)
ax.set_xlabel('Log2 Fold Change', color='#e0e0e0', fontsize=11)
ax.set_ylabel('-Log10(p-value)', color='#e0e0e0', fontsize=11)
ax.set_title(f'Volcano Plot: Motor_Neurons Differential Expression in AD', color='#4fc3f7', fontsize=13, fontweight='bold')
ax.tick_params(colors='#e0e0e0')
ax.legend(facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

sig_up = sum(1 for fc, pv in zip(fc_vals, pv_vals) if fc > 0.5 and pv > -np.log10(0.05))
sig_down = sum(1 for fc, pv in zip(fc_vals, pv_vals) if fc < -0.5 and pv > -np.log10(0.05))
print(f"Significantly upregulated: {sig_up}")
print(f"Significantly downregulated: {sig_down}")

Significantly upregulated: 4
Significantly downregulated: 4

# --- Statistical Tests ---
print("=" * 80)
print("STATISTICAL ANALYSIS: Motor_Neurons Expression Changes in AD")
print("=" * 80)

results = []
for gene in genes:
    ctrl = expression_data[gene]['Motor_Neurons']['Control']
    ad = expression_data[gene]['Motor_Neurons']['Disease']

    # Mann-Whitney U test
    stat_mw, pval_mw = stats.mannwhitneyu(ctrl, ad, alternative='two-sided')

    # Cohen's d effect size
    pooled_std = np.sqrt((np.std(ctrl)**2 + np.std(ad)**2) / 2)
    cohens_d = (np.mean(ad) - np.mean(ctrl)) / pooled_std if pooled_std > 0 else 0

    # Log2 fold change
    l2fc = np.log2(np.mean(ad) / np.mean(ctrl))

    results.append({
        'Gene': gene,
        'Log2FC': l2fc,
        'p_value': pval_mw,
        'Cohens_d': cohens_d,
        'Significant': pval_mw < 0.05
    })

df_stats = pd.DataFrame(results).sort_values('p_value')
print(f"\n{'Motor_Neurons'} Cell Type - Top Differential Genes:")
print("-" * 80)
for _, row in df_stats.head(10).iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else "ns"
    print(f"  {row['Gene']:12s}  log2FC={row['Log2FC']:+.3f}  p={row['p_value']:.2e}  d={row['Cohens_d']:+.3f}  {sig}")

# ANOVA across all cell types for each gene
print(f"\n\nOne-Way ANOVA (Disease condition across cell types):")
print("-" * 80)
for gene in ['G3BP1', 'TARDBP', 'HNRNPA2B1', 'SETX', 'SYNCRIP', 'NPM1', 'FUS']:
    groups = [expression_data[gene][ct]['Disease'] for ct in cell_types]
    f_stat, p_anova = stats.f_oneway(*groups)
    sig = "***" if p_anova < 0.001 else "**" if p_anova < 0.01 else "*" if p_anova < 0.05 else "ns"
    print(f"  {gene:12s}  F={f_stat:.2f}  p={p_anova:.2e}  {sig}")

================================================================================
STATISTICAL ANALYSIS: Motor_Neurons Expression Changes in AD
================================================================================

Motor_Neurons Cell Type - Top Differential Genes:
--------------------------------------------------------------------------------
  G3BP1         log2FC=+1.132  p=1.23e-09  d=+1.172  ***
  TARDBP        log2FC=+1.034  p=1.96e-09  d=+1.296  ***
  FUS           log2FC=+1.039  p=1.52e-07  d=+1.223  ***
  HNRNPA2B1     log2FC=+0.932  p=2.86e-07  d=+1.167  ***
  EWSR1         log2FC=-0.561  p=8.87e-06  d=-0.993  ***
  TAF15         log2FC=-0.599  p=1.18e-04  d=-0.792  ***
  NPM1          log2FC=-0.523  p=1.74e-04  d=-0.683  ***
  SETX          log2FC=-0.532  p=2.62e-04  d=-0.707  ***
  SYNCRIP       log2FC=+0.271  p=1.87e-01  d=+0.335  ns
  PABPC1        log2FC=+0.308  p=2.13e-01  d=+0.275  ns


One-Way ANOVA (Disease condition across cell types):
--------------------------------------------------------------------------------
  G3BP1         F=26.26  p=1.50e-06  ***
  TARDBP        F=24.76  p=2.78e-06  ***
  HNRNPA2B1     F=17.76  p=5.59e-05  ***
  SETX          F=27.41  p=9.40e-07  ***
  SYNCRIP       F=25.05  p=2.46e-06  ***
  NPM1          F=16.42  p=1.02e-04  ***
  FUS           F=11.30  p=1.11e-03  **

# --- Pathway Enrichment (Hypergeometric Test) ---
from scipy.stats import hypergeom

pathways = {
    'Stress Granules': ['G3BP1', 'TIA1', 'TIAR', 'EWSR1', 'TAF15'],
    'RNA Processing': ['HNRNPA2B1', 'SYNCRIP', 'PABPC1', 'SF3B1', 'NPM1'],
    'TDP-43 Pathology': ['TARDBP', 'ATXN2', 'FUS'],
    'R-Loop Resolution': ['SETX', 'HNRNPA2B1', 'SYNCRIP'],
    'Axonal Transport': ['HNRNPA2B1', 'TARDBP', 'FUS'],
    'Translation Regulation': ['G3BP1', 'PABPC1', 'NPM1'],
}

# Get significantly DE genes
sig_genes = [r['Gene'] for r in results if r['Significant']]
all_gene_set = set(genes)
background_size = 20000  # approximate human protein-coding genes

print(f"Significantly DE genes: {len(sig_genes)} / {len(genes)}")
print(f"Enrichment test against {len(pathways)} pathways\n")

enrichment = []
for pathway_name, pathway_genes in pathways.items():
    pathway_set = set(pathway_genes)
    overlap = set(sig_genes) & pathway_set

    # Hypergeometric test
    M = background_size  # population
    n = len(pathway_set)  # successes in population
    N = len(sig_genes)  # draws
    k = len(overlap)  # observed successes

    pval = hypergeom.sf(k - 1, M, n, N) if k > 0 else 1.0
    fold_enrichment = (k / max(N, 1)) / (n / M) if n > 0 else 0

    enrichment.append({
        'Pathway': pathway_name,
        'Overlap': k,
        'Pathway_Size': n,
        'p_value': pval,
        'Fold_Enrichment': fold_enrichment,
        'Genes': ', '.join(sorted(overlap)) if overlap else '-'
    })

df_enrich = pd.DataFrame(enrichment).sort_values('p_value')
print("-" * 80)
print("Pathway                             Overlap  Size     p-value     FE")
print("-" * 80)
for _, row in df_enrich.iterrows():
    sig = "***" if row['p_value'] < 0.001 else "**" if row['p_value'] < 0.01 else "*" if row['p_value'] < 0.05 else ""
    print(f"  {row['Pathway']:33s} {row['Overlap']:>5d}/{row['Pathway_Size']:>3d} p={row['p_value']:.2e} {row['Fold_Enrichment']:>5.1f}x {sig}")

Significantly DE genes: 8 / 14
Enrichment test against 6 pathways

--------------------------------------------------------------------------------
Pathway                             Overlap  Size     p-value     FE
--------------------------------------------------------------------------------
  Axonal Transport                      3/  3 p=4.20e-11 2500.0x ***
  Stress Granules                       3/  5 p=4.20e-10 1500.0x ***
  R-Loop Resolution                     2/  3 p=4.20e-07 1666.7x ***
  TDP-43 Pathology                      2/  3 p=4.20e-07 1666.7x ***
  Translation Regulation                2/  3 p=4.20e-07 1666.7x ***
  RNA Processing                        2/  5 p=1.40e-06 1000.0x ***

# --- Pathway Enrichment Bar Plot ---
fig, ax = plt.subplots(figsize=(10, 6))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

df_plot = df_enrich.sort_values('Fold_Enrichment', ascending=True)
colors_bar = ['#ef5350' if p < 0.05 else '#4fc3f7' for p in df_plot['p_value']]

bars = ax.barh(range(len(df_plot)), df_plot['Fold_Enrichment'], color=colors_bar, alpha=0.8, edgecolor='white', linewidth=0.5)
ax.set_yticks(range(len(df_plot)))
ax.set_yticklabels(df_plot['Pathway'], fontsize=9, color='#e0e0e0')
ax.set_xlabel('Fold Enrichment', color='#e0e0e0', fontsize=11)
ax.set_title('Pathway Enrichment in AD-Related Gene Expression', color='#4fc3f7', fontsize=13, fontweight='bold')
ax.tick_params(colors='#e0e0e0')
ax.axvline(1.0, color='#ffd54f', linestyle='--', alpha=0.5, label='No enrichment')
ax.legend(facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

hyp_data = [
    {"title": "Stress Granule Phase Separation Modulators", "scores": {'Mechanistic': 0.486, 'Evidence': 0.405, 'Novelty': 0.675, 'Feasibility': 0.534, 'Impact': 0.407, 'Druggability': 0.5}},
    {"title": "Cryptic Exon Silencing Restoration", "scores": {'Mechanistic': 0.597, 'Evidence': 0.347, 'Novelty': 0.541, 'Feasibility': 0.404, 'Impact': 0.543, 'Druggability': 0.62}},
    {"title": "Cross-Seeding Prevention Strategy", "scores": {'Mechanistic': 0.427, 'Evidence': 0.527, 'Novelty': 0.482, 'Feasibility': 0.386, 'Impact': 0.457, 'Druggability': 0.75}},
]

# --- Hypothesis Radar Chart ---
categories = list(hyp_data[0]['scores'].keys())
n_cats = len(categories)
angles = np.linspace(0, 2 * np.pi, n_cats, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
fig.patch.set_facecolor('#0a0a14')
ax.set_facecolor('#151525')

colors_radar = ['#4fc3f7', '#ef5350', '#66bb6a', '#ffa726', '#ab47bc']
for idx, hyp in enumerate(hyp_data):
    values = [hyp['scores'][c] for c in categories]
    values += values[:1]
    ax.plot(angles, values, 'o-', color=colors_radar[idx % len(colors_radar)], linewidth=2, label=hyp['title'][:35], markersize=4)
    ax.fill(angles, values, alpha=0.1, color=colors_radar[idx % len(colors_radar)])

ax.set_xticks(angles[:-1])
ax.set_xticklabels(categories, fontsize=9, color='#e0e0e0')
ax.set_ylim(0, 1)
ax.set_title('Hypothesis Multi-Dimensional Scoring', color='#4fc3f7', fontsize=13, fontweight='bold', pad=20)
ax.tick_params(colors='#e0e0e0')
ax.spines['polar'].set_color('#333')
ax.set_facecolor('#151525')
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1), fontsize=7, facecolor='#1a1a2e', edgecolor='#333', labelcolor='#e0e0e0')
plt.tight_layout()
plt.show()

RNA binding protein dysregulation across ALS FTD and AD

RNA binding protein dysregulation across ALS FTD and AD¶

Key Hypotheses¶

1. Setup and Data Generation¶

2. Differential Expression Heatmap¶

3. Volcano Plot: Motor_Neurons Expression¶

4. Statistical Tests¶

5. Pathway Enrichment Analysis¶

6. Hypothesis Radar Chart¶

7. Knowledge Graph Edges¶

8. Conclusions¶

Key Findings¶

Data Sources¶

Source	Relation	Target	Confidence
HNRNPA2B1	associated_with	neurodegeneration	0.57
SETX	associated_with	neurodegeneration	0.54
SYNCRIP	associated_with	neurodegeneration	0.49
NPM1	associated_with	neurodegeneration	0.424
SETX	co_discussed	TARDBP	0.4
SETX	co_discussed	HNRNPA2B1	0.4
SETX	co_discussed	NPM1	0.4
SETX	co_discussed	SYNCRIP	0.4
SETX	co_discussed	G3BP1	0.4
TARDBP	co_discussed	HNRNPA2B1	0.4
TARDBP	co_discussed	NPM1	0.4
TARDBP	co_discussed	SYNCRIP	0.4
HNRNPA2B1	co_discussed	NPM1	0.4
HNRNPA2B1	co_discussed	SYNCRIP	0.4
HNRNPA2B1	co_discussed	G3BP1	0.4
NPM1	co_discussed	SYNCRIP	0.4
NPM1	co_discussed	G3BP1	0.4
SYNCRIP	co_discussed	G3BP1	0.4
APOE4	co_discussed	C9ORF72	0.4
APOE4	co_discussed	FUS	0.4