import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'

REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))

CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB

def load(name):
    p = CACHE / f'{name}.json'
    if p.exists():
        return json.loads(p.read_text())
    return {}

db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
    db = sqlite3.connect(str(db_path))
    prov = pd.read_sql_query('''
        SELECT skill_id, status, COUNT(*) AS n_calls,
               ROUND(AVG(duration_ms),0) AS mean_ms
        FROM tool_calls
        WHERE created_at >= date('now','-30 days')
        GROUP BY skill_id, status
        ORDER BY n_calls DESC
    ''', db)
    db.close()
    prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
    print(f'{len(prov)} tool-call aggregates (last 30 days):')
    prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
    print(f'Provenance unavailable: {e}')

181 tool-call aggregates (last 30 days):

ann_rows = []
for g in ['GFAP', 'PDGFRA', 'PDGFRB']:
    mg = load(f'mygene_{g}')
    hpa = load(f'hpa_{g}')
    if not mg and not hpa:
        ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
                         'disease_involvement': '—'})
        continue
    ann_rows.append({
        'gene': g,
        'name': (mg.get('name') or '')[:55],
        'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
                        if isinstance(hpa.get('protein_class'), list)
                        else str(hpa.get('protein_class') or '—')[:55],
        'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
                              if isinstance(hpa.get('disease_involvement'), list)
                              else str(hpa.get('disease_involvement') or '')[:55],
    })
pd.DataFrame(ann_rows)

go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
    go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
    go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
    go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
    go_df['term'] = go_df['term'].str[:60]
    go_df['n_hits'] = go_df['genes'].apply(len)
    go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
    go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
    print('No GO:BP enrichment data')

# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
    top = go_bp[:8]
    terms = [t['term'][:45] for t in top][::-1]
    neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
    fig, ax = plt.subplots(figsize=(9, 4.5))
    ax.barh(terms, neglogp, color='#4fc3f7')
    ax.set_xlabel('-log10(p-value)')
    ax.set_title('Top GO:BP enrichment (Enrichr)')
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout(); plt.show()
else:
    print('No GO:BP data to plot')

kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
    kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
    kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
    kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
    kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
    kegg_df
else:
    print('No KEGG enrichment data')

No KEGG enrichment data

ppi = load('string_network')
if isinstance(ppi, list) and ppi:
    ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
    display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
    print(f'{len(ppi_df)} STRING edges')
    ppi_df[display_cols].head(20)
else:
    print('No STRING edges returned')

11 STRING edges

# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
    import math
    nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
    n = len(nodes)
    pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
    fig, ax = plt.subplots(figsize=(7, 7))
    for e in ppi:
        x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
        ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
                linewidth=0.5+2*e['score'])
    for name,(x,y) in pos.items():
        ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
        ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
    ax.set_aspect('equal'); ax.axis('off')
    ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
    plt.tight_layout(); plt.show()
else:
    print('No STRING data to visualize')

pw_rows = []
for g in ['GFAP', 'PDGFRA', 'PDGFRB']:
    pws = load(f'reactome_{g}')
    if isinstance(pws, list):
        pw_rows.append({'gene': g, 'n_pathways': len(pws),
                        'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
    else:
        pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)

ish_rows = []
for g in ['GFAP', 'PDGFRA', 'PDGFRB']:
    ish = load(f'allen_ish_{g}')
    regions = ish.get('regions') or [] if isinstance(ish, dict) else []
    ish_rows.append({
        'gene': g,
        'n_ish_regions': len(regions),
        'top_region': (regions[0].get('structure','') if regions else '—')[:45],
        'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
    })
pd.DataFrame(ish_rows)

hyp_data = [('Heterogeneous astrocyte activation states differentiall', 0.64), ('Layer V excitatory neurons show selectively enhanced vu', 0.629), ('Microglial TREM2 downregulation impairs damage-associat', 0.628), ('Vascular mural cell degeneration precedes and exacerbat', 0.625), ('OPC differentiation blockade contributes to white matte', 0.599)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title("SEA-AD Single-Cell Analysis: Cell-Type Vulnerability in Alzheimer's Disease")
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()

labels = ['Heterogeneous astrocyte activation state', 'Layer V excitatory neurons show selectiv', 'Microglial TREM2 downregulation impairs ', 'Vascular mural cell degeneration precede', 'OPC differentiation blockade contributes']
matrix = np.array([[0.75, 0.7, 0.82, 0, 0, 0, 0, 0, 0], [0.75, 0.7, 0.82, 0, 0, 0, 0, 0, 0], [0.75, 0.7, 0.82, 0, 0, 0, 0, 0, 0], [0.75, 0.7, 0.82, 0, 0, 0, 0, 0, 0], [0.75, 0.7, 0.82, 0, 0, 0, 0, 0, 0]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
    fig, ax = plt.subplots(figsize=(10, 5))
    im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
    ax.set_xticks(range(len(dims)))
    ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
                       rotation=45, ha='right', fontsize=8)
    ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
    ax.set_title('Score dimensions — top hypotheses')
    plt.colorbar(im, ax=ax, shrink=0.8)
    plt.tight_layout(); plt.show()
else:
    print('No score data available')

hid = 'h_seaad_002'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h_seaad_003'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h_seaad_001'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h_seaad_005'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h_seaad_004'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

edge_data = [{'source': 'DAP12', 'relation': 'involved_in', 'target': 'UPR', 'strength': 0.6}, {'source': 'MAPK', 'relation': 'co_discussed', 'target': 'TYROBP', 'strength': 0.4}, {'source': 'DAP12', 'relation': 'co_discussed', 'target': 'STAT3', 'strength': 0.4}, {'source': 'C3', 'relation': 'co_discussed', 'target': 'TREM2', 'strength': 0.4}, {'source': 'C4', 'relation': 'co_discussed', 'target': 'GFAP', 'strength': 0.4}, {'source': 'APOE4', 'relation': 'co_discussed', 'target': 'GFAP', 'strength': 0.4}]
if edge_data:
    pd.DataFrame(edge_data).head(25)
else:
    print('No KG edge data available')

	gene	name	protein_class	disease_involvement
0	GFAP	glial fibrillary acidic protein	Candidate cardiovascular disease genes, Diseas...	Disease variant, Leukodystrophy
1	PDGFRA	platelet derived growth factor receptor alpha	Cancer-related genes, CD markers	Cancer-related genes, Disease variant
2	PDGFRB	platelet derived growth factor receptor beta	Cancer-related genes, CD markers	Cancer-related genes, Disease variant

	year	journal	title	pmid
2	2026	Nat Neurosci	Microglia modulate Aβ-dependent astrocyte reac...	41198899
4	2025	Alzheimers Dement	Early microglial and astrocyte reactivity in p...	40747577
0	2024	Alzheimers Dement	Astrocyte biomarkers GFAP and YKL-40 mediate e...	37690071
1	2024	Brain	Blood GFAP reflects astrocyte reactivity to Al...	38634672
3	2024	Mol Neurodegener	Astrocytic autophagy plasticity modulates Aβ c...	39044253

	year	journal	title	pmid
0	2024	Neuroscience	Alzheimer's Disease-associated Region-specific...	38552733
1	2020	J Neurochem	Amyloid-beta(1-42) induced glutamatergic recep...	32491248

	year	journal	title	pmid
2	2024	Nat Neurosci	Identification of senescent, TREM2-expressing ...	38637622
1	2022	Cell	TREM2 drives microglia response to amyloid-β v...	36306735
3	2020	Nat Med	Human and mouse single-nucleus transcriptomics...	31932797
4	2020	Nat Commun	Gene expression and functional deficits underl...	33097708
0	2017	Cell	A Unique Microglia Type Associated with Restri...	28602351

	year	journal	title	pmid
4	2024	Alzheimers Res Ther	Assessing blood-brain barrier dysfunction and ...	39085945
0	2020	Acta Neuropathol	Identification of early pericyte loss and vasc...	32043162
1	2020	Nature	APOE4 leads to blood-brain barrier dysfunction...	32376954
2	2019	Nat Med	Blood-brain barrier breakdown is an early biom...	30643288
3	2018	J Cereb Blood Flow Metab	Differing associations between Aβ accumulation...	28151041

SEA-AD Single-Cell Analysis: Cell-Type Vulnerability in Alzheimer's Disease — Analysis Notebook

SEA-AD Single-Cell Analysis: Cell-Type Vulnerability in Alzheimer's Disease¶

Research question¶

Approach¶

Debate Summary¶

1. Forge tool provenance¶

2. Target gene annotations¶

3. GO Biological Process enrichment (Enrichr)¶

4. KEGG pathway enrichment¶

5. STRING protein interaction network¶

6. Reactome pathway footprint¶

7. Allen Brain Atlas ISH regional expression¶

8. Hypothesis ranking (5 hypotheses)¶

9. Score dimension heatmap (top 10)¶

10. PubMed evidence per hypothesis¶

Hypothesis 1: Heterogeneous astrocyte activation states differentially impact neuron¶

Heterogeneous astrocyte activation states differentially impact neuronal survival across AD progression¶

Overview¶

Hypothesis 2: Layer V excitatory neurons show selectively enhanced vulnerability thr¶

Layer V excitatory neurons show selectively enhanced vulnerability through dysregulated calcium signaling¶

Overview¶

Hypothesis 3: Microglial TREM2 downregulation impairs damage-associated response in¶

Microglial TREM2 downregulation impairs damage-associated response in late-stage Alzheimer's disease¶

Overview¶

Hypothesis 4: Vascular mural cell degeneration precedes and exacerbates parenchymal¶

Vascular mural cell degeneration precedes and exacerbates parenchymal pathology¶

Overview¶

Hypothesis 5: OPC differentiation blockade contributes to white matter degeneration¶

OPC differentiation blockade contributes to white matter degeneration in early-stage AD¶

Overview¶

11. Knowledge graph edges (6 total)¶

12. Caveats¶

	gene	n_pathways	top_pathway
1	PDGFRA	8	PIP3 activates AKT signaling
2	PDGFRB	6	PIP3 activates AKT signaling
0	GFAP	2	Nuclear signaling by ERBB4