import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'

REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))

CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB

def load(name):
    p = CACHE / f'{name}.json'
    if p.exists():
        return json.loads(p.read_text())
    return {}

db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
    db = sqlite3.connect(str(db_path))
    prov = pd.read_sql_query('''
        SELECT skill_id, status, COUNT(*) AS n_calls,
               ROUND(AVG(duration_ms),0) AS mean_ms
        FROM tool_calls
        WHERE created_at >= date('now','-30 days')
        GROUP BY skill_id, status
        ORDER BY n_calls DESC
    ''', db)
    db.close()
    prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
    print(f'{len(prov)} tool-call aggregates (last 30 days):')
    prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
    print(f'Provenance unavailable: {e}')

181 tool-call aggregates (last 30 days):

ann_rows = []
for g in ['AND', 'APOE', 'COMPLEX']:
    mg = load(f'mygene_{g}')
    hpa = load(f'hpa_{g}')
    if not mg and not hpa:
        ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
                         'disease_involvement': '—'})
        continue
    ann_rows.append({
        'gene': g,
        'name': (mg.get('name') or '')[:55],
        'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
                        if isinstance(hpa.get('protein_class'), list)
                        else str(hpa.get('protein_class') or '—')[:55],
        'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
                              if isinstance(hpa.get('disease_involvement'), list)
                              else str(hpa.get('disease_involvement') or '')[:55],
    })
pd.DataFrame(ann_rows)

go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
    go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
    go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
    go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
    go_df['term'] = go_df['term'].str[:60]
    go_df['n_hits'] = go_df['genes'].apply(len)
    go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
    go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
    print('No GO:BP enrichment data')

# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
    top = go_bp[:8]
    terms = [t['term'][:45] for t in top][::-1]
    neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
    fig, ax = plt.subplots(figsize=(9, 4.5))
    ax.barh(terms, neglogp, color='#4fc3f7')
    ax.set_xlabel('-log10(p-value)')
    ax.set_title('Top GO:BP enrichment (Enrichr)')
    ax.grid(axis='x', alpha=0.3)
    plt.tight_layout(); plt.show()
else:
    print('No GO:BP data to plot')

kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
    kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
    kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
    kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
    kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
    kegg_df
else:
    print('No KEGG enrichment data')

No KEGG enrichment data

ppi = load('string_network')
if isinstance(ppi, list) and ppi:
    ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
    display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
    print(f'{len(ppi_df)} STRING edges')
    ppi_df[display_cols].head(20)
else:
    print('No STRING edges returned')

11 STRING edges

# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
    import math
    nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
    n = len(nodes)
    pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
    fig, ax = plt.subplots(figsize=(7, 7))
    for e in ppi:
        x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
        ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
                linewidth=0.5+2*e['score'])
    for name,(x,y) in pos.items():
        ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
        ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
    ax.set_aspect('equal'); ax.axis('off')
    ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
    plt.tight_layout(); plt.show()
else:
    print('No STRING data to visualize')

pw_rows = []
for g in ['AND', 'APOE', 'COMPLEX']:
    pws = load(f'reactome_{g}')
    if isinstance(pws, list):
        pw_rows.append({'gene': g, 'n_pathways': len(pws),
                        'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
    else:
        pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)

ish_rows = []
for g in ['AND', 'APOE', 'COMPLEX']:
    ish = load(f'allen_ish_{g}')
    regions = ish.get('regions') or [] if isinstance(ish, dict) else []
    ish_rows.append({
        'gene': g,
        'n_ish_regions': len(regions),
        'top_region': (regions[0].get('structure','') if regions else '—')[:45],
        'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
    })
pd.DataFrame(ish_rows)

hyp_data = [('Microglial TREM2-Complement Axis Modulation', 0.726), ('Oligodendrocyte DNA Repair Enhancement Therapy', 0.665), ('Astrocyte Metabolic Reprogramming via APOE4 Correction', 0.636), ('BMP4 Pathway Inhibition for Oligodendrocyte Myelination', 0.62), ('Cross-Cell Type Synaptic Rescue via Tripartite Synapse ', 0.62), ('Neuronal Integrated Stress Response Modulation', 0.618), ('Spatial Transcriptome-Guided Precision Cell Therapy', 0.578)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title("Cell type vulnerability in Alzheimer's Disease (SEA-AD data - v2)")
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()

labels = ['Microglial TREM2-Complement Axis Modulat', 'Oligodendrocyte DNA Repair Enhancement T', 'Astrocyte Metabolic Reprogramming via AP', 'BMP4 Pathway Inhibition for Oligodendroc', 'Cross-Cell Type Synaptic Rescue via Trip', 'Neuronal Integrated Stress Response Modu', 'Spatial Transcriptome-Guided Precision C']
matrix = np.array([[0.75, 0.9, 0.85, 0.85, 0, 0.85, 0.8, 0.95, 0.75], [0.8, 0.7, 0.65, 0.6, 0, 0.6, 0.65, 0.75, 0.45], [0.95, 0.25, 0.85, 0.75, 0, 0.65, 0.4, 0.3, 0.3], [0.85, 0.6, 0.6, 0.65, 0, 0.45, 0.55, 0.8, 0.5], [0.9, 0.4, 0.8, 0.75, 0, 0.7, 0.6, 0.45, 0.55], [0.75, 0.55, 0.65, 0.7, 0, 0.55, 0.6, 0.65, 0.5], [0.95, 0.2, 0.7, 0.6, 0, 0.6, 0.3, 0.25, 0.35]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
    fig, ax = plt.subplots(figsize=(10, 5))
    im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
    ax.set_xticks(range(len(dims)))
    ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
                       rotation=45, ha='right', fontsize=8)
    ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
    ax.set_title('Score dimensions — top hypotheses')
    plt.colorbar(im, ax=ax, shrink=0.8)
    plt.tight_layout(); plt.show()
else:
    print('No score data available')

hid = 'h-3616325a'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-fa7ac9cb'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-d8f2bbc9'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-e064f134'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-019c56c1'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-5137be61'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

hid = 'h-0bdc3803'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
    lit = pd.DataFrame(papers)
    cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
    if cols:
        lit = lit[cols]
        lit['title'] = lit['title'].str[:80]
        if 'journal' in lit.columns:
            lit['journal'] = lit['journal'].str[:30]
        lit.sort_values('year', ascending=False, inplace=True)
        display_df = lit
    else:
        display_df = pd.DataFrame(papers[:5])
else:
    display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df

edge_data = [{'source': 'TREM2 dysregulation', 'relation': 'causes (disease-associate', 'target': 'microglial dysfunction', 'strength': 0.8}, {'source': 'h-3616325a', 'relation': 'targets', 'target': 'TREM2', 'strength': 0.8}, {'source': 'h-3616325a', 'relation': 'targets', 'target': 'C3', 'strength': 0.8}, {'source': 'h-3616325a', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.8}, {'source': 'complement activation', 'relation': 'causes (excessive complem', 'target': 'synapse elimination', 'strength': 0.75}, {'source': 'DNA damage', 'relation': 'causes (DNA damage in oli', 'target': 'oligodendrocyte degeneration', 'strength': 0.75}, {'source': 'TREM2 enhancement', 'relation': 'causes (enhancing TREM2 e', 'target': 'tau pathology reduction', 'strength': 0.75}, {'source': 'APOE4', 'relation': 'causes (APOE4 disrupts li', 'target': 'astrocyte dysfunction', 'strength': 0.72}, {'source': 'oligodendrocyte degeneration', 'relation': 'causes (oligodendrocyte d', 'target': 'myelin breakdown', 'strength': 0.7}, {'source': 'APOE4', 'relation': 'causes (APOE4 mediates my', 'target': 'myelin breakdown', 'strength': 0.7}, {'source': 'PARP1 activation', 'relation': 'causes (PARP1 activation ', 'target': 'DNA repair enhancement', 'strength': 0.7}, {'source': 'tripartite synapse dysfunction', 'relation': 'causes (coordinated dysfu', 'target': 'synaptic failure', 'strength': 0.7}, {'source': 'h-d8f2bbc9', 'relation': 'targets', 'target': 'APOE', 'strength': 0.7}, {'source': 'h-d8f2bbc9', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.7}, {'source': 'integrated stress response dys', 'relation': 'causes (dysregulated ISR ', 'target': 'protein synthesis shutdown', 'strength': 0.68}, {'source': 'chronic hypoperfusion', 'relation': 'causes (chronic hypoperfu', 'target': 'BMP4 release', 'strength': 0.65}, {'source': 'BMP4', 'relation': 'causes (pericyte-derived ', 'target': 'white matter damage', 'strength': 0.65}, {'source': 'h-019c56c1', 'relation': 'targets', 'target': 'SYN1', 'strength': 0.65}, {'source': 'h-019c56c1', 'relation': 'targets', 'target': 'SLC1A2', 'strength': 0.65}, {'source': 'h-019c56c1', 'relation': 'targets', 'target': 'CX3CR1', 'strength': 0.65}, {'source': 'h-019c56c1', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.65}, {'source': 'h-5137be61', 'relation': 'targets', 'target': 'EIF2AK3 (PERK)', 'strength': 0.6}, {'source': 'h-5137be61', 'relation': 'targets', 'target': 'EIF2B complex', 'strength': 0.6}, {'source': 'h-5137be61', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.6}, {'source': 'h-fa7ac9cb', 'relation': 'targets', 'target': 'PARP1', 'strength': 0.55}]
if edge_data:
    pd.DataFrame(edge_data).head(25)
else:
    print('No KG edge data available')

Cell type vulnerability in Alzheimer's Disease (SEA-AD data - v2) — Analysis Notebook

Cell type vulnerability in Alzheimer's Disease (SEA-AD data - v2)¶

Research question¶

Approach¶

Debate Summary¶

1. Forge tool provenance¶

2. Target gene annotations¶

3. GO Biological Process enrichment (Enrichr)¶

4. KEGG pathway enrichment¶

5. STRING protein interaction network¶

6. Reactome pathway footprint¶

7. Allen Brain Atlas ISH regional expression¶

8. Hypothesis ranking (7 hypotheses)¶

9. Score dimension heatmap (top 10)¶

10. PubMed evidence per hypothesis¶

Hypothesis 1: Microglial TREM2-Complement Axis Modulation¶

Molecular Mechanism¶

Hypothesis 2: Oligodendrocyte DNA Repair Enhancement Therapy¶

Molecular Mechanism and Rationale¶

Hypothesis 3: Astrocyte Metabolic Reprogramming via APOE4 Correction¶

Molecular Mechanism and Rationale¶

Hypothesis 4: BMP4 Pathway Inhibition for Oligodendrocyte Myelination Support¶

Molecular Mechanism and Rationale¶

Hypothesis 5: Cross-Cell Type Synaptic Rescue via Tripartite Synapse Restoration¶

Molecular Mechanism and Rationale¶

Hypothesis 6: Neuronal Integrated Stress Response Modulation¶

Molecular Mechanism and Rationale¶

Hypothesis 7: Spatial Transcriptome-Guided Precision Cell Therapy¶

Molecular Mechanism and Rationale¶

11. Knowledge graph edges (75 total)¶

12. Caveats¶

	gene	name	protein_class	disease_involvement
0	AND	—	—	—
1	APOE	apolipoprotein E	Cancer-related genes, Candidate cardiovascular...	Alzheimer disease, Amyloidosis
2	COMPLEX	—	—	—