import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'

# Find worktree root by looking for AGENTS.md marker
_cwd = Path('.').resolve()
if (_cwd / 'AGENTS.md').exists():
    REPO = _cwd
else:
    REPO = _cwd
    for _p in [_cwd] + list(_cwd.parents):
        if (_p / 'AGENTS.md').exists():
            REPO = _p
            break

CACHE = REPO / 'data' / 'forge_cache' / 'seaad'

# TARGET_GENES: 11-gene AD vulnerability set (from forge/seaad_analysis.py)
TARGET_GENES = [
    "TREM2", "GFAP", "SLC17A7", "PDGFRA", "PDGFRB",
    "APOE", "MAPT", "APP", "PSEN1", "TYROBP", "CLU"
]

def load(name, _cache=CACHE):
    """Load cached JSON, return empty dict if file missing."""
    _path = _cache / f'{name}.json'
    if _path.exists():
        return json.loads(_path.read_text())
    return {}

# Forge provenance: tool calls this session invoked
try:
    db = sqlite3.connect(str(REPO / 'scidex.db'))
    prov = pd.read_sql_query("""
        SELECT skill_id, status, COUNT(*) AS n_calls,
               ROUND(AVG(duration_ms),0) AS mean_ms,
               MIN(created_at) AS first_call
        FROM tool_calls
        WHERE created_at >= date('now','-1 day')
        GROUP BY skill_id, status
        ORDER BY n_calls DESC
    """, db)
    db.close()
    prov.rename(columns={'skill_id':'tool'}, inplace=True)
    prov['tool'] = prov['tool'].str.replace('tool_', '', regex=False)
    print(f'{len(prov)} tool-call aggregates from the last 24h of Forge provenance:')
    prov.head(20)
except Exception as e:
    print(f"Provenance query skipped: {e}")
    print("(tool_calls table may not exist in this worktree)")

Provenance query skipped: Execution failed on sql '
        SELECT skill_id, status, COUNT(*) AS n_calls,
               ROUND(AVG(duration_ms),0) AS mean_ms,
               MIN(created_at) AS first_call
        FROM tool_calls
        WHERE created_at >= date('now','-1 day')
        GROUP BY skill_id, status
        ORDER BY n_calls DESC
    ': no such table: tool_calls
(tool_calls table may not exist in this worktree)

anno = load('mygene_TREM2')  # probe one
ann_rows = []
for g in ['TREM2','GFAP','SLC17A7','PDGFRA','PDGFRB','APOE','MAPT','APP','PSEN1','TYROBP','CLU']:
    mg = load(f'mygene_{g}')
    hpa = load(f'hpa_{g}')
    ann_rows.append({
        'gene': g,
        'name': (mg.get('name') or '')[:55],
        'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55],
        'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55] if isinstance(hpa.get('disease_involvement'), list) else str(hpa.get('disease_involvement') or '')[:55],
        'ensembl_id': hpa.get('ensembl_id') or '',
    })
anno_df = pd.DataFrame(ann_rows)
anno_df

go_bp = load('enrichr_GO_Biological_Process')[:10]
go_df = pd.DataFrame(go_bp)[['term','p_value','odds_ratio','genes']]
go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
go_df['term'] = go_df['term'].str[:60]
go_df['n_hits'] = go_df['genes'].apply(len)
go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
go_df[['term','n_hits','p_value','odds_ratio','genes']]

# Visualize top GO BP enrichment (−log10 p-value bar chart)
import numpy as np
go_bp = load('enrichr_GO_Biological_Process')[:8]
terms = [t['term'][:45] for t in go_bp][::-1]
neglogp = [-np.log10(t['p_value']) for t in go_bp][::-1]
fig, ax = plt.subplots(figsize=(9, 4.5))
ax.barh(terms, neglogp, color='#4fc3f7')
ax.set_xlabel('-log10(p-value)')
ax.set_title('Top GO:BP enrichment for SEA-AD vulnerability gene set (Enrichr)')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

cm = load('enrichr_CellMarker_Cell_Types')[:10]
cm_df = pd.DataFrame(cm)[['term','p_value','odds_ratio','genes']]
cm_df['genes'] = cm_df['genes'].apply(lambda g: ', '.join(g))
cm_df['p_value'] = cm_df['p_value'].apply(lambda p: f'{p:.2e}')
cm_df['odds_ratio'] = cm_df['odds_ratio'].round(1)
cm_df

ppi = load('string_network')
ppi_df = pd.DataFrame(ppi)
if not ppi_df.empty:
    ppi_df = ppi_df.sort_values('score', ascending=False)
    display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
    print(f'{len(ppi_df)} STRING edges among {len(set(list(ppi_df.protein1)+list(ppi_df.protein2)))} proteins')
    ppi_df[display_cols].head(20)
else:
    print('No STRING edges returned (API may be rate-limited)')

11 STRING edges among 9 proteins

# Simple network figure using matplotlib (no networkx dep)
ppi = load('string_network')
if ppi:
    import math
    nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
    n = len(nodes)
    pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
    fig, ax = plt.subplots(figsize=(7, 7))
    for e in ppi:
        x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
        ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'], linewidth=0.5+2*e['score'])
    for name,(x,y) in pos.items():
        ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
        ax.annotate(name, (x,y), ha='center', va='center', fontsize=9, fontweight='bold', zorder=4)
    ax.set_aspect('equal'); ax.axis('off')
    ax.set_title(f'STRING physical PPI network ({len(ppi)} edges, score ≥ 0.4)')
    plt.tight_layout(); plt.show()

pw_rows = []
for g in ['TREM2','GFAP','SLC17A7','PDGFRA','PDGFRB','APOE','MAPT','APP','PSEN1','TYROBP','CLU']:
    pws = load(f'reactome_{g}')
    pw_rows.append({'gene': g, 'n_pathways': len(pws),
                    'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
pw_df = pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)
pw_df

from collections import Counter
ac = load('allen_celltypes_TREM2')  # same for any gene (not gene-filtered at API level)
ct = pd.DataFrame(ac.get('cell_types', []))
if not ct.empty:
    ct_display = ct.head(15)
else:
    ct_display = pd.DataFrame()
ct_display

ish_rows = []
for g in TARGET_GENES:
    ish = load(f'allen_ish_{g}')
    regions = ish.get('regions') or []
    ish_rows.append({
        'gene': g,
        'n_ish_regions': len(regions),
        'top_region': (regions[0].get('structure','') if regions else '—')[:45],
        'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
        'note': (ish.get('note') or '')[:60],
    })
ish_df = pd.DataFrame(ish_rows)
ish_df

hid = 'h-seaad-5b3cb8ea'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

hid = 'h-seaad-51323624'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

hid = 'h-seaad-7f15df4c'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

hid = 'h-seaad-fa5ea82d'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

hid = 'h-seaad-56fa6428'
papers = load(f'pubmed_{hid}')
if papers:
    lit = pd.DataFrame(papers)[['year','journal','title','pmid']]
    lit['title'] = lit['title'].str[:80]
    lit['journal'] = lit['journal'].str[:30]
    lit.sort_values('year', ascending=False, inplace=True)
    display_df = lit
else:
    display_df = pd.DataFrame([{'note':'no PubMed results for this hypothesis query'}])
display_df

from pathlib import Path
bundle_path = REPO / 'data/analysis_outputs/analysis-SEAAD-20260402/mechanistic_de/bundle.json'
if bundle_path.exists():
    mech_bundle = json.loads(bundle_path.read_text())
    print("Mechanistic highlights:")
    for item in mech_bundle.get('mechanistic_highlights', []):
        print(f"- {item}")
    mech_df = pd.DataFrame([
        {
            'gene': gene,
            'dx_hits': len((payload.get('differential_expression') or {}).get('experiments', [])),
            'top_pathway': ((payload.get('reactome_pathways') or [{}])[0].get('name', '')),
            'top_paper': ((payload.get('literature') or [{}])[0].get('title', '')),
        }
        for gene, payload in mech_bundle.get('per_gene', {}).items()
    ])
    mech_df
else:
    print(f"Missing mechanistic evidence bundle: {bundle_path}")

Mechanistic highlights:
- C1QA: 12 human differential-expression experiments matched the disease filter; top hit E-GEOD-67333 (Transcriptomics profiling of Alzheimer's disease reveal novel molecular targets).
- TREM2: 12 human differential-expression experiments matched the disease filter; top hit E-GEOD-67333 (Transcriptomics profiling of Alzheimer's disease reveal novel molecular targets).
- SLC17A7: 12 human differential-expression experiments matched the disease filter; top hit E-GEOD-67333 (Transcriptomics profiling of Alzheimer's disease reveal novel molecular targets).
- APOE: 12 human differential-expression experiments matched the disease filter; top hit E-GEOD-67333 (Transcriptomics profiling of Alzheimer's disease reveal novel molecular targets).
- GFAP: 12 human differential-expression experiments matched the disease filter; top hit E-GEOD-67333 (Transcriptomics profiling of Alzheimer's disease reveal novel molecular targets).
- STRING physical interactions support a shared mechanism network: APOE–TREM2 (0.99).
- GO_Biological_Process enrichment is led by 'Positive Regulation Of Amyloid-Beta Clearance (GO:1900223)' with p=7.496663581039455e-07, linking the gene set to a coherent pathway-level mechanism.
- Reactome enrichment is led by 'Nuclear Signaling By ERBB4 R-HSA-1251985' with p=2.4725579551199923e-05, linking the gene set to a coherent pathway-level mechanism.
- KEGG enrichment is led by 'Nicotine addiction' with p=0.009960969495176103, linking the gene set to a coherent pathway-level mechanism.

	gene	name	protein_class	disease_involvement	ensembl_id
0	TREM2	triggering receptor expressed on myeloid cells 2	Disease related genes, Human disease related g...	Alzheimer disease, Amyloidosis	ENSG00000095970
1	GFAP	glial fibrillary acidic protein	Candidate cardiovascular disease genes, Diseas...	Disease variant, Leukodystrophy	ENSG00000131095
2	SLC17A7	solute carrier family 17 member 7	Metabolic proteins, Predicted membrane proteins		ENSG00000104888
3	PDGFRA	platelet derived growth factor receptor alpha	Cancer-related genes, CD markers	Cancer-related genes, Disease variant	ENSG00000134853
4	PDGFRB	platelet derived growth factor receptor beta	Cancer-related genes, CD markers	Cancer-related genes, Disease variant	ENSG00000113721
5	APOE	apolipoprotein E	Cancer-related genes, Candidate cardiovascular...	Alzheimer disease, Amyloidosis	ENSG00000130203
6	MAPT	microtubule associated protein tau
7	APP	amyloid beta precursor protein	Disease related genes, FDA approved drug targets	Alzheimer disease, Amyloidosis	ENSG00000142192
8	PSEN1	presenilin 1	Cancer-related genes, Disease related genes	Alzheimer disease, Amyloidosis	ENSG00000080815
9	TYROBP	transmembrane immune signaling adaptor TYROBP	Disease related genes, Human disease related g...		ENSG00000011600
10	CLU	clusterin	Cancer-related genes, Candidate cardiovascular...	Cancer-related genes	ENSG00000120885

	term	n_hits	p_value	odds_ratio	genes
0	Microglial Cell Activation (GO:0001774)	5	2.68e-13	1109.7	APP, TYROBP, TREM2, MAPT, CLU
1	Astrocyte Activation (GO:0048143)	4	2.44e-11	1427.2	APP, TREM2, MAPT, PSEN1
2	Astrocyte Development (GO:0014002)	4	6.74e-11	1037.8	APP, TREM2, MAPT, PSEN1
3	Regulation Of Amyloid Fibril Formation (GO:190...	4	6.74e-11	1037.8	APP, TREM2, PSEN1, CLU
4	Positive Regulation Of Supramolecular Fiber Or...	5	1.04e-09	182.2	APP, MAPT, APOE, PSEN1, CLU
5	Macrophage Activation (GO:0042116)	4	2.57e-09	367.9	APP, TREM2, MAPT, CLU
6	Negative Regulation Of Long-Term Synaptic Pote...	3	6.92e-09	1498.8	APP, TYROBP, APOE
7	Memory (GO:0007613)	4	2.07e-08	211.0	TREM2, MAPT, APOE, PSEN1
8	Positive Regulation Of ERK1 And ERK2 Cascade (...	5	2.40e-08	94.9	PDGFRB, APP, PDGFRA, TREM2, APOE
9	Regulation Of Amyloid-Beta Clearance (GO:1900221)	3	5.61e-08	624.3	TREM2, APOE, CLU

	term	p_value	odds_ratio	genes
0	Glial cell:Undefined	5.61e-08	624.3	PDGFRB, PDGFRA, GFAP
1	T Helper cell:Undefined	1.62e-06	182.5	PDGFRB, PDGFRA, APOE
2	Pericyte:Muscle	2.75e-06	1480.4	PDGFRB, PDGFRA
3	Fibroblast:Lung	9.88e-06	634.3	PDGFRB, PDGFRA
4	Megakaryocyte Erythroid cell:Undefined	1.42e-05	85.8	PDGFRB, PDGFRA, APOE
5	T cell:Undefined	1.67e-05	81.1	PDGFRB, PDGFRA, APOE
6	Cardiac Progenitor cell:Embryonic Stem Cell	1.94e-05	76.9	PDGFRB, APP, PDGFRA
7	Cardiomyocyte:Embryonic Stem Cell	1.94e-05	76.9	PDGFRB, APP, PDGFRA
8	Periosteum-derived Progenitor cell:Periosteum	2.00e-05	76.1	PDGFRB, PDGFRA, TREM2
9	Hepatoblast:Liver	2.06e-05	75.3	PDGFRB, PDGFRA, APOE

	brain_region	dendrite_type	layer	specimen_count
0	"middle temporal gyrus"	spiny	L3	47
1	"middle temporal gyrus"	aspiny	L3	8
2	"middle temporal gyrus"	spiny	L5	7
3	"frontal lobe"	spiny	L3	7
4	"middle temporal gyrus"	spiny	L4	6
5	"temporal lobe"	spiny	L3	6
6	"middle temporal gyrus"	aspiny	L4	3
7	"inferior frontal gyrus"	spiny	L3	2
8	"inferior frontal gyrus"	aspiny	L3	2
9	"middle temporal gyrus"	spiny	L2	2
10	"temporal lobe"	spiny	L2	2
11	"temporal lobe"	spiny	L5	2
12	"inferior temporal gyrus"	spiny	L5	1
13	"frontal lobe"	spiny	L6	1
14	"middle temporal gyrus"	aspiny	L5	1

Gap	Task
Bulk SEA-AD h5ad download + local cache	`19c06875`
Per-cell DE from SEA-AD in the debate loop	`70b96f50`
ABC Atlas + MERFISH spatial queries	`f9ba4c33`
Forge data-validation layer	`4bd2f9de`

SEA-AD Cell-Type Vulnerability Analysis

SEA-AD Cell-Type Vulnerability Analysis¶

Research question¶

Approach¶

1. Forge tool chain¶

2. Target gene annotations (MyGene.info + Human Protein Atlas)¶

3. GO Biological Process enrichment (Enrichr)¶

4. Cell-type enrichment (Enrichr CellMarker)¶

5. STRING physical protein interaction network¶

6. Reactome pathway footprint per gene¶

7. Allen Brain Cell Atlas — cell-type specimen metadata¶

8. Allen Brain Atlas ISH regional expression¶

9. Evidence bound to analysis hypotheses¶

Hypothesis 1: Complement C1QA Spatial Gradient in Cortical Layers¶

Molecular Mechanism of C1QA-Mediated Synaptic Elimination¶

Hypothesis 2: Cell-Type Specific TREM2 Upregulation in DAM Microglia¶

TREM2 Molecular Biology and Signaling¶

Hypothesis 3: Excitatory Neuron Vulnerability via SLC17A7 Downregulation¶

Molecular Function of SLC17A7/VGLUT1¶

Hypothesis 4: APOE Isoform Expression Across Glial Subtypes¶

Hypothesis 5: GFAP-Positive Reactive Astrocyte Subtype Delineation¶

GFAP Biology and the Astrocyte Reactivity Spectrum¶

10. Mechanistic differential-expression synthesis¶

11. Caveats & what's still aggregated¶

	gene	n_pathways	top_pathway
8	PSEN1	8	Nuclear signaling by ERBB4
5	APOE	8	Nuclear signaling by ERBB4
3	PDGFRA	8	PIP3 activates AKT signaling
7	APP	8	Platelet degranulation
9	TYROBP	6	Immunoregulatory interactions between a Lympho...
4	PDGFRB	6	PIP3 activates AKT signaling
0	TREM2	4	Immunoregulatory interactions between a Lympho...
10	CLU	4	Platelet degranulation
6	MAPT	3	Caspase-mediated cleavage of cytoskeletal prot...
2	SLC17A7	2	Glutamate Neurotransmitter Release Cycle
1	GFAP	2	Nuclear signaling by ERBB4

	gene	top_region	top_energy	note
0	TREM2	—	—	No ISH data available; check portal for microa...
1	GFAP	—	—	No ISH data available; check portal for microa...
2	SLC17A7	—	—	No ISH data available; check portal for microa...
3	PDGFRA	—	—	No ISH data available; check portal for microa...
4	PDGFRB	—	—	No ISH data available; check portal for microa...
5	APOE	—	—	No ISH data available; check portal for microa...
6	MAPT	—	—	No ISH data available; check portal for microa...
7	APP	—	—	No ISH data available; check portal for microa...
8	PSEN1	—	—	No ISH data available; check portal for microa...
9	TYROBP	—	—	No ISH data available; check portal for microa...
10	CLU	—	—	No ISH data available; check portal for microa...