SEA-AD Gene Expression Profiling — Allen Brain Cell Atlas¶
Notebook ID: nb-analysis-SEAAD-20260402 · Analysis: analysis-SEAAD-20260402 · Generated: 2026-04-17
Research question¶
What are the cell-type specific expression patterns of key neurodegeneration genes in the Seattle Alzheimer's Disease Brain Cell Atlas?
Approach¶
This notebook is generated programmatically from real Forge tool calls and SciDEX debate data. Code cells load cached evidence bundles from data/forge_cache/seaad/*.json and query live data from scidex.db. Re-run python3 scripts/regenerate_notebooks.py --analysis analysis-SEAAD-20260402 --force to refresh.
5 hypotheses were generated and debated. The knowledge graph has 101 edges.
Debate Summary¶
Quality score: 0.68 · Rounds: 3 · Personas: Theorist, Synthesizer
1. Forge tool provenance¶
import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'
REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))
CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB
def load(name):
p = CACHE / f'{name}.json'
if p.exists():
return json.loads(p.read_text())
return {}
db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
db = sqlite3.connect(str(db_path))
prov = pd.read_sql_query('''
SELECT skill_id, status, COUNT(*) AS n_calls,
ROUND(AVG(duration_ms),0) AS mean_ms
FROM tool_calls
WHERE created_at >= date('now','-30 days')
GROUP BY skill_id, status
ORDER BY n_calls DESC
''', db)
db.close()
prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
print(f'{len(prov)} tool-call aggregates (last 30 days):')
prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
print(f'Provenance unavailable: {e}')
181 tool-call aggregates (last 30 days):
2. Target gene annotations¶
ann_rows = []
for g in ['APOE', 'GFAP']:
mg = load(f'mygene_{g}')
hpa = load(f'hpa_{g}')
if not mg and not hpa:
ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
'disease_involvement': '—'})
continue
ann_rows.append({
'gene': g,
'name': (mg.get('name') or '')[:55],
'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
if isinstance(hpa.get('protein_class'), list)
else str(hpa.get('protein_class') or '—')[:55],
'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
if isinstance(hpa.get('disease_involvement'), list)
else str(hpa.get('disease_involvement') or '')[:55],
})
pd.DataFrame(ann_rows)
| gene | name | protein_class | disease_involvement | |
|---|---|---|---|---|
| 0 | APOE | apolipoprotein E | Cancer-related genes, Candidate cardiovascular... | Alzheimer disease, Amyloidosis |
| 1 | GFAP | glial fibrillary acidic protein | Candidate cardiovascular disease genes, Diseas... | Disease variant, Leukodystrophy |
3. GO Biological Process enrichment (Enrichr)¶
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
go_df['term'] = go_df['term'].str[:60]
go_df['n_hits'] = go_df['genes'].apply(len)
go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
print('No GO:BP enrichment data')
# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
top = go_bp[:8]
terms = [t['term'][:45] for t in top][::-1]
neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
fig, ax = plt.subplots(figsize=(9, 4.5))
ax.barh(terms, neglogp, color='#4fc3f7')
ax.set_xlabel('-log10(p-value)')
ax.set_title('Top GO:BP enrichment (Enrichr)')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
else:
print('No GO:BP data to plot')
4. KEGG pathway enrichment¶
kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
kegg_df
else:
print('No KEGG enrichment data')
No KEGG enrichment data
5. STRING protein interaction network¶
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
print(f'{len(ppi_df)} STRING edges')
ppi_df[display_cols].head(20)
else:
print('No STRING edges returned')
11 STRING edges
# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
import math
nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
n = len(nodes)
pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
fig, ax = plt.subplots(figsize=(7, 7))
for e in ppi:
x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
linewidth=0.5+2*e['score'])
for name,(x,y) in pos.items():
ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
ax.set_aspect('equal'); ax.axis('off')
ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
plt.tight_layout(); plt.show()
else:
print('No STRING data to visualize')
6. Reactome pathway footprint¶
pw_rows = []
for g in ['APOE', 'GFAP']:
pws = load(f'reactome_{g}')
if isinstance(pws, list):
pw_rows.append({'gene': g, 'n_pathways': len(pws),
'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
else:
pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)
| gene | n_pathways | top_pathway | |
|---|---|---|---|
| 0 | APOE | 8 | Nuclear signaling by ERBB4 |
| 1 | GFAP | 2 | Nuclear signaling by ERBB4 |
7. Allen Brain Atlas ISH regional expression¶
ish_rows = []
for g in ['APOE', 'GFAP']:
ish = load(f'allen_ish_{g}')
regions = ish.get('regions') or [] if isinstance(ish, dict) else []
ish_rows.append({
'gene': g,
'n_ish_regions': len(regions),
'top_region': (regions[0].get('structure','') if regions else '—')[:45],
'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
})
pd.DataFrame(ish_rows)
| gene | n_ish_regions | top_region | top_energy | |
|---|---|---|---|---|
| 0 | APOE | 0 | — | — |
| 1 | GFAP | 0 | — | — |
8. Hypothesis ranking (5 hypotheses)¶
hyp_data = [('Cell-Type Specific TREM2 Upregulation in DAM Microglia', 0.761), ('GFAP-Positive Reactive Astrocyte Subtype Delineation', 0.754), ('APOE Isoform Expression Across Glial Subtypes', 0.743), ('Complement C1QA Spatial Gradient in Cortical Layers', 0.678), ('Excitatory Neuron Vulnerability via SLC17A7 Downregulat', 0.675)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title('SEA-AD Gene Expression Profiling — Allen Brain Cell Atlas')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
9. Score dimension heatmap (top 10)¶
labels = ['Cell-Type Specific TREM2 Upregulation in', 'GFAP-Positive Reactive Astrocyte Subtype', 'APOE Isoform Expression Across Glial Sub', 'Complement C1QA Spatial Gradient in Cort', 'Excitatory Neuron Vulnerability via SLC1']
matrix = np.array([[0.65, 0.7, 0.75, 0.8, 0.408, 0.8, 0.7, 0.75, 0.6], [0.6, 0.65, 0.7, 0.7, 0.192, 0.75, 0.65, 0.6, 0.55], [0.6, 0.55, 0.6, 0.6, 0.215, 0.6, 0.5, 0.5, 0.45], [0.7, 0.55, 0.6, 0.65, 0.381, 0.6, 0.55, 0.5, 0.5], [0.7, 0.6, 0.65, 0.65, 0.483, 0.7, 0.6, 0.55, 0.55]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
fig, ax = plt.subplots(figsize=(10, 5))
im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
ax.set_xticks(range(len(dims)))
ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
rotation=45, ha='right', fontsize=8)
ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
ax.set_title('Score dimensions — top hypotheses')
plt.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout(); plt.show()
else:
print('No score data available')
10. PubMed evidence per hypothesis¶
Hypothesis 1: Cell-Type Specific TREM2 Upregulation in DAM Microglia¶
Target genes: TREM2 · Composite score: 0.761
TREM2 (Triggering Receptor Expressed on Myeloid Cells 2) shows marked upregulation in disease-associated microglia (DAM) within the SEA-AD Brain Cell Atlas. Analysis of middle temporal gyrus single-nucleus RNA-seq data reveals TREM2 expression is enriched in a specific microglial subpopulation that undergoes dramatic transcriptional reprogramming in Alzheimer's disease. TREM2 expression levels correlate with Braak stage progression, establishing it as both a central mediator of the microglial di
hid = 'h-seaad-51323624'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 2: GFAP-Positive Reactive Astrocyte Subtype Delineation¶
Target genes: GFAP · Composite score: 0.754
GFAP (Glial Fibrillary Acidic Protein) upregulation in the SEA-AD dataset marks reactive astrocyte populations in the middle temporal gyrus with a log2 fold change of +2.8 — the highest differential expression among all profiled genes. This dramatic increase reflects astrocyte reactivity that is both a blood-based biomarker of AD pathology and a central therapeutic target, with the SEA-AD single-cell data enabling unprecedented resolution of reactive astrocyte heterogeneity.
GFAP Biology and¶
hid = 'h-seaad-56fa6428'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 3: APOE Isoform Expression Across Glial Subtypes¶
Target genes: APOE · Composite score: 0.743
APOE (Apolipoprotein E) shows significant upregulation (log2FC = +1.8) in the SEA-AD dataset, with expression patterns varying dramatically across astrocyte and microglial subtypes in the middle temporal gyrus. The APOE4 allele is the strongest genetic risk factor for late-onset Alzheimer's disease, carried by approximately 25% of the population and present in over 60% of AD patients. The SEA-AD single-cell data enables dissecting APOE isoform-specific effects at unprecedented cellular resolutio
hid = 'h-seaad-fa5ea82d'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 4: Complement C1QA Spatial Gradient in Cortical Layers¶
Target genes: C1QA · Composite score: 0.678
C1QA, the initiating protein of the classical complement cascade, shows upregulation in the SEA-AD dataset with a layer-specific spatial gradient across cortical neurons in the middle temporal gyrus. This finding connects complement-mediated synaptic tagging to the selective vulnerability of specific cortical layers in Alzheimer's disease, revealing a previously underappreciated spatial dimension to complement-driven neurodegeneration.
Molecular Mechanism of C1QA-Mediated Synaptic Eliminatio¶
hid = 'h-seaad-5b3cb8ea'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 5: Excitatory Neuron Vulnerability via SLC17A7 Downregulation¶
Target genes: SLC17A7 · Composite score: 0.675
SLC17A7 (also known as VGLUT1, vesicular glutamate transporter 1) shows significant downregulation (log2FC = -1.7) in the SEA-AD dataset, specifically in layer 3 and layer 5 excitatory neurons of the middle temporal gyrus. This reduction in the primary vesicular glutamate transporter marks early excitatory neuron vulnerability in Alzheimer's disease and points to synaptic transmission failure as a proximal cause of cognitive decline.
Molecular Function of SLC17A7/VGLUT1¶
VGLUT1 is a transmem
hid = 'h-seaad-7f15df4c'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
11. Knowledge graph edges (101 total)¶
edge_data = [{'source': 'TREM2', 'relation': 'participates_in', 'target': 'Microglial Activation / DAM Si', 'strength': 0.73}, {'source': 'TREM2', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'TREM2', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_aspiny', 'strength': 0.7}, {'source': 'TREM2', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'APOE', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'APOE', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_aspiny', 'strength': 0.7}, {'source': 'APOE', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'LRP1', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'LRP1', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_aspiny', 'strength': 0.7}, {'source': 'LRP1', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'BDNF', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'BDNF', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_aspiny', 'strength': 0.7}, {'source': 'BDNF', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'SNCA', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'SNCA', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_aspiny', 'strength': 0.7}, {'source': 'SNCA', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'MAPT', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'MAPT', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_aspiny', 'strength': 0.7}, {'source': 'MAPT', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'APP', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'APP', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_aspiny', 'strength': 0.7}, {'source': 'APP', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'PARP1', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}, {'source': 'PARP1', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_aspiny', 'strength': 0.7}, {'source': 'PARP1', 'relation': 'expressed_in', 'target': '"middle temporal gyrus"_spiny_', 'strength': 0.7}]
if edge_data:
pd.DataFrame(edge_data).head(25)
else:
print('No KG edge data available')
12. Caveats¶
This notebook uses real Forge tool calls cached from live APIs, but:
- Enrichment is against curated gene-set libraries, not genome-wide screens
- STRING/Reactome/HPA/MyGene reflect curated knowledge
- PubMed literature is search-relevance ranked, not systematic review
The cached evidence bundle is the minimum viable real-data analysis for this topic.