APOE4 structural biology and therapeutic targeting strategies¶
Notebook ID: nb-sda-2026-04-01-gap-010 · Analysis: sda-2026-04-01-gap-010 · Generated: 2026-04-10
Research question¶
APOE4 differs from APOE3 by C112R causing domain interaction that alters lipid binding and amyloid clearance.
Approach¶
This notebook is generated programmatically from real Forge tool calls and SciDEX debate data. Code cells load cached evidence bundles from data/forge_cache/seaad/*.json and query live data from scidex.db. Re-run python3 scripts/regenerate_notebooks.py --analysis sda-2026-04-01-gap-010 --force to refresh.
7 hypotheses were generated and debated. The knowledge graph has 98 edges.
Debate Summary¶
Quality score: 0.5 · Rounds: 4 · Personas: Theorist, Skeptic, Domain_Expert, Synthesizer
1. Forge tool provenance¶
import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'
REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))
CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB
def load(name):
p = CACHE / f'{name}.json'
if p.exists():
return json.loads(p.read_text())
return {}
db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
db = sqlite3.connect(str(db_path))
prov = pd.read_sql_query('''
SELECT skill_id, status, COUNT(*) AS n_calls,
ROUND(AVG(duration_ms),0) AS mean_ms
FROM tool_calls
WHERE created_at >= date('now','-30 days')
GROUP BY skill_id, status
ORDER BY n_calls DESC
''', db)
db.close()
prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
print(f'{len(prov)} tool-call aggregates (last 30 days):')
prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
print(f'Provenance unavailable: {e}')
77 tool-call aggregates (last 30 days):
2. Target gene annotations¶
ann_rows = []
for g in ['APOE']:
mg = load(f'mygene_{g}')
hpa = load(f'hpa_{g}')
if not mg and not hpa:
ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
'disease_involvement': '—'})
continue
ann_rows.append({
'gene': g,
'name': (mg.get('name') or '')[:55],
'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
if isinstance(hpa.get('protein_class'), list)
else str(hpa.get('protein_class') or '—')[:55],
'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
if isinstance(hpa.get('disease_involvement'), list)
else str(hpa.get('disease_involvement') or '')[:55],
})
pd.DataFrame(ann_rows)
| gene | name | protein_class | disease_involvement | |
|---|---|---|---|---|
| 0 | APOE | apolipoprotein E | Cancer-related genes, Candidate cardiovascular... | Alzheimer disease, Amyloidosis |
3. GO Biological Process enrichment (Enrichr)¶
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
go_df['term'] = go_df['term'].str[:60]
go_df['n_hits'] = go_df['genes'].apply(len)
go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
print('No GO:BP enrichment data')
# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
top = go_bp[:8]
terms = [t['term'][:45] for t in top][::-1]
neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
fig, ax = plt.subplots(figsize=(9, 4.5))
ax.barh(terms, neglogp, color='#4fc3f7')
ax.set_xlabel('-log10(p-value)')
ax.set_title('Top GO:BP enrichment (Enrichr)')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
else:
print('No GO:BP data to plot')
4. KEGG pathway enrichment¶
kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
kegg_df
else:
print('No KEGG enrichment data')
No KEGG enrichment data
5. STRING protein interaction network¶
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
print(f'{len(ppi_df)} STRING edges')
ppi_df[display_cols].head(20)
else:
print('No STRING edges returned')
11 STRING edges
# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
import math
nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
n = len(nodes)
pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
fig, ax = plt.subplots(figsize=(7, 7))
for e in ppi:
x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
linewidth=0.5+2*e['score'])
for name,(x,y) in pos.items():
ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
ax.set_aspect('equal'); ax.axis('off')
ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
plt.tight_layout(); plt.show()
else:
print('No STRING data to visualize')
6. Reactome pathway footprint¶
pw_rows = []
for g in ['APOE']:
pws = load(f'reactome_{g}')
if isinstance(pws, list):
pw_rows.append({'gene': g, 'n_pathways': len(pws),
'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
else:
pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)
| gene | n_pathways | top_pathway | |
|---|---|---|---|
| 0 | APOE | 8 | Nuclear signaling by ERBB4 |
7. Allen Brain Atlas ISH regional expression¶
ish_rows = []
for g in ['APOE']:
ish = load(f'allen_ish_{g}')
regions = ish.get('regions') or [] if isinstance(ish, dict) else []
ish_rows.append({
'gene': g,
'n_ish_regions': len(regions),
'top_region': (regions[0].get('structure','') if regions else '—')[:45],
'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
})
pd.DataFrame(ish_rows)
| gene | n_ish_regions | top_region | top_energy | |
|---|---|---|---|---|
| 0 | APOE | 0 | — | — |
8. Hypothesis ranking (7 hypotheses)¶
hyp_data = [('Interfacial Lipid Mimetics to Disrupt Domain Interactio', 0.575), ('Targeted APOE4-to-APOE3 Base Editing Therapy', 0.569), ('APOE4 Allosteric Rescue via Small Molecule Chaperones', 0.557), ('Pharmacological Enhancement of APOE4 Glycosylation', 0.544), ('Selective APOE4 Degradation via Proteolysis Targeting C', 0.534), ('Chaperone-Mediated APOE4 Refolding Enhancement', 0.525), ('Competitive APOE4 Domain Stabilization Peptides', 0.513)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title('APOE4 structural biology and therapeutic targeting strategies')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
9. Score dimension heatmap (top 10)¶
labels = ['Interfacial Lipid Mimetics to Disrupt Do', 'Targeted APOE4-to-APOE3 Base Editing The', 'APOE4 Allosteric Rescue via Small Molecu', 'Pharmacological Enhancement of APOE4 Gly', 'Selective APOE4 Degradation via Proteoly', 'Chaperone-Mediated APOE4 Refolding Enhan', 'Competitive APOE4 Domain Stabilization P']
matrix = np.array([[0.7, 0.2, 0.5, 0.3, 0.725, 0.3, 0.3, 0.4, 0.5], [0.9, 0.2, 0.9, 0.6, 0.725, 0.5, 0.4, 0.4, 0.1], [0.9, 0.3, 0.8, 0.5, 0.725, 0.3, 0.4, 0.4, 0.5], [0.8, 0.3, 0.4, 0.2, 0.436, 0.2, 0.3, 0.6, 0.3], [0.9, 0.2, 0.7, 0.4, 0.725, 0.4, 0.3, 0.6, 0.2], [0.6, 0.8, 0.7, 0.7, 0.436, 0.7, 0.8, 0.8, 0.6], [0.8, 0.2, 0.6, 0.4, 0.725, 0.4, 0.3, 0.3, 0.4]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
fig, ax = plt.subplots(figsize=(10, 5))
im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
ax.set_xticks(range(len(dims)))
ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
rotation=45, ha='right', fontsize=8)
ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
ax.set_title('Score dimensions — top hypotheses')
plt.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout(); plt.show()
else:
print('No score data available')
10. PubMed evidence per hypothesis¶
Hypothesis 1: Interfacial Lipid Mimetics to Disrupt Domain Interaction¶
Target genes: APOE · Composite score: 0.575
Molecular Mechanism and Rationale
The apolipoprotein E4 (APOE4) isoform represents the most significant genetic risk factor for late-onset Alzheimer's disease, present in approximately 65% of AD patients despite occurring in only 25% of the general population. The molecular basis for APOE4's pathogenicity lies in its unique structural conformation, specifically the aberrant interdomain interaction between its N-terminal (NT) domain and C-terminal (CT) domain. Unlike the protective APOE2 and
hid = 'h-99b4e2d2'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 2: Targeted APOE4-to-APOE3 Base Editing Therapy¶
Target genes: APOE · Composite score: 0.569
Molecular Mechanism and Rationale
The apolipoprotein E4 (APOE4) variant represents the most significant genetic risk factor for late-onset Alzheimer's disease (AD), conferring a 3-fold and 12-fold increased risk for heterozygous and homozygous carriers, respectively. The molecular basis of APOE4 pathogenicity stems from a single nucleotide polymorphism at position 334 (C334T), which results in a cysteine-to-arginine substitution at amino acid position 112 (Cys112Arg). This seemingly minor c
hid = 'h-a20e0cbb'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 3: APOE4 Allosteric Rescue via Small Molecule Chaperones¶
Target genes: APOE · Composite score: 0.557
Molecular Mechanism and Rationale¶
The apolipoprotein E4 (APOE4) isoform represents the strongest genetic risk factor for late-onset Alzheimer's disease, carried by approximately 25% of the population and conferring a 3-15 fold increased risk compared to the protective APOE3 variant. The fundamental pathological difference between APOE4 and APOE3 lies in a single amino acid substitution at position 112 (Cys→Arg), which triggers a conformational cascade affecting the entire protein archite
hid = 'h-44195347'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 4: Pharmacological Enhancement of APOE4 Glycosylation¶
Target genes: ST6GAL1, FUT8 · Composite score: 0.544
Molecular Mechanism and Rationale
The apolipoprotein E4 (APOE4) variant represents the strongest genetic risk factor for late-onset Alzheimer's disease, affecting approximately 25% of the population and increasing disease risk by 3-12 fold compared to the protective APOE3 isoform. The fundamental pathogenic mechanism underlying APOE4's deleterious effects stems from a critical amino acid substitution at position 112, where arginine replaces cysteine (C112R), disrupting the protein's tertiar
hid = 'h-9a108e26'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 5: Selective APOE4 Degradation via Proteolysis Targeting Chimeras (PROTAC¶
Target genes: APOE · Composite score: 0.534
Molecular Mechanism and Rationale
The apolipoprotein E gene (APOE) exists in three major isoforms—APOE2, APOE3, and APOE4—differing by single amino acid substitutions that profoundly impact protein structure and function. The APOE4 variant, present in approximately 25% of the population and 65% of Alzheimer's disease patients, represents the strongest genetic risk factor for late-onset Alzheimer's disease, increasing risk by 3-fold in heterozygotes and 12-fold in homozygotes. The structural
hid = 'h-11795af0'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 6: Chaperone-Mediated APOE4 Refolding Enhancement¶
Target genes: HSPA1A, HSP90AA1, DNAJB1, FKBP5 · Composite score: 0.525
Molecular Mechanism and Rationale
The apolipoprotein E4 (APOE4) isoform represents the strongest genetic risk factor for late-onset Alzheimer's disease, carried by approximately 25% of the population and conferring a 3-4 fold increased risk compared to the protective APOE3 variant. The fundamental pathogenic mechanism underlying APOE4's deleterious effects stems from a critical structural vulnerability: an aberrant domain interaction between the N-terminal (residues 1-165) and C-terminal (r
hid = 'h-637a53c9'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 7: Competitive APOE4 Domain Stabilization Peptides¶
Target genes: APOE · Composite score: 0.513
Molecular Mechanism and Rationale
The apolipoprotein E epsilon 4 (APOE4) allele represents the strongest genetic risk factor for late-onset Alzheimer's disease, carried by approximately 25% of the population and increasing AD risk by 3-fold in heterozygotes and 12-fold in homozygotes. The molecular basis for APOE4's pathogenicity lies in its unique structural instability compared to the protective APOE3 variant. APOE4 contains a critical amino acid substitution at position 112 (Arg112 inste
hid = 'h-d0a564e8'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
11. Knowledge graph edges (98 total)¶
edge_data = [{'source': 'APOE', 'relation': 'regulates', 'target': 'lipid_metabolism', 'strength': 1.0}, {'source': 'APOE4', 'relation': 'causes', 'target': 'domain_interaction', 'strength': 0.8}, {'source': 'HSPA1A', 'relation': 'regulates', 'target': 'protein_folding', 'strength': 0.8}, {'source': 'HSP90AA1', 'relation': 'enhances', 'target': 'protein_stability', 'strength': 0.8}, {'source': 'FKBP5', 'relation': 'co_chaperone', 'target': 'HSP90', 'strength': 0.8}, {'source': 'APOE4_domain_interaction', 'relation': 'impairs', 'target': 'amyloid_clearance', 'strength': 0.8}, {'source': 'molecular_chaperones', 'relation': 'maintains', 'target': 'proteostasis', 'strength': 0.8}, {'source': 'C334T_mutation', 'relation': 'determines', 'target': 'APOE4_phenotype', 'strength': 0.8}, {'source': 'HSPA1A', 'relation': 'interacts_with', 'target': 'HSP90AA1', 'strength': 0.61}, {'source': 'HSPA1A', 'relation': 'interacts_with', 'target': 'DNAJB1', 'strength': 0.61}, {'source': 'HSPA1A', 'relation': 'interacts_with', 'target': 'FKBP5', 'strength': 0.61}, {'source': 'HSP90AA1', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.61}, {'source': 'HSP90AA1', 'relation': 'interacts_with', 'target': 'HSPA1A', 'strength': 0.61}, {'source': 'HSP90AA1', 'relation': 'interacts_with', 'target': 'DNAJB1', 'strength': 0.61}, {'source': 'HSP90AA1', 'relation': 'interacts_with', 'target': 'FKBP5', 'strength': 0.61}, {'source': 'DNAJB1', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.61}, {'source': 'DNAJB1', 'relation': 'interacts_with', 'target': 'HSPA1A', 'strength': 0.61}, {'source': 'DNAJB1', 'relation': 'interacts_with', 'target': 'HSP90AA1', 'strength': 0.61}, {'source': 'DNAJB1', 'relation': 'interacts_with', 'target': 'FKBP5', 'strength': 0.61}, {'source': 'FKBP5', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.61}, {'source': 'FKBP5', 'relation': 'interacts_with', 'target': 'HSPA1A', 'strength': 0.61}, {'source': 'FKBP5', 'relation': 'interacts_with', 'target': 'HSP90AA1', 'strength': 0.61}, {'source': 'FKBP5', 'relation': 'interacts_with', 'target': 'DNAJB1', 'strength': 0.61}, {'source': 'h-637a53c9', 'relation': 'targets', 'target': 'FKBP5', 'strength': 0.6}, {'source': 'h-637a53c9', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.6}]
if edge_data:
pd.DataFrame(edge_data).head(25)
else:
print('No KG edge data available')
12. Caveats¶
This notebook uses real Forge tool calls cached from live APIs, but:
- Enrichment is against curated gene-set libraries, not genome-wide screens
- STRING/Reactome/HPA/MyGene reflect curated knowledge
- PubMed literature is search-relevance ranked, not systematic review
The cached evidence bundle is the minimum viable real-data analysis for this topic.