Protein aggregation cross-seeding across neurodegenerative diseases¶
Notebook ID: nb-sda-2026-04-01-gap-9137255b · Analysis: sda-2026-04-01-gap-9137255b · Generated: 2026-04-17
Research question¶
Protein aggregation cross-seeding across neurodegenerative diseases?
Approach¶
This notebook is generated programmatically from real Forge tool calls and SciDEX debate data. Code cells load cached evidence bundles from data/forge_cache/seaad/*.json and query live data from scidex.db. Re-run python3 scripts/regenerate_notebooks.py --analysis sda-2026-04-01-gap-9137255b --force to refresh.
7 hypotheses were generated and debated. The knowledge graph has 44 edges.
Debate Summary¶
Quality score: 0.53 · Rounds: 4 · Personas: Theorist, Skeptic, Domain_Expert, Synthesizer
1. Forge tool provenance¶
import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'
REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))
CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB
def load(name):
p = CACHE / f'{name}.json'
if p.exists():
return json.loads(p.read_text())
return {}
db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
db = sqlite3.connect(str(db_path))
prov = pd.read_sql_query('''
SELECT skill_id, status, COUNT(*) AS n_calls,
ROUND(AVG(duration_ms),0) AS mean_ms
FROM tool_calls
WHERE created_at >= date('now','-30 days')
GROUP BY skill_id, status
ORDER BY n_calls DESC
''', db)
db.close()
prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
print(f'{len(prov)} tool-call aggregates (last 30 days):')
prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
print(f'Provenance unavailable: {e}')
181 tool-call aggregates (last 30 days):
2. Target gene annotations¶
ann_rows = []
for g in ['TARDBP']:
mg = load(f'mygene_{g}')
hpa = load(f'hpa_{g}')
if not mg and not hpa:
ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
'disease_involvement': '—'})
continue
ann_rows.append({
'gene': g,
'name': (mg.get('name') or '')[:55],
'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
if isinstance(hpa.get('protein_class'), list)
else str(hpa.get('protein_class') or '—')[:55],
'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
if isinstance(hpa.get('disease_involvement'), list)
else str(hpa.get('disease_involvement') or '')[:55],
})
pd.DataFrame(ann_rows)
| gene | name | protein_class | disease_involvement | |
|---|---|---|---|---|
| 0 | TARDBP | — | — | — |
3. GO Biological Process enrichment (Enrichr)¶
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
go_df['term'] = go_df['term'].str[:60]
go_df['n_hits'] = go_df['genes'].apply(len)
go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
print('No GO:BP enrichment data')
# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
top = go_bp[:8]
terms = [t['term'][:45] for t in top][::-1]
neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
fig, ax = plt.subplots(figsize=(9, 4.5))
ax.barh(terms, neglogp, color='#4fc3f7')
ax.set_xlabel('-log10(p-value)')
ax.set_title('Top GO:BP enrichment (Enrichr)')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
else:
print('No GO:BP data to plot')
4. KEGG pathway enrichment¶
kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
kegg_df
else:
print('No KEGG enrichment data')
No KEGG enrichment data
5. STRING protein interaction network¶
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
print(f'{len(ppi_df)} STRING edges')
ppi_df[display_cols].head(20)
else:
print('No STRING edges returned')
11 STRING edges
# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
import math
nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
n = len(nodes)
pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
fig, ax = plt.subplots(figsize=(7, 7))
for e in ppi:
x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
linewidth=0.5+2*e['score'])
for name,(x,y) in pos.items():
ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
ax.set_aspect('equal'); ax.axis('off')
ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
plt.tight_layout(); plt.show()
else:
print('No STRING data to visualize')
6. Reactome pathway footprint¶
pw_rows = []
for g in ['TARDBP']:
pws = load(f'reactome_{g}')
if isinstance(pws, list):
pw_rows.append({'gene': g, 'n_pathways': len(pws),
'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
else:
pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)
| gene | n_pathways | top_pathway | |
|---|---|---|---|
| 0 | TARDBP | 0 | — |
7. Allen Brain Atlas ISH regional expression¶
ish_rows = []
for g in ['TARDBP']:
ish = load(f'allen_ish_{g}')
regions = ish.get('regions') or [] if isinstance(ish, dict) else []
ish_rows.append({
'gene': g,
'n_ish_regions': len(regions),
'top_region': (regions[0].get('structure','') if regions else '—')[:45],
'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
})
pd.DataFrame(ish_rows)
| gene | n_ish_regions | top_region | top_energy | |
|---|---|---|---|---|
| 0 | TARDBP | 0 | — | — |
8. Hypothesis ranking (7 hypotheses)¶
hyp_data = [('Transglutaminase-2 Cross-Linking Inhibition Strategy', 0.701), ('TREM2-Mediated Selective Aggregate Clearance Pathway', 0.699), ('Liquid-Liquid Phase Separation Modifier Therapy', 0.656), ('Prohibitin-2 Mitochondrial Cross-Seeding Hub Disruption', 0.65), ('Glycosaminoglycan Template Disruption Approach', 0.649), ('RNA-Binding Competition Therapy for TDP-43 Cross-Seedin', 0.642), ('HSP70 Co-chaperone DNAJB6 Universal Cross-Seeding Inhib', 0.632)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title('Protein aggregation cross-seeding across neurodegenerative diseases')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
9. Score dimension heatmap (top 10)¶
labels = ['Transglutaminase-2 Cross-Linking Inhibit', 'TREM2-Mediated Selective Aggregate Clear', 'Liquid-Liquid Phase Separation Modifier ', 'Prohibitin-2 Mitochondrial Cross-Seeding', 'Glycosaminoglycan Template Disruption Ap', 'RNA-Binding Competition Therapy for TDP-', 'HSP70 Co-chaperone DNAJB6 Universal Cros']
matrix = np.array([[0.7, 0.8, 0.7, 0.75, 0.13, 0.7, 0.75, 0.85, 0.65], [0.85, 0.55, 0.7, 0.6, 0.263, 0.6, 0.45, 0.65, 0.5], [0.9, 0.4, 0.65, 0.6, 0.09, 0.45, 0.4, 0.3, 0.35], [0.75, 0.3, 0.5, 0.55, 0.13, 0.4, 0.35, 0.25, 0.3], [0.75, 0.6, 0.65, 0.7, 0.247, 0.65, 0.7, 0.7, 0.55], [0.85, 0.25, 0.45, 0.5, 0.238, 0.35, 0.3, 0.3, 0.25], [0.8, 0.45, 0.6, 0.65, 0.385, 0.5, 0.55, 0.35, 0.4]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
fig, ax = plt.subplots(figsize=(10, 5))
im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
ax.set_xticks(range(len(dims)))
ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
rotation=45, ha='right', fontsize=8)
ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
ax.set_title('Score dimensions — top hypotheses')
plt.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout(); plt.show()
else:
print('No score data available')
10. PubMed evidence per hypothesis¶
Hypothesis 1: Transglutaminase-2 Cross-Linking Inhibition Strategy¶
Target genes: TGM2 · Composite score: 0.701
The Transglutaminase-2 (TG2) Cross-Linking Inhibition Strategy targets the enzymatic mechanism responsible for creating covalent isopeptide bonds between aggregation-prone proteins in neurodegenerative diseases. TG2 catalyzes the formation of Nε-(γ-glutamyl)lysine cross-links between glutamine and lysine residues, creating irreversible protein-protein bonds that stabilize pathological aggregates of tau, α-synuclein, and huntingtin. By selectively inhibiting TG2's cross-linking activity at diseas
hid = 'h-d4f71a6b'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 2: TREM2-Mediated Selective Aggregate Clearance Pathway¶
Target genes: TREM2 · Composite score: 0.699
Background and Rationale
Protein aggregation represents a central pathological hallmark across multiple neurodegenerative diseases, including Alzheimer's disease (AD), Parkinson's disease (PD), and frontotemporal dementia (FTD). While traditional therapeutic approaches have focused on preventing aggregate formation or broadly enhancing clearance mechanisms, recent insights into the heterogeneous nature of pathological protein assemblies have revealed new opportunities for precision interven
hid = 'h-3460f820'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 3: Liquid-Liquid Phase Separation Modifier Therapy¶
Target genes: G3BP1 · Composite score: 0.656
Liquid-Liquid Phase Separation (LLPS) Modifier Therapy targets the biophysical process by which intrinsically disordered proteins demix from the cytoplasm to form membraneless organelles — biomolecular condensates — that serve as hotspots for pathological protein co-aggregation and cross-seeding in neurodegenerative diseases. By modulating the material properties, composition, and dynamics of these condensates with small molecules, this approach aims to prevent the aberrant mixing of tau, α-synu
hid = 'h-27bc0569'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 4: Prohibitin-2 Mitochondrial Cross-Seeding Hub Disruption¶
Target genes: PHB2 · Composite score: 0.65
Prohibitin-2 (PHB2) Mitochondrial Cross-Seeding Hub Disruption proposes that PHB2, a mitochondrial inner membrane scaffolding protein, serves as a convergent platform where multiple neurodegenerative disease proteins (tau, α-synuclein, TDP-43) interact on the mitochondrial surface, undergo conformational templating, and initiate cross-seeding. Selective PHB2 modulators could disrupt this hub while preserving PHB2's essential roles in mitochondrial cristae organization and PINK1-Parkin mitophagy
hid = 'h-8bd89d90'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 5: Glycosaminoglycan Template Disruption Approach¶
Target genes: HSPG2 · Composite score: 0.649
Heparan sulfate and other glycosaminoglycans serve as nucleation templates that facilitate cross-seeding by concentrating different amyloidogenic proteins and stabilizing cross-β structures. Specific glycosaminoglycan lyases or competitive inhibitors could disrupt this templating mechanism while preserving normal GAG functions through targeted delivery.
Glycosaminoglycans as Molecular Scaffolds for Protein Aggregation¶
Glycosaminoglycans (GAGs) are long, unbranched polysaccharide chains cons
hid = 'h-54b9e0f5'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 6: RNA-Binding Competition Therapy for TDP-43 Cross-Seeding¶
Target genes: TARDBP · Composite score: 0.642
Background and Rationale
TAR DNA-binding protein 43 (TDP-43), encoded by the TARDBP gene, is a heterogeneous nuclear ribonucleoprotein (hnRNP) that plays critical roles in RNA processing, including transcription, splicing, transport, and stability regulation. Under pathological conditions, TDP-43 undergoes cytoplasmic mislocalization, hyperphosphorylation, ubiquitination, and aggregation, forming characteristic inclusions observed in amyotrophic lateral sclerosis (ALS), frontotemporal demen
hid = 'h-7693c291'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 7: HSP70 Co-chaperone DNAJB6 Universal Cross-Seeding Inhibitor¶
Target genes: DNAJB6 · Composite score: 0.632
Background and Rationale
Protein misfolding and aggregation represent a fundamental pathological mechanism underlying multiple neurodegenerative diseases, including Alzheimer's disease (tau), Parkinson's disease (α-synuclein), and amyotrophic lateral sclerosis/frontotemporal dementia (TDP-43). A critical emerging concept is that these pathological proteins can undergo cross-seeding, where aggregates of one protein can template the misfolding and aggregation of heterologous proteins. This ph
hid = 'h-c9486869'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
11. Knowledge graph edges (44 total)¶
edge_data = [{'source': 'DNAJB6', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.55}, {'source': 'h-54b9e0f5', 'relation': 'targets', 'target': 'HSPG2', 'strength': 0.5}, {'source': 'h-3460f820', 'relation': 'targets', 'target': 'TREM2', 'strength': 0.5}, {'source': 'h-c9486869', 'relation': 'targets', 'target': 'DNAJB6', 'strength': 0.5}, {'source': 'h-8bd89d90', 'relation': 'targets', 'target': 'PHB2', 'strength': 0.5}, {'source': 'h-7693c291', 'relation': 'targets', 'target': 'TARDBP', 'strength': 0.5}, {'source': 'PHB2', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.47}, {'source': 'TREM2', 'relation': 'co_discussed', 'target': 'HSPG2', 'strength': 0.4}, {'source': 'TREM2', 'relation': 'co_discussed', 'target': 'G3BP1', 'strength': 0.4}, {'source': 'TREM2', 'relation': 'co_discussed', 'target': 'TARDBP', 'strength': 0.4}, {'source': 'TREM2', 'relation': 'co_discussed', 'target': 'TGM2', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'G3BP1', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'TARDBP', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'TGM2', 'strength': 0.4}, {'source': 'TGM2', 'relation': 'co_discussed', 'target': 'HSPG2', 'strength': 0.4}, {'source': 'TGM2', 'relation': 'co_discussed', 'target': 'TREM2', 'strength': 0.4}, {'source': 'TARDBP', 'relation': 'co_discussed', 'target': 'HSPG2', 'strength': 0.4}, {'source': 'G3BP1', 'relation': 'co_discussed', 'target': 'HSPG2', 'strength': 0.4}, {'source': 'G3BP1', 'relation': 'co_discussed', 'target': 'TREM2', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'TREM2', 'strength': 0.4}, {'source': 'TREM2', 'relation': 'co_discussed', 'target': 'PHB2', 'strength': 0.4}, {'source': 'TREM2', 'relation': 'co_discussed', 'target': 'DNAJB6', 'strength': 0.4}, {'source': 'G3BP1', 'relation': 'co_discussed', 'target': 'PHB2', 'strength': 0.4}, {'source': 'G3BP1', 'relation': 'co_discussed', 'target': 'DNAJB6', 'strength': 0.4}, {'source': 'PHB2', 'relation': 'co_discussed', 'target': 'TGM2', 'strength': 0.4}]
if edge_data:
pd.DataFrame(edge_data).head(25)
else:
print('No KG edge data available')
12. Caveats¶
This notebook uses real Forge tool calls cached from live APIs, but:
- Enrichment is against curated gene-set libraries, not genome-wide screens
- STRING/Reactome/HPA/MyGene reflect curated knowledge
- PubMed literature is search-relevance ranked, not systematic review
The cached evidence bundle is the minimum viable real-data analysis for this topic.