4R-tau strain-specific spreading patterns in PSP vs CBD¶
Notebook ID: nb-sda-2026-04-01-gap-005 · Analysis: sda-2026-04-01-gap-005 · Generated: 2026-04-17
Research question¶
PSP and CBD both involve 4R-tau but produce distinct neuropathological patterns (tufted astrocytes vs astrocytic plaques). Whether tau strains or regional cellular environments drive these differences is unresolved.
Approach¶
This notebook is generated programmatically from real Forge tool calls and SciDEX debate data. Code cells load cached evidence bundles from data/forge_cache/seaad/*.json and query live data from scidex.db. Re-run python3 scripts/regenerate_notebooks.py --analysis sda-2026-04-01-gap-005 --force to refresh.
7 hypotheses were generated and debated. The knowledge graph has 130 edges.
Debate Summary¶
Quality score: 0.91 · Rounds: 7 · Personas: Theorist, Skeptic, Domain_Expert, Epidemiologist, Computational_Biologist, Clinical_Trialist, Synthesizer
1. Forge tool provenance¶
import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'
REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))
CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB
def load(name):
p = CACHE / f'{name}.json'
if p.exists():
return json.loads(p.read_text())
return {}
db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
db = sqlite3.connect(str(db_path))
prov = pd.read_sql_query('''
SELECT skill_id, status, COUNT(*) AS n_calls,
ROUND(AVG(duration_ms),0) AS mean_ms
FROM tool_calls
WHERE created_at >= date('now','-30 days')
GROUP BY skill_id, status
ORDER BY n_calls DESC
''', db)
db.close()
prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
print(f'{len(prov)} tool-call aggregates (last 30 days):')
prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
print(f'Provenance unavailable: {e}')
2. Target gene annotations¶
ann_rows = []
for g in []:
mg = load(f'mygene_{g}')
hpa = load(f'hpa_{g}')
if not mg and not hpa:
ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
'disease_involvement': '—'})
continue
ann_rows.append({
'gene': g,
'name': (mg.get('name') or '')[:55],
'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
if isinstance(hpa.get('protein_class'), list)
else str(hpa.get('protein_class') or '—')[:55],
'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
if isinstance(hpa.get('disease_involvement'), list)
else str(hpa.get('disease_involvement') or '')[:55],
})
pd.DataFrame(ann_rows)
3. GO Biological Process enrichment (Enrichr)¶
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
go_df['term'] = go_df['term'].str[:60]
go_df['n_hits'] = go_df['genes'].apply(len)
go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
print('No GO:BP enrichment data')
# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
top = go_bp[:8]
terms = [t['term'][:45] for t in top][::-1]
neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
fig, ax = plt.subplots(figsize=(9, 4.5))
ax.barh(terms, neglogp, color='#4fc3f7')
ax.set_xlabel('-log10(p-value)')
ax.set_title('Top GO:BP enrichment (Enrichr)')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
else:
print('No GO:BP data to plot')
4. KEGG pathway enrichment¶
kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
kegg_df
else:
print('No KEGG enrichment data')
5. STRING protein interaction network¶
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
print(f'{len(ppi_df)} STRING edges')
ppi_df[display_cols].head(20)
else:
print('No STRING edges returned')
# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
import math
nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
n = len(nodes)
pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
fig, ax = plt.subplots(figsize=(7, 7))
for e in ppi:
x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
linewidth=0.5+2*e['score'])
for name,(x,y) in pos.items():
ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
ax.set_aspect('equal'); ax.axis('off')
ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
plt.tight_layout(); plt.show()
else:
print('No STRING data to visualize')
6. Reactome pathway footprint¶
pw_rows = []
for g in []:
pws = load(f'reactome_{g}')
if isinstance(pws, list):
pw_rows.append({'gene': g, 'n_pathways': len(pws),
'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
else:
pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)
7. Allen Brain Atlas ISH regional expression¶
ish_rows = []
for g in []:
ish = load(f'allen_ish_{g}')
regions = ish.get('regions') or [] if isinstance(ish, dict) else []
ish_rows.append({
'gene': g,
'n_ish_regions': len(regions),
'top_region': (regions[0].get('structure','') if regions else '—')[:45],
'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
})
pd.DataFrame(ish_rows)
8. Hypothesis ranking (7 hypotheses)¶
hyp_data = [('Aquaporin-4 Polarization Rescue', 0.732), ('Microglial Purinergic Reprogramming', 0.701), ('Complement C1q Subtype Switching', 0.665), ('Sphingolipid Metabolism Reprogramming', 0.663), ('Glial Glycocalyx Remodeling Therapy', 0.649), ('Ephrin-B2/EphB4 Axis Manipulation', 0.645), ('Netrin-1 Gradient Restoration', 0.595)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title('4R-tau strain-specific spreading patterns in PSP vs CBD')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
9. Score dimension heatmap (top 10)¶
labels = ['Aquaporin-4 Polarization Rescue', 'Microglial Purinergic Reprogramming', 'Complement C1q Subtype Switching', 'Sphingolipid Metabolism Reprogramming', 'Glial Glycocalyx Remodeling Therapy', 'Ephrin-B2/EphB4 Axis Manipulation', 'Netrin-1 Gradient Restoration']
matrix = np.array([[0.72, 0.58, 0.71, 0.75, 0.09, 0.61, 0.54, 0.62, 0.55], [0.68, 0.74, 0.71, 0.72, 0.13, 0.62, 0.59, 0.81, 0.52], [0.75, 0.4, 0.5, 0.65, 0.623, 0.6, 0.5, 0.35, 0.45], [0.7, 0.7, 0.6, 0.5, 0.436, 0.4, 0.5, 0.7, 0.6], [0.8, 0.6, 0.5, 0.4, 0.473, 0.4, 0.4, 0.6, 0.4], [0.9, 0.6, 0.4, 0.3, 0.513, 0.3, 0.3, 0.6, 0.5], [0.9, 0.2, 0.3, 0.2, 0.448, 0.2, 0.2, 0.1, 0.3]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
fig, ax = plt.subplots(figsize=(10, 5))
im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
ax.set_xticks(range(len(dims)))
ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
rotation=45, ha='right', fontsize=8)
ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
ax.set_title('Score dimensions — top hypotheses')
plt.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout(); plt.show()
else:
print('No score data available')
10. PubMed evidence per hypothesis¶
Hypothesis 1: Aquaporin-4 Polarization Rescue¶
Target genes: AQP4 · Composite score: 0.732
The aquaporin-4 polarization rescue hypothesis proposes a sophisticated mechanistic framework linking tau pathology to glymphatic dysfunction through strain-specific disruption of astrocytic water channel organization. This hypothesis centers on the differential vulnerability of brainstem versus cortical astrocytes to 4R-tau strains and posits that targeted restoration of AQP4 polarity could serve as a therapeutic intervention to prevent characteristic aggregation patterns in neurodegenerative d
hid = 'h-c8ccbee8'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
Hypothesis 2: Microglial Purinergic Reprogramming¶
Target genes: P2RY12 · Composite score: 0.701
Molecular Mechanism and Rationale
The P2Y12 receptor (encoded by P2RY12) represents a critical nexus in microglial purinergic signaling that governs neuroinflammatory responses and tau pathology propagation in neurodegenerative diseases. P2Y12 is a Gi/Go-coupled metabotropic purinergic receptor that serves as the primary ADP sensor on microglial cells, functioning as a molecular switch between homeostatic surveillance and pathological activation states. Under physiological conditions, P2Y12
hid = 'h-5daecb6e'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
Hypothesis 3: Complement C1q Subtype Switching¶
Target genes: C1QA · Composite score: 0.665
Molecular Mechanism and Rationale
The complement C1q complex represents a critical nexus in neuroinflammation and astrocyte-mediated pathology in neurodegenerative diseases. This trimeric protein complex consists of three distinct subunits—C1qA, C1qB, and C1qC—that assemble in varying stoichiometric ratios to form heterotrimeric complexes with distinct functional properties. In healthy neural tissue, C1q complexes maintain homeostatic balance between immune surveillance and neuroprotection.
hid = 'h-5a55aabc'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
Hypothesis 4: Sphingolipid Metabolism Reprogramming¶
Target genes: CERS2 · Composite score: 0.663
Molecular Mechanism and Rationale
The sphingolipid metabolic pathway represents a critical convergence point between membrane biophysics and tau protein aggregation dynamics in neurodegenerative diseases. Ceramide synthases (CERS) constitute the rate-limiting enzymes in de novo ceramide biosynthesis, with six distinct isoforms (CERS1-6) exhibiting unique tissue distribution patterns and acyl-CoA substrate specificities. CERS2 primarily generates very long-chain ceramides (C22-C24), while CE
hid = 'h-6657f7cd'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
Hypothesis 5: Glial Glycocalyx Remodeling Therapy¶
Target genes: HSPG2 · Composite score: 0.649
Molecular Mechanism and Rationale
Progressive supranuclear palsy (PSP) and corticobasal degeneration (CBD) represent distinct 4R tauopathies characterized by specific patterns of tau aggregation in astrocytes, with PSP exhibiting tufted astrocytes and CBD displaying astrocytic plaques. The central hypothesis proposes that these differential pathological presentations result from strain-specific interactions between pathological tau species and region-specific compositions of the glial glyco
hid = 'h-c35493aa'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
Hypothesis 6: Ephrin-B2/EphB4 Axis Manipulation¶
Target genes: EPHB4 · Composite score: 0.645
Molecular Mechanism and Rationale
The ephrin-B2/EphB4 signaling axis represents a critical bidirectional communication system that governs astrocyte-neuron interactions and determines regional susceptibility to tau pathology in neurodegenerative diseases. Ephrin-B2 (EFNB2), a transmembrane ligand predominantly expressed on reactive astrocytes, binds to its cognate receptor EphB4 (EPHB4) expressed on both neurons and astrocytes, initiating complex forward and reverse signaling cascades that
hid = 'h-e6437136'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
Hypothesis 7: Netrin-1 Gradient Restoration¶
Target genes: NTN1 · Composite score: 0.595
Molecular Mechanism and Rationale¶
The netrin-1 guidance system, originally characterized for its role in axon pathfinding during neural development, represents a sophisticated molecular machinery for establishing and maintaining cellular compartmentalization in the central nervous system. Netrin-1 (NTN1) functions as a bifunctional guidance cue, capable of both attracting and repelling cellular processes depending on the receptor repertoire expressed by target cells. The primary receptor
hid = 'h-05b8894a'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
11. Knowledge graph edges (130 total)¶
edge_data = [{'source': 'P2RY12', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.65}, {'source': 'AQP4', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.65}, {'source': 'C1QA', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.65}, {'source': 'CERS2', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.55}, {'source': 'HSPG2', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.51}, {'source': 'EPHB4', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.48}, {'source': 'AQP4', 'relation': 'participates_in', 'target': 'Aquaporin-4 water transport / ', 'strength': 0.46}, {'source': 'P2RY12', 'relation': 'participates_in', 'target': 'Purinergic signaling / microgl', 'strength': 0.46}, {'source': 'C1QA', 'relation': 'participates_in', 'target': 'Classical complement cascade', 'strength': 0.42}, {'source': 'CERS2', 'relation': 'participates_in', 'target': 'Sphingolipid metabolism', 'strength': 0.42}, {'source': 'NTN1', 'relation': 'co_discussed', 'target': 'HSPG2', 'strength': 0.4}, {'source': 'NTN1', 'relation': 'co_discussed', 'target': 'P2RY12', 'strength': 0.4}, {'source': 'NTN1', 'relation': 'co_discussed', 'target': 'P2RX7', 'strength': 0.4}, {'source': 'NTN1', 'relation': 'co_discussed', 'target': 'AQP4', 'strength': 0.4}, {'source': 'NTN1', 'relation': 'co_discussed', 'target': 'EPHB4', 'strength': 0.4}, {'source': 'NTN1', 'relation': 'co_discussed', 'target': 'SMPD1', 'strength': 0.4}, {'source': 'NTN1', 'relation': 'co_discussed', 'target': 'C1QA', 'strength': 0.4}, {'source': 'NTN1', 'relation': 'co_discussed', 'target': 'CERS2', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'P2RY12', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'P2RX7', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'AQP4', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'EPHB4', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'SMPD1', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'C1QA', 'strength': 0.4}, {'source': 'HSPG2', 'relation': 'co_discussed', 'target': 'CERS2', 'strength': 0.4}]
if edge_data:
pd.DataFrame(edge_data).head(25)
else:
print('No KG edge data available')
12. Caveats¶
This notebook uses real Forge tool calls cached from live APIs, but:
- Enrichment is against curated gene-set libraries, not genome-wide screens
- STRING/Reactome/HPA/MyGene reflect curated knowledge
- PubMed literature is search-relevance ranked, not systematic review
The cached evidence bundle is the minimum viable real-data analysis for this topic.