TDP-43 phase separation therapeutics for ALS-FTD¶
Notebook ID: nb-sda-2026-04-01-gap-006 · Analysis: sda-2026-04-01-gap-006 · Generated: 2026-04-10
Research question¶
TDP-43 undergoes liquid-liquid phase separation that becomes pathological. Small molecules targeting phase separation properties could be therapeutic but the design principles are undefined.
Approach¶
This notebook is generated programmatically from real Forge tool calls and SciDEX debate data. Code cells load cached evidence bundles from data/forge_cache/seaad/*.json and query live data from scidex.db. Re-run python3 scripts/regenerate_notebooks.py --analysis sda-2026-04-01-gap-006 --force to refresh.
7 hypotheses were generated and debated. The knowledge graph has 102 edges.
Debate Summary¶
Quality score: 0.2 · Rounds: 6 · Personas: Theorist, Skeptic, Domain_Expert, Medicinal_Chemist, Clinical_Trialist, Synthesizer
1. Forge tool provenance¶
import json, sys, sqlite3
from pathlib import Path
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['figure.dpi'] = 110
matplotlib.rcParams['figure.facecolor'] = 'white'
REPO = Path('.').resolve()
sys.path.insert(0, str(REPO))
CACHE_SUB = 'seaad'
CACHE = REPO / 'data' / 'forge_cache' / CACHE_SUB
def load(name):
p = CACHE / f'{name}.json'
if p.exists():
return json.loads(p.read_text())
return {}
db_path = Path('/home/ubuntu/scidex/scidex.db')
try:
db = sqlite3.connect(str(db_path))
prov = pd.read_sql_query('''
SELECT skill_id, status, COUNT(*) AS n_calls,
ROUND(AVG(duration_ms),0) AS mean_ms
FROM tool_calls
WHERE created_at >= date('now','-30 days')
GROUP BY skill_id, status
ORDER BY n_calls DESC
''', db)
db.close()
prov['tool'] = prov['skill_id'].str.replace('tool_', '', regex=False)
print(f'{len(prov)} tool-call aggregates (last 30 days):')
prov[['tool','status','n_calls','mean_ms']].head(20)
except Exception as e:
print(f'Provenance unavailable: {e}')
77 tool-call aggregates (last 30 days):
2. Target gene annotations¶
ann_rows = []
for g in ['TARDBP']:
mg = load(f'mygene_{g}')
hpa = load(f'hpa_{g}')
if not mg and not hpa:
ann_rows.append({'gene': g, 'name': '—', 'protein_class': '—',
'disease_involvement': '—'})
continue
ann_rows.append({
'gene': g,
'name': (mg.get('name') or '')[:55],
'protein_class': ', '.join((hpa.get('protein_class') or [])[:2])[:55]
if isinstance(hpa.get('protein_class'), list)
else str(hpa.get('protein_class') or '—')[:55],
'disease_involvement': ', '.join((hpa.get('disease_involvement') or [])[:2])[:55]
if isinstance(hpa.get('disease_involvement'), list)
else str(hpa.get('disease_involvement') or '')[:55],
})
pd.DataFrame(ann_rows)
| gene | name | protein_class | disease_involvement | |
|---|---|---|---|---|
| 0 | TARDBP | — | — | — |
3. GO Biological Process enrichment (Enrichr)¶
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
go_df = pd.DataFrame(go_bp[:10])[['term','p_value','odds_ratio','genes']]
go_df['p_value'] = go_df['p_value'].apply(lambda p: f'{p:.2e}')
go_df['odds_ratio'] = go_df['odds_ratio'].round(1)
go_df['term'] = go_df['term'].str[:60]
go_df['n_hits'] = go_df['genes'].apply(len)
go_df['genes'] = go_df['genes'].apply(lambda g: ', '.join(g))
go_df[['term','n_hits','p_value','odds_ratio','genes']]
else:
print('No GO:BP enrichment data')
# Visualize top GO BP enrichment
go_bp = load('enrichr_GO_Biological_Process')
if isinstance(go_bp, list) and go_bp:
top = go_bp[:8]
terms = [t['term'][:45] for t in top][::-1]
neglogp = [-np.log10(max(t['p_value'], 1e-300)) for t in top][::-1]
fig, ax = plt.subplots(figsize=(9, 4.5))
ax.barh(terms, neglogp, color='#4fc3f7')
ax.set_xlabel('-log10(p-value)')
ax.set_title('Top GO:BP enrichment (Enrichr)')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
else:
print('No GO:BP data to plot')
4. KEGG pathway enrichment¶
kegg = load('enrichr_KEGG_Pathways')
if isinstance(kegg, list) and kegg:
kegg_df = pd.DataFrame(kegg[:10])[['term','p_value','odds_ratio','genes']]
kegg_df['genes'] = kegg_df['genes'].apply(lambda g: ', '.join(g))
kegg_df['p_value'] = kegg_df['p_value'].apply(lambda p: f'{p:.2e}')
kegg_df['odds_ratio'] = kegg_df['odds_ratio'].round(1)
kegg_df
else:
print('No KEGG enrichment data')
No KEGG enrichment data
5. STRING protein interaction network¶
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
ppi_df = pd.DataFrame(ppi).sort_values('score', ascending=False)
display_cols = [c for c in ['protein1','protein2','score','escore','tscore'] if c in ppi_df.columns]
print(f'{len(ppi_df)} STRING edges')
ppi_df[display_cols].head(20)
else:
print('No STRING edges returned')
11 STRING edges
# Network figure
ppi = load('string_network')
if isinstance(ppi, list) and ppi:
import math
nodes = sorted({p for e in ppi for p in (e['protein1'], e['protein2'])})
n = len(nodes)
pos = {n_: (math.cos(2*math.pi*i/n), math.sin(2*math.pi*i/n)) for i, n_ in enumerate(nodes)}
fig, ax = plt.subplots(figsize=(7, 7))
for e in ppi:
x1,y1 = pos[e['protein1']]; x2,y2 = pos[e['protein2']]
ax.plot([x1,x2],[y1,y2], color='#888', alpha=0.3+0.5*e['score'],
linewidth=0.5+2*e['score'])
for name,(x,y) in pos.items():
ax.scatter([x],[y], s=450, color='#ffd54f', edgecolors='#333', zorder=3)
ax.annotate(name, (x,y), ha='center', va='center', fontsize=8, fontweight='bold', zorder=4)
ax.set_aspect('equal'); ax.axis('off')
ax.set_title(f'STRING PPI network ({len(ppi)} edges)')
plt.tight_layout(); plt.show()
else:
print('No STRING data to visualize')
6. Reactome pathway footprint¶
pw_rows = []
for g in ['TARDBP']:
pws = load(f'reactome_{g}')
if isinstance(pws, list):
pw_rows.append({'gene': g, 'n_pathways': len(pws),
'top_pathway': (pws[0]['name'] if pws else '—')[:70]})
else:
pw_rows.append({'gene': g, 'n_pathways': 0, 'top_pathway': '—'})
pd.DataFrame(pw_rows).sort_values('n_pathways', ascending=False)
| gene | n_pathways | top_pathway | |
|---|---|---|---|
| 0 | TARDBP | 0 | — |
7. Allen Brain Atlas ISH regional expression¶
ish_rows = []
for g in ['TARDBP']:
ish = load(f'allen_ish_{g}')
regions = ish.get('regions') or [] if isinstance(ish, dict) else []
ish_rows.append({
'gene': g,
'n_ish_regions': len(regions),
'top_region': (regions[0].get('structure','') if regions else '—')[:45],
'top_energy': round(regions[0].get('expression_energy',0), 2) if regions else None,
})
pd.DataFrame(ish_rows)
| gene | n_ish_regions | top_region | top_energy | |
|---|---|---|---|---|
| 0 | TARDBP | 0 | — | — |
8. Hypothesis ranking (7 hypotheses)¶
hyp_data = [('Glycine-Rich Domain Competitive Inhibition', 0.449), ('RNA Granule Nucleation Site Modulation', 0.448), ('Heat Shock Protein 70 Disaggregase Amplification', 0.443), ('Arginine Methylation Enhancement Therapy', 0.428), ('Low Complexity Domain Cross-Linking Inhibition', 0.419), ('Serine/Arginine-Rich Protein Kinase Modulation', 0.398), ('PARP1 Inhibition Therapy', 0.367)]
titles = [h[0] for h in hyp_data][::-1]
scores = [h[1] for h in hyp_data][::-1]
fig, ax = plt.subplots(figsize=(10, max(8, len(titles)*0.4)))
colors = ['#ef5350' if s >= 0.6 else '#ffa726' if s >= 0.5 else '#66bb6a' for s in scores]
ax.barh(range(len(titles)), scores, color=colors)
ax.set_yticks(range(len(titles))); ax.set_yticklabels(titles, fontsize=7)
ax.set_xlabel('Composite Score'); ax.set_title('TDP-43 phase separation therapeutics for ALS-FTD')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout(); plt.show()
9. Score dimension heatmap (top 10)¶
labels = ['Glycine-Rich Domain Competitive Inhibiti', 'RNA Granule Nucleation Site Modulation', 'Heat Shock Protein 70 Disaggregase Ampli', 'Arginine Methylation Enhancement Therapy', 'Low Complexity Domain Cross-Linking Inhi', 'Serine/Arginine-Rich Protein Kinase Modu', 'PARP1 Inhibition Therapy']
matrix = np.array([[0.7, 0.45, 0.6, 0.65, 0.573, 0.6, 0.45, 0.5, 0.4], [0.65, 0.6, 0.7, 0.75, 0.065, 0.75, 0.65, 0.55, 0.5], [0.6, 0.9, 0.7, 0.8, 0.09, 0.8, 0.8, 1.0, 0.6], [0.9, 0.5, 0.8, 0.6, 0.037, 0.6, 0.6, 0.6, 0.4], [0.6, 0.7, 0.5, 0.4, 0.534, 0.4, 0.4, 0.8, 0.4], [0.7, 0.6, 0.5, 0.5, 0.452, 0.5, 0.5, 0.7, 0.4], [0.7, 1.0, 0.6, 0.4, 0.09, 0.9, 0.7, 1.0, 0.8]])
dims = ['novelty_score', 'feasibility_score', 'impact_score', 'mechanistic_plausibility_score', 'clinical_relevance_score', 'data_availability_score', 'reproducibility_score', 'druggability_score', 'safety_profile_score']
if matrix.size:
fig, ax = plt.subplots(figsize=(10, 5))
im = ax.imshow(matrix, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)
ax.set_xticks(range(len(dims)))
ax.set_xticklabels([d.replace('_score','').replace('_',' ').title() for d in dims],
rotation=45, ha='right', fontsize=8)
ax.set_yticks(range(len(labels))); ax.set_yticklabels(labels, fontsize=7)
ax.set_title('Score dimensions — top hypotheses')
plt.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout(); plt.show()
else:
print('No score data available')
10. PubMed evidence per hypothesis¶
Hypothesis 1: Glycine-Rich Domain Competitive Inhibition¶
Target genes: TARDBP · Composite score: 0.449
Molecular Mechanism and Rationale
TAR DNA-binding protein 43 (TDP-43), encoded by the TARDBP gene, is a nuclear ribonucleoprotein that plays crucial roles in RNA metabolism, including transcriptional repression, pre-mRNA splicing, and mRNA stability regulation. The protein consists of two RNA recognition motifs (RRM1 and RRM2), a nuclear localization signal, and a C-terminal glycine-rich domain (GRD) spanning amino acids 274-414. Under pathological conditions, TDP-43 undergoes cytoplasmic m
hid = 'h-7e846ceb'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 2: RNA Granule Nucleation Site Modulation¶
Target genes: G3BP1 · Composite score: 0.448
Molecular Mechanism and Rationale
The pathological aggregation of TAR DNA-binding protein 43 (TDP-43) represents a critical hallmark of numerous neurodegenerative disorders, including amyotrophic lateral sclerosis (ALS), frontotemporal dementia (FTD), and limbic-predominant age-related TDP-43 encephalopathy (LATE). Under physiological conditions, TDP-43 functions as a nuclear RNA-binding protein that regulates transcription, splicing, and mRNA stability. However, in disease states, TDP-43 u
hid = 'h-fffd1a74'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 3: Heat Shock Protein 70 Disaggregase Amplification¶
Target genes: HSPA1A · Composite score: 0.443
Molecular Mechanism and Rationale¶
The HSP70 chaperone system operates as a protein disaggregation machine through an ATP-dependent cycle involving multiple specialized co-factors. HSPA1A (inducible HSP70) and HSPA8 (constitutive HSC70) work in concert with HSP40 co-chaperones (DNAJA1, DNAJB1) and the nucleotide exchange factor HSP110 (HSPH1) to form a trimeric disaggregase complex capable of extracting individual polypeptide chains from amorphous aggregates and amyloid fibrils through a thre
hid = 'h-5dbfd3aa'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 4: Arginine Methylation Enhancement Therapy¶
Target genes: PRMT1 · Composite score: 0.428
Molecular Mechanism and Rationale¶
The TAR DNA-binding protein 43 (TDP-43) has emerged as a central pathological player in numerous neurodegenerative diseases, including amyotrophic lateral sclerosis (ALS), frontotemporal dementia (FTD), and chronic traumatic encephalopathy (CTE). Under physiological conditions, TDP-43 functions as a critical RNA-binding protein that regulates splicing, transcription, and RNA metabolism. However, in disease states, TDP-43 undergoes pathological aggregation an
hid = 'h-19003961'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 5: Low Complexity Domain Cross-Linking Inhibition¶
Target genes: TGM2 · Composite score: 0.419
Molecular Mechanism and Rationale
Transglutaminase 2 (TGM2) represents a critical enzyme in the pathological cascade leading to neurodegeneration through its ability to catalyze the cross-linking of proteins containing low complexity domains (LCDs), particularly TDP-43 (TAR DNA-binding protein 43). TGM2 belongs to a family of calcium-dependent enzymes that catalyze the formation of covalent bonds between glutamine and lysine residues, creating stable ε-(γ-glutamyl)lysine cross-links that re
hid = 'h-69d383ea'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 6: Serine/Arginine-Rich Protein Kinase Modulation¶
Target genes: SRPK1 · Composite score: 0.398
Molecular Mechanism and Rationale
The serine/arginine-rich protein kinases SRPK1 and CLK1 represent critical regulatory nodes in the post-transcriptional control of RNA metabolism, particularly in the phosphorylation of splicing regulators that govern TDP-43 functionality. TDP-43 (TAR DNA-binding protein 43) is a predominantly nuclear RNA-binding protein that becomes pathologically cytoplasmic and aggregated in numerous neurodegenerative diseases, including amyotrophic lateral sclerosis (AL
hid = 'h-dca3e907'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
Hypothesis 7: PARP1 Inhibition Therapy¶
Target genes: PARP1 · Composite score: 0.367
Molecular Mechanism and Rationale
The pathophysiology of TDP-43 proteinopathies, including amyotrophic lateral sclerosis (ALS) and frontotemporal dementia (FTD), is fundamentally characterized by the aberrant cytoplasmic mislocalization and aggregation of TAR DNA-binding protein 43 (TDP-43). Under physiological conditions, TDP-43 functions as a nuclear ribonucleoprotein that regulates RNA splicing, transport, and stability. However, in neurodegenerative diseases, TDP-43 forms hyperphosphory
hid = 'h-69919c49'
papers = load(f'pubmed_{hid}')
if isinstance(papers, list) and papers:
lit = pd.DataFrame(papers)
cols = [c for c in ['year','journal','title','pmid'] if c in lit.columns]
if cols:
lit = lit[cols]
lit['title'] = lit['title'].str[:80]
if 'journal' in lit.columns:
lit['journal'] = lit['journal'].str[:30]
lit.sort_values('year', ascending=False, inplace=True)
display_df = lit
else:
display_df = pd.DataFrame(papers[:5])
else:
display_df = pd.DataFrame([{'note':'no PubMed results'}])
display_df
| note | |
|---|---|
| 0 | no PubMed results |
11. Knowledge graph edges (102 total)¶
edge_data = [{'source': 'HSPA1A', 'relation': 'encodes', 'target': 'HSP70', 'strength': 0.8}, {'source': 'HSP70', 'relation': 'participates_in', 'target': 'protein_folding_pathway', 'strength': 0.8}, {'source': 'PARP1', 'relation': 'encodes', 'target': 'PARP1_protein', 'strength': 0.8}, {'source': 'PARP1_protein', 'relation': 'mediates', 'target': 'DNA_damage_response', 'strength': 0.8}, {'source': 'TDP-43', 'relation': 'regulates', 'target': 'RNA_splicing_pathway', 'strength': 0.8}, {'source': 'PRMT1', 'relation': 'catalyzes', 'target': 'arginine_methylation_pathway', 'strength': 0.8}, {'source': 'arginine_methylation_pathway', 'relation': 'modifies', 'target': 'TDP-43', 'strength': 0.8}, {'source': 'G3BP1', 'relation': 'nucleates', 'target': 'stress_granule_formation', 'strength': 0.8}, {'source': 'SRPK1', 'relation': 'mediates', 'target': 'SR_protein_phosphorylation', 'strength': 0.8}, {'source': 'TGM2', 'relation': 'catalyzes', 'target': 'protein_crosslinking_pathway', 'strength': 0.8}, {'source': 'protein_folding_pathway', 'relation': 'dysregulated_in', 'target': 'ALS', 'strength': 0.8}, {'source': 'stress_granule_formation', 'relation': 'contributes_to', 'target': 'FTD', 'strength': 0.8}, {'source': 'h-5dbfd3aa', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.7}, {'source': 'h-fffd1a74', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.7}, {'source': 'HSPA1A', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.65}, {'source': 'PARP1', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.63}, {'source': 'PRMT1', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.62}, {'source': 'h-19003961', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.6}, {'source': 'G3BP1', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.57}, {'source': 'h-7e846ceb', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.55}, {'source': 'TARDBP', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.53}, {'source': 'SRPK1', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.52}, {'source': 'HSPA1A', 'relation': 'participates_in', 'target': 'Heat shock protein / proteosta', 'strength': 0.51}, {'source': 'TGM2', 'relation': 'associated_with', 'target': 'neurodegeneration', 'strength': 0.5}, {'source': 'h-69919c49', 'relation': 'implicated_in', 'target': 'neurodegeneration', 'strength': 0.5}]
if edge_data:
pd.DataFrame(edge_data).head(25)
else:
print('No KG edge data available')
12. Caveats¶
This notebook uses real Forge tool calls cached from live APIs, but:
- Enrichment is against curated gene-set libraries, not genome-wide screens
- STRING/Reactome/HPA/MyGene reflect curated knowledge
- PubMed literature is search-relevance ranked, not systematic review
The cached evidence bundle is the minimum viable real-data analysis for this topic.