%matplotlib inline
import sqlite3
import sys
import json
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Dark theme for SciDEX notebooks
plt.rcParams.update({
    'figure.facecolor': '#0a0a14',
    'axes.facecolor': '#151525',
    'axes.edgecolor': '#333',
    'axes.labelcolor': '#e0e0e0',
    'text.color': '#e0e0e0',
    'xtick.color': '#888',
    'ytick.color': '#888',
    'legend.facecolor': '#151525',
    'legend.edgecolor': '#333',
    'figure.dpi': 120,
    'savefig.dpi': 120,
})

# ── Query SciDEX Database ────────────────────────────────────────────────
ANALYSIS_ID = "SDA-2026-04-04-gap-20260404-microglial-priming-early-ad"
DB_PATH = "/home/ubuntu/scidex/scidex.db"

conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row

# Analysis metadata
analysis = dict(conn.execute(
    "SELECT id, title, question, domain, created_at FROM analyses WHERE id=?",
    (ANALYSIS_ID,)
).fetchone())

# All hypotheses with scores
hyp_rows = conn.execute(
    """SELECT id, title, composite_score, target_gene, disease, target_pathway,
           mechanistic_plausibility_score, confidence_score, novelty_score,
           feasibility_score, impact_score, druggability_score,
           safety_profile_score, competitive_landscape_score,
           data_availability_score, reproducibility_score
    FROM hypotheses WHERE analysis_id=? ORDER BY composite_score DESC""",
    (ANALYSIS_ID,)
).fetchall()
hypotheses = [dict(r) for r in hyp_rows]

# Knowledge graph edges
edge_rows = conn.execute(
    """SELECT source_id, target_id, relation, edge_type, evidence_strength
    FROM knowledge_edges WHERE analysis_id=? ORDER BY evidence_strength DESC""",
    (ANALYSIS_ID,)
).fetchall()
edges = [dict(r) for r in edge_rows]
conn.close()

print(f"Analysis: {analysis['title']}")
print(f"Question: {analysis['question'][:100]}...")
print(f"Hypotheses: {len(hypotheses)} | Knowledge edges: {len(edges)}")
print(f"Top hypothesis: {hypotheses[0]['title']} (score: {hypotheses[0]['composite_score']:.3f})")
print("\nEnvironment ready: sqlite3, numpy, matplotlib, scipy")

Analysis: Neuroinflammation and microglial priming in early Alzheimer's Disease
Question: Investigate mechanistic links between early microglial priming states, neuroinflammatory signaling, ...
Hypotheses: 14 | Knowledge edges: 105
Top hypothesis: Epigenetic Reprogramming of Microglial Memory (score: 0.508)

Environment ready: sqlite3, numpy, matplotlib, scipy

# ── Configure Forge tools ─────────────────────────────────────────────────────
import sys
# Point to worktree package path
sys.path.insert(0, '/home/ubuntu/scidex/.orchestra-worktrees/task-13a48c3c-bc4e-4a57-9b9f-d36ee1f35bff')
from scidex.forge import tools as forge_tools

# Map function names
pubmed_search        = forge_tools.pubmed_search
get_gene_info        = forge_tools.get_gene_info
string_protein_interactions = forge_tools.string_protein_interactions
reactome_pathways    = forge_tools.reactome_pathways
uniprot_protein_info = forge_tools.uniprot_protein_info

print('Forge tools configured')

Forge tools configured

# ── Hypothesis ranking bar chart ──────────────────────────────────────────
titles = [h['title'][:45] + ('...' if len(h['title']) > 45 else '') for h in hypotheses]
scores = [h['composite_score'] for h in hypotheses]
genes = [h['target_gene'] or 'Multiple' for h in hypotheses]

colors = ['#4fc3f7' if s >= 0.45 else '#81c784' if s >= 0.42 else '#ffd54f' if s >= 0.40 else '#ef5350' for s in scores]

fig, ax = plt.subplots(figsize=(14, 8))
bars = ax.barh(range(len(titles)), scores, color=colors, alpha=0.85, edgecolor='#333')
ax.set_yticks(range(len(titles)))
ax.set_yticklabels(titles, fontsize=8)
ax.set_xlabel('Composite Score', fontsize=11)
ax.set_xlim(0, 0.6)
ax.set_title('Hypothesis Ranking — Microglial Priming in Early AD', fontsize=14,
             color='#4fc3f7', fontweight='bold')
ax.axvline(x=0.45, color='#4fc3f7', linestyle='--', alpha=0.4, label='Strong (0.45)')
ax.axvline(x=0.40, color='#ffd54f', linestyle='--', alpha=0.4, label='Moderate (0.40)')

for bar, val, gene in zip(bars, scores, genes):
    ax.text(val + 0.005, bar.get_y() + bar.get_height()/2,
            f'{val:.3f}  [{gene[:20]}]', va='center', fontsize=7, color='#e0e0e0')
ax.legend(fontsize=8)
ax.invert_yaxis()
plt.tight_layout()
plt.show()

# Summary table
print("\nHypothesis Ranking")
print("=" * 90)
print(f"{'Rank':<5} {'Title':<50} {'Score':>7} {'Target Gene':<20}")
print("-" * 90)
for i, h in enumerate(hypotheses, 1):
    print(f"{i:<5} {h['title'][:48]:<50} {h['composite_score']:>7.3f} {(h['target_gene'] or 'Multiple')[:20]:<20}")

Hypothesis Ranking
==========================================================================================
Rank  Title                                                Score Target Gene         
------------------------------------------------------------------------------------------
1     Epigenetic Reprogramming of Microglial Memory        0.508 DNMT3A, HDAC1/2     
2     Microbiota-Microglia Axis Modulation                 0.476 Multiple            
3     Synaptic Pruning Precision Therapy                   0.465 C1QA, C3, CX3CR1, CX
4     Cardiovascular-Neuroinflammatory Dual Targeting      0.462 TNF/IL6             
5     IGFBPL1-Mediated Homeostatic Restoration             0.446 IGFBPL1             
6     Cardiovascular-Neuroinflammation Crosstalk Inter     0.437 IL1B, TNFA, NLRP3   
7     APOE4-Lipid Metabolism Correction                    0.425 APOE                
8     Gut-Brain Axis Microbiome Modulation                 0.421 GPR43, GPR109A      
9     Perinatal Immune Challenge Prevention                0.416 Multiple            
10    IGFBPL1-Mediated Microglial Reprogramming            0.414 IGFBPL1             
11    Complement-Mediated Synaptic Protection              0.410 C1QA                
12    Temporal Gating of Microglial Responses              0.389 CLOCK, ARNTL        
13    Perinatal Hypoxia-Primed Microglia Targeting         0.385 HIF1A, NFKB1        
14    TREM2-P2RY12 Balance Restoration Therapy             0.366 TREM2

# ── Radar chart: top 3 hypotheses across 10 scoring dimensions ────────────
dim_keys = ['mechanistic_plausibility_score', 'confidence_score', 'novelty_score',
            'feasibility_score', 'impact_score', 'druggability_score',
            'safety_profile_score', 'competitive_landscape_score',
            'data_availability_score', 'reproducibility_score']
dim_labels = ['Mechanistic', 'Confidence', 'Novelty', 'Feasibility', 'Impact',
              'Druggability', 'Safety', 'Competition', 'Data Avail.', 'Reproducibility']

top3 = hypotheses[:3]
radar_colors = ['#4fc3f7', '#ef5350', '#81c784']

fig, axes = plt.subplots(1, 3, figsize=(18, 6), subplot_kw=dict(polar=True))
angles = np.linspace(0, 2 * np.pi, len(dim_labels), endpoint=False).tolist()
angles += angles[:1]

for idx, (hyp, ax, color) in enumerate(zip(top3, axes, radar_colors)):
    values = [hyp.get(k, 0) or 0 for k in dim_keys]
    vals = values + values[:1]
    ax.plot(angles, vals, 'o-', linewidth=2, color=color, alpha=0.8)
    ax.fill(angles, vals, alpha=0.2, color=color)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([d[:6] for d in dim_labels], fontsize=6)
    ax.set_ylim(0, 1)
    short_title = hyp['title'][:30] + ('...' if len(hyp['title']) > 30 else '')
    ax.set_title(f"#{idx+1}: {short_title}\n(Score: {hyp['composite_score']:.3f})",
                 fontsize=9, color=color, fontweight='bold', pad=15)
    ax.set_facecolor('#151525')

plt.suptitle('Top 3 Hypotheses — 10-Dimension Score Profiles', fontsize=14,
             color='#4fc3f7', fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Dimension comparison
print("\nDimension Comparison — Top 3 Hypotheses")
print("=" * 80)
header = f"{'Dimension':<20}"
for h in top3:
    header += f" {h['title'][:18]:>20}"
print(header)
print("-" * 80)
for k, label in zip(dim_keys, dim_labels):
    row = f"{label:<20}"
    for h in top3:
        v = h.get(k, 0) or 0
        row += f" {v:>20.2f}"
    print(row)

Dimension Comparison — Top 3 Hypotheses
================================================================================
Dimension              Epigenetic Reprogr   Microbiota-Microgl   Synaptic Pruning P
--------------------------------------------------------------------------------
Mechanistic                          0.70                 0.40                 0.80
Confidence                           0.60                 0.30                 0.70
Novelty                              0.80                 0.60                 0.70
Feasibility                          0.80                 0.60                 0.60
Impact                               0.70                 0.50                 0.80
Druggability                         0.90                 0.70                 0.60
Safety                               0.60                 0.80                 0.50
Competition                          0.70                 0.40                 0.60
Data Avail.                          0.70                 0.40                 0.80
Reproducibility                      0.80                 0.30                 0.70

# ── PubMed Literature Search ──────────────────────────────────────────────

queries = [
    "microglial priming Alzheimer neuroinflammation",
    "TREM2 microglia neurodegeneration",
    "complement C1q synaptic pruning Alzheimer",
]

all_papers = []
for query in queries:
    try:
        results = forge_tools.pubmed_search(query, max_results=5)
        if results:
            all_papers.extend(results)
            print(f"\n[PubMed] '{query}' → {len(results)} papers")
            for p in results:
                print(f"  PMID:{p.get('pmid','')} | {p.get('title','')[:70]}... ({p.get('year','')})")
    except Exception as e:
        print(f"  [Warning] PubMed query failed: {e}")

print(f"\nTotal papers retrieved: {len(all_papers)}")

Failed to log tool call pubmed_search: no such table: tool_calls

[PubMed] 'microglial priming Alzheimer neuroinflammation' → 5 papers
  PMID:35642214 | Microglia-Mediated Neuroinflammation: A Potential Target for the Treat... (2022)
  PMID:35248147 | Microbiota in neuroinflammation and synaptic dysfunction: a focus on A... (2022)
  PMID:29951498 | Microglial priming in Alzheimer's disease.... (2018)
  PMID:38561809 | The endotoxin hypothesis of Alzheimer's disease.... (2024)
  PMID:34080771 | Acute systemic inflammation exacerbates neuroinflammation in Alzheimer... (2021)

Failed to log tool call pubmed_search: no such table: tool_calls

[PubMed] 'TREM2 microglia neurodegeneration' → 5 papers
  PMID:38769824 | Microglia, Trem2, and Neurodegeneration.... (2025)
  PMID:28930663 | The TREM2-APOE Pathway Drives the Transcriptional Phenotype of Dysfunc... (2017)
  PMID:30258234 | Microglia in neurodegeneration.... (2018)
  PMID:29775591 | Disease-Associated Microglia: A Universal Immune Sensor of Neurodegene... (2018)
  PMID:40122810 | Enhancing TREM2 expression activates microglia and modestly mitigates ... (2025)

Failed to log tool call pubmed_search: no such table: tool_calls

[PubMed] 'complement C1q synaptic pruning Alzheimer' → 5 papers
  PMID:27033548 | Complement and microglia mediate early synapse loss in Alzheimer mouse... (2016)
  PMID:34472455 | Microglia regulation of synaptic plasticity and learning and memory.... (2022)
  PMID:27114033 | Progranulin Deficiency Promotes Circuit-Specific Synaptic Pruning by M... (2016)
  PMID:34595138 | The Role of Complement in Synaptic Pruning and Neurodegeneration.... (2021)
  PMID:38278523 | C5aR1 signaling promotes region- and age-dependent synaptic pruning in... (2024)

Total papers retrieved: 15

# ── Gene Annotations (MyGene.info) ────────────────────────────────────────

target_genes = ['TREM2', 'DNMT3A', 'C1QA', 'IGFBPL1', 'APOE']
gene_data = {}

print("Gene Annotations for Key Microglial Priming Targets")
print("=" * 90)
for gene in target_genes:
    try:
        info = forge_tools.get_gene_info(gene)
        if info:
            gene_data[gene] = info
            print(f"\n{gene} — {info.get('name', 'N/A')}")
            summary = info.get('summary', 'No summary available')
            print(f"  Summary: {summary[:120]}...")
            print(f"  Type: {info.get('type', 'N/A')} | Aliases: {info.get('aliases', 'N/A')}")
    except Exception as e:
        print(f"  [Warning] Gene info failed for {gene}: {e}")

print(f"\nAnnotated {len(gene_data)}/{len(target_genes)} target genes")

Gene Annotations for Key Microglial Priming Targets
==========================================================================================

Failed to log tool call get_gene_info: no such table: tool_calls

TREM2 — triggering receptor expressed on myeloid cells 2
  Summary: This gene encodes a membrane protein that forms a receptor signaling complex with the TYRO protein tyrosine kinase bindi...
  Type: protein-coding | Aliases: ['AD17', 'PLOSL2', 'TREM-2', 'Trem2a', 'Trem2b', 'Trem2c']

Failed to log tool call get_gene_info: no such table: tool_calls

DNMT3A — DNA methyltransferase 3 alpha
  Summary: CpG methylation is an epigenetic modification that is important for embryonic development, imprinting, and X-chromosome ...
  Type: protein-coding | Aliases: ['DNMT3A2', 'HESJAS', 'M.HsaIIIA', 'TBRS']

Failed to log tool call get_gene_info: no such table: tool_calls

C1QA — complement C1q A chain
  Summary: This gene encodes the A-chain polypeptide of serum complement subcomponent C1q, which associates with C1r and C1s to yie...
  Type: protein-coding | Aliases: ['C1QD1']

Failed to log tool call get_gene_info: no such table: tool_calls

IGFBPL1 — insulin like growth factor binding protein like 1
  Summary: Predicted to enable insulin-like growth factor binding activity. Involved in cellular response to tumor cell. Located in...
  Type: protein-coding | Aliases: ['IGFBP-RP4', 'IGFBPRP4', 'bA113O24.1']

Failed to log tool call get_gene_info: no such table: tool_calls

# ── STRING Protein-Protein Interactions ───────────────────────────────────

# Query interactions for key microglial genes
query_genes = ['TREM2', 'C1QA', 'APOE', 'DNMT3A', 'IGFBPL1', 'TNF', 'IL1B', 'NLRP3', 'P2RY12']

try:
    interactions = forge_tools.string_protein_interactions(query_genes, species=9606,
                                               network_type="functional",
                                               score_threshold=400)
    print(f"STRING: {len(interactions)} interactions found for {len(query_genes)} query genes")
    print("\nTop interactions by combined score:")
    print(f"{'Protein 1':<15} {'Protein 2':<15} {'Score':>8}")
    print("-" * 40)
    sorted_int = sorted(interactions, key=lambda x: x.get('score', 0), reverse=True)[:15]
    for ix in sorted_int:
        p1 = ix.get('preferredName_A', ix.get('protein1', '?'))
        p2 = ix.get('preferredName_B', ix.get('protein2', '?'))
        sc = ix.get('score', 0)
        print(f"{p1:<15} {p2:<15} {sc:>8}")
except Exception as e:
    interactions = []
    print(f"[Warning] STRING query failed: {e}")

Failed to log tool call string_protein_interactions: no such table: tool_calls

STRING: 19 interactions found for 9 query genes

Top interactions by combined score:
Protein 1       Protein 2          Score
----------------------------------------
IL1B            TNF                0.998
APOE            TREM2              0.997
IL1B            NLRP3              0.997
P2RY12          TREM2              0.874
NLRP3           TNF                0.869
TREM2           C1QA               0.854
APOE            IL1B               0.846
APOE            TNF                0.806
P2RY12          C1QA               0.714
APOE            P2RY12             0.684
IL1B            TREM2              0.668
IL1B            P2RY12              0.62
TREM2           TNF                 0.59
APOE            C1QA               0.546
APOE            NLRP3              0.505

# ── Reactome Pathway Enrichment ───────────────────────────────────────────

pathway_genes = ['TREM2', 'C1QA', 'APOE', 'TNF', 'IL1B']
all_pathways = {}

print("Reactome Pathway Associations")
print("=" * 80)
for gene in pathway_genes:
    try:
        pws = forge_tools.reactome_pathways(gene, max_results=5)
        if pws:
            all_pathways[gene] = pws
            print(f"\n{gene} — {len(pws)} pathways:")
            for pw in pws[:3]:
                name = pw.get('name', pw.get('displayName', 'Unknown'))
                pid = pw.get('pathway_id', pw.get('stId', ''))
                print(f"  {pid}: {name}")
    except Exception as e:
        print(f"  [Warning] Reactome failed for {gene}: {e}")

total = sum(len(v) for v in all_pathways.values())
print(f"\nTotal pathway associations: {total} across {len(all_pathways)} genes")

Reactome Pathway Associations
================================================================================

Failed to log tool call reactome_pathways: no such table: tool_calls

TREM2 — 4 pathways:
  R-HSA-198933: Immunoregulatory interactions between a Lymphoid and a non-Lymphoid cell
  R-HSA-2172127: DAP12 interactions
  R-HSA-2424491: DAP12 signaling

Failed to log tool call reactome_pathways: no such table: tool_calls

C1QA — 3 pathways:
  R-HSA-166663: Initial triggering of complement
  R-HSA-173623: Classical antibody-mediated complement activation
  R-HSA-977606: Regulation of Complement cascade

Failed to log tool call reactome_pathways: no such table: tool_calls

APOE — 5 pathways:
  R-HSA-1251985: Nuclear signaling by ERBB4
  R-HSA-3000480: Scavenging by Class A Receptors
  R-HSA-381426: Regulation of Insulin-like Growth Factor (IGF) transport and uptake by Insulin-like Growth Factor Binding Proteins (IGFBPs)

Failed to log tool call reactome_pathways: no such table: tool_calls

TNF — 5 pathways:
  R-HSA-381340: Transcriptional regulation of white adipocyte differentiation
  R-HSA-5357786: TNFR1-induced proapoptotic signaling
  R-HSA-5357905: Regulation of TNFR1 signaling

Failed to log tool call reactome_pathways: no such table: tool_calls

# ── UniProt Protein Data ──────────────────────────────────────────────────

uniprot_targets = ['TREM2', 'APOE', 'C1QA']
protein_data = {}

print("UniProt Protein Annotations")
print("=" * 80)
for gene in uniprot_targets:
    try:
        info = forge_tools.uniprot_protein_info(gene)
        if info:
            protein_data[gene] = info
            print(f"\n{gene} — {info.get('protein_name', 'N/A')}")
            print(f"  Accession: {info.get('accession', 'N/A')}")
            print(f"  Length: {info.get('sequence_length', 'N/A')} aa")
            func = info.get('function', 'N/A')
            if isinstance(func, str):
                print(f"  Function: {func[:150]}...")
            print(f"  Subcellular: {str(info.get('subcellular_location', 'N/A'))[:80]}")
    except Exception as e:
        print(f"  [Warning] UniProt failed for {gene}: {e}")

print(f"\nAnnotated {len(protein_data)}/{len(uniprot_targets)} proteins")

UniProt Protein Annotations
================================================================================

Failed to log tool call uniprot_protein_info: no such table: tool_calls

TREM2 — Triggering receptor expressed on myeloid cells 2
  Accession: Q9NZC2
  Length: 230 aa
  Function: Forms a receptor signaling complex with TYROBP which mediates signaling and cell activation following ligand binding (PubMed:10799849). Acts as a rece...
  Subcellular: Cell membrane

Failed to log tool call uniprot_protein_info: no such table: tool_calls

APOE — Apolipoprotein E
  Accession: P02649
  Length: 317 aa
  Function: APOE is an apolipoprotein, a protein associating with lipid particles, that mainly functions in lipoprotein-mediated lipid transport between organs vi...
  Subcellular: Secreted, Secreted, extracellular space, Secreted, extracellular space, extracel

Failed to log tool call uniprot_protein_info: no such table: tool_calls

C1QA — Complement C1q subcomponent subunit A
  Accession: P02745
  Length: 245 aa
  Function: Core component of the complement C1 complex, a multiprotein complex that initiates the classical pathway of the complement system, a cascade of protei...
  Subcellular: Secreted, Cell surface

Annotated 3/3 proteins

# ── Build and analyze knowledge graph ─────────────────────────────────────
import networkx as nx

G = nx.Graph()
for e in edges:
    src = e['source_id']
    tgt = e['target_id']
    rel = e['relation']
    strength = e.get('evidence_strength', 0.5)
    # Skip hypothesis-to-target edges for the gene-gene network
    if src.startswith('h-') or tgt.startswith('h-'):
        continue
    G.add_edge(src, tgt, relation=rel, weight=strength)

print(f"Knowledge Graph (gene/concept level)")
print(f"Nodes: {G.number_of_nodes()} | Edges: {G.number_of_edges()}")
print(f"Connected components: {nx.number_connected_components(G)}")

# Centrality analysis
degree_cent = nx.degree_centrality(G)
betweenness = nx.betweenness_centrality(G)

sorted_nodes = sorted(degree_cent.items(), key=lambda x: x[1], reverse=True)
print(f"\nHub Analysis (by degree centrality):")
print(f"{'Node':<30} {'Degree':>8} {'Betweenness':>12} {'Connections':>12}")
print("-" * 65)
for node, dc in sorted_nodes[:10]:
    bc = betweenness.get(node, 0)
    deg = G.degree(node)
    print(f"{node[:28]:<30} {dc:>8.3f} {bc:>12.3f} {deg:>12}")

Knowledge Graph (gene/concept level)
Nodes: 39 | Edges: 66
Connected components: 5

Hub Analysis (by degree centrality):
Node                             Degree  Betweenness  Connections
-----------------------------------------------------------------
Alzheimer's disease               0.579        0.457           22
IGFBPL1                           0.368        0.202           14
C1QA, C3, CX3CR1, CX3CL1          0.184        0.000            7
CLOCK, ARNTL                      0.184        0.000            7
DNMT3A, HDAC1/2                   0.184        0.000            7
GPR43, GPR109A                    0.184        0.000            7
HIF1A, NFKB1                      0.184        0.000            7
IL1B, TNFA, NLRP3                 0.184        0.000            7
C1QA                              0.158        0.073            6
Multiple                          0.158        0.005            6

# ── Knowledge graph visualization ─────────────────────────────────────────
fig, ax = plt.subplots(figsize=(14, 14))

# Layout
pos = nx.spring_layout(G, k=2.5, iterations=80, seed=42)

# Node sizes by degree
node_sizes = [300 + G.degree(n) * 200 for n in G.nodes()]

# Node colors by type
node_colors = []
gene_nodes = {'TREM2', 'DNMT3A', 'C1QA', 'C3', 'CX3CR1', 'CX3CL1', 'IGFBPL1',
              'APOE', 'TNF', 'IL1B', 'TNFA', 'NLRP3', 'GPR43', 'GPR109A',
              'HIF1A', 'NFKB1', 'CLOCK', 'ARNTL', 'P2RY12', 'IL6', 'HDAC1', 'HDAC2',
              'TNF/IL6', 'IL1B, TNFA, NLRP3', 'DNMT3A, HDAC1/2',
              'C1QA, C3, CX3CR1, CX3CL1', 'GPR43, GPR109A', 'HIF1A, NFKB1',
              'CLOCK, ARNTL'}
for n in G.nodes():
    if n in gene_nodes or any(g in n for g in ['TREM2', 'APOE', 'C1Q', 'TNF', 'IL1', 'NLRP']):
        node_colors.append('#4fc3f7')
    elif 'disease' in n.lower() or 'alzheimer' in n.lower():
        node_colors.append('#ef5350')
    else:
        node_colors.append('#81c784')

# Edge colors by relation type
edge_colors = []
for u, v, d in G.edges(data=True):
    rel = d.get('relation', '')
    if 'drives' in rel or 'promotes' in rel:
        edge_colors.append('#ef5350')
    elif 'regulates' in rel or 'modulates' in rel:
        edge_colors.append('#ffd54f')
    elif 'co_associated' in rel:
        edge_colors.append('#555555')
    else:
        edge_colors.append('#4fc3f788')

edge_widths = [d.get('weight', 0.5) * 2.5 for _, _, d in G.edges(data=True)]

nx.draw_networkx_edges(G, pos, ax=ax, edge_color=edge_colors, width=edge_widths, alpha=0.5)
nx.draw_networkx_nodes(G, pos, ax=ax, node_size=node_sizes, node_color=node_colors,
                       edgecolors='#333', linewidths=1.5, alpha=0.9)

# Labels
labels = {}
for n in G.nodes():
    label = n.replace('_', ' ')
    if len(label) > 20:
        label = label[:18] + '..'
    labels[n] = label
nx.draw_networkx_labels(G, pos, labels, ax=ax, font_size=7, font_color='#e0e0e0',
                        font_weight='bold')

ax.set_title('Knowledge Graph — Microglial Priming in Early AD\n'
             f'{G.number_of_nodes()} nodes, {G.number_of_edges()} edges',
             fontsize=14, color='#4fc3f7', fontweight='bold')
ax.axis('off')

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#4fc3f7', label='Gene/Protein'),
    Patch(facecolor='#ef5350', label='Disease'),
    Patch(facecolor='#81c784', label='Biological Process'),
]
ax.legend(handles=legend_elements, loc='lower left', fontsize=9)
plt.tight_layout()
plt.show()

# ── Edge type and relation analysis ───────────────────────────────────────
from collections import Counter

relation_counts = Counter(e['relation'] for e in edges)
strength_by_relation = {}
for e in edges:
    rel = e['relation']
    if rel not in strength_by_relation:
        strength_by_relation[rel] = []
    strength_by_relation[rel].append(e.get('evidence_strength', 0.5))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Relation type distribution
rels = sorted(relation_counts.items(), key=lambda x: x[1], reverse=True)
rel_names = [r[0].replace('_', ' ') for r in rels]
rel_vals = [r[1] for r in rels]
rel_colors = ['#4fc3f7', '#ef5350', '#81c784', '#ffd54f', '#b39ddb', '#ff8a65'][:len(rels)]

ax1.barh(range(len(rel_names)), rel_vals, color=rel_colors, alpha=0.85, edgecolor='#333')
ax1.set_yticks(range(len(rel_names)))
ax1.set_yticklabels(rel_names, fontsize=10)
ax1.set_xlabel('Count', fontsize=11)
ax1.set_title('Edge Relation Types', fontsize=13, color='#4fc3f7', fontweight='bold')
for i, v in enumerate(rel_vals):
    ax1.text(v + 0.5, i, str(v), va='center', fontsize=9, color='#e0e0e0')

# Evidence strength distribution
all_strengths = [e.get('evidence_strength', 0.5) for e in edges]
ax2.hist(all_strengths, bins=10, color='#4fc3f7', alpha=0.8, edgecolor='#333')
ax2.set_xlabel('Evidence Strength', fontsize=11)
ax2.set_ylabel('Count', fontsize=11)
ax2.set_title('Evidence Strength Distribution', fontsize=13, color='#4fc3f7', fontweight='bold')
mean_str = np.mean(all_strengths)
ax2.axvline(x=mean_str, color='#ef5350', linestyle='--', alpha=0.7,
            label=f'Mean: {mean_str:.2f}')
ax2.legend(fontsize=9)

plt.tight_layout()
plt.show()

print(f"\nEdge Analysis Summary")
print(f"Total edges: {len(edges)}")
print(f"Unique relation types: {len(relation_counts)}")
print(f"Evidence strength: mean={mean_str:.3f}, min={min(all_strengths):.2f}, max={max(all_strengths):.2f}")
for rel, count in sorted(relation_counts.items(), key=lambda x: x[1], reverse=True):
    avg = np.mean(strength_by_relation[rel])
    print(f"  {rel}: {count} edges (avg strength: {avg:.2f})")

Edge Analysis Summary
Total edges: 105
Unique relation types: 12
Evidence strength: mean=0.450, min=0.20, max=0.80
  co_associated_with: 34 edges (avg strength: 0.40)
  targets: 25 edges (avg strength: 0.47)
  associated_with_microglial_priming: 16 edges (avg strength: 0.50)
  implicated_in: 14 edges (avg strength: 0.43)
  associated_with: 9 edges (avg strength: 0.50)
  drives: 1 edges (avg strength: 0.50)
  mediates: 1 edges (avg strength: 0.50)
  regulates: 1 edges (avg strength: 0.50)
  promotes: 1 edges (avg strength: 0.50)
  maintains: 1 edges (avg strength: 0.50)
  modulates: 1 edges (avg strength: 0.50)
  programs: 1 edges (avg strength: 0.50)

# ── Score distribution and statistical tests ──────────────────────────────
composite_scores = [h['composite_score'] for h in hypotheses]
dim_keys_short = ['mechanistic_plausibility_score', 'confidence_score', 'novelty_score',
                  'feasibility_score', 'impact_score', 'druggability_score',
                  'safety_profile_score', 'competitive_landscape_score',
                  'data_availability_score', 'reproducibility_score']
dim_labels_short = ['Mech.', 'Conf.', 'Novel.', 'Feas.', 'Impact',
                    'Drug.', 'Safety', 'Comp.', 'Data', 'Reprod.']

# Gather all dimension scores
dim_matrix = []
for h in hypotheses:
    row = [h.get(k, 0) or 0 for k in dim_keys_short]
    dim_matrix.append(row)
dim_matrix = np.array(dim_matrix)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Composite score distribution
ax = axes[0, 0]
ax.hist(composite_scores, bins=8, color='#4fc3f7', alpha=0.8, edgecolor='#333')
ax.axvline(np.mean(composite_scores), color='#ef5350', linestyle='--', linewidth=2,
           label=f'Mean: {np.mean(composite_scores):.3f}')
ax.axvline(np.median(composite_scores), color='#81c784', linestyle='--', linewidth=2,
           label=f'Median: {np.median(composite_scores):.3f}')
ax.set_xlabel('Composite Score', fontsize=11)
ax.set_ylabel('Count', fontsize=11)
ax.set_title('Composite Score Distribution (n=14)', fontsize=13,
             color='#4fc3f7', fontweight='bold')
ax.legend(fontsize=9)

# 2. Dimension box plots
ax = axes[0, 1]
bp = ax.boxplot(dim_matrix, vert=True, patch_artist=True,
                labels=dim_labels_short)
for patch, color in zip(bp['boxes'], plt.cm.viridis(np.linspace(0.2, 0.8, 10))):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
ax.set_ylabel('Score', fontsize=11)
ax.set_title('Score Distributions by Dimension', fontsize=13,
             color='#4fc3f7', fontweight='bold')
ax.tick_params(axis='x', rotation=45)

# 3. Dimension correlation heatmap
ax = axes[1, 0]
corr = np.corrcoef(dim_matrix.T)
im = ax.imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')
ax.set_xticks(range(len(dim_labels_short)))
ax.set_xticklabels(dim_labels_short, rotation=45, ha='right', fontsize=7)
ax.set_yticks(range(len(dim_labels_short)))
ax.set_yticklabels(dim_labels_short, fontsize=7)
ax.set_title('Dimension Correlation', fontsize=13, color='#4fc3f7', fontweight='bold')
plt.colorbar(im, ax=ax, shrink=0.8)

# 4. Score profile heatmap
ax = axes[1, 1]
hyp_labels = [h['title'][:25] for h in hypotheses]
im2 = ax.imshow(dim_matrix, cmap='YlOrRd', aspect='auto', vmin=0, vmax=1)
ax.set_xticks(range(len(dim_labels_short)))
ax.set_xticklabels(dim_labels_short, rotation=45, ha='right', fontsize=7)
ax.set_yticks(range(len(hyp_labels)))
ax.set_yticklabels(hyp_labels, fontsize=6)
ax.set_title('All Hypotheses — Score Heatmap', fontsize=13,
             color='#4fc3f7', fontweight='bold')
plt.colorbar(im2, ax=ax, shrink=0.8)

plt.tight_layout()
plt.show()

# ── Statistical summary ───────────────────────────────────────────────────
print("=" * 70)
print("STATISTICAL ANALYSIS — Microglial Priming in Early AD")
print("=" * 70)

# Composite score stats
print(f"\n1. COMPOSITE SCORE SUMMARY (n={len(composite_scores)})")
print("-" * 70)
print(f"Mean:   {np.mean(composite_scores):.4f}")
print(f"Median: {np.median(composite_scores):.4f}")
print(f"Std:    {np.std(composite_scores):.4f}")
print(f"Range:  [{min(composite_scores):.4f}, {max(composite_scores):.4f}]")
print(f"IQR:    [{np.percentile(composite_scores, 25):.4f}, {np.percentile(composite_scores, 75):.4f}]")
print(f"CV:     {np.std(composite_scores)/np.mean(composite_scores)*100:.1f}%")

# Normality test
stat_w, p_w = stats.shapiro(composite_scores)
print(f"\n2. NORMALITY TEST (Shapiro-Wilk)")
print("-" * 70)
print(f"W={stat_w:.4f}, p={p_w:.4f} — {'Normal' if p_w > 0.05 else 'Non-normal'} distribution")

# Bootstrap CI
print(f"\n3. BOOTSTRAP 95% CI (10,000 resamples)")
print("-" * 70)
np.random.seed(42)
boot = [np.mean(np.random.choice(composite_scores, len(composite_scores), replace=True))
        for _ in range(10000)]
lo, hi = np.percentile(boot, [2.5, 97.5])
print(f"95% CI for mean composite score: [{lo:.4f}, {hi:.4f}]")

# Dimension analysis
print(f"\n4. DIMENSION ANALYSIS")
print("-" * 70)
print(f"{'Dimension':<25} {'Mean':>8} {'Std':>8} {'Min':>8} {'Max':>8}")
print("-" * 70)
for i, label in enumerate(dim_labels_short):
    col = dim_matrix[:, i]
    print(f"{label:<25} {np.mean(col):>8.3f} {np.std(col):>8.3f} {np.min(col):>8.2f} {np.max(col):>8.2f}")

# Strongest dimension
dim_means = dim_matrix.mean(axis=0)
strongest = dim_labels_short[np.argmax(dim_means)]
weakest = dim_labels_short[np.argmin(dim_means)]
print(f"\nStrongest dimension: {strongest} (mean={np.max(dim_means):.3f})")
print(f"Weakest dimension:  {weakest} (mean={np.min(dim_means):.3f})")

======================================================================
STATISTICAL ANALYSIS — Microglial Priming in Early AD
======================================================================

1. COMPOSITE SCORE SUMMARY (n=14)
----------------------------------------------------------------------
Mean:   0.4300
Median: 0.4230
Std:    0.0373
Range:  [0.3662, 0.5075]
IQR:    [0.4113, 0.4582]
CV:     8.7%

2. NORMALITY TEST (Shapiro-Wilk)
----------------------------------------------------------------------
W=0.9810, p=0.9800 — Normal distribution

3. BOOTSTRAP 95% CI (10,000 resamples)
----------------------------------------------------------------------
95% CI for mean composite score: [0.4111, 0.4499]

4. DIMENSION ANALYSIS
----------------------------------------------------------------------
Dimension                     Mean      Std      Min      Max
----------------------------------------------------------------------
Mech.                        0.529    0.162     0.30     0.80
Conf.                        0.429    0.171     0.20     0.80
Novel.                       0.729    0.153     0.40     0.90
Feas.                        0.450    0.229     0.10     0.80
Impact                       0.621    0.132     0.40     0.80
Drug.                        0.500    0.265     0.10     0.90
Safety                       0.543    0.172     0.20     0.80
Comp.                        0.693    0.187     0.30     0.90
Data                         0.500    0.200     0.20     0.80
Reprod.                      0.464    0.216     0.10     0.80

Strongest dimension: Novel. (mean=0.729)
Weakest dimension:  Conf. (mean=0.429)

Neuroinflammation and Microglial Priming in Early Alzheimer's Disease

Neuroinflammation and Microglial Priming in Early Alzheimer's Disease¶

Research Question¶

Contents¶

2. Hypothesis Landscape¶

3. Evidence Mining with Forge Scientific Tools¶

4. Knowledge Graph Analysis¶

5. Statistical Analysis¶

6. Conclusions¶

Key Findings¶

Research Directions¶