# Install required packages (uncomment if needed)
# !pip install allensdk numpy pandas matplotlib seaborn scipy requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import requests
import json
from typing import List, Dict, Optional

# AllenSDK imports
from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache
from allensdk.api.queries.mouse_connectivity_api import MouseConnectivityApi
from allensdk.api.queries.reference_space_api import ReferenceSpaceApi

# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

print("✓ Imports successful")

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[2], line 11
      7 import json
      8 from typing import List, Dict, Optional
      9 
     10 # AllenSDK imports
---> 11 from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache
     12 from allensdk.api.queries.mouse_connectivity_api import MouseConnectivityApi
     13 from allensdk.api.queries.reference_space_api import ReferenceSpaceApi
     14 

ModuleNotFoundError: No module named 'allensdk'

# Initialize Allen Brain API connections
mcc = MouseConnectivityCache(manifest_file='connectivity/mouse_connectivity_manifest.json')
mca = MouseConnectivityApi()
rsa = ReferenceSpaceApi()

print("✓ Allen Brain API initialized")

def query_gene_expression(gene_symbols: List[str], structure_ids: Optional[List[int]] = None) -> pd.DataFrame:
    """
    Query Allen Brain API for gene expression data.
    
    Args:
        gene_symbols: List of gene symbols (e.g., ['MAPT', 'APP', 'SNCA'])
        structure_ids: Optional list of Allen Brain structure IDs
    
    Returns:
        DataFrame with gene expression data
    """
    base_url = "http://api.brain-map.org/api/v2/data/query.json"
    
    results = []
    
    for gene in gene_symbols:
        # Query for ISH experiments
        query = (
            f"criteria=model::SectionDataSet,"
            f"rma::criteria,[failed$eq'false'],"
            f"products[abbreviation$eq'Mouse'],"
            f"genes[acronym$eq'{gene}'],"
            f"rma::include,genes,section_images"
        )
        
        try:
            response = requests.get(f"{base_url}?{query}", timeout=30)
            response.raise_for_status()
            data = response.json()
            
            if data['success'] and data['msg']:
                for experiment in data['msg']:
                    results.append({
                        'gene': gene,
                        'experiment_id': experiment['id'],
                        'plane_of_section': experiment.get('plane_of_section_id'),
                        'num_images': len(experiment.get('section_images', []))
                    })
                    
            print(f"✓ Retrieved data for {gene}: {len(data['msg'])} experiments")
            
        except Exception as e:
            print(f"✗ Error querying {gene}: {e}")
    
    return pd.DataFrame(results)

# Example: Query gene expression for AD-related genes
ad_genes = ['MAPT', 'APP', 'APOE', 'SNCA', 'PSEN1']
gene_data = query_gene_expression(ad_genes)

print(f"\nRetrieved {len(gene_data)} total experiments")
gene_data.head()

def get_structure_tree():
    """
    Retrieve the Allen Brain structure ontology tree.
    
    Returns:
        Structure tree with anatomical hierarchy
    """
    return mcc.get_structure_tree()

def extract_expression_matrix(experiment_ids: List[int]) -> pd.DataFrame:
    """
    Extract gene expression values for experiments across structures.
    
    Args:
        experiment_ids: List of Allen Brain experiment IDs
    
    Returns:
        DataFrame with expression values (rows=structures, cols=experiments)
    """
    structure_tree = get_structure_tree()
    
    # Get major brain structures
    structures = structure_tree.get_structures_by_set_id([167587189])  # Summary structures
    structure_ids = [s['id'] for s in structures]
    
    expression_data = {}
    
    for exp_id in experiment_ids[:5]:  # Limit to 5 for demo
        try:
            # Get structure unionizes (aggregated expression per structure)
            unionizes = mca.get_structure_unionizes(
                [exp_id],
                is_injection=False,
                structure_ids=structure_ids,
                hemisphere_ids=[3]  # Both hemispheres
            )
            
            for u in unionizes:
                struct_id = u['structure_id']
                if struct_id not in expression_data:
                    expression_data[struct_id] = {}
                expression_data[struct_id][exp_id] = u.get('expression_energy', 0)
            
            print(f"✓ Extracted expression for experiment {exp_id}")
            
        except Exception as e:
            print(f"✗ Error extracting experiment {exp_id}: {e}")
    
    # Convert to DataFrame
    df = pd.DataFrame(expression_data).T
    df.index.name = 'structure_id'
    
    return df

# Example: Extract expression matrix
if len(gene_data) > 0:
    sample_experiments = gene_data['experiment_id'].head(5).tolist()
    expression_matrix = extract_expression_matrix(sample_experiments)
    print(f"\nExpression matrix shape: {expression_matrix.shape}")
    expression_matrix.head()
else:
    print("No experiments to extract. Use demo data.")
    # Create demo data
    expression_matrix = pd.DataFrame(
        np.random.rand(10, 5) * 100,
        columns=[f'exp_{i}' for i in range(5)],
        index=[f'struct_{i}' for i in range(10)]
    )
    expression_matrix.head()

# Heatmap of expression across structures and experiments
plt.figure(figsize=(12, 8))
sns.heatmap(
    expression_matrix,
    cmap='viridis',
    cbar_kws={'label': 'Expression Energy'},
    xticklabels=True,
    yticklabels=True
)
plt.title('Gene Expression Heatmap Across Brain Structures')
plt.xlabel('Experiment ID')
plt.ylabel('Brain Structure')
plt.tight_layout()
plt.show()

# Bar plot of mean expression per structure
mean_expression = expression_matrix.mean(axis=1).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
mean_expression.plot(kind='barh', color='steelblue')
plt.title('Mean Gene Expression by Brain Structure')
plt.xlabel('Mean Expression Energy')
plt.ylabel('Structure ID')
plt.tight_layout()
plt.show()

# Distribution plot
plt.figure(figsize=(10, 6))
for col in expression_matrix.columns[:3]:  # Plot first 3 experiments
    sns.kdeplot(expression_matrix[col], label=col, fill=True, alpha=0.3)

plt.title('Gene Expression Distribution Across Experiments')
plt.xlabel('Expression Energy')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()

def differential_expression_ttest(group1: pd.Series, group2: pd.Series, alpha: float = 0.05) -> Dict:
    """
    Perform two-sample t-test for differential expression.
    
    Args:
        group1: Expression values for group 1
        group2: Expression values for group 2
        alpha: Significance threshold
    
    Returns:
        Dictionary with test results
    """
    # Remove NaN values
    g1 = group1.dropna()
    g2 = group2.dropna()
    
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(g1, g2)
    
    # Calculate fold change
    mean1 = g1.mean()
    mean2 = g2.mean()
    fold_change = mean2 / mean1 if mean1 != 0 else np.inf
    log2_fc = np.log2(fold_change) if fold_change > 0 else 0
    
    return {
        't_statistic': t_stat,
        'p_value': p_value,
        'significant': p_value < alpha,
        'mean_group1': mean1,
        'mean_group2': mean2,
        'fold_change': fold_change,
        'log2_fold_change': log2_fc
    }

# Example: Compare expression between two experiment groups
if expression_matrix.shape[1] >= 2:
    group_a = expression_matrix.iloc[:, 0]
    group_b = expression_matrix.iloc[:, 1]
    
    de_results = differential_expression_ttest(group_a, group_b)
    
    print("Differential Expression Analysis Results:")
    print(f"  t-statistic: {de_results['t_statistic']:.4f}")
    print(f"  p-value: {de_results['p_value']:.4e}")
    print(f"  Significant: {de_results['significant']}")
    print(f"  Fold change: {de_results['fold_change']:.4f}")
    print(f"  Log2 fold change: {de_results['log2_fold_change']:.4f}")

def anova_multiple_groups(expression_df: pd.DataFrame) -> Dict:
    """
    Perform one-way ANOVA across multiple experiment groups.
    
    Args:
        expression_df: DataFrame with expression values
    
    Returns:
        Dictionary with ANOVA results
    """
    # Prepare data for ANOVA (each column is a group)
    groups = [expression_df[col].dropna() for col in expression_df.columns]
    
    # Perform ANOVA
    f_stat, p_value = stats.f_oneway(*groups)
    
    return {
        'f_statistic': f_stat,
        'p_value': p_value,
        'significant': p_value < 0.05,
        'num_groups': len(groups)
    }

# Example: ANOVA across all experiments
anova_results = anova_multiple_groups(expression_matrix)

print("\nOne-Way ANOVA Results:")
print(f"  F-statistic: {anova_results['f_statistic']:.4f}")
print(f"  p-value: {anova_results['p_value']:.4e}")
print(f"  Significant: {anova_results['significant']}")
print(f"  Number of groups: {anova_results['num_groups']}")

# Volcano plot (log2 fold change vs -log10 p-value)
# For demonstration with multiple comparisons
if expression_matrix.shape[1] >= 2:
    de_all = []
    for idx in expression_matrix.index:
        row = expression_matrix.loc[idx]
        if len(row) >= 2:
            result = differential_expression_ttest(
                row.iloc[:len(row)//2],
                row.iloc[len(row)//2:]
            )
            result['structure'] = idx
            de_all.append(result)
    
    de_df = pd.DataFrame(de_all)
    de_df['neg_log10_p'] = -np.log10(de_df['p_value'] + 1e-300)  # Avoid log(0)
    
    plt.figure(figsize=(10, 6))
    colors = ['red' if sig else 'gray' for sig in de_df['significant']]
    plt.scatter(de_df['log2_fold_change'], de_df['neg_log10_p'], c=colors, alpha=0.6)
    plt.axhline(y=-np.log10(0.05), color='blue', linestyle='--', label='p=0.05')
    plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
    plt.xlabel('Log2 Fold Change')
    plt.ylabel('-Log10 P-value')
    plt.title('Volcano Plot: Differential Expression')
    plt.legend()
    plt.tight_layout()
    plt.show()

def enrichr_submit_genes(gene_list: List[str]) -> str:
    """
    Submit gene list to Enrichr API.
    
    Args:
        gene_list: List of gene symbols
    
    Returns:
        User list ID from Enrichr
    """
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
    
    genes_str = '\n'.join(gene_list)
    payload = {
        'list': (None, genes_str),
        'description': (None, 'Allen Brain Analysis')
    }
    
    try:
        response = requests.post(ENRICHR_URL, files=payload, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        if 'userListId' in data:
            print(f"✓ Gene list submitted successfully (ID: {data['userListId']})")
            return data['userListId']
        else:
            print("✗ Failed to submit gene list")
            return None
            
    except Exception as e:
        print(f"✗ Error submitting to Enrichr: {e}")
        return None

def enrichr_get_results(user_list_id: str, gene_set_library: str = 'KEGG_2021_Human') -> pd.DataFrame:
    """
    Retrieve enrichment results from Enrichr.
    
    Args:
        user_list_id: ID returned from enrichr_submit_genes
        gene_set_library: Enrichr library to query
            Options: 'KEGG_2021_Human', 'GO_Biological_Process_2021',
                     'WikiPathways_2021_Human', 'Reactome_2022'
    
    Returns:
        DataFrame with enrichment results
    """
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
    
    query_string = f'?userListId={user_list_id}&backgroundType={gene_set_library}'
    
    try:
        response = requests.get(ENRICHR_URL + query_string, timeout=30)
        response.raise_for_status()
        data = response.json()
        
        if gene_set_library in data:
            results = data[gene_set_library]
            
            df = pd.DataFrame(results, columns=[
                'rank', 'term', 'p_value', 'z_score', 'combined_score',
                'overlapping_genes', 'adjusted_p_value', 'old_p_value', 'old_adjusted_p_value'
            ])
            
            print(f"✓ Retrieved {len(df)} enriched terms")
            return df
        else:
            print(f"✗ No results for {gene_set_library}")
            return pd.DataFrame()
            
    except Exception as e:
        print(f"✗ Error retrieving Enrichr results: {e}")
        return pd.DataFrame()

# Example: Enrichment analysis for AD-related genes
genes_for_enrichment = ['MAPT', 'APP', 'APOE', 'SNCA', 'PSEN1', 'PSEN2', 'GRN', 'LRRK2']

list_id = enrichr_submit_genes(genes_for_enrichment)

if list_id:
    # Query multiple libraries
    enrichment_results = {}
    libraries = ['KEGG_2021_Human', 'GO_Biological_Process_2021', 'WikiPathways_2021_Human']
    
    for lib in libraries:
        print(f"\nQuerying {lib}...")
        results = enrichr_get_results(list_id, lib)
        if not results.empty:
            enrichment_results[lib] = results
            print(f"  Top term: {results.iloc[0]['term']} (p={results.iloc[0]['p_value']:.2e})")

# Visualize enrichment results
if list_id and 'KEGG_2021_Human' in enrichment_results:
    kegg_results = enrichment_results['KEGG_2021_Human'].head(10)
    
    plt.figure(figsize=(12, 6))
    plt.barh(
        range(len(kegg_results)),
        -np.log10(kegg_results['p_value']),
        color='coral'
    )
    plt.yticks(range(len(kegg_results)), kegg_results['term'])
    plt.xlabel('-Log10 P-value')
    plt.title('KEGG Pathway Enrichment (Top 10)')
    plt.axvline(x=-np.log10(0.05), color='blue', linestyle='--', label='p=0.05')
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print("No enrichment results to visualize (using demo mode or API unavailable)")

# Export results to JSON (optional)
output_data = {
    'genes_queried': genes_for_enrichment if list_id else ad_genes,
    'expression_summary': {
        'mean': expression_matrix.mean().to_dict(),
        'std': expression_matrix.std().to_dict()
    },
    'differential_expression': de_results if 'de_results' in locals() else {},
    'anova': anova_results if 'anova_results' in locals() else {}
}

# Uncomment to save results
# with open('allen_brain_analysis_results.json', 'w') as f:
#     json.dump(output_data, f, indent=2)

print("✓ Analysis complete!")
print(f"\nAnalyzed {len(expression_matrix.columns)} experiments across {len(expression_matrix)} structures")

Allen Brain Atlas Analysis Template

Allen Brain Atlas Analysis Template¶

1. Environment Setup¶

2. Querying the Allen Brain API¶

3. Gene Expression Data Extraction¶

4. Data Visualization¶

5. Differential Expression Analysis¶

6. Pathway Enrichment Analysis¶

7. Summary and Next Steps¶