Allen Brain Atlas Analysis Template¶
This notebook provides a comprehensive template for analyzing gene expression data from the Allen Brain Atlas.
Contents:
- Environment setup and AllenSDK installation
- Querying the Allen Brain API (brain-map.org)
- Gene expression data extraction
- Data visualization (matplotlib, seaborn)
- Differential expression analysis (scipy)
- Pathway enrichment analysis (Enrichr API)
Resources:
- AllenSDK Documentation: https://allensdk.readthedocs.io/
- Allen Brain API: https://api.brain-map.org/
- Enrichr API: https://maayanlab.cloud/Enrichr/
1. Environment Setup¶
Install required packages if not already available:
# Install required packages (uncomment if needed)
# !pip install allensdk numpy pandas matplotlib seaborn scipy requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import requests
import json
from typing import List, Dict, Optional
# AllenSDK imports
from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache
from allensdk.api.queries.mouse_connectivity_api import MouseConnectivityApi
from allensdk.api.queries.reference_space_api import ReferenceSpaceApi
# Configure plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100
print("✓ Imports successful")
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[2], line 11 7 import json 8 from typing import List, Dict, Optional 9 10 # AllenSDK imports ---> 11 from allensdk.core.mouse_connectivity_cache import MouseConnectivityCache 12 from allensdk.api.queries.mouse_connectivity_api import MouseConnectivityApi 13 from allensdk.api.queries.reference_space_api import ReferenceSpaceApi 14 ModuleNotFoundError: No module named 'allensdk'
2. Querying the Allen Brain API¶
The Allen Brain API provides programmatic access to gene expression, connectivity, and anatomical data.
# Initialize Allen Brain API connections
mcc = MouseConnectivityCache(manifest_file='connectivity/mouse_connectivity_manifest.json')
mca = MouseConnectivityApi()
rsa = ReferenceSpaceApi()
print("✓ Allen Brain API initialized")
def query_gene_expression(gene_symbols: List[str], structure_ids: Optional[List[int]] = None) -> pd.DataFrame:
"""
Query Allen Brain API for gene expression data.
Args:
gene_symbols: List of gene symbols (e.g., ['MAPT', 'APP', 'SNCA'])
structure_ids: Optional list of Allen Brain structure IDs
Returns:
DataFrame with gene expression data
"""
base_url = "http://api.brain-map.org/api/v2/data/query.json"
results = []
for gene in gene_symbols:
# Query for ISH experiments
query = (
f"criteria=model::SectionDataSet,"
f"rma::criteria,[failed$eq'false'],"
f"products[abbreviation$eq'Mouse'],"
f"genes[acronym$eq'{gene}'],"
f"rma::include,genes,section_images"
)
try:
response = requests.get(f"{base_url}?{query}", timeout=30)
response.raise_for_status()
data = response.json()
if data['success'] and data['msg']:
for experiment in data['msg']:
results.append({
'gene': gene,
'experiment_id': experiment['id'],
'plane_of_section': experiment.get('plane_of_section_id'),
'num_images': len(experiment.get('section_images', []))
})
print(f"✓ Retrieved data for {gene}: {len(data['msg'])} experiments")
except Exception as e:
print(f"✗ Error querying {gene}: {e}")
return pd.DataFrame(results)
# Example: Query gene expression for AD-related genes
ad_genes = ['MAPT', 'APP', 'APOE', 'SNCA', 'PSEN1']
gene_data = query_gene_expression(ad_genes)
print(f"\nRetrieved {len(gene_data)} total experiments")
gene_data.head()
3. Gene Expression Data Extraction¶
Extract and preprocess gene expression values across brain regions.
def get_structure_tree():
"""
Retrieve the Allen Brain structure ontology tree.
Returns:
Structure tree with anatomical hierarchy
"""
return mcc.get_structure_tree()
def extract_expression_matrix(experiment_ids: List[int]) -> pd.DataFrame:
"""
Extract gene expression values for experiments across structures.
Args:
experiment_ids: List of Allen Brain experiment IDs
Returns:
DataFrame with expression values (rows=structures, cols=experiments)
"""
structure_tree = get_structure_tree()
# Get major brain structures
structures = structure_tree.get_structures_by_set_id([167587189]) # Summary structures
structure_ids = [s['id'] for s in structures]
expression_data = {}
for exp_id in experiment_ids[:5]: # Limit to 5 for demo
try:
# Get structure unionizes (aggregated expression per structure)
unionizes = mca.get_structure_unionizes(
[exp_id],
is_injection=False,
structure_ids=structure_ids,
hemisphere_ids=[3] # Both hemispheres
)
for u in unionizes:
struct_id = u['structure_id']
if struct_id not in expression_data:
expression_data[struct_id] = {}
expression_data[struct_id][exp_id] = u.get('expression_energy', 0)
print(f"✓ Extracted expression for experiment {exp_id}")
except Exception as e:
print(f"✗ Error extracting experiment {exp_id}: {e}")
# Convert to DataFrame
df = pd.DataFrame(expression_data).T
df.index.name = 'structure_id'
return df
# Example: Extract expression matrix
if len(gene_data) > 0:
sample_experiments = gene_data['experiment_id'].head(5).tolist()
expression_matrix = extract_expression_matrix(sample_experiments)
print(f"\nExpression matrix shape: {expression_matrix.shape}")
expression_matrix.head()
else:
print("No experiments to extract. Use demo data.")
# Create demo data
expression_matrix = pd.DataFrame(
np.random.rand(10, 5) * 100,
columns=[f'exp_{i}' for i in range(5)],
index=[f'struct_{i}' for i in range(10)]
)
expression_matrix.head()
4. Data Visualization¶
Visualize gene expression patterns using matplotlib and seaborn.
# Heatmap of expression across structures and experiments
plt.figure(figsize=(12, 8))
sns.heatmap(
expression_matrix,
cmap='viridis',
cbar_kws={'label': 'Expression Energy'},
xticklabels=True,
yticklabels=True
)
plt.title('Gene Expression Heatmap Across Brain Structures')
plt.xlabel('Experiment ID')
plt.ylabel('Brain Structure')
plt.tight_layout()
plt.show()
# Bar plot of mean expression per structure
mean_expression = expression_matrix.mean(axis=1).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
mean_expression.plot(kind='barh', color='steelblue')
plt.title('Mean Gene Expression by Brain Structure')
plt.xlabel('Mean Expression Energy')
plt.ylabel('Structure ID')
plt.tight_layout()
plt.show()
# Distribution plot
plt.figure(figsize=(10, 6))
for col in expression_matrix.columns[:3]: # Plot first 3 experiments
sns.kdeplot(expression_matrix[col], label=col, fill=True, alpha=0.3)
plt.title('Gene Expression Distribution Across Experiments')
plt.xlabel('Expression Energy')
plt.ylabel('Density')
plt.legend()
plt.tight_layout()
plt.show()
5. Differential Expression Analysis¶
Perform statistical tests to identify significantly different expression patterns.
def differential_expression_ttest(group1: pd.Series, group2: pd.Series, alpha: float = 0.05) -> Dict:
"""
Perform two-sample t-test for differential expression.
Args:
group1: Expression values for group 1
group2: Expression values for group 2
alpha: Significance threshold
Returns:
Dictionary with test results
"""
# Remove NaN values
g1 = group1.dropna()
g2 = group2.dropna()
# Perform t-test
t_stat, p_value = stats.ttest_ind(g1, g2)
# Calculate fold change
mean1 = g1.mean()
mean2 = g2.mean()
fold_change = mean2 / mean1 if mean1 != 0 else np.inf
log2_fc = np.log2(fold_change) if fold_change > 0 else 0
return {
't_statistic': t_stat,
'p_value': p_value,
'significant': p_value < alpha,
'mean_group1': mean1,
'mean_group2': mean2,
'fold_change': fold_change,
'log2_fold_change': log2_fc
}
# Example: Compare expression between two experiment groups
if expression_matrix.shape[1] >= 2:
group_a = expression_matrix.iloc[:, 0]
group_b = expression_matrix.iloc[:, 1]
de_results = differential_expression_ttest(group_a, group_b)
print("Differential Expression Analysis Results:")
print(f" t-statistic: {de_results['t_statistic']:.4f}")
print(f" p-value: {de_results['p_value']:.4e}")
print(f" Significant: {de_results['significant']}")
print(f" Fold change: {de_results['fold_change']:.4f}")
print(f" Log2 fold change: {de_results['log2_fold_change']:.4f}")
def anova_multiple_groups(expression_df: pd.DataFrame) -> Dict:
"""
Perform one-way ANOVA across multiple experiment groups.
Args:
expression_df: DataFrame with expression values
Returns:
Dictionary with ANOVA results
"""
# Prepare data for ANOVA (each column is a group)
groups = [expression_df[col].dropna() for col in expression_df.columns]
# Perform ANOVA
f_stat, p_value = stats.f_oneway(*groups)
return {
'f_statistic': f_stat,
'p_value': p_value,
'significant': p_value < 0.05,
'num_groups': len(groups)
}
# Example: ANOVA across all experiments
anova_results = anova_multiple_groups(expression_matrix)
print("\nOne-Way ANOVA Results:")
print(f" F-statistic: {anova_results['f_statistic']:.4f}")
print(f" p-value: {anova_results['p_value']:.4e}")
print(f" Significant: {anova_results['significant']}")
print(f" Number of groups: {anova_results['num_groups']}")
# Volcano plot (log2 fold change vs -log10 p-value)
# For demonstration with multiple comparisons
if expression_matrix.shape[1] >= 2:
de_all = []
for idx in expression_matrix.index:
row = expression_matrix.loc[idx]
if len(row) >= 2:
result = differential_expression_ttest(
row.iloc[:len(row)//2],
row.iloc[len(row)//2:]
)
result['structure'] = idx
de_all.append(result)
de_df = pd.DataFrame(de_all)
de_df['neg_log10_p'] = -np.log10(de_df['p_value'] + 1e-300) # Avoid log(0)
plt.figure(figsize=(10, 6))
colors = ['red' if sig else 'gray' for sig in de_df['significant']]
plt.scatter(de_df['log2_fold_change'], de_df['neg_log10_p'], c=colors, alpha=0.6)
plt.axhline(y=-np.log10(0.05), color='blue', linestyle='--', label='p=0.05')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.xlabel('Log2 Fold Change')
plt.ylabel('-Log10 P-value')
plt.title('Volcano Plot: Differential Expression')
plt.legend()
plt.tight_layout()
plt.show()
6. Pathway Enrichment Analysis¶
Use the Enrichr API to identify enriched biological pathways from gene lists.
def enrichr_submit_genes(gene_list: List[str]) -> str:
"""
Submit gene list to Enrichr API.
Args:
gene_list: List of gene symbols
Returns:
User list ID from Enrichr
"""
ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
genes_str = '\n'.join(gene_list)
payload = {
'list': (None, genes_str),
'description': (None, 'Allen Brain Analysis')
}
try:
response = requests.post(ENRICHR_URL, files=payload, timeout=30)
response.raise_for_status()
data = response.json()
if 'userListId' in data:
print(f"✓ Gene list submitted successfully (ID: {data['userListId']})")
return data['userListId']
else:
print("✗ Failed to submit gene list")
return None
except Exception as e:
print(f"✗ Error submitting to Enrichr: {e}")
return None
def enrichr_get_results(user_list_id: str, gene_set_library: str = 'KEGG_2021_Human') -> pd.DataFrame:
"""
Retrieve enrichment results from Enrichr.
Args:
user_list_id: ID returned from enrichr_submit_genes
gene_set_library: Enrichr library to query
Options: 'KEGG_2021_Human', 'GO_Biological_Process_2021',
'WikiPathways_2021_Human', 'Reactome_2022'
Returns:
DataFrame with enrichment results
"""
ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
query_string = f'?userListId={user_list_id}&backgroundType={gene_set_library}'
try:
response = requests.get(ENRICHR_URL + query_string, timeout=30)
response.raise_for_status()
data = response.json()
if gene_set_library in data:
results = data[gene_set_library]
df = pd.DataFrame(results, columns=[
'rank', 'term', 'p_value', 'z_score', 'combined_score',
'overlapping_genes', 'adjusted_p_value', 'old_p_value', 'old_adjusted_p_value'
])
print(f"✓ Retrieved {len(df)} enriched terms")
return df
else:
print(f"✗ No results for {gene_set_library}")
return pd.DataFrame()
except Exception as e:
print(f"✗ Error retrieving Enrichr results: {e}")
return pd.DataFrame()
# Example: Enrichment analysis for AD-related genes
genes_for_enrichment = ['MAPT', 'APP', 'APOE', 'SNCA', 'PSEN1', 'PSEN2', 'GRN', 'LRRK2']
list_id = enrichr_submit_genes(genes_for_enrichment)
if list_id:
# Query multiple libraries
enrichment_results = {}
libraries = ['KEGG_2021_Human', 'GO_Biological_Process_2021', 'WikiPathways_2021_Human']
for lib in libraries:
print(f"\nQuerying {lib}...")
results = enrichr_get_results(list_id, lib)
if not results.empty:
enrichment_results[lib] = results
print(f" Top term: {results.iloc[0]['term']} (p={results.iloc[0]['p_value']:.2e})")
# Visualize enrichment results
if list_id and 'KEGG_2021_Human' in enrichment_results:
kegg_results = enrichment_results['KEGG_2021_Human'].head(10)
plt.figure(figsize=(12, 6))
plt.barh(
range(len(kegg_results)),
-np.log10(kegg_results['p_value']),
color='coral'
)
plt.yticks(range(len(kegg_results)), kegg_results['term'])
plt.xlabel('-Log10 P-value')
plt.title('KEGG Pathway Enrichment (Top 10)')
plt.axvline(x=-np.log10(0.05), color='blue', linestyle='--', label='p=0.05')
plt.legend()
plt.tight_layout()
plt.show()
else:
print("No enrichment results to visualize (using demo mode or API unavailable)")
7. Summary and Next Steps¶
This template provides a foundation for Allen Brain Atlas analysis:
What we covered:
- ✓ AllenSDK setup and initialization
- ✓ Querying the Allen Brain API for gene expression data
- ✓ Extracting expression matrices across brain structures
- ✓ Visualizing expression patterns with heatmaps and plots
- ✓ Differential expression analysis using t-tests and ANOVA
- ✓ Pathway enrichment analysis with Enrichr API
Next steps:
- Replace demo genes with your genes of interest
- Specify target brain structures using Allen structure IDs
- Apply multiple testing correction (Bonferroni, FDR) to p-values
- Integrate with SciDEX knowledge graph for hypothesis generation
- Link enriched pathways to disease mechanisms
- Export results to database or JSON for downstream analysis
Resources:
- AllenSDK Examples: https://allensdk.readthedocs.io/en/latest/examples.html
- Brain Structure Ontology: http://api.brain-map.org/api/v2/structure_graph_download/1.json
- Enrichr Libraries: https://maayanlab.cloud/Enrichr/#libraries
# Export results to JSON (optional)
output_data = {
'genes_queried': genes_for_enrichment if list_id else ad_genes,
'expression_summary': {
'mean': expression_matrix.mean().to_dict(),
'std': expression_matrix.std().to_dict()
},
'differential_expression': de_results if 'de_results' in locals() else {},
'anova': anova_results if 'anova_results' in locals() else {}
}
# Uncomment to save results
# with open('allen_brain_analysis_results.json', 'w') as f:
# json.dump(output_data, f, indent=2)
print("✓ Analysis complete!")
print(f"\nAnalyzed {len(expression_matrix.columns)} experiments across {len(expression_matrix)} structures")