Tissue Expression Filter Test with TCGA LAML Data¶
Testing tissue expression functionality with real TCGA Acute Myeloid Leukemia data.
In [1]:
Copied!
import sys
import os
import pandas as pd
from IPython.display import display
# Add src to path
sys.path.insert(0, os.path.join('..', '..', '..', 'src'))
from pyMut.input import read_maf
from pyMut.filters.tissue_expression import tissue_expression
import sys
import os
import pandas as pd
from IPython.display import display
# Add src to path
sys.path.insert(0, os.path.join('..', '..', '..', 'src'))
from pyMut.input import read_maf
from pyMut.filters.tissue_expression import tissue_expression
Load TCGA LAML Dataset¶
In [2]:
Copied!
# Load real TCGA LAML data
maf_path = os.path.join('..', '..', '..', 'src', 'pyMut', 'data', 'examples', 'MAF','tcga_laml.maf.gz')
# TCGA data is typically based on GRCh37 assembly
py_mut = read_maf(maf_path, assembly="37")
print(f"Loaded TCGA LAML data: {len(py_mut.data)} variants")
print(f"Unique genes: {py_mut.data['Hugo_Symbol'].nunique()}")
print(f"Unique samples: {py_mut.data['Tumor_Sample_Barcode'].nunique()}")
# Load real TCGA LAML data
maf_path = os.path.join('..', '..', '..', 'src', 'pyMut', 'data', 'examples', 'MAF','tcga_laml.maf.gz')
# TCGA data is typically based on GRCh37 assembly
py_mut = read_maf(maf_path, assembly="37")
print(f"Loaded TCGA LAML data: {len(py_mut.data)} variants")
print(f"Unique genes: {py_mut.data['Hugo_Symbol'].nunique()}")
print(f"Unique samples: {py_mut.data['Tumor_Sample_Barcode'].nunique()}")
2025-08-01 02:03:35,885 | INFO | pyMut.input | Starting MAF reading: ../../../src/pyMut/data/examples/MAF/tcga_laml.maf.gz 2025-08-01 02:03:35,886 | INFO | pyMut.input | Loading from cache: ../../../src/pyMut/data/examples/MAF/.pymut_cache/tcga_laml.maf_8bfbda65c4b23428.parquet 2025-08-01 02:03:35,911 | INFO | pyMut.input | Cache loaded successfully in 0.03 seconds
Loaded TCGA LAML data: 2091 variants Unique genes: 1611 Unique samples: 190
Test 1: Individual Gene Expression (with prints)¶
In [3]:
Copied!
# Test individual gene expression with genes from the dataset
sample_genes = py_mut.data['Hugo_Symbol'].value_counts().head(3).index.tolist()
print(f"Testing genes: {sample_genes}")
for gene in sample_genes:
result_laml = tissue_expression(gene, ["LAML", 5]) # LAML = Acute Myeloid Leukemia
result_blca = tissue_expression(gene, ["BLCA", 5]) # BLCA = Bladder Cancer
print(f"{gene} - LAML (>5): {result_laml}, BLCA (>5): {result_blca}")
# Test individual gene expression with genes from the dataset
sample_genes = py_mut.data['Hugo_Symbol'].value_counts().head(3).index.tolist()
print(f"Testing genes: {sample_genes}")
for gene in sample_genes:
result_laml = tissue_expression(gene, ["LAML", 5]) # LAML = Acute Myeloid Leukemia
result_blca = tissue_expression(gene, ["BLCA", 5]) # BLCA = Bladder Cancer
print(f"{gene} - LAML (>5): {result_laml}, BLCA (>5): {result_blca}")
Testing genes: ['FLT3', 'DNMT3A', 'TET2'] FLT3 - LAML (>5): False, BLCA (>5): False DNMT3A - LAML (>5): False, BLCA (>5): False TET2 - LAML (>5): False, BLCA (>5): False
Test 2: PyMutation Object Filtering¶
In [4]:
Copied!
print(f"Original TCGA data: {len(py_mut.data)} variants")
# Filter by LAML tissue (should keep many since this is LAML data)
filtered_laml = py_mut.filter_by_tissue_expression([('LAML', 5)])
print(f"LAML expressed (>5): {len(filtered_laml.data)} variants")
# Filter by multiple tissues
filtered_multi = py_mut.filter_by_tissue_expression([('LAML', 5), ('BRCA', 3), ('LUAD', 4)])
print(f"Multi-tissue expressed: {len(filtered_multi.data)} variants")
# Filter for NOT expressed in LAML with high threshold
filtered_not = py_mut.filter_by_tissue_expression([('LAML', 50)], keep_expressed=False)
print(f"NOT highly expressed in LAML: {len(filtered_not.data)} variants")
print(f"Original TCGA data: {len(py_mut.data)} variants")
# Filter by LAML tissue (should keep many since this is LAML data)
filtered_laml = py_mut.filter_by_tissue_expression([('LAML', 5)])
print(f"LAML expressed (>5): {len(filtered_laml.data)} variants")
# Filter by multiple tissues
filtered_multi = py_mut.filter_by_tissue_expression([('LAML', 5), ('BRCA', 3), ('LUAD', 4)])
print(f"Multi-tissue expressed: {len(filtered_multi.data)} variants")
# Filter for NOT expressed in LAML with high threshold
filtered_not = py_mut.filter_by_tissue_expression([('LAML', 50)], keep_expressed=False)
print(f"NOT highly expressed in LAML: {len(filtered_not.data)} variants")
Original TCGA data: 2091 variants LAML expressed (>5): 0 variants Multi-tissue expressed: 915 variants NOT highly expressed in LAML: 2091 variants
Test 3: Tissue Expression Results Analysis¶
In [5]:
Copied!
# Display tissue expression results dataframes
print("📊 Detailed tissue expression analysis results:")
if hasattr(filtered_multi, 'tissue_expression_results'):
results_df = filtered_multi.tissue_expression_results
print(f"\nResults dataframe shape: {results_df.shape}")
print("Columns:", list(results_df.columns))
print("\n📊 Complete tissue expression results:")
display(results_df)
# Create summary statistics table
summary_stats = []
for col in results_df.columns:
if col.endswith('_expressed'):
tissue = col.replace('_expressed', '')
count = results_df[col].sum()
total = len(results_df)
percentage = (count / total * 100) if total > 0 else 0
summary_stats.append({
'Tissue': tissue,
'Expressed_Count': count,
'Total_Genes': total,
'Percentage': f"{percentage:.1f}%"
})
if summary_stats:
print("\n📊 Summary statistics by tissue:")
summary_df = pd.DataFrame(summary_stats)
display(summary_df)
else:
print("❌ Results dataframe not found")
# Display tissue expression results dataframes
print("📊 Detailed tissue expression analysis results:")
if hasattr(filtered_multi, 'tissue_expression_results'):
results_df = filtered_multi.tissue_expression_results
print(f"\nResults dataframe shape: {results_df.shape}")
print("Columns:", list(results_df.columns))
print("\n📊 Complete tissue expression results:")
display(results_df)
# Create summary statistics table
summary_stats = []
for col in results_df.columns:
if col.endswith('_expressed'):
tissue = col.replace('_expressed', '')
count = results_df[col].sum()
total = len(results_df)
percentage = (count / total * 100) if total > 0 else 0
summary_stats.append({
'Tissue': tissue,
'Expressed_Count': count,
'Total_Genes': total,
'Percentage': f"{percentage:.1f}%"
})
if summary_stats:
print("\n📊 Summary statistics by tissue:")
summary_df = pd.DataFrame(summary_stats)
display(summary_df)
else:
print("❌ Results dataframe not found")
📊 Detailed tissue expression analysis results: Results dataframe shape: (2091, 9) Columns: ['Index', 'Gene_Symbol', 'Expressed_in_Any_Tissue', 'LAML_expressed', 'LAML_threshold', 'BRCA_expressed', 'BRCA_threshold', 'LUAD_expressed', 'LUAD_threshold'] 📊 Complete tissue expression results:
Index | Gene_Symbol | Expressed_in_Any_Tissue | LAML_expressed | LAML_threshold | BRCA_expressed | BRCA_threshold | LUAD_expressed | LUAD_threshold | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | KIAA1529 | False | False | 5 | False | 3 | False | 4 |
1 | 1 | KIAA1529 | False | False | 5 | False | 3 | False | 4 |
2 | 2 | TBC1D2 | True | False | 5 | True | 3 | True | 4 |
3 | 3 | LPPR1 | False | False | 5 | False | 3 | False | 4 |
4 | 4 | BAAT | False | False | 5 | False | 3 | False | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2086 | 2086 | FRAS1 | False | False | 5 | False | 3 | False | 4 |
2087 | 2087 | GPR78 | False | False | 5 | False | 3 | False | 4 |
2088 | 2088 | MEPE | False | False | 5 | False | 3 | False | 4 |
2089 | 2089 | RAP1GDS1 | True | False | 5 | True | 3 | True | 4 |
2090 | 2090 | SLC2A9 | False | False | 5 | False | 3 | False | 4 |
2091 rows × 9 columns
📊 Summary statistics by tissue:
Tissue | Expressed_Count | Total_Genes | Percentage | |
---|---|---|---|---|
0 | LAML | 0 | 2091 | 0.0% |
1 | BRCA | 886 | 2091 | 42.4% |
2 | LUAD | 754 | 2091 | 36.1% |