In [1]:
Copied!
from pathlib import Path
# Import pyMut functions
from pyMut.input import read_maf, read_vcf
# Define the specific columns we want to display from knownCancer annotation
knowncancer_columns = [
"COSMIC_ROLE_IN_CANCER",
"COSMIC_TIER",
"OncoKB_Is Oncogene",
"OncoKB_Is Tumor Suppressor Gene",
"OncoKB_OncoKB Annotated",
"OncoKB_MSK-IMPACT",
"OncoKB_MSK-HEME",
"OncoKB_FOUNDATION ONE",
"OncoKB_FOUNDATION ONE HEME",
"OncoKB_Vogelstein",
"Is_Oncogene_any"
]
print("✓ Setup complete")
from pathlib import Path
# Import pyMut functions
from pyMut.input import read_maf, read_vcf
# Define the specific columns we want to display from knownCancer annotation
knowncancer_columns = [
"COSMIC_ROLE_IN_CANCER",
"COSMIC_TIER",
"OncoKB_Is Oncogene",
"OncoKB_Is Tumor Suppressor Gene",
"OncoKB_OncoKB Annotated",
"OncoKB_MSK-IMPACT",
"OncoKB_MSK-HEME",
"OncoKB_FOUNDATION ONE",
"OncoKB_FOUNDATION ONE HEME",
"OncoKB_Vogelstein",
"Is_Oncogene_any"
]
print("✓ Setup complete")
✓ Setup complete
Data File Paths¶
Define the paths to our example data files and COSMIC annotation table.
In [2]:
Copied!
# Data file paths
MAF_FILE = "../../../src/pyMut/data/examples/MAF/tcga_laml_VEP_annotated.maf.gz"
VCF_FILE = "../../../src/pyMut/data/examples/VCF/subset_50k_variants_vep_protein_gene_variant_class.vcf.gz"
COSMIC_ANNOTATION = "../../../src/pyMut/data/resources/COSMIC/Cosmic_CancerGeneCensus_Tsv_v102_GRCh38/Cosmic_CancerGeneCensus_v102_GRCh38.tsv.gz"
ONCOKB_ANNOTATION = "../../../src/pyMut/data/resources/OncoKb/cancerGeneList.tsv"
# Verify files exist
for file_path, name in [(MAF_FILE, "MAF"), (VCF_FILE, "VCF"), (COSMIC_ANNOTATION, "COSMIC"), (ONCOKB_ANNOTATION, "OncoKB")]:
if Path(file_path).exists():
print(f"✓ {name} file found: {Path(file_path).name}")
else:
print(f"✗ {name} file not found: {file_path}")
# Data file paths
MAF_FILE = "../../../src/pyMut/data/examples/MAF/tcga_laml_VEP_annotated.maf.gz"
VCF_FILE = "../../../src/pyMut/data/examples/VCF/subset_50k_variants_vep_protein_gene_variant_class.vcf.gz"
COSMIC_ANNOTATION = "../../../src/pyMut/data/resources/COSMIC/Cosmic_CancerGeneCensus_Tsv_v102_GRCh38/Cosmic_CancerGeneCensus_v102_GRCh38.tsv.gz"
ONCOKB_ANNOTATION = "../../../src/pyMut/data/resources/OncoKb/cancerGeneList.tsv"
# Verify files exist
for file_path, name in [(MAF_FILE, "MAF"), (VCF_FILE, "VCF"), (COSMIC_ANNOTATION, "COSMIC"), (ONCOKB_ANNOTATION, "OncoKB")]:
if Path(file_path).exists():
print(f"✓ {name} file found: {Path(file_path).name}")
else:
print(f"✗ {name} file not found: {file_path}")
✓ MAF file found: tcga_laml_VEP_annotated.maf.gz ✓ VCF file found: subset_50k_variants_vep_protein_gene_variant_class.vcf.gz ✓ COSMIC file found: Cosmic_CancerGeneCensus_v102_GRCh38.tsv.gz ✓ OncoKB file found: cancerGeneList.tsv
1. Load MAF Data¶
Load the TCGA LAML MAF file using read_maf
.
In [3]:
Copied!
# Load MAF data
print("Loading MAF data...")
py_mut_maf = read_maf(MAF_FILE,assembly="37")
print("\n📊 MAF Data Summary:")
print(f" Shape: {py_mut_maf.data.shape}")
print(f" Source: {py_mut_maf.metadata.source_format}")
print(f" Unique genes: {py_mut_maf.data['Hugo_Symbol'].nunique()}")
print(f" Unique samples: {py_mut_maf.data['Tumor_Sample_Barcode'].nunique()}")
# Show first few rows
print("\n📋 First 3 rows:")
display(py_mut_maf.data[['Hugo_Symbol', 'Variant_Classification', 'Tumor_Sample_Barcode']].head(3))
# Load MAF data
print("Loading MAF data...")
py_mut_maf = read_maf(MAF_FILE,assembly="37")
print("\n📊 MAF Data Summary:")
print(f" Shape: {py_mut_maf.data.shape}")
print(f" Source: {py_mut_maf.metadata.source_format}")
print(f" Unique genes: {py_mut_maf.data['Hugo_Symbol'].nunique()}")
print(f" Unique samples: {py_mut_maf.data['Tumor_Sample_Barcode'].nunique()}")
# Show first few rows
print("\n📋 First 3 rows:")
display(py_mut_maf.data[['Hugo_Symbol', 'Variant_Classification', 'Tumor_Sample_Barcode']].head(3))
2025-08-01 01:20:21,027 | INFO | pyMut.input | Starting MAF reading: ../../../src/pyMut/data/examples/MAF/tcga_laml_VEP_annotated.maf.gz 2025-08-01 01:20:21,031 | INFO | pyMut.input | Reading MAF with 'pyarrow' engine… 2025-08-01 01:20:21,051 | INFO | pyMut.input | Reading with 'pyarrow' completed. 2025-08-01 01:20:21,058 | INFO | pyMut.input | Detected 193 unique samples.
Loading MAF data...
2025-08-01 01:20:21,148 | INFO | pyMut.input | Consolidating duplicate variants across samples... 2025-08-01 01:20:21,162 | INFO | pyMut.input | Consolidating variants using vectorized operations... 2025-08-01 01:21:15,156 | INFO | pyMut.input | Variant consolidation completed in 54.01 seconds 2025-08-01 01:21:15,162 | INFO | pyMut.input | Consolidated 2207 rows into 2091 unique variants 2025-08-01 01:21:15,171 | INFO | pyMut.input | Saving to cache: ../../../src/pyMut/data/examples/MAF/.pymut_cache/tcga_laml_VEP_annotated.maf_6e67e5a1df3d2694.parquet 2025-08-01 01:21:15,231 | INFO | pyMut.input | MAF processed successfully: 2091 rows, 237 columns in 54.20 seconds
📊 MAF Data Summary: Shape: (2091, 237) Source: MAF Unique genes: 1611 Unique samples: 190 📋 First 3 rows:
Hugo_Symbol | Variant_Classification | Tumor_Sample_Barcode | |
---|---|---|---|
0 | KIAA1529 | SILENT | TCGA-AB-2886 |
1 | KIAA1529 | MISSENSE_MUTATION | TCGA-AB-2917 |
2 | TBC1D2 | MISSENSE_MUTATION | TCGA-AB-2841 |
2. Load VCF Data¶
Load the 1000 Genomes VCF file using read_vcf
.
In [4]:
Copied!
# Load VCF data
print("Loading VCF data...")
py_mut_vcf = read_vcf(VCF_FILE,assembly="38")
print("\n📊 VCF Data Summary:")
print(f" Shape: {py_mut_vcf.data.shape}")
print(f" Source: {py_mut_vcf.metadata.source_format}")
print(f" Unique genes: {py_mut_vcf.data['Hugo_Symbol'].nunique()}")
print(f" Chromosome: {py_mut_vcf.data['CHROM'].unique()[0]}")
# Show first few rows
print("\n📋 First 3 rows:")
display(py_mut_vcf.data[['Hugo_Symbol', 'CHROM', 'POS', 'REF', 'ALT', 'Variant_Classification']].head(3))
# Load VCF data
print("Loading VCF data...")
py_mut_vcf = read_vcf(VCF_FILE,assembly="38")
print("\n📊 VCF Data Summary:")
print(f" Shape: {py_mut_vcf.data.shape}")
print(f" Source: {py_mut_vcf.metadata.source_format}")
print(f" Unique genes: {py_mut_vcf.data['Hugo_Symbol'].nunique()}")
print(f" Chromosome: {py_mut_vcf.data['CHROM'].unique()[0]}")
# Show first few rows
print("\n📋 First 3 rows:")
display(py_mut_vcf.data[['Hugo_Symbol', 'CHROM', 'POS', 'REF', 'ALT', 'Variant_Classification']].head(3))
2025-08-01 01:21:15,252 | INFO | pyMut.input | Starting optimized VCF reading: ../../../src/pyMut/data/examples/VCF/subset_50k_variants_vep_protein_gene_variant_class.vcf.gz 2025-08-01 01:21:15,253 | INFO | pyMut.input | Reading VCF with pandas + pyarrow optimization...
Loading VCF data...
2025-08-01 01:21:27,468 | INFO | pyMut.input | Pandas reading completed. 2025-08-01 01:21:27,498 | INFO | pyMut.input | Starting vectorized genotype conversion before INFO expansion... 2025-08-01 01:23:51,282 | INFO | pyMut.input | GT conversion completed: 143.78 s 2025-08-01 01:23:51,282 | INFO | pyMut.input | Expanding INFO column with vectorized operations... 2025-08-01 01:23:57,835 | INFO | pyMut.input | Expanding VEP CSQ annotations into individual columns... 2025-08-01 01:24:05,372 | INFO | pyMut.input | CSQ expanded into 31 VEP annotation columns in 7.54 s 2025-08-01 01:24:05,373 | INFO | pyMut.input | Generating Hugo_Symbol column from VEP_SYMBOL and VEP_NEAREST... 2025-08-01 01:24:05,401 | INFO | pyMut.input | Hugo_Symbol column generated in 0.03 s 2025-08-01 01:24:05,404 | INFO | pyMut.input | Generating Variant_Classification from VEP_Consequence and VEP_VARIANT_CLASS... 2025-08-01 01:24:10,510 | INFO | pyMut.input | Variant_Classification generated in 5.11 s 2025-08-01 01:24:10,511 | INFO | pyMut.input | Generating Variant_Type from VEP_VARIANT_CLASS... 2025-08-01 01:24:13,887 | INFO | pyMut.input | Variant_Type generated in 3.37 s 2025-08-01 01:24:13,916 | INFO | pyMut.input | Using 2548 sample columns for output organization 2025-08-01 01:24:16,923 | INFO | pyMut.input | Saving to cache: ../../../src/pyMut/data/examples/VCF/.pymut_cache/subset_50k_variants_vep_protein_gene_variant_class.vcf_70ff294aac35bdcb.parquet 2025-08-01 01:24:24,596 | INFO | pyMut.input | VCF processed successfully: 50000 rows, 2601 columns in 189.34 seconds
📊 VCF Data Summary: Shape: (50000, 2601) Source: VCF Unique genes: 13 Chromosome: chr10 📋 First 3 rows:
Hugo_Symbol | CHROM | POS | REF | ALT | Variant_Classification | |
---|---|---|---|---|---|---|
0 | TUBB8 | chr10 | 11501 | C | A | INTRON |
1 | TUBB8 | chr10 | 36097 | G | A | INTRON |
2 | TUBB8 | chr10 | 45900 | C | T | 3'FLANK |
3. Apply COSMIC Cancer Annotation to MAF Data¶
Apply the knownCancer
method to the MAF data.
In [5]:
Copied!
# Apply COSMIC annotation to MAF data
print("🔬 Applying COSMIC cancer annotation to MAF data...")
# Apply annotation (in_place=False to get returned DataFrame)
maf_annotated = py_mut_maf.knownCancer(
annotation_table=COSMIC_ANNOTATION,
oncokb_table=ONCOKB_ANNOTATION,
in_place=False
)
print("\n✅ MAF Annotation Complete!")
print(f" Original shape: {py_mut_maf.data.shape}")
print(f" Annotated shape: {maf_annotated.shape}")
# Show new annotation columns
original_cols = set(py_mut_maf.data.columns)
new_cols = [col for col in maf_annotated.columns if col not in original_cols]
print(f"\n🏷️ New annotation columns ({len(new_cols)}):")
for col in new_cols:
print(f" • {col}")
# Show annotation results for genes with annotations
annotated_genes = maf_annotated[maf_annotated['Is_Oncogene_any'] == True]
if len(annotated_genes) > 0:
print(f"\n🎯 Genes with cancer annotations ({len(annotated_genes)} variants):")
# Use specific knowncancer_columns that are available in the data
available_cols = ['Hugo_Symbol'] + [col for col in knowncancer_columns if col in maf_annotated.columns]
display(annotated_genes[available_cols].drop_duplicates('Hugo_Symbol').head(10))
else:
print("\n⚠️ No genes found with cancer annotations in this dataset")
# Apply COSMIC annotation to MAF data
print("🔬 Applying COSMIC cancer annotation to MAF data...")
# Apply annotation (in_place=False to get returned DataFrame)
maf_annotated = py_mut_maf.knownCancer(
annotation_table=COSMIC_ANNOTATION,
oncokb_table=ONCOKB_ANNOTATION,
in_place=False
)
print("\n✅ MAF Annotation Complete!")
print(f" Original shape: {py_mut_maf.data.shape}")
print(f" Annotated shape: {maf_annotated.shape}")
# Show new annotation columns
original_cols = set(py_mut_maf.data.columns)
new_cols = [col for col in maf_annotated.columns if col not in original_cols]
print(f"\n🏷️ New annotation columns ({len(new_cols)}):")
for col in new_cols:
print(f" • {col}")
# Show annotation results for genes with annotations
annotated_genes = maf_annotated[maf_annotated['Is_Oncogene_any'] == True]
if len(annotated_genes) > 0:
print(f"\n🎯 Genes with cancer annotations ({len(annotated_genes)} variants):")
# Use specific knowncancer_columns that are available in the data
available_cols = ['Hugo_Symbol'] + [col for col in knowncancer_columns if col in maf_annotated.columns]
display(annotated_genes[available_cols].drop_duplicates('Hugo_Symbol').head(10))
else:
print("\n⚠️ No genes found with cancer annotations in this dataset")
2025-08-01 01:24:24,772 | INFO | pyMut.annotate.cosmic_cancer_annotate | DataFrame memory usage: 0.02 GB 2025-08-01 01:24:24,772 | INFO | pyMut.annotate.cosmic_cancer_annotate | Using pandas backend for annotation 2025-08-01 01:24:24,773 | INFO | pyMut.annotate.cosmic_cancer_annotate | Starting pandas annotation for DataFrame: 2091 rows, 237 columns 2025-08-01 01:24:24,786 | INFO | pyMut.annotate.cosmic_cancer_annotate | Using join column: Hugo_Symbol 2025-08-01 01:24:24,786 | INFO | pyMut.annotate.cosmic_cancer_annotate | Reading annotation table: ../../../src/pyMut/data/resources/COSMIC/Cosmic_CancerGeneCensus_Tsv_v102_GRCh38/Cosmic_CancerGeneCensus_v102_GRCh38.tsv.gz 2025-08-01 01:24:24,791 | INFO | pyMut.annotate.cosmic_cancer_annotate | Annotation table loaded: 758 rows, 21 columns 2025-08-01 01:24:24,792 | INFO | pyMut.annotate.cosmic_cancer_annotate | Creating synonyms dictionary from column 'SYNONYMS'... 2025-08-01 01:24:24,818 | INFO | pyMut.annotate.cosmic_cancer_annotate | Created synonyms dictionary with 4710 mappings 2025-08-01 01:24:24,818 | INFO | pyMut.annotate.cosmic_cancer_annotate | Applying synonyms mapping to PyMutation data... 2025-08-01 01:24:24,823 | INFO | pyMut.annotate.cosmic_cancer_annotate | Gene mapping results: 2086 direct matches, 5 synonym matches 2025-08-01 01:24:24,824 | INFO | pyMut.annotate.cosmic_cancer_annotate | Performing annotation merge... 2025-08-01 01:24:24,847 | INFO | pyMut.annotate.cosmic_cancer_annotate | COSMIC annotation completed: 2091 rows, 257 columns 2025-08-01 01:24:24,848 | INFO | pyMut.annotate.cosmic_cancer_annotate | Added 20 COSMIC annotation columns 2025-08-01 01:24:24,849 | INFO | pyMut.annotate.cosmic_cancer_annotate | Reading OncoKB table: ../../../src/pyMut/data/resources/OncoKb/cancerGeneList.tsv 2025-08-01 01:24:24,853 | INFO | pyMut.annotate.cosmic_cancer_annotate | OncoKB table loaded: 1195 rows, 17 columns 2025-08-01 01:24:24,853 | INFO | pyMut.annotate.cosmic_cancer_annotate | Creating OncoKB synonyms dictionary from column 'Gene Aliases'...
🔬 Applying COSMIC cancer annotation to MAF data...
2025-08-01 01:24:24,893 | INFO | pyMut.annotate.cosmic_cancer_annotate | Created OncoKB synonyms dictionary with 3291 mappings 2025-08-01 01:24:24,894 | INFO | pyMut.annotate.cosmic_cancer_annotate | Applying synonyms mapping to PyMutation data... 2025-08-01 01:24:24,907 | INFO | pyMut.annotate.cosmic_cancer_annotate | Gene mapping results: 2074 direct matches, 17 synonym matches 2025-08-01 01:24:24,907 | INFO | pyMut.annotate.cosmic_cancer_annotate | Performing OncoKB annotation merge... 2025-08-01 01:24:24,930 | INFO | pyMut.annotate.cosmic_cancer_annotate | OncoKB annotation completed: 2091 rows, 273 columns 2025-08-01 01:24:24,933 | INFO | pyMut.annotate.cosmic_cancer_annotate | Added 16 OncoKB annotation columns 2025-08-01 01:24:24,934 | INFO | pyMut.annotate.cosmic_cancer_annotate | Total annotation completed: 2091 rows, 273 columns 2025-08-01 01:24:24,934 | INFO | pyMut.annotate.cosmic_cancer_annotate | Pandas annotation completed successfully 2025-08-01 01:24:24,960 | INFO | pyMut.annotate.cosmic_cancer_annotate | KnownCancer annotation completed successfully 2025-08-01 01:24:24,961 | INFO | pyMut.annotate.cosmic_cancer_annotate | Filtered to 10 annotation columns plus Is_Oncogene_any field
✅ MAF Annotation Complete! Original shape: (2091, 237) Annotated shape: (2091, 248) 🏷️ New annotation columns (11): • COSMIC_ROLE_IN_CANCER • COSMIC_TIER • OncoKB_Is Oncogene • OncoKB_Is Tumor Suppressor Gene • OncoKB_OncoKB Annotated • OncoKB_MSK-IMPACT • OncoKB_MSK-HEME • OncoKB_FOUNDATION ONE • OncoKB_FOUNDATION ONE HEME • OncoKB_Vogelstein • Is_Oncogene_any 🎯 Genes with cancer annotations (423 variants):
Hugo_Symbol | COSMIC_ROLE_IN_CANCER | COSMIC_TIER | OncoKB_Is Oncogene | OncoKB_Is Tumor Suppressor Gene | OncoKB_OncoKB Annotated | OncoKB_MSK-IMPACT | OncoKB_MSK-HEME | OncoKB_FOUNDATION ONE | OncoKB_FOUNDATION ONE HEME | OncoKB_Vogelstein | Is_Oncogene_any | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | TNC | oncogene | 2.0 | True | ||||||||
21 | ABL1 | oncogene, fusion | 1.0 | Yes | No | Yes | Yes | Yes | Yes | Yes | Yes | True |
26 | NOTCH1 | oncogene, TSG, fusion | 1.0 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | True |
53 | PTPRD | TSG | 2.0 | No | Yes | Yes | Yes | Yes | No | No | No | True |
59 | FANCC | TSG | 1.0 | No | Yes | Yes | Yes | Yes | Yes | Yes | No | True |
60 | PTCH1 | TSG | 1.0 | No | Yes | Yes | Yes | Yes | Yes | Yes | Yes | True |
80 | STAG2 | TSG | 1.0 | No | Yes | Yes | Yes | Yes | Yes | Yes | Yes | True |
86 | BCORL1 | oncogene, TSG | 1.0 | No | Yes | Yes | No | Yes | Yes | Yes | No | True |
87 | ELF4 | oncogene, TSG, fusion | 1.0 | Yes | No | Yes | No | No | No | Yes | No | True |
92 | PHF6 | TSG | 1.0 | No | Yes | Yes | Yes | Yes | No | Yes | Yes | True |
4. Apply COSMIC Cancer Annotation to VCF Data¶
Apply the knownCancer
method to the VCF data.
In [6]:
Copied!
# Apply COSMIC annotation to VCF data
print("🔬 Applying COSMIC cancer annotation to VCF data...")
# Apply annotation (in_place=False to get returned DataFrame)
vcf_annotated = py_mut_vcf.knownCancer(
annotation_table=COSMIC_ANNOTATION,
oncokb_table=ONCOKB_ANNOTATION,
in_place=False
)
print("\n✅ VCF Annotation Complete!")
print(f" Original shape: {py_mut_vcf.data.shape}")
print(f" Annotated shape: {vcf_annotated.shape}")
# Show new annotation columns
original_cols = set(py_mut_vcf.data.columns)
new_cols = [col for col in vcf_annotated.columns if col not in original_cols]
print(f"\n🏷️ New annotation columns ({len(new_cols)}):")
for col in new_cols:
print(f" • {col}")
# Show annotation results for genes with annotations
annotated_genes = vcf_annotated[vcf_annotated['Is_Oncogene_any'] == True]
if len(annotated_genes) > 0:
print(f"\n🎯 Genes with cancer annotations ({len(annotated_genes)} variants):")
# Use specific knowncancer_columns that are available in the data, plus VCF-specific columns
available_annotation_cols = [col for col in knowncancer_columns if col in vcf_annotated.columns]
annotation_cols = ['Hugo_Symbol', 'CHROM', 'POS'] + available_annotation_cols
display(annotated_genes[annotation_cols].drop_duplicates('Hugo_Symbol').head(10))
else:
print("\n⚠️ No genes found with cancer annotations in this dataset")
# Apply COSMIC annotation to VCF data
print("🔬 Applying COSMIC cancer annotation to VCF data...")
# Apply annotation (in_place=False to get returned DataFrame)
vcf_annotated = py_mut_vcf.knownCancer(
annotation_table=COSMIC_ANNOTATION,
oncokb_table=ONCOKB_ANNOTATION,
in_place=False
)
print("\n✅ VCF Annotation Complete!")
print(f" Original shape: {py_mut_vcf.data.shape}")
print(f" Annotated shape: {vcf_annotated.shape}")
# Show new annotation columns
original_cols = set(py_mut_vcf.data.columns)
new_cols = [col for col in vcf_annotated.columns if col not in original_cols]
print(f"\n🏷️ New annotation columns ({len(new_cols)}):")
for col in new_cols:
print(f" • {col}")
# Show annotation results for genes with annotations
annotated_genes = vcf_annotated[vcf_annotated['Is_Oncogene_any'] == True]
if len(annotated_genes) > 0:
print(f"\n🎯 Genes with cancer annotations ({len(annotated_genes)} variants):")
# Use specific knowncancer_columns that are available in the data, plus VCF-specific columns
available_annotation_cols = [col for col in knowncancer_columns if col in vcf_annotated.columns]
annotation_cols = ['Hugo_Symbol', 'CHROM', 'POS'] + available_annotation_cols
display(annotated_genes[annotation_cols].drop_duplicates('Hugo_Symbol').head(10))
else:
print("\n⚠️ No genes found with cancer annotations in this dataset")
🔬 Applying COSMIC cancer annotation to VCF data...
2025-08-01 01:24:36,208 | INFO | pyMut.annotate.cosmic_cancer_annotate | DataFrame memory usage: 7.29 GB 2025-08-01 01:24:36,209 | INFO | pyMut.annotate.cosmic_cancer_annotate | Using pandas backend for annotation 2025-08-01 01:24:36,209 | INFO | pyMut.annotate.cosmic_cancer_annotate | Starting pandas annotation for DataFrame: 50000 rows, 2601 columns 2025-08-01 01:24:40,560 | INFO | pyMut.annotate.cosmic_cancer_annotate | Using join column: Hugo_Symbol 2025-08-01 01:24:40,561 | INFO | pyMut.annotate.cosmic_cancer_annotate | Reading annotation table: ../../../src/pyMut/data/resources/COSMIC/Cosmic_CancerGeneCensus_Tsv_v102_GRCh38/Cosmic_CancerGeneCensus_v102_GRCh38.tsv.gz 2025-08-01 01:24:40,565 | INFO | pyMut.annotate.cosmic_cancer_annotate | Annotation table loaded: 758 rows, 21 columns 2025-08-01 01:24:40,565 | INFO | pyMut.annotate.cosmic_cancer_annotate | Creating synonyms dictionary from column 'SYNONYMS'... 2025-08-01 01:24:40,587 | INFO | pyMut.annotate.cosmic_cancer_annotate | Created synonyms dictionary with 4710 mappings 2025-08-01 01:24:40,587 | INFO | pyMut.annotate.cosmic_cancer_annotate | Applying synonyms mapping to PyMutation data... 2025-08-01 01:24:41,534 | INFO | pyMut.annotate.cosmic_cancer_annotate | Gene mapping results: 50000 direct matches, 0 synonym matches 2025-08-01 01:24:41,535 | INFO | pyMut.annotate.cosmic_cancer_annotate | Performing annotation merge... 2025-08-01 01:24:47,923 | INFO | pyMut.annotate.cosmic_cancer_annotate | COSMIC annotation completed: 50000 rows, 2621 columns 2025-08-01 01:24:47,924 | INFO | pyMut.annotate.cosmic_cancer_annotate | Added 20 COSMIC annotation columns 2025-08-01 01:24:47,924 | INFO | pyMut.annotate.cosmic_cancer_annotate | Reading OncoKB table: ../../../src/pyMut/data/resources/OncoKb/cancerGeneList.tsv 2025-08-01 01:24:47,928 | INFO | pyMut.annotate.cosmic_cancer_annotate | OncoKB table loaded: 1195 rows, 17 columns 2025-08-01 01:24:47,929 | INFO | pyMut.annotate.cosmic_cancer_annotate | Creating OncoKB synonyms dictionary from column 'Gene Aliases'... 2025-08-01 01:24:47,961 | INFO | pyMut.annotate.cosmic_cancer_annotate | Created OncoKB synonyms dictionary with 3291 mappings 2025-08-01 01:24:47,962 | INFO | pyMut.annotate.cosmic_cancer_annotate | Applying synonyms mapping to PyMutation data... 2025-08-01 01:24:52,409 | INFO | pyMut.annotate.cosmic_cancer_annotate | Gene mapping results: 50000 direct matches, 0 synonym matches 2025-08-01 01:24:52,410 | INFO | pyMut.annotate.cosmic_cancer_annotate | Performing OncoKB annotation merge... 2025-08-01 01:24:59,643 | INFO | pyMut.annotate.cosmic_cancer_annotate | OncoKB annotation completed: 50000 rows, 2637 columns 2025-08-01 01:24:59,644 | INFO | pyMut.annotate.cosmic_cancer_annotate | Added 16 OncoKB annotation columns 2025-08-01 01:24:59,644 | INFO | pyMut.annotate.cosmic_cancer_annotate | Total annotation completed: 50000 rows, 2637 columns 2025-08-01 01:24:59,645 | INFO | pyMut.annotate.cosmic_cancer_annotate | Pandas annotation completed successfully 2025-08-01 01:25:07,520 | INFO | pyMut.annotate.cosmic_cancer_annotate | KnownCancer annotation completed successfully 2025-08-01 01:25:07,520 | INFO | pyMut.annotate.cosmic_cancer_annotate | Filtered to 10 annotation columns plus Is_Oncogene_any field
✅ VCF Annotation Complete! Original shape: (50000, 2601) Annotated shape: (50000, 2612) 🏷️ New annotation columns (11): • COSMIC_ROLE_IN_CANCER • COSMIC_TIER • OncoKB_Is Oncogene • OncoKB_Is Tumor Suppressor Gene • OncoKB_OncoKB Annotated • OncoKB_MSK-IMPACT • OncoKB_MSK-HEME • OncoKB_FOUNDATION ONE • OncoKB_FOUNDATION ONE HEME • OncoKB_Vogelstein • Is_Oncogene_any 🎯 Genes with cancer annotations (17072 variants):
Hugo_Symbol | CHROM | POS | COSMIC_ROLE_IN_CANCER | COSMIC_TIER | OncoKB_Is Oncogene | OncoKB_Is Tumor Suppressor Gene | OncoKB_OncoKB Annotated | OncoKB_MSK-IMPACT | OncoKB_MSK-HEME | OncoKB_FOUNDATION ONE | OncoKB_FOUNDATION ONE HEME | OncoKB_Vogelstein | Is_Oncogene_any | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
25959 | LARP4B | chr10 | 753787 | TSG | 2.0 | No | No | Yes | No | No | No | No | No | True |
38840 | ADARB2 | chr10 | 1151506 | Yes | No | Yes | No | No | No | No | No | True |
5. Summary and Comparison¶
Compare the annotation results between MAF and VCF data.
In [7]:
Copied!
# Summary comparison
print("📊 COSMIC Cancer Annotation Summary")
print("=" * 50)
# MAF results
maf_oncogenes = maf_annotated[maf_annotated['Is_Oncogene_any'] == True]['Hugo_Symbol'].nunique()
maf_total_genes = maf_annotated['Hugo_Symbol'].nunique()
maf_cosmic_role = maf_annotated['COSMIC_ROLE_IN_CANCER'].value_counts().to_dict() if 'COSMIC_ROLE_IN_CANCER' in maf_annotated.columns else {}
print("\n🧬 MAF Data Results:")
print(f" Total unique genes: {maf_total_genes}")
print(f" Genes with cancer annotations: {maf_oncogenes}")
print(f" Annotation rate: {maf_oncogenes/maf_total_genes*100:.1f}%")
if maf_cosmic_role:
print(f" COSMIC roles found: {list(maf_cosmic_role.keys())}")
# VCF results
vcf_oncogenes = vcf_annotated[vcf_annotated['Is_Oncogene_any'] == True]['Hugo_Symbol'].nunique()
vcf_total_genes = vcf_annotated['Hugo_Symbol'].nunique()
vcf_cosmic_role = vcf_annotated['COSMIC_ROLE_IN_CANCER'].value_counts().to_dict() if 'COSMIC_ROLE_IN_CANCER' in vcf_annotated.columns else {}
print("\n🧬 VCF Data Results:")
print(f" Total unique genes: {vcf_total_genes}")
print(f" Genes with cancer annotations: {vcf_oncogenes}")
print(f" Annotation rate: {vcf_oncogenes/vcf_total_genes*100:.1f}%")
if vcf_cosmic_role:
print(f" COSMIC roles found: {list(vcf_cosmic_role.keys())}")
print("\n✅ Annotation process completed successfully for both datasets!")
# Summary comparison
print("📊 COSMIC Cancer Annotation Summary")
print("=" * 50)
# MAF results
maf_oncogenes = maf_annotated[maf_annotated['Is_Oncogene_any'] == True]['Hugo_Symbol'].nunique()
maf_total_genes = maf_annotated['Hugo_Symbol'].nunique()
maf_cosmic_role = maf_annotated['COSMIC_ROLE_IN_CANCER'].value_counts().to_dict() if 'COSMIC_ROLE_IN_CANCER' in maf_annotated.columns else {}
print("\n🧬 MAF Data Results:")
print(f" Total unique genes: {maf_total_genes}")
print(f" Genes with cancer annotations: {maf_oncogenes}")
print(f" Annotation rate: {maf_oncogenes/maf_total_genes*100:.1f}%")
if maf_cosmic_role:
print(f" COSMIC roles found: {list(maf_cosmic_role.keys())}")
# VCF results
vcf_oncogenes = vcf_annotated[vcf_annotated['Is_Oncogene_any'] == True]['Hugo_Symbol'].nunique()
vcf_total_genes = vcf_annotated['Hugo_Symbol'].nunique()
vcf_cosmic_role = vcf_annotated['COSMIC_ROLE_IN_CANCER'].value_counts().to_dict() if 'COSMIC_ROLE_IN_CANCER' in vcf_annotated.columns else {}
print("\n🧬 VCF Data Results:")
print(f" Total unique genes: {vcf_total_genes}")
print(f" Genes with cancer annotations: {vcf_oncogenes}")
print(f" Annotation rate: {vcf_oncogenes/vcf_total_genes*100:.1f}%")
if vcf_cosmic_role:
print(f" COSMIC roles found: {list(vcf_cosmic_role.keys())}")
print("\n✅ Annotation process completed successfully for both datasets!")
📊 COSMIC Cancer Annotation Summary ================================================== 🧬 MAF Data Results: Total unique genes: 1611 Genes with cancer annotations: 158 Annotation rate: 9.8% COSMIC roles found: ['', 'TSG', 'oncogene', 'oncogene, TSG, fusion', 'oncogene, fusion', 'fusion', 'oncogene, TSG', 'TSG, fusion'] 🧬 VCF Data Results: Total unique genes: 13 Genes with cancer annotations: 2 Annotation rate: 15.4% COSMIC roles found: ['', 'TSG'] ✅ Annotation process completed successfully for both datasets!
6. Detailed Annotation Results¶
Show detailed annotation information for genes that have COSMIC annotations.
In [8]:
Copied!
# Show detailed annotation results
print("🔍 Detailed Annotation Results")
print("=" * 40)
# Function to show annotation details
def show_annotation_details(data, dataset_name):
print(f"\n📋 {dataset_name} - Genes with COSMIC annotations:")
# Get genes with annotations
annotated = data[data['Is_Oncogene_any'] == True]
if len(annotated) == 0:
print(" No genes with COSMIC annotations found.")
return
# Show specific knowncancer annotation columns
available_annotation_cols = [col for col in knowncancer_columns if col in data.columns]
if available_annotation_cols:
gene_annotations = annotated[['Hugo_Symbol'] + available_annotation_cols].drop_duplicates('Hugo_Symbol')
print(f" Found {len(gene_annotations)} unique genes with annotations:")
print(f" Available annotation columns: {', '.join(available_annotation_cols)}")
# Show detailed table with all available annotation columns
if len(gene_annotations) > 0:
print("\n 📋 Detailed annotation table:")
display(gene_annotations.head(10))
if len(gene_annotations) > 10:
print(f" ... and {len(gene_annotations) - 10} more genes")
# Show details for both datasets
show_annotation_details(maf_annotated, "MAF Dataset")
show_annotation_details(vcf_annotated, "VCF Dataset")
# Show detailed annotation results
print("🔍 Detailed Annotation Results")
print("=" * 40)
# Function to show annotation details
def show_annotation_details(data, dataset_name):
print(f"\n📋 {dataset_name} - Genes with COSMIC annotations:")
# Get genes with annotations
annotated = data[data['Is_Oncogene_any'] == True]
if len(annotated) == 0:
print(" No genes with COSMIC annotations found.")
return
# Show specific knowncancer annotation columns
available_annotation_cols = [col for col in knowncancer_columns if col in data.columns]
if available_annotation_cols:
gene_annotations = annotated[['Hugo_Symbol'] + available_annotation_cols].drop_duplicates('Hugo_Symbol')
print(f" Found {len(gene_annotations)} unique genes with annotations:")
print(f" Available annotation columns: {', '.join(available_annotation_cols)}")
# Show detailed table with all available annotation columns
if len(gene_annotations) > 0:
print("\n 📋 Detailed annotation table:")
display(gene_annotations.head(10))
if len(gene_annotations) > 10:
print(f" ... and {len(gene_annotations) - 10} more genes")
# Show details for both datasets
show_annotation_details(maf_annotated, "MAF Dataset")
show_annotation_details(vcf_annotated, "VCF Dataset")
🔍 Detailed Annotation Results ======================================== 📋 MAF Dataset - Genes with COSMIC annotations: Found 158 unique genes with annotations: Available annotation columns: COSMIC_ROLE_IN_CANCER, COSMIC_TIER, OncoKB_Is Oncogene, OncoKB_Is Tumor Suppressor Gene, OncoKB_OncoKB Annotated, OncoKB_MSK-IMPACT, OncoKB_MSK-HEME, OncoKB_FOUNDATION ONE, OncoKB_FOUNDATION ONE HEME, OncoKB_Vogelstein, Is_Oncogene_any 📋 Detailed annotation table:
Hugo_Symbol | COSMIC_ROLE_IN_CANCER | COSMIC_TIER | OncoKB_Is Oncogene | OncoKB_Is Tumor Suppressor Gene | OncoKB_OncoKB Annotated | OncoKB_MSK-IMPACT | OncoKB_MSK-HEME | OncoKB_FOUNDATION ONE | OncoKB_FOUNDATION ONE HEME | OncoKB_Vogelstein | Is_Oncogene_any | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
12 | TNC | oncogene | 2.0 | True | ||||||||
21 | ABL1 | oncogene, fusion | 1.0 | Yes | No | Yes | Yes | Yes | Yes | Yes | Yes | True |
26 | NOTCH1 | oncogene, TSG, fusion | 1.0 | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | True |
53 | PTPRD | TSG | 2.0 | No | Yes | Yes | Yes | Yes | No | No | No | True |
59 | FANCC | TSG | 1.0 | No | Yes | Yes | Yes | Yes | Yes | Yes | No | True |
60 | PTCH1 | TSG | 1.0 | No | Yes | Yes | Yes | Yes | Yes | Yes | Yes | True |
80 | STAG2 | TSG | 1.0 | No | Yes | Yes | Yes | Yes | Yes | Yes | Yes | True |
86 | BCORL1 | oncogene, TSG | 1.0 | No | Yes | Yes | No | Yes | Yes | Yes | No | True |
87 | ELF4 | oncogene, TSG, fusion | 1.0 | Yes | No | Yes | No | No | No | Yes | No | True |
92 | PHF6 | TSG | 1.0 | No | Yes | Yes | Yes | Yes | No | Yes | Yes | True |
... and 148 more genes 📋 VCF Dataset - Genes with COSMIC annotations: Found 2 unique genes with annotations: Available annotation columns: COSMIC_ROLE_IN_CANCER, COSMIC_TIER, OncoKB_Is Oncogene, OncoKB_Is Tumor Suppressor Gene, OncoKB_OncoKB Annotated, OncoKB_MSK-IMPACT, OncoKB_MSK-HEME, OncoKB_FOUNDATION ONE, OncoKB_FOUNDATION ONE HEME, OncoKB_Vogelstein, Is_Oncogene_any 📋 Detailed annotation table:
Hugo_Symbol | COSMIC_ROLE_IN_CANCER | COSMIC_TIER | OncoKB_Is Oncogene | OncoKB_Is Tumor Suppressor Gene | OncoKB_OncoKB Annotated | OncoKB_MSK-IMPACT | OncoKB_MSK-HEME | OncoKB_FOUNDATION ONE | OncoKB_FOUNDATION ONE HEME | OncoKB_Vogelstein | Is_Oncogene_any | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
25959 | LARP4B | TSG | 2.0 | No | No | Yes | No | No | No | No | No | True |
38840 | ADARB2 | Yes | No | Yes | No | No | No | No | No | True |