get_alphamissense_proteingym_data¶

Download and process AlphaMissense supplementary data for ProteinGym variants.

This loads Table S8 from Cheng et al. 2023 containing AlphaMissense pathogenicity scores for ~1.6M variants that match those in ProteinGym from 87 DMS experiments across 72 proteins.

Parameters:	`cache_dir` (`str`, default: `'.cache'` ) – Directory to cache downloaded files

Returns: DataFrame with columns: - DMS_id: DMS assay identifier
- Uniprot_ID: UniProt accession (resolved via UniProt API) - SwissProt_ID: Original AlphaMissense SwissProt entry name - variant_id: Variant identifier - AlphaMissense: Pathogenicity score (0-1, higher = more pathogenic)

Source code in proteingympy/make_alphamissense_supplementary.py

def get_alphamissense_proteingym_data(cache_dir: str = ".cache") -> pd.DataFrame:
    """
    Download and process AlphaMissense supplementary data for ProteinGym variants.

    This loads Table S8 from Cheng et al. 2023 containing AlphaMissense pathogenicity 
    scores for ~1.6M variants that match those in ProteinGym from 87 DMS experiments 
    across 72 proteins.

    Args:
        cache_dir: Directory to cache downloaded files

    Returns:
    DataFrame with columns:
    - DMS_id: DMS assay identifier  
        - Uniprot_ID: UniProt accession (resolved via UniProt API)
        - SwissProt_ID: Original AlphaMissense SwissProt entry name
    - variant_id: Variant identifier
    - AlphaMissense: Pathogenicity score (0-1, higher = more pathogenic)
    """
    os.makedirs(cache_dir, exist_ok=True)

    # File paths
    csv_path = os.path.join(cache_dir, "Supplementary_Data_S8_proteingym.csv")

    #url = "https://www.science.org/doi/suppl/10.1126/science.adg7492/suppl_file/science.adg7492_data_s1_to_s9.zip"
    # Science is blocking requests with TLS fingerprinting, so we rely on a local copy
    # Preferred zip path is the copy bundled with the package at src/
    repo_zip_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "science.adg7492_data_s1_to_s9.zip"))
    cache_zip_path = os.path.join(cache_dir, "science.adg7492_data_s1_to_s9.zip")

    # Prefer the repository zip file if present, otherwise fallback to cache
    if os.path.exists(repo_zip_path):
        zip_path = repo_zip_path
    else:
        zip_path = cache_zip_path

    # Extract CSV if not present
    if not os.path.exists(csv_path):
        if not os.path.exists(zip_path):
            print(f"Zip file not found locally. Downloading from GitHub...")
            url = "https://github.com/ccb-hms/ProteinGymPy/blob/main/src/science.adg7492_data_s1_to_s9.zip?raw=true"
            try:
                response = requests.get(url, stream=True)
                response.raise_for_status()
                with open(cache_zip_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                zip_path = cache_zip_path
                print(f"Downloaded {zip_path}")
            except Exception as e:
                print(f"Warning: Failed to download AlphaMissense data: {e}")

        if os.path.exists(zip_path):
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                extracted_name = _extract_proteingym_csv(zip_ref, csv_path)
                if extracted_name:
                    print(f"Extracted {extracted_name} from {zip_path} -> {csv_path}")
                else:
                    raise FileNotFoundError("Could not find ProteinGym supplementary CSV in zip file (including nested archives)")
        else:
            # Neither the CSV nor any local zip exists; we do not attempt to download
            raise FileNotFoundError(
                f"AlphaMissense supplementary CSV not found at {csv_path} and no local zip found at {repo_zip_path} or {cache_zip_path}."
            )

    # Load the data
    print("Loading AlphaMissense ProteinGym data...")

    df = pd.read_csv(csv_path)
    df = _add_uniprot_accessions(df, cache_dir)

    # Ensure AlphaMissense column is numeric
    df['AlphaMissense'] = pd.to_numeric(df['AlphaMissense'], errors='coerce')

    print(f"Loaded {len(df):,} AlphaMissense scores for ProteinGym variants")
    print(f"Data covers {df['DMS_id'].nunique()} DMS assays")
    print(f"Columns: {list(df.columns)}")

    return df