get_dms_substitution_data¶

Download and process ProteinGym DMS substitution data.

Returns a dictionary of 217 DMS assays, each as a pandas DataFrame with columns: - UniProt_id: UniProt accession identifier
- DMS_id: DMS assay identifier - mutant: substitution description (e.g. A1P:D2N) - mutated_sequence: full amino acid sequence - DMS_score: experimental measurement (higher = more fit) - DMS_score_bin: binary fitness (1=fit, 0=not fit)

Parameters:	`cache_dir` (`str`, default: `'.cache'` ) – Directory to cache downloaded files `use_cache` (`bool`, default: `True` ) – If True, use cached file if it exists. If False, force a fresh download.

Returns:	`Dict[str, DataFrame]` – Dictionary mapping DMS study names to DataFrames

Source code in proteingympy/make_dms_substitutions.py

def get_dms_substitution_data(cache_dir: str = ".cache", use_cache: bool = True) -> Dict[str, pd.DataFrame]:
    """
    Download and process ProteinGym DMS substitution data.

    Returns a dictionary of 217 DMS assays, each as a pandas DataFrame with columns:
    - UniProt_id: UniProt accession identifier  
    - DMS_id: DMS assay identifier
    - mutant: substitution description (e.g. A1P:D2N)
    - mutated_sequence: full amino acid sequence
    - DMS_score: experimental measurement (higher = more fit)
    - DMS_score_bin: binary fitness (1=fit, 0=not fit)

    Args:
        cache_dir: Directory to cache downloaded files
        use_cache: If True, use cached file if it exists. If False, force a fresh download.

    Returns:
        Dictionary mapping DMS study names to DataFrames
    """
    os.makedirs(cache_dir, exist_ok=True)
    zip_path = os.path.join(cache_dir, "DMS_ProteinGym_substitutions.zip")

    # Download if not cached or if use_cache is False
    if not use_cache or not os.path.exists(zip_path):
        if os.path.exists(zip_path):
            os.remove(zip_path)
        url = "https://zenodo.org/records/15293562/files/DMS_ProteinGym_substitutions.zip"
        print(f"Downloading {url} to {zip_path}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(zip_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print("Download complete.")
    else:
        print(f"Using cached file at {zip_path}.")

    # Extract and load data
    progym_tables = {}

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        file_list = [f for f in zip_ref.namelist() if f.endswith('.csv')]

        for csv_file in file_list:
            # Extract DMS study name (remove .csv extension)
            study_name = os.path.splitext(os.path.basename(csv_file))[0]

            # Read CSV from zip
            with zip_ref.open(csv_file) as f:
                df = pd.read_csv(f)

            # Add DMS_id column
            df['DMS_id'] = study_name

            # Convert DMS_score_bin to categorical
            df['DMS_score_bin'] = df['DMS_score_bin'].astype('category')

            progym_tables[study_name] = df

    # Add UniProt IDs
    progym_tables = _add_uniprot_ids(progym_tables)

    # Reorder columns
    cols = ['UniProt_id', 'DMS_id', 'mutant', 'mutated_sequence', 'DMS_score', 'DMS_score_bin']

    for study_name, df in progym_tables.items():
        # Select and reorder columns (keep any additional columns at end)
        available_cols = [col for col in cols if col in df.columns]
        other_cols = [col for col in df.columns if col not in cols]
        progym_tables[study_name] = df[available_cols + other_cols]

    return progym_tables