get_dms_substitution_data

Download and process ProteinGym DMS substitution data.

Returns a dictionary of 217 DMS assays, each as a pandas DataFrame with columns: - UniProt_id: UniProt accession identifier
- DMS_id: DMS assay identifier - mutant: substitution description (e.g. A1P:D2N) - mutated_sequence: full amino acid sequence - DMS_score: experimental measurement (higher = more fit) - DMS_score_bin: binary fitness (1=fit, 0=not fit)

Parameters:
  • cache_dir (str, default: '.cache' ) –

    Directory to cache downloaded files

  • use_cache (bool, default: True ) –

    If True, use cached file if it exists. If False, force a fresh download.

Returns:
  • Dict[str, DataFrame]

    Dictionary mapping DMS study names to DataFrames

Source code in proteingympy/make_dms_substitutions.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def get_dms_substitution_data(cache_dir: str = ".cache", use_cache: bool = True) -> Dict[str, pd.DataFrame]:
    """
    Download and process ProteinGym DMS substitution data.

    Returns a dictionary of 217 DMS assays, each as a pandas DataFrame with columns:
    - UniProt_id: UniProt accession identifier  
    - DMS_id: DMS assay identifier
    - mutant: substitution description (e.g. A1P:D2N)
    - mutated_sequence: full amino acid sequence
    - DMS_score: experimental measurement (higher = more fit)
    - DMS_score_bin: binary fitness (1=fit, 0=not fit)

    Args:
        cache_dir: Directory to cache downloaded files
        use_cache: If True, use cached file if it exists. If False, force a fresh download.

    Returns:
        Dictionary mapping DMS study names to DataFrames
    """
    os.makedirs(cache_dir, exist_ok=True)
    zip_path = os.path.join(cache_dir, "DMS_ProteinGym_substitutions.zip")

    # Download if not cached or if use_cache is False
    if not use_cache or not os.path.exists(zip_path):
        if os.path.exists(zip_path):
            os.remove(zip_path)
        url = "https://zenodo.org/records/15293562/files/DMS_ProteinGym_substitutions.zip"
        print(f"Downloading {url} to {zip_path}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(zip_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print("Download complete.")
    else:
        print(f"Using cached file at {zip_path}.")

    # Extract and load data
    progym_tables = {}

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        file_list = [f for f in zip_ref.namelist() if f.endswith('.csv')]

        for csv_file in file_list:
            # Extract DMS study name (remove .csv extension)
            study_name = os.path.splitext(os.path.basename(csv_file))[0]

            # Read CSV from zip
            with zip_ref.open(csv_file) as f:
                df = pd.read_csv(f)

            # Add DMS_id column
            df['DMS_id'] = study_name

            # Convert DMS_score_bin to categorical
            df['DMS_score_bin'] = df['DMS_score_bin'].astype('category')

            progym_tables[study_name] = df

    # Add UniProt IDs
    progym_tables = _add_uniprot_ids(progym_tables)

    # Reorder columns
    cols = ['UniProt_id', 'DMS_id', 'mutant', 'mutated_sequence', 'DMS_score', 'DMS_score_bin']

    for study_name, df in progym_tables.items():
        # Select and reorder columns (keep any additional columns at end)
        available_cols = [col for col in cols if col in df.columns]
        other_cols = [col for col in df.columns if col not in cols]
        progym_tables[study_name] = df[available_cols + other_cols]

    return progym_tables