Download and process ProteinGym DMS substitution data.
Returns a dictionary of 217 DMS assays, each as a pandas DataFrame with columns:
- UniProt_id: UniProt accession identifier
- DMS_id: DMS assay identifier
- mutant: substitution description (e.g. A1P:D2N)
- mutated_sequence: full amino acid sequence
- DMS_score: experimental measurement (higher = more fit)
- DMS_score_bin: binary fitness (1=fit, 0=not fit)
| Parameters: |
-
cache_dir
(str, default:
'.cache'
)
–
Directory to cache downloaded files
-
use_cache
(bool, default:
True
)
–
If True, use cached file if it exists. If False, force a fresh download.
|
| Returns: |
-
Dict[str, DataFrame]
–
Dictionary mapping DMS study names to DataFrames
|
Source code in proteingympy/make_dms_substitutions.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88 | def get_dms_substitution_data(cache_dir: str = ".cache", use_cache: bool = True) -> Dict[str, pd.DataFrame]:
"""
Download and process ProteinGym DMS substitution data.
Returns a dictionary of 217 DMS assays, each as a pandas DataFrame with columns:
- UniProt_id: UniProt accession identifier
- DMS_id: DMS assay identifier
- mutant: substitution description (e.g. A1P:D2N)
- mutated_sequence: full amino acid sequence
- DMS_score: experimental measurement (higher = more fit)
- DMS_score_bin: binary fitness (1=fit, 0=not fit)
Args:
cache_dir: Directory to cache downloaded files
use_cache: If True, use cached file if it exists. If False, force a fresh download.
Returns:
Dictionary mapping DMS study names to DataFrames
"""
os.makedirs(cache_dir, exist_ok=True)
zip_path = os.path.join(cache_dir, "DMS_ProteinGym_substitutions.zip")
# Download if not cached or if use_cache is False
if not use_cache or not os.path.exists(zip_path):
if os.path.exists(zip_path):
os.remove(zip_path)
url = "https://zenodo.org/records/15293562/files/DMS_ProteinGym_substitutions.zip"
print(f"Downloading {url} to {zip_path}...")
response = requests.get(url, stream=True)
response.raise_for_status()
with open(zip_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download complete.")
else:
print(f"Using cached file at {zip_path}.")
# Extract and load data
progym_tables = {}
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
file_list = [f for f in zip_ref.namelist() if f.endswith('.csv')]
for csv_file in file_list:
# Extract DMS study name (remove .csv extension)
study_name = os.path.splitext(os.path.basename(csv_file))[0]
# Read CSV from zip
with zip_ref.open(csv_file) as f:
df = pd.read_csv(f)
# Add DMS_id column
df['DMS_id'] = study_name
# Convert DMS_score_bin to categorical
df['DMS_score_bin'] = df['DMS_score_bin'].astype('category')
progym_tables[study_name] = df
# Add UniProt IDs
progym_tables = _add_uniprot_ids(progym_tables)
# Reorder columns
cols = ['UniProt_id', 'DMS_id', 'mutant', 'mutated_sequence', 'DMS_score', 'DMS_score_bin']
for study_name, df in progym_tables.items():
# Select and reorder columns (keep any additional columns at end)
available_cols = [col for col in cols if col in df.columns]
other_cols = [col for col in df.columns if col not in cols]
progym_tables[study_name] = df[available_cols + other_cols]
return progym_tables
|