Download and process ProteinGym zero-shot model scores for DMS substitutions.
This loads zero-shot model predictions across 217 DMS assays for multiple models.
Each assay contains predictions from various protein language models and other
zero-shot approaches.
| Parameters: |
-
cache_dir
(str, default:
'.cache'
)
–
Directory to cache downloaded files
|
| Returns: |
-
Dict[str, DataFrame]
–
Dictionary mapping DMS assay names to DataFrames with columns:
-
Dict[str, DataFrame]
–
- UniProt_id: UniProt accession identifier
-
Dict[str, DataFrame]
–
- DMS_id: DMS assay identifier
-
Dict[str, DataFrame]
–
- mutant: substitution description
-
Dict[str, DataFrame]
–
- mutated_sequence: full amino acid sequence
-
Dict[str, DataFrame]
–
- DMS_score: experimental measurement
-
Dict[str, DataFrame]
–
- DMS_score_bin: binary fitness classification
-
Dict[str, DataFrame]
–
- [model_name]: Prediction scores from various zero-shot models
|
Source code in proteingympy/make_zero_shot_substitutions.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65 | def get_zero_shot_substitution_data(cache_dir: str = ".cache") -> Dict[str, pd.DataFrame]:
"""
Download and process ProteinGym zero-shot model scores for DMS substitutions.
This loads zero-shot model predictions across 217 DMS assays for multiple models.
Each assay contains predictions from various protein language models and other
zero-shot approaches.
Args:
cache_dir: Directory to cache downloaded files
Returns:
Dictionary mapping DMS assay names to DataFrames with columns:
- UniProt_id: UniProt accession identifier
- DMS_id: DMS assay identifier
- mutant: substitution description
- mutated_sequence: full amino acid sequence
- DMS_score: experimental measurement
- DMS_score_bin: binary fitness classification
- [model_name]: Prediction scores from various zero-shot models
"""
os.makedirs(cache_dir, exist_ok=True)
# Download zero-shot scores data
zip_path = os.path.join(cache_dir, "zero_shot_substitutions_scores.zip")
if not os.path.exists(zip_path):
# URL from ProteinGym Zenodo v1.2
url = "https://zenodo.org/records/14997691/files/zero_shot_substitutions_scores.zip?download=1"
print(f"Downloading zero-shot scores from {url}...")
response = requests.get(url, stream=True)
response.raise_for_status()
with open(zip_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download complete.")
else:
print(f"Zero-shot scores found in cache at {zip_path}")
# Load zero-shot scores
zeroshot_tables = _load_zero_shot_data(zip_path)
# Add UniProt IDs
zeroshot_tables = _add_uniprot_ids_zeroshot(zeroshot_tables)
# Clean up column names (replace hyphens with underscores)
zeroshot_tables = _clean_zeroshot_column_names(zeroshot_tables)
return zeroshot_tables
|