Download and process raw ProteinGym supervised model substitution scores.
| Parameters: |
-
fold_type
(str, default:
'random_5'
)
–
Type of cross-validation fold ("contiguous_5", "modulo_5", or "random_5")
-
cache_dir
(str, default:
'.cache'
)
–
Directory to cache downloaded files
|
| Returns: |
-
Dict[str, DataFrame]
–
Tuple of (supervised_scores_dict, summary_metrics_df)
-
DataFrame
–
- supervised_scores_dict: Dictionary mapping DMS assay names to DataFrames with model predictions
-
Tuple[Dict[str, DataFrame], DataFrame]
–
- summary_metrics_df: DataFrame with performance metrics across assays and models
|
Source code in proteingympy/make_supervised_scores.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 | def get_supervised_substitution_data(
fold_type: str = "random_5",
cache_dir: str = ".cache"
) -> Tuple[Dict[str, pd.DataFrame], pd.DataFrame]:
"""
Download and process raw ProteinGym supervised model substitution scores.
Args:
fold_type: Type of cross-validation fold ("contiguous_5", "modulo_5", or "random_5")
cache_dir: Directory to cache downloaded files
Returns:
Tuple of (supervised_scores_dict, summary_metrics_df)
- supervised_scores_dict: Dictionary mapping DMS assay names to DataFrames with model predictions
- summary_metrics_df: DataFrame with performance metrics across assays and models
"""
if fold_type not in ["contiguous_5", "modulo_5", "random_5"]:
raise ValueError("fold_type must be one of: 'contiguous_5', 'modulo_5', 'random_5'")
os.makedirs(cache_dir, exist_ok=True)
# Download supervised scores data (this would need the actual URL from Zenodo v1.2)
zip_path = os.path.join(cache_dir, "DMS_supervised_substitutions_scores.zip")
if not os.path.exists(zip_path):
url = "https://zenodo.org/records/14997691/files/DMS_supervised_substitutions_scores.zip?download=1"
print(f"Downloading supervised scores from {url}...")
response = requests.get(url, stream=True)
response.raise_for_status()
with open(zip_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Download complete.")
# Check if we need to extract summary metrics
summary_path = os.path.join(cache_dir, "merged_scores_substitutions_DMS.csv")
if not os.path.exists(summary_path) and os.path.exists(zip_path):
try:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Check if file exists in zip (at root or in subfolder)
target_file = "merged_scores_substitutions_DMS.csv"
if target_file in zip_ref.namelist():
zip_ref.extract(target_file, cache_dir)
except zipfile.BadZipFile:
print(f"Warning: Could not read {zip_path} to extract summary metrics")
# Load supervised scores for specific fold type
supervised_tables = _load_supervised_fold_data(zip_path, fold_type)
# Add UniProt IDs
supervised_tables = _add_uniprot_ids_supervised(supervised_tables)
# Clean up column names (replace hyphens with underscores, remove spaces, etc.)
supervised_tables = _clean_supervised_column_names(supervised_tables)
# Load summary metrics
summary_df = get_supervised_metrics(cache_dir)
return supervised_tables, summary_df
|