get_supervised_substitution_data

Download and process raw ProteinGym supervised model substitution scores.

Parameters:
  • fold_type (str, default: 'random_5' ) –

    Type of cross-validation fold ("contiguous_5", "modulo_5", or "random_5")

  • cache_dir (str, default: '.cache' ) –

    Directory to cache downloaded files

Returns:
  • Dict[str, DataFrame]

    Tuple of (supervised_scores_dict, summary_metrics_df)

  • DataFrame
    • supervised_scores_dict: Dictionary mapping DMS assay names to DataFrames with model predictions
  • Tuple[Dict[str, DataFrame], DataFrame]
    • summary_metrics_df: DataFrame with performance metrics across assays and models
Source code in proteingympy/make_supervised_scores.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def get_supervised_substitution_data(
    fold_type: str = "random_5", 
    cache_dir: str = ".cache"
) -> Tuple[Dict[str, pd.DataFrame], pd.DataFrame]:
    """
    Download and process raw ProteinGym supervised model substitution scores.

    Args:
        fold_type: Type of cross-validation fold ("contiguous_5", "modulo_5", or "random_5")
        cache_dir: Directory to cache downloaded files

    Returns:
        Tuple of (supervised_scores_dict, summary_metrics_df)
        - supervised_scores_dict: Dictionary mapping DMS assay names to DataFrames with model predictions
        - summary_metrics_df: DataFrame with performance metrics across assays and models
    """
    if fold_type not in ["contiguous_5", "modulo_5", "random_5"]:
        raise ValueError("fold_type must be one of: 'contiguous_5', 'modulo_5', 'random_5'")

    os.makedirs(cache_dir, exist_ok=True)

    # Download supervised scores data (this would need the actual URL from Zenodo v1.2)
    zip_path = os.path.join(cache_dir, "DMS_supervised_substitutions_scores.zip")

    if not os.path.exists(zip_path):
        url = "https://zenodo.org/records/14997691/files/DMS_supervised_substitutions_scores.zip?download=1"
        print(f"Downloading supervised scores from {url}...")

        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(zip_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print("Download complete.")

    # Check if we need to extract summary metrics
    summary_path = os.path.join(cache_dir, "merged_scores_substitutions_DMS.csv")
    if not os.path.exists(summary_path) and os.path.exists(zip_path):
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                # Check if file exists in zip (at root or in subfolder)
                target_file = "merged_scores_substitutions_DMS.csv"
                if target_file in zip_ref.namelist():
                    zip_ref.extract(target_file, cache_dir)
        except zipfile.BadZipFile:
            print(f"Warning: Could not read {zip_path} to extract summary metrics")

    # Load supervised scores for specific fold type
    supervised_tables = _load_supervised_fold_data(zip_path, fold_type)

    # Add UniProt IDs
    supervised_tables = _add_uniprot_ids_supervised(supervised_tables)

    # Clean up column names (replace hyphens with underscores, remove spaces, etc.)
    supervised_tables = _clean_supervised_column_names(supervised_tables)

    # Load summary metrics
    summary_df = get_supervised_metrics(cache_dir)

    return supervised_tables, summary_df