get_supervised_substitution_data¶

Download and process raw ProteinGym supervised model substitution scores.

Parameters:	`fold_type` (`str`, default: `'random_5'` ) – Type of cross-validation fold ("contiguous_5", "modulo_5", or "random_5") `cache_dir` (`str`, default: `'.cache'` ) – Directory to cache downloaded files

Returns:	`Dict[str, DataFrame]` – Tuple of (supervised_scores_dict, summary_metrics_df) `DataFrame` – supervised_scores_dict: Dictionary mapping DMS assay names to DataFrames with model predictions `Tuple[Dict[str, DataFrame], DataFrame]` – summary_metrics_df: DataFrame with performance metrics across assays and models

Source code in proteingympy/make_supervised_scores.py

def get_supervised_substitution_data(
    fold_type: str = "random_5", 
    cache_dir: str = ".cache"
) -> Tuple[Dict[str, pd.DataFrame], pd.DataFrame]:
    """
    Download and process raw ProteinGym supervised model substitution scores.

    Args:
        fold_type: Type of cross-validation fold ("contiguous_5", "modulo_5", or "random_5")
        cache_dir: Directory to cache downloaded files

    Returns:
        Tuple of (supervised_scores_dict, summary_metrics_df)
        - supervised_scores_dict: Dictionary mapping DMS assay names to DataFrames with model predictions
        - summary_metrics_df: DataFrame with performance metrics across assays and models
    """
    if fold_type not in ["contiguous_5", "modulo_5", "random_5"]:
        raise ValueError("fold_type must be one of: 'contiguous_5', 'modulo_5', 'random_5'")

    os.makedirs(cache_dir, exist_ok=True)

    # Download supervised scores data (this would need the actual URL from Zenodo v1.2)
    zip_path = os.path.join(cache_dir, "DMS_supervised_substitutions_scores.zip")

    if not os.path.exists(zip_path):
        url = "https://zenodo.org/records/14997691/files/DMS_supervised_substitutions_scores.zip?download=1"
        print(f"Downloading supervised scores from {url}...")

        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(zip_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print("Download complete.")

    # Check if we need to extract summary metrics
    summary_path = os.path.join(cache_dir, "merged_scores_substitutions_DMS.csv")
    if not os.path.exists(summary_path) and os.path.exists(zip_path):
        try:
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                # Check if file exists in zip (at root or in subfolder)
                target_file = "merged_scores_substitutions_DMS.csv"
                if target_file in zip_ref.namelist():
                    zip_ref.extract(target_file, cache_dir)
        except zipfile.BadZipFile:
            print(f"Warning: Could not read {zip_path} to extract summary metrics")

    # Load supervised scores for specific fold type
    supervised_tables = _load_supervised_fold_data(zip_path, fold_type)

    # Add UniProt IDs
    supervised_tables = _add_uniprot_ids_supervised(supervised_tables)

    # Clean up column names (replace hyphens with underscores, remove spaces, etc.)
    supervised_tables = _clean_supervised_column_names(supervised_tables)

    # Load summary metrics
    summary_df = get_supervised_metrics(cache_dir)

    return supervised_tables, summary_df