get_dms_metadata¶

Download and process DMS substitutions metadata/reference file.

Parameters:	`cache_dir` (`str`, default: `'.cache'` ) – Directory to cache downloaded files

Returns:	`DataFrame` – DataFrame with metadata for 217 DMS assays

Source code in proteingympy/make_dms_substitutions.py

def get_dms_metadata(cache_dir: str = ".cache") -> pd.DataFrame:
    """
    Download and process DMS substitutions metadata/reference file.

    Args:
        cache_dir: Directory to cache downloaded files

    Returns:
        DataFrame with metadata for 217 DMS assays
    """
    os.makedirs(cache_dir, exist_ok=True)
    metadata_path = os.path.join(cache_dir, "DMS_substitutions.csv")

    if not os.path.exists(metadata_path):
        url = "https://zenodo.org/records/15293562/files/DMS_substitutions.csv"
        print(f"Downloading metadata from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(metadata_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print("Metadata download complete.")

    # Load and process metadata
    df = pd.read_csv(metadata_path)

    # Convert categorical columns
    categorical_cols = [
        'taxon', 'source_organism', 'DMS_binarization_method', 
        'selection_type', 'selection_assay', 'raw_DMS_phenotype_name',
        'raw_DMS_directionality', 'raw_DMS_mutant_column', 
        'ProteinGym_version', 'coarse_selection_type'
    ]

    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')

    return df