get_dms_metadata

Download and process DMS substitutions metadata/reference file.

Parameters:
  • cache_dir (str, default: '.cache' ) –

    Directory to cache downloaded files

Returns:
  • DataFrame

    DataFrame with metadata for 217 DMS assays

Source code in proteingympy/make_dms_substitutions.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
def get_dms_metadata(cache_dir: str = ".cache") -> pd.DataFrame:
    """
    Download and process DMS substitutions metadata/reference file.

    Args:
        cache_dir: Directory to cache downloaded files

    Returns:
        DataFrame with metadata for 217 DMS assays
    """
    os.makedirs(cache_dir, exist_ok=True)
    metadata_path = os.path.join(cache_dir, "DMS_substitutions.csv")

    if not os.path.exists(metadata_path):
        url = "https://zenodo.org/records/15293562/files/DMS_substitutions.csv"
        print(f"Downloading metadata from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(metadata_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print("Metadata download complete.")

    # Load and process metadata
    df = pd.read_csv(metadata_path)

    # Convert categorical columns
    categorical_cols = [
        'taxon', 'source_organism', 'DMS_binarization_method', 
        'selection_type', 'selection_assay', 'raw_DMS_phenotype_name',
        'raw_DMS_directionality', 'raw_DMS_mutant_column', 
        'ProteinGym_version', 'coarse_selection_type'
    ]

    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')

    return df