Download and process DMS substitutions metadata/reference file.
| Parameters: |
-
cache_dir
(str, default:
'.cache'
)
–
Directory to cache downloaded files
|
| Returns: |
-
DataFrame
–
DataFrame with metadata for 217 DMS assays
|
Source code in proteingympy/make_dms_substitutions.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236 | def get_dms_metadata(cache_dir: str = ".cache") -> pd.DataFrame:
"""
Download and process DMS substitutions metadata/reference file.
Args:
cache_dir: Directory to cache downloaded files
Returns:
DataFrame with metadata for 217 DMS assays
"""
os.makedirs(cache_dir, exist_ok=True)
metadata_path = os.path.join(cache_dir, "DMS_substitutions.csv")
if not os.path.exists(metadata_path):
url = "https://zenodo.org/records/15293562/files/DMS_substitutions.csv"
print(f"Downloading metadata from {url}...")
response = requests.get(url, stream=True)
response.raise_for_status()
with open(metadata_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
print("Metadata download complete.")
# Load and process metadata
df = pd.read_csv(metadata_path)
# Convert categorical columns
categorical_cols = [
'taxon', 'source_organism', 'DMS_binarization_method',
'selection_type', 'selection_assay', 'raw_DMS_phenotype_name',
'raw_DMS_directionality', 'raw_DMS_mutant_column',
'ProteinGym_version', 'coarse_selection_type'
]
for col in categorical_cols:
if col in df.columns:
df[col] = df[col].astype('category')
return df
|