Skip to content

Commit 2e8013c

Browse files
committed
Small update to optimize memory usage and clean up bugs.
1 parent 9fde47f commit 2e8013c

12 files changed

+123
-45
lines changed

CHANGELOG.md

+18
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,24 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
9+
## [0.1.3] - 2024-05-15
10+
11+
### Changed
12+
13+
- Updated the logic for `detect_outliers` in phenotype transforms to actually reflect the function
14+
name (before it was returning true for inliers...).
15+
- Updated `quantize` and `dequantize` to minimize data copying as much as possible.
16+
- Updated `LDMatrix.load_rows()` method to minimize data copying.
17+
- Fixed bug in `LDMatrix.n_neighbors` implementation.
18+
- Updated `dask` version in `requirements.txt` to avoid installing `dask-expr`.
19+
20+
21+
### Added
22+
23+
- Added `get_peak_memory_usage` to `system_utils` to inspect peak memory usage of a process.
24+
- Placeholder method to perform QC on `SumstatsTable` objects (needs to be implemented still).
25+
826
## [0.1.2] - 2024-04-24
927

1028
### Changed

bin/magenpy_ld

+11-11
Original file line numberDiff line numberDiff line change
@@ -31,17 +31,17 @@ from magenpy.utils.system_utils import valid_url
3131
from magenpy.GenotypeMatrix import xarrayGenotypeMatrix, plinkBEDGenotypeMatrix
3232

3333
print(fr"""
34-
**********************************************
35-
_ __ ___ __ _ __ _ ___ _ __ _ __ _ _
36-
| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
37-
| | | | | | (_| | (_| | __/ | | | |_) | |_| |
38-
|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
39-
|___/ |_| |___/
40-
Modeling and Analysis of Genetics data in python
41-
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
42-
Author: Shadi Zabad, McGill University
43-
**********************************************
44-
< Compute LD matrix and output in Zarr format >
34+
**********************************************
35+
_ __ ___ __ _ __ _ ___ _ __ _ __ _ _
36+
| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
37+
| | | | | | (_| | (_| | __/ | | | |_) | |_| |
38+
|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
39+
|___/ |_| |___/
40+
Modeling and Analysis of Genetics data in python
41+
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
42+
Author: Shadi Zabad, McGill University
43+
**********************************************
44+
< Compute LD matrix and output in Zarr format >
4545
""")
4646

4747
parser = argparse.ArgumentParser(description="""

bin/magenpy_simulate

+11-11
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,17 @@ import argparse
3333

3434

3535
print(fr"""
36-
**********************************************
37-
_ __ ___ __ _ __ _ ___ _ __ _ __ _ _
38-
| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
39-
| | | | | | (_| | (_| | __/ | | | |_) | |_| |
40-
|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
41-
|___/ |_| |___/
42-
Modeling and Analysis of Genetics data in python
43-
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
44-
Author: Shadi Zabad, McGill University
45-
**********************************************
46-
< Simulate complex quantitative or case-control traits >
36+
********************************************************
37+
_ __ ___ __ _ __ _ ___ _ __ _ __ _ _
38+
| '_ ` _ \ / _` |/ _` |/ _ \ '_ \| '_ \| | | |
39+
| | | | | | (_| | (_| | __/ | | | |_) | |_| |
40+
|_| |_| |_|\__,_|\__, |\___|_| |_| .__/ \__, |
41+
|___/ |_| |___/
42+
Modeling and Analysis of Genetics data in python
43+
Version: {mgp.__version__} | Release date: {mgp.__release_date__}
44+
Author: Shadi Zabad, McGill University
45+
********************************************************
46+
< Simulate complex quantitative or case-control traits >
4747
""")
4848

4949
# --------- Options ---------

magenpy/LDMatrix.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ def from_csr(cls,
122122
"""
123123
Initialize an LDMatrix object from a sparse CSR matrix.
124124
125+
TODO: Determine the chunksize based on the avg neighborhood size?
126+
125127
:param csr_mat: The sparse CSR matrix.
126128
:param store_path: The path to the Zarr LD store where the data will be stored.
127129
:param overwrite: If True, it overwrites the LD store at `store_path`.
@@ -173,8 +175,9 @@ def from_plink_table(cls,
173175
compressor_name='lz4',
174176
compression_level=5):
175177
"""
176-
Construct a Zarr LD matrix using output tables from plink1.9.
177-
This class method takes the following inputs:
178+
Construct a Zarr LD matrix using LD tables generated by plink1.9.
179+
180+
TODO: Determine the chunksize based on the avg neighborhood size?
178181
179182
:param plink_ld_file: The path to the plink LD table file.
180183
:param snps: An iterable containing the list of SNPs in the LD matrix.
@@ -265,6 +268,8 @@ def from_dense_zarr_matrix(cls,
265268
useful for converting a dense LD matrix computed using Dask (or other distributed computing
266269
software) to a sparse or banded one.
267270
271+
TODO: Determine the chunksize based on the avg neighborhood size?
272+
268273
:param dense_zarr: The path to the dense Zarr array object.
269274
:param ld_boundaries: The LD boundaries for each SNP in the LD matrix (delineates the indices of
270275
the leftmost and rightmost neighbors of each SNP).
@@ -364,6 +369,8 @@ def from_ragged_zarr_matrix(cls,
364369
This utility function will also copy some of the stored attributes
365370
associated with the matrix in the old format.
366371
372+
TODO: Determine the chunksize based on the avg neighborhood size?
373+
367374
:param ragged_zarr: The path to the ragged Zarr array object.
368375
:param store_path: The path where to store the new LD matrix.
369376
:param overwrite: If True, it overwrites the LD store at `store_path`.
@@ -722,7 +729,7 @@ def n_neighbors(self):
722729
:return: The number of variants in the LD window for each SNP.
723730
724731
"""
725-
return self.window_size()
732+
return self.window_size
726733

727734
@property
728735
def csr_matrix(self):
@@ -1150,8 +1157,10 @@ def low_memory_load(self, dtype=None):
11501157
from .stats.ld.c_utils import filter_ut_csr_matrix_low_memory
11511158

11521159
data_mask, indptr = filter_ut_csr_matrix_low_memory(indptr, mask)
1153-
# Unfortunately, .vindex is very slow in Zarr right now (~order of magnitude)
1154-
# So for now, we load the entire data array before performing the mask selection:
1160+
# .oindex and .vindex are slow and likely convert to integer indices in the background,
1161+
# which unnecessarily increases memory usage. Unfortunately, here we have to load the entire
1162+
# data and index it using the boolean array afterward.
1163+
# Something to be improved in the future...
11551164
data = self._zg['matrix/data'][:][data_mask]
11561165
else:
11571166
data = self._zg['matrix/data'][:]
@@ -1282,7 +1291,7 @@ def load_rows(self,
12821291
mat.data[mat.data == 0] = invalid_value
12831292

12841293
# Add the matrix transpose to make it symmetric:
1285-
mat = (mat + mat.T).astype(dtype)
1294+
mat += mat.T
12861295

12871296
# If the user requested filling the diagonals, do it here:
12881297
if fill_diag:
@@ -1458,7 +1467,7 @@ def __iter__(self):
14581467
TODO: Add a flag to allow for chunked iterator, with limited memory footprint.
14591468
"""
14601469
self.index = 0
1461-
self.load(return_symmetric=self.is_symmetric)
1470+
self.load(return_symmetric=self.is_symmetric, fill_diag=self.is_symmetric)
14621471
return self
14631472

14641473
def __next__(self):

magenpy/SumstatsTable.py

+10
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,16 @@ def set_sample_size(self, n):
378378
"""
379379
self.table['N'] = n
380380

381+
def run_quality_control(self, reference_table=None):
382+
"""
383+
Run quality control checks on the summary statistics table.
384+
TODO: Implement quality control checks following recommendations given by Prive et al.:
385+
https://doi.org/10.1016/j.xhgg.2022.100136
386+
Given user fine-control over which checks to run and which to skip.
387+
Maybe move parts of this implementation to a module in `stats` (TBD)
388+
"""
389+
pass
390+
381391
def match(self, reference_table, correct_flips=True):
382392
"""
383393
Match the summary statistics table with a reference table,

magenpy/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
from .utils.data_utils import *
1818

19-
__version__ = '0.1.2'
20-
__release_date__ = 'April 2024'
19+
__version__ = '0.1.3'
20+
__release_date__ = 'May 2024'
2121

2222

2323
config = configparser.ConfigParser()

magenpy/stats/transforms/phenotype.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def rint(phenotype, offset=3./8):
3535
return norm.ppf((ranked_pheno - offset) / (len(ranked_pheno) - 2 * offset + 1))
3636

3737

38-
def detect_outliers(phenotype, sigma_threshold=5):
38+
def detect_outliers(phenotype, sigma_threshold=5, nan_policy='omit'):
3939
"""
4040
Detect samples with outlier phenotype values.
4141
This function takes a vector of quantitative phenotypes,
@@ -45,11 +45,14 @@ def detect_outliers(phenotype, sigma_threshold=5):
4545
:param phenotype: A numpy vector of continuous or quantitative phenotypes.
4646
:param sigma_threshold: The multiple of standard deviations or sigmas after
4747
which we consider the phenotypic value an outlier.
48+
:param nan_policy: The policy to use when encountering NaN values in the phenotype vector.
49+
By default, we compute the z-scores ignoring NaN values.
4850
49-
:return: A boolean array indicating whether the phenotype value is an outlier.
51+
:return: A boolean array indicating whether the phenotype value is an outlier (i.e.
52+
True indicates outlier).
5053
"""
5154
from scipy.stats import zscore
52-
return np.abs(zscore(phenotype)) < sigma_threshold
55+
return np.abs(zscore(phenotype, nan_policy=nan_policy)) > sigma_threshold
5356

5457

5558
def standardize(phenotype):
@@ -109,8 +112,8 @@ def chained_transform(sample_table,
109112
# Remove outlier samples whose phenotypes are more than `threshold` standard deviations from the mean:
110113
if outlier_sigma_threshold is not None:
111114
# Find outliers:
112-
mask = detect_outliers(phenotype, outlier_sigma_threshold)
113-
# Filter phenotype vector:
115+
mask = ~detect_outliers(phenotype, outlier_sigma_threshold)
116+
# Filter phenotype vector to exclude outliers:
114117
phenotype = phenotype[mask]
115118

116119
return phenotype, mask

magenpy/utils/model_utils.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -374,16 +374,21 @@ def quantize(floats, int_dtype=np.int8):
374374
# See discussions on Scale Quantization Mapping.
375375
scale = 2. / (info.max - (info.min + 1))
376376

377-
# Quantize the floats to int
378-
return np.clip((floats / scale).round(), info.min, info.max).astype(int_dtype)
377+
# Use as much in-place operations as possible
378+
# (Currently, we copy the data twice)
379+
scaled_floats = floats / scale
380+
np.round(scaled_floats, out=scaled_floats)
381+
np.clip(scaled_floats, info.min, info.max, out=scaled_floats)
382+
383+
return scaled_floats.astype(int_dtype)
379384

380385

381386
def dequantize(ints, float_dtype=np.float32):
382387
"""
383388
Dequantize integers to the specified floating point type.
384389
NOTE: Assumes original floats are in the range [-1, 1].
385390
:param ints: A numpy array of integers
386-
:param float_dtype: The floating point type to dequantize to.
391+
:param float_dtype: The floating point data type to dequantize the integers to.
387392
"""
388393

389394
# Infer the boundaries from the integer type
@@ -394,7 +399,10 @@ def dequantize(ints, float_dtype=np.float32):
394399
# See discussions on Scale Quantization Mapping.
395400
scale = 2. / (info.max - (info.min + 1))
396401

397-
return ints.astype(float_dtype) * scale
402+
dq = ints.astype(float_dtype)
403+
dq *= scale # in-place multiplication
404+
405+
return dq
398406

399407

400408
def multinomial_rvs(n, p):

magenpy/utils/system_utils.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import subprocess
55
import glob
66
import psutil
7+
import sys
78

89

910
def available_cpu():
@@ -13,9 +14,38 @@ def available_cpu():
1314
return psutil.cpu_count() - 1
1415

1516

17+
def get_peak_memory_usage(include_children=False):
18+
"""
19+
Get the peak memory usage of the running process in Mega Bytes (MB).
20+
21+
!!! warning
22+
This function is only available on Unix-based systems for now.
23+
24+
:param include_children: A boolean flag to include the memory usage of the child processes.
25+
:return: The peak memory usage of the running process in Mega Bytes (MB).
26+
"""
27+
28+
try:
29+
import resource
30+
except ImportError:
31+
return
32+
33+
mem_usage_self = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
34+
35+
if include_children:
36+
mem_usage_self = max(mem_usage_self, resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss)
37+
38+
if sys.platform != 'darwin':
39+
mem_usage_self /= 1024
40+
else:
41+
mem_usage_self /= 1024**2
42+
43+
return mem_usage_self
44+
45+
1646
def get_memory_usage():
1747
"""
18-
Get the memory usage of the current process in Mega Bytes (MB)
48+
Get the current memory usage of the running process in Mega Bytes (MB)
1949
"""
2050
process = psutil.Process(os.getpid())
2151
mem_info = process.memory_info()
@@ -145,5 +175,5 @@ def delete_temp_files(prefix):
145175
for f in glob.glob(f"{prefix}*"):
146176
try:
147177
os.remove(f)
148-
except Exception as e:
178+
except Exception:
149179
continue

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
dask
1+
dask<=2024.1.0 # Seen installation issues with newer versions
22
scipy
33
numpy
44
pandas

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def no_cythonize(extensions, **_ignore):
7979

8080
setup(
8181
name="magenpy",
82-
version="0.1.2",
82+
version="0.1.3",
8383
author="Shadi Zabad",
8484
author_email="[email protected]",
8585
description="Modeling and Analysis of Statistical Genetics data in python",

tests/conda_manual_testing.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ do
2929

3030
# Install magenpy
3131
make clean
32-
python -m pip install -v -e .[test]
32+
python -m pip install --no-cache-dir -v -e .[test]
3333

3434
# List the installed packages:
3535
python -m pip list

0 commit comments

Comments
 (0)