@@ -122,6 +122,8 @@ def from_csr(cls,
122
122
"""
123
123
Initialize an LDMatrix object from a sparse CSR matrix.
124
124
125
+ TODO: Determine the chunksize based on the avg neighborhood size?
126
+
125
127
:param csr_mat: The sparse CSR matrix.
126
128
:param store_path: The path to the Zarr LD store where the data will be stored.
127
129
:param overwrite: If True, it overwrites the LD store at `store_path`.
@@ -173,8 +175,9 @@ def from_plink_table(cls,
173
175
compressor_name = 'lz4' ,
174
176
compression_level = 5 ):
175
177
"""
176
- Construct a Zarr LD matrix using output tables from plink1.9.
177
- This class method takes the following inputs:
178
+ Construct a Zarr LD matrix using LD tables generated by plink1.9.
179
+
180
+ TODO: Determine the chunksize based on the avg neighborhood size?
178
181
179
182
:param plink_ld_file: The path to the plink LD table file.
180
183
:param snps: An iterable containing the list of SNPs in the LD matrix.
@@ -265,6 +268,8 @@ def from_dense_zarr_matrix(cls,
265
268
useful for converting a dense LD matrix computed using Dask (or other distributed computing
266
269
software) to a sparse or banded one.
267
270
271
+ TODO: Determine the chunksize based on the avg neighborhood size?
272
+
268
273
:param dense_zarr: The path to the dense Zarr array object.
269
274
:param ld_boundaries: The LD boundaries for each SNP in the LD matrix (delineates the indices of
270
275
the leftmost and rightmost neighbors of each SNP).
@@ -364,6 +369,8 @@ def from_ragged_zarr_matrix(cls,
364
369
This utility function will also copy some of the stored attributes
365
370
associated with the matrix in the old format.
366
371
372
+ TODO: Determine the chunksize based on the avg neighborhood size?
373
+
367
374
:param ragged_zarr: The path to the ragged Zarr array object.
368
375
:param store_path: The path where to store the new LD matrix.
369
376
:param overwrite: If True, it overwrites the LD store at `store_path`.
@@ -722,7 +729,7 @@ def n_neighbors(self):
722
729
:return: The number of variants in the LD window for each SNP.
723
730
724
731
"""
725
- return self .window_size ()
732
+ return self .window_size
726
733
727
734
@property
728
735
def csr_matrix (self ):
@@ -1150,8 +1157,10 @@ def low_memory_load(self, dtype=None):
1150
1157
from .stats .ld .c_utils import filter_ut_csr_matrix_low_memory
1151
1158
1152
1159
data_mask , indptr = filter_ut_csr_matrix_low_memory (indptr , mask )
1153
- # Unfortunately, .vindex is very slow in Zarr right now (~order of magnitude)
1154
- # So for now, we load the entire data array before performing the mask selection:
1160
+ # .oindex and .vindex are slow and likely convert to integer indices in the background,
1161
+ # which unnecessarily increases memory usage. Unfortunately, here we have to load the entire
1162
+ # data and index it using the boolean array afterward.
1163
+ # Something to be improved in the future...
1155
1164
data = self ._zg ['matrix/data' ][:][data_mask ]
1156
1165
else :
1157
1166
data = self ._zg ['matrix/data' ][:]
@@ -1282,7 +1291,7 @@ def load_rows(self,
1282
1291
mat .data [mat .data == 0 ] = invalid_value
1283
1292
1284
1293
# Add the matrix transpose to make it symmetric:
1285
- mat = ( mat + mat . T ). astype ( dtype )
1294
+ mat += mat . T
1286
1295
1287
1296
# If the user requested filling the diagonals, do it here:
1288
1297
if fill_diag :
@@ -1458,7 +1467,7 @@ def __iter__(self):
1458
1467
TODO: Add a flag to allow for chunked iterator, with limited memory footprint.
1459
1468
"""
1460
1469
self .index = 0
1461
- self .load (return_symmetric = self .is_symmetric )
1470
+ self .load (return_symmetric = self .is_symmetric , fill_diag = self . is_symmetric )
1462
1471
return self
1463
1472
1464
1473
def __next__ (self ):
0 commit comments