Skip to content

Commit e6d4481

Browse files
committed
Updating defaults for LD matrix construction
1 parent 9cdc274 commit e6d4481

File tree

6 files changed

+51
-43
lines changed

6 files changed

+51
-43
lines changed

Diff for: bin/magenpy_ld

+11-3
Original file line numberDiff line numberDiff line change
@@ -82,16 +82,16 @@ parser.add_argument('--metadata', dest='metadata', type=str,
8282

8383
# Argument for the float precision:
8484
parser.add_argument('--storage-dtype', dest='storage_dtype', type=str,
85-
default='int16', help='The data type for the entries of the LD matrix.',
85+
default='int8', help='The data type for the entries of the LD matrix.',
8686
choices={'float32', 'float64', 'int16', 'int8'})
8787

8888
# Add arguments for the compressor:
8989
parser.add_argument('--compressor', dest='compressor', type=str,
90-
default='lz4', help='The compressor name or compression algorithm to use for the LD matrix.',
90+
default='zstd', help='The compressor name or compression algorithm to use for the LD matrix.',
9191
choices={'lz4', 'zstd', 'gzip', 'zlib'})
9292

9393
parser.add_argument('--compression-level', dest='compression_level', type=int,
94-
default=5, help='The compression level to use for the entries of the LD matrix (1-9).')
94+
default=7, help='The compression level to use for the entries of the LD matrix (1-9).')
9595

9696
# Options for the various LD estimators:
9797

@@ -229,6 +229,7 @@ ld_mat = g.compute_ld(args.estimator,
229229
**ld_kwargs)
230230

231231
# Store metadata (if provided):
232+
232233
if args.metadata is not None:
233234
parsed_metadata = {
234235
k: v for entry in args.metadata.split(',') for k, v in [entry.strip().split('=')]
@@ -239,6 +240,13 @@ if args.metadata is not None:
239240
for k, v in parsed_metadata.items():
240241
ld_mat.set_store_attr(k, v)
241242

243+
if 'Date' not in parsed_metadata:
244+
# Store the date when the computation was done:
245+
ld_mat.set_store_attr('Date', time.strftime("%Y-%m-%d"))
246+
247+
else:
248+
# Store the date when the computation was done:
249+
ld_mat.set_store_attr('Date', time.strftime("%Y-%m-%d"))
242250

243251
# Clean up all intermediate files and directories:
244252
g.cleanup()

Diff for: magenpy/GWADataLoader.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -415,13 +415,13 @@ def read_genotypes(self,
415415
gmat_class = plinkBEDGenotypeMatrix
416416

417417
if self.verbose and len(bed_files) < 2:
418-
print("> Reading BED file...")
418+
print("> Reading genotype metadata...")
419419

420420
self.genotype = {}
421421

422422
for bfile in tqdm(bed_files,
423423
total=len(bed_files),
424-
desc="Reading BED files",
424+
desc="Reading genotype metadata",
425425
disable=not self.verbose or len(bed_files) < 2):
426426
# Read BED file and update the genotypes dictionary:
427427
self.genotype.update(gmat_class.from_file(bfile,
@@ -615,9 +615,9 @@ def release_ld(self):
615615
def compute_ld(self,
616616
estimator,
617617
output_dir,
618-
dtype='int16',
619-
compressor_name='lz4',
620-
compression_level=5,
618+
dtype='int8',
619+
compressor_name='zstd',
620+
compression_level=7,
621621
**ld_kwargs):
622622
"""
623623
Compute the Linkage-Disequilibrium (LD) matrix or SNP-by-SNP Pearson

Diff for: magenpy/GenotypeMatrix.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -341,9 +341,9 @@ def get_snp_attribute(self, attr):
341341
def compute_ld(self,
342342
estimator,
343343
output_dir,
344-
dtype='int16',
345-
compressor_name='lz4',
346-
compression_level=5,
344+
dtype='int8',
345+
compressor_name='zstd',
346+
compression_level=7,
347347
**ld_kwargs):
348348
"""
349349

Diff for: magenpy/LDMatrix.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -116,9 +116,9 @@ def from_csr(cls,
116116
csr_mat,
117117
store_path,
118118
overwrite=False,
119-
dtype='int16',
120-
compressor_name='lz4',
121-
compression_level=5):
119+
dtype='int8',
120+
compressor_name='zstd',
121+
compression_level=7):
122122
"""
123123
Initialize an LDMatrix object from a sparse CSR matrix.
124124
@@ -171,9 +171,9 @@ def from_plink_table(cls,
171171
store_path,
172172
pandas_chunksize=None,
173173
overwrite=False,
174-
dtype='int16',
175-
compressor_name='lz4',
176-
compression_level=5):
174+
dtype='int8',
175+
compressor_name='zstd',
176+
compression_level=7):
177177
"""
178178
Construct a Zarr LD matrix using LD tables generated by plink1.9.
179179
@@ -260,9 +260,9 @@ def from_dense_zarr_matrix(cls,
260260
store_path,
261261
overwrite=False,
262262
delete_original=False,
263-
dtype='int16',
264-
compressor_name='lz4',
265-
compression_level=5):
263+
dtype='int8',
264+
compressor_name='zstd',
265+
compression_level=7):
266266
"""
267267
Initialize a new LD matrix object using a Zarr array object. This method is
268268
useful for converting a dense LD matrix computed using Dask (or other distributed computing
@@ -359,9 +359,9 @@ def from_ragged_zarr_matrix(cls,
359359
store_path,
360360
overwrite=False,
361361
delete_original=False,
362-
dtype='int16',
363-
compressor_name='lz4',
364-
compression_level=5):
362+
dtype='int8',
363+
compressor_name='zstd',
364+
compression_level=7):
365365
"""
366366
Initialize a new LD matrix object using a Zarr array object
367367
conforming to the old LD Matrix format from magenpy v<=0.0.12.

Diff for: magenpy/stats/ld/estimator.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ def compute(self,
6161
temp_dir='temp',
6262
overwrite=True,
6363
delete_original=True,
64-
dtype='int16',
65-
compressor_name='lz4',
66-
compression_level=5):
64+
dtype='int8',
65+
compressor_name='zstd',
66+
compression_level=7):
6767
"""
6868
A utility method to compute the LD matrix and store in Zarr array format.
6969
The computes the LD matrix and stores it in Zarr array format, set its attributes,
@@ -238,9 +238,9 @@ def compute(self,
238238
temp_dir='temp',
239239
overwrite=True,
240240
delete_original=True,
241-
dtype='int16',
242-
compressor_name='lz4',
243-
compression_level=5):
241+
dtype='int8',
242+
compressor_name='zstd',
243+
compression_level=7):
244244
"""
245245
246246
Compute the windowed LD matrix and store in Zarr array format.
@@ -346,9 +346,9 @@ def compute(self,
346346
temp_dir='temp',
347347
overwrite=True,
348348
delete_original=True,
349-
dtype='int16',
350-
compressor_name='lz4',
351-
compression_level=5,
349+
dtype='int8',
350+
compressor_name='zstd',
351+
compression_level=7,
352352
chunk_size=1000):
353353
"""
354354
@@ -465,9 +465,9 @@ def compute(self,
465465
temp_dir='temp',
466466
overwrite=True,
467467
delete_original=True,
468-
dtype='int16',
469-
compressor_name='lz4',
470-
compression_level=5):
468+
dtype='int8',
469+
compressor_name='zstd',
470+
compression_level=7):
471471
"""
472472
473473
Compute the block-based LD matrix and store in Zarr array format.

Diff for: magenpy/stats/ld/utils.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ def harmonic_series_sum(n):
233233
return ld_mat_obj
234234

235235

236-
def estimate_rows_per_chunk(rows, cols, dtype='int16', mem_size=128):
236+
def estimate_rows_per_chunk(rows, cols, dtype='int8', mem_size=128):
237237
"""
238238
Estimate the number of rows per chunk for matrices conditional on the desired size of the chunk in MB.
239239
The estimator takes as input the number of rows, columns, data type, and projected size of the chunk in memory.
@@ -255,9 +255,9 @@ def compute_ld_plink1p9(genotype_matrix,
255255
output_dir,
256256
temp_dir='temp',
257257
overwrite=True,
258-
dtype='int16',
259-
compressor_name='lz4',
260-
compression_level=5):
258+
dtype='int8',
259+
compressor_name='zstd',
260+
compression_level=7):
261261

262262
"""
263263
Compute LD matrices using plink 1.9.
@@ -354,7 +354,7 @@ def compute_ld_plink1p9(genotype_matrix,
354354
plink1.execute(cmd)
355355

356356
# Convert from PLINK LD files to Zarr:
357-
fin_ld_store = osp.join(output_dir, 'ld', 'chr_' + str(genotype_matrix.chromosome))
357+
fin_ld_store = osp.join(output_dir, 'chr_' + str(genotype_matrix.chromosome))
358358

359359
# Compute the pandas chunk_size
360360
# The goal of this is to process chunks of the LD table without overwhelming memory resources:
@@ -382,9 +382,9 @@ def compute_ld_xarray(genotype_matrix,
382382
temp_dir='temp',
383383
overwrite=True,
384384
delete_original=True,
385-
dtype='int16',
386-
compressor_name='lz4',
387-
compression_level=5):
385+
dtype='int8',
386+
compressor_name='zstd',
387+
compression_level=7):
388388

389389
"""
390390
Compute the Linkage Disequilibrium matrix or snp-by-snp

0 commit comments

Comments
 (0)