Skip to content

Commit 1e3c1f2

Browse files
radioflashtimholy
authored andcommitted
feat: allow writing compressed files (#123)
1 parent bb9356e commit 1e3c1f2

File tree

4 files changed

+69
-35
lines changed

4 files changed

+69
-35
lines changed

README.md

+4-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ To load the module:
2323
using MAT
2424
```
2525

26-
To read a single variable from a MAT file:
26+
To read a single variable from a MAT file (compressed files are detected and handled automatically):
2727

2828
```julia
2929
file = matopen("matfile.mat")
@@ -45,13 +45,14 @@ To read all variables from a MAT file as a Dict:
4545
vars = matread("matfile.mat")
4646
```
4747

48-
To write a Dict to a MAT file, using its keys as variable names:
48+
To write a Dict to a MAT file, using its keys as variable names.
49+
The `compress` argument is optional, and compression is off by default:
4950

5051
```julia
5152
matwrite("matfile.mat", Dict(
5253
"myvar1" => 0,
5354
"myvar2" => 1
54-
))
55+
); compress = true)
5556
```
5657

5758
To get a list of variable names in a MAT file:

src/MAT.jl

+22-18
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ export matopen, matread, matwrite, names, exists, @read, @write
3535

3636
# Open a MATLAB file
3737
const HDF5_HEADER = UInt8[0x89, 0x48, 0x44, 0x46, 0x0d, 0x0a, 0x1a, 0x0a]
38-
function matopen(filename::AbstractString, rd::Bool, wr::Bool, cr::Bool, tr::Bool, ff::Bool)
38+
function matopen(filename::AbstractString, rd::Bool, wr::Bool, cr::Bool, tr::Bool, ff::Bool, compress::Bool)
3939
# When creating new files, create as HDF5 by default
4040
fs = filesize(filename)
4141
if cr && (tr || fs == 0)
42-
return MAT_HDF5.matopen(filename, rd, wr, cr, tr, ff)
42+
return MAT_HDF5.matopen(filename, rd, wr, cr, tr, ff, compress)
4343
elseif fs == 0
4444
error("File \"$filename\" does not exist and create was not specified")
4545
end
@@ -76,28 +76,28 @@ function matopen(filename::AbstractString, rd::Bool, wr::Bool, cr::Bool, tr::Boo
7676
seek(rawfid, offset)
7777
if read!(rawfid, Vector{UInt8}(undef, 8)) == HDF5_HEADER
7878
close(rawfid)
79-
return MAT_HDF5.matopen(filename, rd, wr, cr, tr, ff)
79+
return MAT_HDF5.matopen(filename, rd, wr, cr, tr, ff, compress)
8080
end
8181
end
8282

8383
close(rawfid)
8484
error("\"$filename\" is not a MAT file")
8585
end
8686

87-
function matopen(fname::AbstractString, mode::AbstractString)
88-
mode == "r" ? matopen(fname, true , false, false, false, false) :
89-
mode == "r+" ? matopen(fname, true , true , false, false, false) :
90-
mode == "w" ? matopen(fname, false, true , true , true , false) :
91-
# mode == "w+" ? matopen(fname, true , true , true , true , false) :
92-
# mode == "a" ? matopen(fname, false, true , true , false, true ) :
93-
# mode == "a+" ? matopen(fname, true , true , true , false, true ) :
87+
function matopen(fname::AbstractString, mode::AbstractString; compress::Bool = false)
88+
mode == "r" ? matopen(fname, true , false, false, false, false, false) :
89+
mode == "r+" ? matopen(fname, true , true , false, false, false, compress) :
90+
mode == "w" ? matopen(fname, false, true , true , true , false, compress) :
91+
# mode == "w+" ? matopen(fname, true , true , true , true , false, compress) :
92+
# mode == "a" ? matopen(fname, false, true , true , false, true, compress) :
93+
# mode == "a+" ? matopen(fname, true , true , true , false, true, compress) :
9494
error("invalid open mode: ", mode)
9595
end
9696

97-
matopen(fname::AbstractString) = matopen(fname, "r")
97+
matopen(fname::AbstractString; kwargs...) = matopen(fname, "r"; kwargs...)
9898

99-
function matopen(f::Function, args...)
100-
fid = matopen(args...)
99+
function matopen(f::Function, args...; kwargs...)
100+
fid = matopen(args...; kwargs...)
101101
try
102102
f(fid)
103103
finally
@@ -106,12 +106,15 @@ function matopen(f::Function, args...)
106106
end
107107

108108
"""
109-
matopen(filename [, mode]) -> handle
110-
matopen(f::Function, filename [, mode]) -> f(handle)
109+
matopen(filename [, mode]; compress = false) -> handle
110+
matopen(f::Function, filename [, mode]; compress = false) -> f(handle)
111111
112112
Mode defaults to "r" for read. It can also be "w" for write, or "r+" for
113113
read or write without creation or truncation.
114114
115+
Compression on reading is detected/handled automatically; the compress
116+
keyword argument only affects write operations.
117+
115118
Use with `read`, `write`, `close`, `names`, and `exists`.
116119
"""
117120
matopen
@@ -136,13 +139,13 @@ end
136139

137140
# Write a dict to a MATLAB file
138141
"""
139-
matwrite(filename, d::Dict)
142+
matwrite(filename, d::Dict; compress::Bool = false)
140143
141144
Write a dictionary containing variable names as keys and values as values
142145
to a Matlab file, opening and closing it automatically.
143146
"""
144-
function matwrite(filename::AbstractString, dict::AbstractDict{S, T}) where {S, T}
145-
file = matopen(filename, "w")
147+
function matwrite(filename::AbstractString, dict::AbstractDict{S, T}; compress::Bool = false) where {S, T}
148+
file = matopen(filename, "w"; compress = compress)
146149
try
147150
for (k, v) in dict
148151
local kstring
@@ -158,3 +161,4 @@ function matwrite(filename::AbstractString, dict::AbstractDict{S, T}) where {S,
158161
end
159162
end
160163
end
164+

src/MAT_HDF5.jl

+21-12
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,10 @@ mutable struct MatlabHDF5File <: HDF5.DataFile
4141
toclose::Bool
4242
writeheader::Bool
4343
refcounter::Int
44+
compress::Bool
4445

45-
function MatlabHDF5File(plain, toclose::Bool=true, writeheader::Bool=false, refcounter::Int=0)
46-
f = new(plain, toclose, writeheader, refcounter)
46+
function MatlabHDF5File(plain, toclose::Bool=true, writeheader::Bool=false, refcounter::Int=0, compress::Bool=false)
47+
f = new(plain, toclose, writeheader, refcounter, compress)
4748
if toclose
4849
finalizer(close, f)
4950
end
@@ -79,7 +80,7 @@ function close(f::MatlabHDF5File)
7980
nothing
8081
end
8182

82-
function matopen(filename::AbstractString, rd::Bool, wr::Bool, cr::Bool, tr::Bool, ff::Bool)
83+
function matopen(filename::AbstractString, rd::Bool, wr::Bool, cr::Bool, tr::Bool, ff::Bool, compress::Bool)
8384
local f
8485
if ff && !wr
8586
error("Cannot append to a write-only file")
@@ -101,7 +102,7 @@ function matopen(filename::AbstractString, rd::Bool, wr::Bool, cr::Bool, tr::Boo
101102
writeheader = false
102103
end
103104
close(pa)
104-
fid = MatlabHDF5File(HDF5File(f, filename), true, writeheader, 0)
105+
fid = MatlabHDF5File(HDF5File(f, filename), true, writeheader, 0, compress)
105106
pathrefs = "/#refs#"
106107
if exists(fid.plain, pathrefs)
107108
g = fid.plain[pathrefs]
@@ -326,8 +327,8 @@ function m_writeempty(parent::HDF5Parent, name::String, data::AbstractArray)
326327
end
327328

328329
# Write an array to a dataset in a MATLAB file, returning the dataset
329-
function m_writearray(parent::HDF5Parent, name::String, adata::AbstractArray{T}) where {T<:HDF5BitsOrBool}
330-
dset, dtype = d_create(parent, name, adata)
330+
function m_writearray(parent::HDF5Parent, name::String, adata::AbstractArray{T}, compress::Bool) where {T<:HDF5BitsOrBool}
331+
dset, dtype = compress ? d_create(parent, name, adata, "chunk",HDF5.heuristic_chunk(adata), "compress",3) : d_create(parent, name, adata)
331332
try
332333
HDF5.writearray(dset, dtype.id, adata)
333334
dset
@@ -338,11 +339,19 @@ function m_writearray(parent::HDF5Parent, name::String, adata::AbstractArray{T})
338339
close(dtype)
339340
end
340341
end
341-
function m_writearray(parent::HDF5Parent, name::String, adata::AbstractArray{Complex{T}}) where {T<:HDF5BitsOrBool}
342+
function m_writearray(parent::HDF5Parent, name::String, adata::AbstractArray{Complex{T}}, compress::Bool) where {T<:HDF5BitsOrBool}
342343
dtype = build_datatype_complex(T)
343344
try
344345
stype = dataspace(adata)
345-
obj_id = HDF5.h5d_create(parent.id, name, dtype.id, stype.id)
346+
if compress
347+
p = p_create(HDF5.H5P_DATASET_CREATE)
348+
p["compress"] = 3
349+
p["chunk"] = HDF5.heuristic_chunk(adata)
350+
obj_id = HDF5.h5d_create(parent.id, name, dtype.id, stype.id,
351+
HDF5._link_properties(name),p.id,HDF5.H5P_DEFAULT)
352+
else
353+
obj_id = HDF5.h5d_create(parent.id, name, dtype.id, stype.id)
354+
end
346355
dset = HDF5Dataset(obj_id, file(parent))
347356
try
348357
arr = reshape(reinterpret(T, adata), tuple(2, size(adata)...))
@@ -365,7 +374,7 @@ function m_write(mfile::MatlabHDF5File, parent::HDF5Parent, name::String, data::
365374
m_writeempty(parent, name, data)
366375
return
367376
end
368-
dset = m_writearray(parent, name, toarray(data))
377+
dset = m_writearray(parent, name, toarray(data), mfile.compress)
369378
try
370379
m_writetypeattr(dset, T)
371380
finally
@@ -380,10 +389,10 @@ function m_write(mfile::MatlabHDF5File, parent::HDF5Parent, name::String, data::
380389
m_writetypeattr(g, T)
381390
a_write(g, sparse_attr_matlab, UInt64(size(data, 1)))
382391
if !isempty(data.nzval)
383-
close(m_writearray(g, "data", toarray(data.nzval)))
384-
close(m_writearray(g, "ir", add!(isa(data.rowval, Vector{UInt64}) ? copy(data.rowval) : convert(Vector{UInt64}, data.rowval), typemax(UInt64))))
392+
close(m_writearray(g, "data", toarray(data.nzval), mfile.compress))
393+
close(m_writearray(g, "ir", add!(isa(data.rowval, Vector{UInt64}) ? copy(data.rowval) : convert(Vector{UInt64}, data.rowval), typemax(UInt64)), mfile.compress))
385394
end
386-
close(m_writearray(g, "jc", add!(isa(data.colptr, Vector{UInt64}) ? copy(data.colptr) : convert(Vector{UInt64}, data.colptr), typemax(UInt64))))
395+
close(m_writearray(g, "jc", add!(isa(data.colptr, Vector{UInt64}) ? copy(data.colptr) : convert(Vector{UInt64}, data.colptr), typemax(UInt64)), mfile.compress))
387396
finally
388397
close(g)
389398
end

test/write.jl

+22-2
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ using MAT
22

33
tmpfile = string(tempname(), ".mat")
44

5-
function test_write(data)
6-
matwrite(tmpfile, data)
5+
function test_write(data; kwargs...)
6+
matwrite(tmpfile, data; kwargs...)
77

88
fid = matopen(tmpfile, "r")
99
local result
@@ -18,6 +18,22 @@ function test_write(data)
1818
end
1919
end
2020

21+
function test_write(data)
22+
test_write(data; compress = false)
23+
test_write(data; compress = true)
24+
end
25+
26+
function test_compression_effective(data)
27+
test_write(data; compress = false)
28+
sizeUncompressed = stat(tmpfile).size
29+
test_write(data; compress = true)
30+
sizeCompressed = stat(tmpfile).size
31+
32+
if sizeCompressed >= sizeUncompressed
33+
error("Compression was not effective")
34+
end
35+
end
36+
2137
test_write(Dict(
2238
"int8" => Int8(1),
2339
"uint8" => UInt8(1),
@@ -109,3 +125,7 @@ sd = SortedDict(Dict(
109125
"sparse_empty" => sparse(Matrix{Float64}(undef, 0, 0))
110126
))
111127
test_write(sd)
128+
129+
# note: compression is NOT effective when the dict contains many duplicate entries
130+
# which are not compressible by themselves!
131+
test_compression_effective(Dict("data" => fill(1.0, 1000)))

0 commit comments

Comments
 (0)