Skip to content

Commit afc31d4

Browse files
rename _convert to maybe_encode (#64)
* rename _convert to maybe_encode * add tests for maybe_encode * ignore CRLF/LF differences Co-authored-by: Lyndon White <[email protected]>
1 parent ab01585 commit afc31d4

12 files changed

+212
-16
lines changed

src/fileio.jl

+37-15
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ function loadfile(T, file::File)
99
end
1010

1111
function loadfile(T, file::TextFile)
12-
replace(read(file.filename, String), "\r"=>"") # ignore CRLF/LF difference
12+
_ignore_CR(read(file.filename, String))
1313
end
1414

1515
function loadfile(::Type{<:Number}, file::File{format"TXT"})
@@ -24,7 +24,7 @@ function savefile(file::TextFile, content)
2424
write(file.filename, string(content))
2525
end
2626

27-
function query_extended(filename)
27+
function query_extended(filename::AbstractString)
2828
file, ext = splitext(filename)
2929
# TODO: make this less hacky
3030
if uppercase(ext) == ".SHA256"
@@ -38,20 +38,28 @@ function query_extended(filename)
3838
res
3939
end
4040

41+
# Some target formats are not supported by FileIO and thus require an encoding/compression process
42+
# before saving. For other formats, we should trust IO backends and make as few changes as possible.
43+
# Otherwise, reference becomes unfaithful. The encoding process helps making the actual data matches
44+
# the reference data, which is loaded from reference file via IO backends.
45+
#
46+
# TODO: split `maybe_encode` to `maybe_preprocess` and `maybe_encode`
4147
"""
42-
_convert(T::Type{<:DataFormat}, x; kw...) -> out
48+
maybe_encode(T::Type{<:DataFormat}, x; kw...) -> out
4349
44-
Convert `x` to a validate content for file data format `T`.
50+
If needed, encode `x` to a valid content that matches format `T`.
51+
52+
If there is no known method to encode `x`, then it directly return `x` without warning.
4553
"""
46-
_convert(::Type{<:DataFormat}, x; kw...) = x
54+
maybe_encode(::Type{<:DataFormat}, x; kw...) = x
4755

4856
# plain TXT
49-
_convert(::Type{DataFormat{:TXT}}, x; kw...) = replace(string(x), "\r"=>"") # ignore CRLF/LF difference
50-
_convert(::Type{DataFormat{:TXT}}, x::Number; kw...) = x
51-
function _convert(::Type{DataFormat{:TXT}}, x::AbstractArray{<:AbstractString}; kw...)
52-
return join(x, '\n')
53-
end
54-
function _convert(
57+
maybe_encode(::Type{DataFormat{:TXT}}, x; kw...) = _ignore_CR(string(x))
58+
maybe_encode(::Type{DataFormat{:TXT}}, x::AbstractArray{<:AbstractString}; kw...) = _join(x)
59+
maybe_encode(::Type{DataFormat{:TXT}}, x::AbstractString; kw...) = _ignore_CR(x)
60+
maybe_encode(::Type{DataFormat{:TXT}}, x::Number; kw...) = x # TODO: Change this to string(x) ?
61+
62+
function maybe_encode(
5563
::Type{DataFormat{:TXT}}, img::AbstractArray{<:Colorant};
5664
size = (20,40), kw...)
5765

@@ -65,11 +73,25 @@ function _convert(
6573
end
6674

6775
# SHA256
68-
_convert(::Type{DataFormat{:SHA256}}, x; kw...) = bytes2hex(sha256(string(x)))
69-
function _convert(::Type{DataFormat{:SHA256}}, img::AbstractArray{<:Colorant}; kw...)
76+
maybe_encode(::Type{DataFormat{:SHA256}}, x; kw...) = _sha256(string(x))
77+
maybe_encode(::Type{DataFormat{:SHA256}}, x::AbstractString) = _sha256(_ignore_CR(x))
78+
maybe_encode(::Type{DataFormat{:SHA256}}, x::AbstractArray{<:AbstractString}) = _sha256(_join(x))
79+
function maybe_encode(::Type{DataFormat{:SHA256}}, img::AbstractArray{<:Colorant}; kw...)
7080
# encode image into SHA256
71-
size_str = bytes2hex(sha256(reinterpret(UInt8,[map(Int64,size(img))...])))
72-
img_str = bytes2hex(sha256(reinterpret(UInt8,vec(rawview(channelview(img))))))
81+
size_str = _sha256(reinterpret(UInt8,[map(Int64,size(img))...]))
82+
img_str = _sha256(reinterpret(UInt8,vec(rawview(channelview(img)))))
7383

7484
return size_str * img_str
7585
end
86+
87+
# Helpers
88+
_join(x::AbstractArray{<:AbstractString}) = _ignore_CR(join(x, "\n"))
89+
_sha256(x) = bytes2hex(sha256(x))
90+
"""
91+
_ignore_CR(x::AbstractString)
92+
93+
Ignore the CRLF(`\\r\\n`) and LF(`\\n`) difference by removing `\\r` from the given string.
94+
95+
CRLF format is widely used by Windows while LF format is mainly used by Linux.
96+
"""
97+
_ignore_CR(x::AbstractString) = replace(x, "\r\n"=>"\n") # issue #39

src/test_reference.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ function test_reference(
107107
rendermode = default_rendermode(F, raw_actual)
108108
end
109109

110-
actual = _convert(F, raw_actual; kw...)
110+
actual = maybe_encode(F, raw_actual; kw...)
111111
# preprocessing when reference file doesn't exists
112112
if !isfile(path)
113113
@info("Reference file for \"$filename\" does not exist. It will be created")

test/fileio.jl

+165
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
refdir = joinpath(refroot, "fileio")
2+
3+
@testset "query" begin
4+
check_types = [
5+
# text types
6+
("textfile_with_no_extension", format"TXT"),
7+
("textfile.txt", format"TXT"),
8+
("textfile.unknown", format"TXT"),
9+
("textfile.sha256", format"SHA256"),
10+
11+
# image types
12+
("imagefile.jpg", format"JPEG"),
13+
("imagefile.jpeg", format"JPEG"),
14+
("imagefile.png", format"PNG"),
15+
("imagefile.tif", format"TIFF"),
16+
("imagefile.tiff", format"TIFF"),
17+
18+
# dataframe types
19+
("dataframe_file.csv", format"CSV")
20+
]
21+
for (file, fmt) in check_types
22+
@test ReferenceTests.query_extended(file) == File{fmt}(file)
23+
@test ReferenceTests.query_extended(abspath(file)) == File{fmt}(abspath(file))
24+
end
25+
end
26+
27+
@testset "maybe_encode" begin
28+
@testset "string" begin
29+
str1 = "Hello world"
30+
str1_sha256 = "64ec88ca00b268e5ba1a35678a1b5316d212f4f366b2477232534a8aeca37f3c"
31+
str2 = "Hello\n world"
32+
str2_sha256 = "60b65ab310480818c4289227f2ec68f1714743db8571b4cb190e100c0085be3d" # bytes2hex(SHA.sha256(str2))
33+
str2_crlf = "Hello\r\n world"
34+
str3 = "Hello\nworld"
35+
str3_sha256 = "46e0ea795802f17d0b340983ca7d7068c94d7d9172ee4daea37a1ab1168649ec" # bytes2hex(SHA.sha256(str3))
36+
str3_arr1 = ["Hello", "world"]
37+
str3_arr2 = ["Hello" "world"]
38+
str4 = "Hello\n world1\nHello\n world2"
39+
str4_sha256 = "c7dc8b82c3a6fed4afa0c8790a0586b73df0e4f35524efe6810e5d78b6b6a611" # bytes2hex(SHA.sha256(str4))
40+
str4_arr = ["Hello\r\n world1", "Hello\n world2"]
41+
42+
# string as plain text
43+
fmt = format"TXT"
44+
# convert should respect whitespaces
45+
@test str1 == ReferenceTests.maybe_encode(fmt, str1)
46+
@test str2 == ReferenceTests.maybe_encode(fmt, str2)
47+
# but ignore CRLF/LF differences
48+
@test str2 == ReferenceTests.maybe_encode(fmt, str2_crlf)
49+
# string arrays are treated as multi-line strings, even for UNKNOWN format
50+
@test str3 == ReferenceTests.maybe_encode(fmt, str3)
51+
@test str3 == ReferenceTests.maybe_encode(fmt, str3_arr1)
52+
@test str3 == ReferenceTests.maybe_encode(fmt, str3_arr2)
53+
# string arrays should ignore CRLF/LF differences, too
54+
@test str4 == ReferenceTests.maybe_encode(fmt, str4_arr)
55+
56+
# string as SHA256 should also ignore CRLF/LF differences
57+
fmt = format"SHA256"
58+
@test str1_sha256 == ReferenceTests.maybe_encode(fmt, str1)
59+
@test str2_sha256 == ReferenceTests.maybe_encode(fmt, str2)
60+
# but ignore CRLF/LF differences
61+
@test str2_sha256 == ReferenceTests.maybe_encode(fmt, str2_crlf)
62+
# string arrays are treated as multi-line strings, even for UNKNOWN format
63+
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3)
64+
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3_arr1)
65+
@test str3_sha256 == ReferenceTests.maybe_encode(fmt, str3_arr2)
66+
# string arrays should ignore CRLF/LF differences, too
67+
@test str4_sha256 == ReferenceTests.maybe_encode(fmt, str4_arr)
68+
69+
# unknown formats
70+
fmt = format"PNG"
71+
for str in (str1, str2, str2_crlf, str3, str3_arr1, str3_arr2)
72+
@test str === ReferenceTests.maybe_encode(fmt, str)
73+
end
74+
end
75+
76+
@testset "numbers" begin
77+
for num in (0x01, 1, 1.0f0, 1.0)
78+
for fmt in (format"TXT", format"UNKNOWN")
79+
@test num === ReferenceTests.maybe_encode(fmt, num)
80+
end
81+
fmt = format"SHA256"
82+
@test ReferenceTests.maybe_encode(fmt, num) == ReferenceTests.maybe_encode(fmt, string(num))
83+
end
84+
85+
86+
for (fmt, a, ref) in [
87+
# if target is TXT, convert it to string
88+
(format"TXT", [1, 2], "[1, 2]"),
89+
(format"TXT", [1,2], "[1, 2]"),
90+
(format"TXT", [1;2], "[1, 2]"),
91+
(format"TXT", [1 2], "[1 2]"),
92+
(format"TXT", [1 2; 3 4], "[1 2; 3 4]"),
93+
# if target is Unknown, make no change
94+
(format"UNKNOWN", [1, 2], [1, 2]),
95+
(format"UNKNOWN", [1,2], [1, 2]),
96+
(format"UNKNOWN", [1;2], [1, 2]),
97+
(format"UNKNOWN", [1 2], [1 2]),
98+
(format"UNKNOWN", [1 2; 3 4], [1 2; 3 4]),
99+
]
100+
@test ref == ReferenceTests.maybe_encode(fmt, a)
101+
end
102+
103+
for a in [[1, 2], [1 2], [1 2; 3 4]]
104+
fmt = format"SHA256"
105+
@test ReferenceTests.maybe_encode(fmt, a) == ReferenceTests.maybe_encode(fmt, string(a))
106+
end
107+
108+
end
109+
110+
@testset "image" begin
111+
gray_1d = Gray{N0f8}.(0.0:0.1:0.9)
112+
rgb_1d = RGB.(gray_1d)
113+
gray_2d = Gray{N0f8}.(reshape(0.0:0.1:0.9, 2, 5))
114+
rgb_2d = RGB.(gray_2d)
115+
gray_3d = Gray{N0f8}.(reshape(0.0:0.02:0.95, 2, 4, 6))
116+
rgb_3d = RGB.(gray_3d)
117+
118+
# any common image types
119+
for img in (gray_1d, gray_2d, gray_3d, rgb_1d, rgb_2d, rgb_3d)
120+
for fmt in (format"JPEG", format"PNG", format"TIFF", format"UNKNOWN")
121+
@test img === ReferenceTests.maybe_encode(fmt, img)
122+
end
123+
end
124+
125+
# image as text file
126+
fmt = format"TXT"
127+
# TODO: support n-D image encoding
128+
# @test_reference joinpath(refdir, "gray_1d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_1d)
129+
# @test_reference joinpath(refdir, "rgb_1d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_1d)
130+
@test_reference joinpath(refdir, "gray_2d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_2d)
131+
@test_reference joinpath(refdir, "rgb_2d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_2d)
132+
# @test_reference joinpath(refdir, "gray_3d_as_txt.txt") ReferenceTests.maybe_encode(fmt, gray_3d)
133+
# @test_reference joinpath(refdir, "rgb_3d_as_txt.txt") ReferenceTests.maybe_encode(fmt, rgb_3d)
134+
135+
# image as SHA256
136+
fmt = format"SHA256"
137+
for (file, img) in [
138+
("gray_1d", gray_1d),
139+
("gray_2d", gray_2d),
140+
("gray_3d", gray_3d),
141+
("rgb_1d", rgb_1d),
142+
("rgb_2d", rgb_2d),
143+
("rgb_3d", rgb_3d)
144+
]
145+
reffile = joinpath(refdir, "$(file)_as_sha256.txt")
146+
@test_reference reffile ReferenceTests.maybe_encode(fmt, img)
147+
end
148+
end
149+
150+
# dataframe
151+
@testset "dataframe" begin
152+
df = DataFrame(v1=[1,2,3], v2=["a","b","c"])
153+
154+
@test string(df) == ReferenceTests.maybe_encode(format"TXT", df)
155+
for fmt in (format"CSV", format"UNKNOWN")
156+
@test df === ReferenceTests.maybe_encode(fmt, df)
157+
end
158+
159+
fmt = format"SHA256"
160+
@test_reference joinpath(refdir, "dataframe_as_sha256.txt") ReferenceTests.maybe_encode(fmt, df)
161+
162+
end
163+
end
164+
165+
# TODO: savefile & loadfile
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
2cf7c4edcafc27a5eb1b74fb0af704edc0d9bbef91a1b55d3b7350fa4b54cd18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
a111f275cc2e7588000001d300a31e76336d15b9d314cd1a1d8f3d3556975eed10ef43c7fcace84c4d0d54b8e92c0c9be2d14a6bf3dd7647254a3cc0c4a04297
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
26cfbb315c316a0b15516434f90284e5011dcb58503fe39eb036bf669bd8233d10ef43c7fcace84c4d0d54b8e92c0c9be2d14a6bf3dd7647254a3cc0c4a04297
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
▀▀▀▀▀
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
72307e420b5460c03a1c167060ed336407c26ea74aabf8fab76dd8e9dbe8cbe4baf0f53196e8d5270c0b0b2da82bbbb4676edbb0ebf84ec0dcbd8c0bf4d9af68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
a111f275cc2e7588000001d300a31e76336d15b9d314cd1a1d8f3d3556975eedebd6b0ad29dd5402ce5745bb5b48d4c59b7f8da0cdf8d2f287befd9094f6ac89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
26cfbb315c316a0b15516434f90284e5011dcb58503fe39eb036bf669bd8233debd6b0ad29dd5402ce5745bb5b48d4c59b7f8da0cdf8d2f287befd9094f6ac89
+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
▀▀▀▀▀
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
72307e420b5460c03a1c167060ed336407c26ea74aabf8fab76dd8e9dbe8cbe45465bcbf50acdbe5600207e3266eedef6548bc4d244e55d7a1af0f1af09e019f

0 commit comments

Comments
 (0)