Skip to content

Commit 0a8526c

Browse files
c42fstevengj
andauthored
Port ruby data_generator.rb to Julia (#258)
* Port ruby data_generator.rb to Julia This reduces the number of dependencies needed when regenerating the C code. The new code also separates C code generation from unicode data analysis somewhat more cleanly which should be better factored for generating a Julia version of the data files in the future. The output is identical to the original Ruby script, for now. Some bugs which were found in the process are noted as FIXMEs in the Julia source and can be fixed next. * Replace some explicit loops with a utility function * fixup! Port ruby data_generator.rb to Julia * Update Makefile * Update data/Makefile * Update data/Makefile * Update data/Makefile * Update data/Makefile * Update data/data_generator.jl --------- Co-authored-by: Steven G. Johnson <[email protected]>
1 parent a9c6332 commit 0a8526c

File tree

7 files changed

+639
-654
lines changed

7 files changed

+639
-654
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ manifest: MANIFEST.new
7373

7474
# real targets
7575

76-
data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.rb data/charwidths.jl
76+
data/utf8proc_data.c.new: libutf8proc.$(SHLIB_EXT) data/data_generator.jl
7777
$(MAKE) -C data utf8proc_data.c.new
7878

7979
utf8proc.o: utf8proc.h utf8proc.c utf8proc_data.c

data/Makefile

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
# Unicode data generation rules. Except for the test data files, most
22
# users will not use these Makefile rules, which are primarily to re-generate
33
# unicode_data.c when we get a new Unicode version or charwidth data; they
4-
# require ruby and julia to be installed.
4+
# require julia to be installed.
55

66
# programs
77
CURL=curl
8-
RUBY=ruby
98
PERL=perl
109
MAKE=make
1110
JULIA=julia
@@ -15,11 +14,11 @@ CURLFLAGS = --retry 5 --location
1514

1615
.DELETE_ON_ERROR:
1716

18-
utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
19-
$(RUBY) data_generator.rb < UnicodeData.txt > $@
17+
RAWDATA = UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt EastAsianWidth.txt emoji-data.txt
2018

21-
CharWidths.txt: charwidths.jl EastAsianWidth.txt
22-
$(JULIA) charwidths.jl > $@
19+
utf8proc_data.c.new: data_generator.jl $(RAWDATA)
20+
$(JULIA) --project=. -e 'using Pkg; Pkg.instantiate()'
21+
$(JULIA) --project=. data_generator.jl > $@
2322

2423
# Unicode data version (must also update utf8proc_unicode_version function)
2524
UNICODE_VERSION=15.1.0
@@ -52,12 +51,12 @@ emoji-data.txt:
5251
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
5352

5453
Uppercase.txt: DerivedCoreProperties.txt
55-
$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@
54+
$(JULIA) -e 'print(match(r"# Derived Property: Uppercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@
5655

5756
Lowercase.txt: DerivedCoreProperties.txt
58-
$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@
57+
$(JULIA) -e 'print(match(r"# Derived Property: Lowercase.*?# Total code points:"s, read("DerivedCoreProperties.txt", String)).match)' > $@
5958

6059
clean:
61-
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
60+
rm -f $(RAWDATA) NormalizationTest.txt GraphemeBreakTest.txt
6261
rm -f Uppercase.txt Lowercase.txt
6362
rm -f utf8proc_data.c.new

data/Manifest.toml

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# This file is machine-generated - editing it directly is not advised
2+
3+
julia_version = "1.9.3"
4+
manifest_format = "2.0"
5+
project_hash = "bc0740aa2247b17bd49ba693fb87f41bbbddead6"
6+
7+
[[deps.Adapt]]
8+
deps = ["LinearAlgebra", "Requires"]
9+
git-tree-sha1 = "cde29ddf7e5726c9fb511f340244ea3481267608"
10+
uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
11+
version = "3.7.2"
12+
13+
[deps.Adapt.extensions]
14+
AdaptStaticArraysExt = "StaticArrays"
15+
16+
[deps.Adapt.weakdeps]
17+
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
18+
19+
[[deps.Artifacts]]
20+
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
21+
22+
[[deps.CompilerSupportLibraries_jll]]
23+
deps = ["Artifacts", "Libdl"]
24+
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
25+
version = "1.0.5+0"
26+
27+
[[deps.Libdl]]
28+
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
29+
30+
[[deps.LinearAlgebra]]
31+
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
32+
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
33+
34+
[[deps.OffsetArrays]]
35+
deps = ["Adapt"]
36+
git-tree-sha1 = "2ac17d29c523ce1cd38e27785a7d23024853a4bb"
37+
uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
38+
version = "1.12.10"
39+
40+
[[deps.OpenBLAS_jll]]
41+
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
42+
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
43+
version = "0.3.21+4"
44+
45+
[[deps.Random]]
46+
deps = ["SHA", "Serialization"]
47+
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
48+
49+
[[deps.Requires]]
50+
deps = ["UUIDs"]
51+
git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
52+
uuid = "ae029012-a4dd-5104-9daa-d747884805df"
53+
version = "1.3.0"
54+
55+
[[deps.SHA]]
56+
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
57+
version = "0.7.0"
58+
59+
[[deps.Serialization]]
60+
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
61+
62+
[[deps.UUIDs]]
63+
deps = ["Random", "SHA"]
64+
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
65+
66+
[[deps.libblastrampoline_jll]]
67+
deps = ["Artifacts", "Libdl"]
68+
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
69+
version = "5.8.0+0"

data/Project.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[deps]
2+
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"

data/charwidths.jl

Lines changed: 0 additions & 169 deletions
This file was deleted.

0 commit comments

Comments
 (0)