Skip to content

Commit 7a3bc21

Browse files
committed
add function to check alphabetic or numeric
1 parent 370f1f8 commit 7a3bc21

File tree

4 files changed

+135
-19
lines changed

4 files changed

+135
-19
lines changed

gcc/rust/util/make-rust-unicode.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,9 @@ def write_decomposition():
191191

192192

193193
def write_recomposition():
194-
# non const.
195-
print("std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{")
194+
print(
195+
"const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{"
196+
)
196197
print(" // clang-format off")
197198
for cp in decomposition_map:
198199
if binary_search_ranges(composition_exclusion_ranges, cp) != -1:
@@ -219,9 +220,7 @@ def write_ccc():
219220

220221
def write_alphabetic():
221222
print(
222-
"const std::array<std::pair<uint32_t, uint32_t>, {}> ALPHABETIC_RANGES = {{{{".format(
223-
len(alphabetic_ranges)
224-
)
223+
"const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{"
225224
)
226225
print(" // clang-format off")
227226
for r in alphabetic_ranges:
@@ -231,11 +230,7 @@ def write_alphabetic():
231230

232231

233232
def write_numeric():
234-
print(
235-
"const std::array<uint32_t, {}> NUMERIC_CODEPOINTS = {{{{".format(
236-
len(numeric_codepoints)
237-
)
238-
)
233+
print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{")
239234
print(" // clang-format off")
240235
for i, cp in enumerate(numeric_codepoints):
241236
if i % 16 == 0:
@@ -268,6 +263,9 @@ def main():
268263
print()
269264
print("namespace Rust {")
270265
print()
266+
print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
267+
print("const uint32_t NUM_NUMERIC_CODEPOINTS = {};".format(len(numeric_codepoints)))
268+
print()
271269

272270
write_decomposition()
273271
print()

gcc/rust/util/rust-unicode-data.h

+11-7
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020

2121
namespace Rust {
2222

23+
const uint32_t NUM_ALPHABETIC_RANGES = 1117;
24+
const uint32_t NUM_NUMERIC_CODEPOINTS = 1831;
25+
2326
const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {
2427
// clang-format off
2528
{0x00c0, {0x0041, 0x0300, }},
@@ -2086,7 +2089,7 @@ const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {
20862089
// clang-format on
20872090
};
20882091

2089-
std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{
2092+
const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{
20902093
// clang-format off
20912094
{{0x0041, 0x0300}, 0x00c0},
20922095
{{0x0041, 0x0301}, 0x00c1},
@@ -3959,8 +3962,9 @@ const std::map<uint32_t, int32_t> CCC_TABLE = {
39593962
// clang-format on
39603963
};
39613964

3962-
const std::array<std::pair<uint32_t, uint32_t>, 1117> ALPHABETIC_RANGES = {{
3963-
// clang-format off
3965+
const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES>
3966+
ALPHABETIC_RANGES = {{
3967+
// clang-format off
39643968
{0x0041, 0x005b},
39653969
{0x0061, 0x007b},
39663970
{0x00aa, 0x00ab},
@@ -5078,10 +5082,10 @@ const std::array<std::pair<uint32_t, uint32_t>, 1117> ALPHABETIC_RANGES = {{
50785082
{0x2ceb0, 0x2ebe1},
50795083
{0x2f800, 0x2fa1e},
50805084
{0x30000, 0x3134b},
5081-
// clang-format on
5082-
}};
5085+
// clang-format on
5086+
}};
50835087

5084-
const std::array<uint32_t, 1831> NUMERIC_CODEPOINTS = {{
5088+
const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{
50855089
// clang-format off
50865090
0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x00b2, 0x00b3, 0x00b9, 0x00bc, 0x00bd, 0x00be,
50875091
0x0660, 0x0661, 0x0662, 0x0663, 0x0664, 0x0665, 0x0666, 0x0667, 0x0668, 0x0669, 0x06f0, 0x06f1, 0x06f2, 0x06f3, 0x06f4, 0x06f5,
@@ -5197,7 +5201,7 @@ const std::array<uint32_t, 1831> NUMERIC_CODEPOINTS = {{
51975201
0x1ed1d, 0x1ed1e, 0x1ed1f, 0x1ed20, 0x1ed21, 0x1ed22, 0x1ed23, 0x1ed24, 0x1ed25, 0x1ed26, 0x1ed27, 0x1ed28, 0x1ed29, 0x1ed2a, 0x1ed2b, 0x1ed2c,
51985202
0x1ed2d, 0x1ed2f, 0x1ed30, 0x1ed31, 0x1ed32, 0x1ed33, 0x1ed34, 0x1ed35, 0x1ed36, 0x1ed37, 0x1ed38, 0x1ed39, 0x1ed3a, 0x1ed3b, 0x1ed3c, 0x1ed3d,
51995203
0x1f100, 0x1f101, 0x1f102, 0x1f103, 0x1f104, 0x1f105, 0x1f106, 0x1f107, 0x1f108, 0x1f109, 0x1f10a, 0x1f10b, 0x1f10c, 0x1fbf0, 0x1fbf1, 0x1fbf2,
5200-
0x1fbf3, 0x1fbf4, 0x1fbf5, 0x1fbf6, 0x1fbf7, 0x1fbf8, 0x1fbf9,
5204+
0x1fbf3, 0x1fbf4, 0x1fbf5, 0x1fbf6, 0x1fbf7, 0x1fbf8, 0x1fbf9,
52015205
// clang-format on
52025206
}};
52035207

gcc/rust/util/rust-unicode.cc

+106
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,51 @@ namespace Rust {
99
typedef uint32_t codepoint_t;
1010
typedef std::vector<codepoint_t> string_t;
1111

12+
template <std::size_t SIZE>
13+
int64_t
14+
binary_search_ranges (
15+
const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
16+
uint32_t target_cp)
17+
{
18+
uint32_t low, high, mid, start, end;
19+
low = 0;
20+
high = SIZE;
21+
while (low <= high)
22+
{
23+
mid = (low + high) / 2;
24+
start = ranges[mid].first;
25+
end = ranges[mid].second;
26+
if (start <= target_cp && target_cp <= end - 1)
27+
return mid;
28+
else if (target_cp < start)
29+
high = mid - 1;
30+
else
31+
low = mid + 1;
32+
}
33+
return -1;
34+
}
35+
36+
template <std::size_t SIZE>
37+
int64_t
38+
binary_search_sorted_array (const std::array<std::uint32_t, SIZE> &array,
39+
uint32_t target)
40+
{
41+
uint32_t low, high, mid;
42+
low = 0;
43+
high = SIZE - 1;
44+
while (low <= high)
45+
{
46+
mid = (low + high) / 2;
47+
if (array[mid] == target)
48+
return true;
49+
else if (array[mid] < target)
50+
low = mid + 1;
51+
else
52+
high = mid - 1;
53+
}
54+
return false;
55+
}
56+
1257
int
1358
lookup_cc (codepoint_t c)
1459
{
@@ -155,6 +200,22 @@ nfc_normalize (string_t s)
155200
return r;
156201
}
157202

203+
bool
204+
is_alphabetic (uint32_t codepoint)
205+
{
206+
int64_t res = binary_search_ranges (ALPHABETIC_RANGES, codepoint);
207+
if (res < 0)
208+
return false;
209+
else
210+
return true;
211+
}
212+
213+
bool
214+
is_numeric (uint32_t codepoint)
215+
{
216+
return binary_search_sorted_array (NUMERIC_CODEPOINTS, codepoint);
217+
}
218+
158219
} // namespace Rust
159220

160221
namespace selftest {
@@ -191,4 +252,49 @@ rust_utf8_normalize_test ()
191252
// https://unicode.org/Public/UNIDATA/NormalizationTest.txt
192253
}
193254

255+
void
256+
rust_utf8_property_test ()
257+
{
258+
ASSERT_TRUE (Rust::is_alphabetic ('A'));
259+
ASSERT_TRUE (Rust::is_alphabetic ('B'));
260+
ASSERT_TRUE (Rust::is_alphabetic ('x'));
261+
ASSERT_TRUE (Rust::is_alphabetic ('z'));
262+
ASSERT_TRUE (Rust::is_alphabetic (0x00b5)); // µ
263+
ASSERT_TRUE (Rust::is_alphabetic (0x3093)); //
264+
ASSERT_TRUE (Rust::is_alphabetic (0xa8f2)); //
265+
ASSERT_TRUE (Rust::is_alphabetic (0x2b743)); // 𫝃
266+
267+
ASSERT_FALSE (Rust::is_numeric ('\v'));
268+
ASSERT_FALSE (Rust::is_alphabetic ('0'));
269+
ASSERT_FALSE (Rust::is_alphabetic ('9'));
270+
ASSERT_FALSE (Rust::is_alphabetic (0xa720)); //
271+
ASSERT_FALSE (Rust::is_alphabetic (0xaac1)); // ◌꫁
272+
273+
// `Nd`s
274+
ASSERT_TRUE (Rust::is_numeric ('0'));
275+
ASSERT_TRUE (Rust::is_numeric ('1'));
276+
ASSERT_TRUE (Rust::is_numeric ('7'));
277+
ASSERT_TRUE (Rust::is_numeric ('9'));
278+
ASSERT_TRUE (Rust::is_numeric (0x07c2)); // ߂
279+
ASSERT_TRUE (Rust::is_numeric (0x096d)); //
280+
// `Nl`s
281+
ASSERT_TRUE (Rust::is_numeric (0x16e6)); //
282+
ASSERT_TRUE (Rust::is_numeric (0xa6e6)); //
283+
ASSERT_TRUE (Rust::is_numeric (0x12400)); // 𒐀
284+
ASSERT_TRUE (Rust::is_numeric (0x1243a)); // 𒐺
285+
286+
// `No`s
287+
ASSERT_TRUE (Rust::is_numeric (0x00b2)); // ²
288+
ASSERT_TRUE (Rust::is_numeric (0x32b1)); //
289+
290+
ASSERT_FALSE (Rust::is_numeric ('\n'));
291+
ASSERT_FALSE (Rust::is_numeric ('z'));
292+
ASSERT_FALSE (Rust::is_numeric (';'));
293+
ASSERT_FALSE (Rust::is_numeric (0x03f4)); // ϴ
294+
ASSERT_FALSE (Rust::is_numeric (0x0628)); // ب
295+
ASSERT_FALSE (Rust::is_numeric (0x0975)); //
296+
ASSERT_FALSE (Rust::is_numeric (0x18f0)); //
297+
ASSERT_FALSE (Rust::is_numeric (0x2f30)); //
298+
}
299+
194300
} // namespace selftest

gcc/rust/util/rust-unicode.h

+10-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,13 @@
2020

2121
namespace Rust {
2222

23-
std::string
24-
nfc_normalize (std::string s);
23+
// TODO: add function nfc_normalize
24+
25+
bool
26+
is_alphabetic (uint32_t codepoint);
27+
28+
bool
29+
is_numeric (uint32_t codepoint);
2530

2631
} // namespace Rust
2732

@@ -32,6 +37,9 @@ namespace selftest {
3237
void
3338
rust_utf8_normalize_test ();
3439

40+
void
41+
rust_utf8_property_test ();
42+
3543
} // namespace selftest
3644

3745
#endif // CHECKING_P

0 commit comments

Comments
 (0)