Skip to content

Commit dcf654d

Browse files
committed
gccrs: Add function Rust::encode_punycode
gcc/rust/ChangeLog: * Make-lang.in: Add rust-punycode.o. * rust-lang.cc (run_rust_tests): Add selftest. * util/rust-punycode.cc: New file. * util/rust-punycode.h: New file. Signed-off-by: Raiki Tamura <[email protected]>
1 parent 35b67c3 commit dcf654d

File tree

4 files changed

+230
-0
lines changed

4 files changed

+230
-0
lines changed

gcc/rust/Make-lang.in

+1
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ GRS_OBJS = \
186186
rust/rust-feature-gate.o \
187187
rust/rust-dir-owner.o \
188188
rust/rust-unicode.o \
189+
rust/rust-punycode.o \
189190
$(END)
190191
# removed object files from here
191192

gcc/rust/rust-lang.cc

+2
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include "rust-lex.h"
4141
#include "optional.h"
4242
#include "rust-unicode.h"
43+
#include "rust-punycode.h"
4344

4445
#include <mpfr.h>
4546
// note: header files must be in this order or else forward declarations don't
@@ -456,6 +457,7 @@ run_rust_tests ()
456457
// Call tests for the rust frontend here
457458
rust_input_source_test ();
458459
rust_utf8_normalize_test ();
460+
rust_punycode_encode_test ();
459461
rust_cfg_parser_test ();
460462
rust_privacy_ctx_test ();
461463
rust_crate_name_validation_test ();

gcc/rust/util/rust-punycode.cc

+181
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
2+
3+
// This file is part of GCC.
4+
5+
// GCC is free software; you can redistribute it and/or modify it under
6+
// the terms of the GNU General Public License as published by the Free
7+
// Software Foundation; either version 3, or (at your option) any later
8+
// version.
9+
10+
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11+
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
12+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13+
// for more details.
14+
15+
// You should have received a copy of the GNU General Public License
16+
// along with GCC; see the file COPYING3. If not see
17+
// <http://www.gnu.org/licenses/>.
18+
19+
// This file provides functions for punycode conversion
20+
// See https://datatracker.ietf.org/doc/html/rfc3492
21+
22+
#include "rust-system.h"
23+
#include "rust-unicode.h"
24+
#include "optional.h"
25+
#include "selftest.h"
26+
27+
namespace Rust {
28+
29+
// https://tools.ietf.org/html/rfc3492#section-4.
30+
constexpr uint32_t BASE = 36;
31+
constexpr uint32_t TMIN = 1;
32+
constexpr uint32_t TMAX = 26;
33+
constexpr uint32_t SKEW = 38;
34+
constexpr uint32_t DAMP = 700;
35+
constexpr uint32_t INITIAL_BIAS = 72;
36+
constexpr uint32_t INITIAL_N = 128;
37+
constexpr char DELIMITER = '-';
38+
39+
constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
40+
41+
std::string
42+
extract_basic_string (const std::vector<Codepoint> &src)
43+
{
44+
std::string basic_string;
45+
for (auto c : src)
46+
{
47+
if (c.value <= MAX_ASCII_CODEPOINT)
48+
basic_string += c.as_string ();
49+
}
50+
return basic_string;
51+
}
52+
53+
uint32_t
54+
adapt_bias (uint32_t delta, const uint32_t n_points, const bool is_first)
55+
{
56+
delta /= is_first ? DAMP : 2;
57+
delta += delta / n_points;
58+
uint32_t k = 0;
59+
60+
while (delta > (BASE - TMIN) * TMAX / 2)
61+
{
62+
delta /= BASE - TMIN;
63+
k += BASE;
64+
}
65+
return k + (BASE - TMIN + 1) * delta / (delta + SKEW);
66+
}
67+
68+
uint32_t
69+
clamped_sub (const uint32_t min, const uint32_t lhs, const uint32_t rhs,
70+
const uint32_t max)
71+
{
72+
if (min + rhs >= lhs)
73+
return min;
74+
else if (max + rhs <= lhs)
75+
return max;
76+
else
77+
return lhs - rhs;
78+
}
79+
80+
uint32_t
81+
min_gt_or_eq (const std::vector<Codepoint> &l, const uint32_t threshold)
82+
{
83+
uint32_t min = UINT32_MAX;
84+
for (auto c : l)
85+
if (c.value >= threshold && c.value < min)
86+
min = c.value;
87+
return min;
88+
}
89+
90+
char
91+
encode_digit (const uint32_t d)
92+
{
93+
return d + 22 + (d < 26 ? 75 : 0);
94+
}
95+
96+
tl::optional<std::string>
97+
encode_punycode (const Utf8String &input)
98+
{
99+
std::vector<Codepoint> input_chars = input.get_chars ();
100+
101+
uint32_t n = INITIAL_N;
102+
uint32_t delta = 0;
103+
uint32_t bias = INITIAL_BIAS;
104+
105+
std::string output = extract_basic_string (input_chars);
106+
uint32_t h = output.size ();
107+
const uint32_t b = h;
108+
if (b > 0)
109+
output += DELIMITER;
110+
111+
while (h < input_chars.size ())
112+
{
113+
const uint32_t m = min_gt_or_eq (input_chars, n);
114+
115+
if (m - n > ((UINT32_MAX - delta) / (h + 1)))
116+
return tl::nullopt;
117+
118+
delta += (m - n) * (h + 1);
119+
n = m;
120+
121+
for (const auto c : input_chars)
122+
{
123+
if (c.value < n)
124+
delta++;
125+
else if (c.value == n)
126+
{
127+
uint32_t q = delta;
128+
// encode as a variable length integer
129+
for (uint32_t k = 1;; k++)
130+
{
131+
const uint32_t kb = k * BASE;
132+
const uint32_t t = clamped_sub (TMIN, kb, bias, TMAX);
133+
if (q < t)
134+
break;
135+
136+
output += encode_digit (t + (q - t) % (BASE - t));
137+
q = (q - t) / (BASE - t);
138+
}
139+
output += encode_digit (q);
140+
141+
bias = adapt_bias (delta, h + 1, h == b);
142+
delta = 0;
143+
h++;
144+
}
145+
}
146+
delta++;
147+
n++;
148+
}
149+
150+
return {output};
151+
}
152+
153+
} // namespace Rust
154+
155+
namespace selftest {
156+
157+
void
158+
encode_assert (const std::string &input, const std::string &expected)
159+
{
160+
Rust::Utf8String input_utf8
161+
= Rust::Utf8String::make_utf8_string (input).value ();
162+
std::string actual = Rust::encode_punycode (input_utf8).value ();
163+
ASSERT_EQ (actual, expected);
164+
}
165+
166+
void
167+
rust_punycode_encode_test ()
168+
{
169+
encode_assert ("abc", "abc-");
170+
encode_assert ("12345", "12345-");
171+
encode_assert ("香港", "j6w193g");
172+
173+
// Examples from https://datatracker.ietf.org/doc/html/rfc3492#section-7.1
174+
encode_assert ("ليهمابتكلموشعربي؟", "egbpdaj6bu4bxfgehfvwxn");
175+
encode_assert ("他们为什么不说中文", "ihqwcrb4cv8a8dqg056pqjye");
176+
encode_assert ("他們爲什麽不說中文", "ihqwctvzc91f659drss3x8bo0yb");
177+
encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
178+
encode_assert ("Pročprostěnemluvíčesky", "Proprostnemluvesky-uyb24dma41a");
179+
}
180+
181+
} // namespace selftest

gcc/rust/util/rust-punycode.h

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Copyright (C) 2020-2023 Free Software Foundation, Inc.
2+
3+
// This file is part of GCC.
4+
5+
// GCC is free software; you can redistribute it and/or modify it under
6+
// the terms of the GNU General Public License as published by the Free
7+
// Software Foundation; either version 3, or (at your option) any later
8+
// version.
9+
10+
// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
11+
// WARRANTY; without even the implied warranty of MERCHANTABILITY or
12+
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13+
// for more details.
14+
15+
// You should have received a copy of the GNU General Public License
16+
// along with GCC; see the file COPYING3. If not see
17+
// <http://www.gnu.org/licenses/>.
18+
19+
#ifndef RUST_PUNYCODE_H
20+
#define RUST_PUNYCODE_H
21+
22+
#include "rust-unicode.h"
23+
#include "optional.h"
24+
25+
namespace Rust {
26+
27+
/* Encode a string as punycode. Returns a string if encoding is successful.
28+
* Returns nullopt otherwise. Note that a returned string contains only ASCII
29+
* characters and does not start with `xn--`. */
30+
tl::optional<std::string>
31+
encode_punycode (const Utf8String &src);
32+
33+
} // namespace Rust
34+
35+
#if CHECKING_P
36+
37+
namespace selftest {
38+
39+
void
40+
rust_punycode_encode_test ();
41+
42+
} // namespace selftest
43+
44+
#endif // CHECKING_P
45+
46+
#endif

0 commit comments

Comments
 (0)