Skip to content

Commit 467dc47

Browse files
committed
gccrs: Add Unicode check for crate_name attributes
gcc/rust/ChangeLog: * lex/rust-codepoint.h: Add comment * lex/rust-lex.h: New method to get decoded characters * rust-session-manager.cc (validate_crate_name): Modify unicode check (rust_crate_name_validation_test): Add testcases * util/rust-unicode.h (RUST_UNICODE_H): New class Utf8String. (class Utf8String): New class. * util/rust-unicode.cc (binary_search_sorted_array): Add comment. (recursive_decomp_cano): Add comment. (recomp): Remove dead code. (dump_string): Removed. gcc/testsuite/ChangeLog: * rust/compile/bad-crate-name.rs: Moved to... * rust/compile/bad-crate-name1.rs: ...here. * rust/compile/bad-crate-name2.rs: New test. Signed-off-by: Raiki Tamura <[email protected]>
1 parent 5ad0feb commit 467dc47

File tree

7 files changed

+59
-29
lines changed

7 files changed

+59
-29
lines changed

gcc/rust/lex/rust-codepoint.h

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include "rust-system.h"
2323

2424
namespace Rust {
25+
26+
// FIXME: move this to rust-unicode.h?
2527
struct Codepoint
2628
{
2729
uint32_t value;

gcc/rust/lex/rust-lex.h

+8
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,14 @@ class Lexer
334334
return c;
335335
}
336336
}
337+
338+
tl::optional<std::vector<Codepoint>> get_chars ()
339+
{
340+
if (is_valid ())
341+
return {chars};
342+
else
343+
return tl::nullopt;
344+
}
337345
};
338346

339347
class FileInputSource : public InputSource

gcc/rust/rust-session-manager.cc

+24-10
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include "rust-early-name-resolver.h"
4343
#include "rust-cfg-strip.h"
4444
#include "rust-expand-visitor.h"
45+
#include "rust-unicode.h"
4546

4647
#include "diagnostic.h"
4748
#include "input.h"
@@ -107,30 +108,39 @@ infer_crate_name (const std::string &filename)
107108
return crate;
108109
}
109110

110-
/* Validate the crate name using the ASCII rules
111-
TODO: Support Unicode version of the rules */
111+
/* Validate the crate name using the ASCII rules */
112112

113113
static bool
114114
validate_crate_name (const std::string &crate_name, Error &error)
115115
{
116-
if (crate_name.empty ())
116+
Utf8String utf8_name = {crate_name};
117+
tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars ();
118+
119+
if (!uchars_opt.has_value ())
120+
{
121+
error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string");
122+
return false;
123+
}
124+
125+
std::vector<Codepoint> uchars = uchars_opt.value ();
126+
if (uchars.empty ())
117127
{
118128
error = Error (UNDEF_LOCATION, "crate name cannot be empty");
119129
return false;
120130
}
121-
if (crate_name.length () > kMaxNameLength)
131+
if (uchars.size () > kMaxNameLength)
122132
{
123133
error = Error (UNDEF_LOCATION, "crate name cannot exceed %lu characters",
124134
(unsigned long) kMaxNameLength);
125135
return false;
126136
}
127-
for (auto &c : crate_name)
137+
for (Codepoint &c : uchars)
128138
{
129-
if (!(ISALNUM (c) || c == '_'))
139+
if (!(is_alphabetic (c.value) || is_numeric (c.value) || c.value == '_'))
130140
{
131141
error = Error (UNDEF_LOCATION,
132-
"invalid character %<%c%> in crate name: %<%s%>", c,
133-
crate_name.c_str ());
142+
"invalid character %<%s%> in crate name: %<%s%>",
143+
c.as_string ().c_str (), crate_name.c_str ());
134144
return false;
135145
}
136146
}
@@ -1273,13 +1283,17 @@ rust_crate_name_validation_test (void)
12731283
ASSERT_TRUE (Rust::validate_crate_name ("example", error));
12741284
ASSERT_TRUE (Rust::validate_crate_name ("abcdefg_1234", error));
12751285
ASSERT_TRUE (Rust::validate_crate_name ("1", error));
1276-
// FIXME: The next test does not pass as of current implementation
1277-
// ASSERT_TRUE (Rust::CompileOptions::validate_crate_name ("惊吓"));
1286+
ASSERT_TRUE (Rust::validate_crate_name ("クレート", error));
1287+
ASSERT_TRUE (Rust::validate_crate_name ("Sōkrátēs", error));
1288+
ASSERT_TRUE (Rust::validate_crate_name ("惊吓", error));
1289+
12781290
// NOTE: - is not allowed in the crate name ...
12791291

12801292
ASSERT_FALSE (Rust::validate_crate_name ("abcdefg-1234", error));
12811293
ASSERT_FALSE (Rust::validate_crate_name ("a+b", error));
12821294
ASSERT_FALSE (Rust::validate_crate_name ("/a+b/", error));
1295+
ASSERT_FALSE (Rust::validate_crate_name ("😸++", error));
1296+
ASSERT_FALSE (Rust::validate_crate_name ("", error));
12831297

12841298
/* Tests for crate name inference */
12851299
ASSERT_EQ (Rust::infer_crate_name ("c.rs"), "c");

gcc/rust/util/rust-unicode.cc

+4-19
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ typedef std::vector<codepoint_t> string_t;
1212
template <std::size_t SIZE>
1313
int64_t
1414
binary_search_ranges (
15+
// FIXME: use binray search function from <algorithm>
1516
const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
1617
uint32_t target_cp)
1718
{
@@ -49,6 +50,7 @@ int64_t
4950
binary_search_sorted_array (const std::array<std::uint32_t, SIZE> &array,
5051
uint32_t target)
5152
{
53+
// FIXME: use binray search function from <algorithm>
5254
if (SIZE == 0)
5355
return -1;
5456

@@ -104,9 +106,7 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
104106
{
105107
string_t decomped = it->second;
106108
for (codepoint_t cp : decomped)
107-
{
108-
recursive_decomp_cano (cp, buf);
109-
}
109+
recursive_decomp_cano (cp, buf);
110110
}
111111
else
112112
buf.push_back (c);
@@ -152,8 +152,7 @@ recomp (string_t s)
152152
if (s.size () > 0)
153153
{
154154
int last_class = -1;
155-
// int starter_pos = 0; // Assume the first character is Starter. Correct?
156-
// int target_pos = 1;
155+
// Assume the first character is Starter.
157156
codepoint_t starter_ch = s[0];
158157
for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
159158
{
@@ -189,20 +188,6 @@ recomp (string_t s)
189188
return buf;
190189
}
191190

192-
// TODO: remove
193-
/*
194-
void
195-
dump_string (std::vector<uint32_t> s)
196-
{
197-
std::cout << "dump=";
198-
for (auto c : s)
199-
{
200-
std::cout << std::hex << c << ", ";
201-
}
202-
std::cout << std::endl;
203-
}
204-
*/
205-
206191
string_t
207192
nfc_normalize (string_t s)
208193
{

gcc/rust/util/rust-unicode.h

+19
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,29 @@
1919
#ifndef RUST_UNICODE_H
2020
#define RUST_UNICODE_H
2121

22+
#include "optional.h"
2223
#include "rust-system.h"
24+
#include "rust-lex.h"
2325

2426
namespace Rust {
2527

28+
class Utf8String
29+
{
30+
private:
31+
tl::optional<std::vector<Codepoint>> chars;
32+
33+
public:
34+
Utf8String (const std::string &maybe_utf8)
35+
{
36+
Lexer::BufferInputSource input_source = {maybe_utf8, 0};
37+
chars = input_source.get_chars ();
38+
}
39+
40+
// Returns UTF codepoints when string is valid as UTF-8, returns nullopt
41+
// otherwise.
42+
tl::optional<std::vector<Codepoint>> get_chars () const { return chars; }
43+
};
44+
2645
// TODO: add function nfc_normalize
2746

2847
bool
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#![crate_name = "😅"] // { dg-error "invalid character ...." "" }
2+
fn main() {}

0 commit comments

Comments
 (0)