9
9
#include " misc/Interval.h"
10
10
#include " IntStream.h"
11
11
12
- #include " support/StringUtils .h"
12
+ #include " support/Utf8 .h"
13
13
#include " support/CPPUtils.h"
14
14
15
15
#include " ANTLRInputStream.h"
@@ -23,15 +23,9 @@ ANTLRInputStream::ANTLRInputStream() {
23
23
InitializeInstanceFields ();
24
24
}
25
25
26
- #if __cplusplus >= 201703L
27
- ANTLRInputStream::ANTLRInputStream (const std::string_view &input): ANTLRInputStream() {
26
+ ANTLRInputStream::ANTLRInputStream (std::string_view input): ANTLRInputStream() {
28
27
load (input.data (), input.length ());
29
28
}
30
- #endif
31
-
32
- ANTLRInputStream::ANTLRInputStream (const std::string &input): ANTLRInputStream() {
33
- load (input.data (), input.size ());
34
- }
35
29
36
30
ANTLRInputStream::ANTLRInputStream (const char *data, size_t length) {
37
31
load (data, length);
@@ -41,28 +35,37 @@ ANTLRInputStream::ANTLRInputStream(std::istream &stream): ANTLRInputStream() {
41
35
load (stream);
42
36
}
43
37
44
- void ANTLRInputStream::load (const std::string &input) {
45
- load (input.data (), input.size ());
38
+ void ANTLRInputStream::load (const std::string &input, bool lenient ) {
39
+ load (input.data (), input.size (), lenient );
46
40
}
47
41
48
- void ANTLRInputStream::load (const char *data, size_t length) {
42
+ void ANTLRInputStream::load (const char *data, size_t length, bool lenient ) {
49
43
// Remove the UTF-8 BOM if present.
50
44
const char *bom = " \xef\xbb\xbf " ;
51
- if (length >= 3 && strncmp (data, bom, 3 ) == 0 )
52
- _data = antlrcpp::utf8_to_utf32 (data + 3 , data + length);
53
- else
54
- _data = antlrcpp::utf8_to_utf32 (data, data + length);
45
+ if (length >= 3 && strncmp (data, bom, 3 ) == 0 ) {
46
+ data += 3 ;
47
+ length -= 3 ;
48
+ }
49
+ if (lenient) {
50
+ _data = Utf8::lenientDecode (std::string_view (data, length));
51
+ } else {
52
+ auto maybe_utf32 = Utf8::strictDecode (std::string_view (data, length));
53
+ if (!maybe_utf32.has_value ()) {
54
+ throw IllegalArgumentException (" UTF-8 string contains an illegal byte sequence" );
55
+ }
56
+ _data = std::move (maybe_utf32).value ();
57
+ }
55
58
p = 0 ;
56
59
}
57
60
58
- void ANTLRInputStream::load (std::istream &stream) {
61
+ void ANTLRInputStream::load (std::istream &stream, bool lenient ) {
59
62
if (!stream.good () || stream.eof ()) // No fail, bad or EOF.
60
63
return ;
61
64
62
65
_data.clear ();
63
66
64
67
std::string s ((std::istreambuf_iterator<char >(stream)), std::istreambuf_iterator<char >());
65
- load (s.data (), s.length ());
68
+ load (s.data (), s.length (), lenient );
66
69
}
67
70
68
71
void ANTLRInputStream::reset () {
@@ -150,7 +153,11 @@ std::string ANTLRInputStream::getText(const Interval &interval) {
150
153
return " " ;
151
154
}
152
155
153
- return antlrcpp::utf32_to_utf8 (_data.substr (start, count));
156
+ auto maybeUtf8 = Utf8::strictEncode (std::u32string_view (_data).substr (start, count));
157
+ if (!maybeUtf8.has_value ()) {
158
+ throw IllegalArgumentException (" Input stream contains invalid Unicode code points" );
159
+ }
160
+ return std::move (maybeUtf8).value ();
154
161
}
155
162
156
163
std::string ANTLRInputStream::getSourceName () const {
@@ -161,7 +168,11 @@ std::string ANTLRInputStream::getSourceName() const {
161
168
}
162
169
163
170
std::string ANTLRInputStream::toString () const {
164
- return antlrcpp::utf32_to_utf8 (_data);
171
+ auto maybeUtf8 = Utf8::strictEncode (_data);
172
+ if (!maybeUtf8.has_value ()) {
173
+ throw IllegalArgumentException (" Input stream contains invalid Unicode code points" );
174
+ }
175
+ return std::move (maybeUtf8).value ();
165
176
}
166
177
167
178
void ANTLRInputStream::InitializeInstanceFields () {
0 commit comments