Skip to content

Commit 35407d6

Browse files
committed
JSON.dump / String#to_json: raise on invalid encoding
This regressed since 2.7.2.
1 parent 3b0bfe7 commit 35407d6

File tree

4 files changed

+52
-24
lines changed

4 files changed

+52
-24
lines changed

ext/json/ext/generator/generator.c

+33-7
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
#define RB_UNLIKELY(cond) (cond)
66
#endif
77

8-
static VALUE mJSON, cState, mString_Extend, eGeneratorError, eNestingError;
8+
static VALUE mJSON, cState, mString_Extend, eGeneratorError, eNestingError, Encoding_UTF_8;
99

10-
static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
10+
static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend, i_encode;
1111

1212
/* Converts in_string to a JSON string (without the wrapping '"'
1313
* characters) in FBuffer out_buffer.
@@ -735,20 +735,41 @@ static void generate_json_array(FBuffer *buffer, VALUE Vstate, JSON_Generator_St
735735
fbuffer_append_char(buffer, ']');
736736
}
737737

738-
static int usascii_encindex, utf8_encindex;
738+
static int usascii_encindex, utf8_encindex, binary_encindex;
739739

740-
static int enc_utf8_compatible_p(int enc_idx)
740+
static inline int enc_utf8_compatible_p(int enc_idx)
741741
{
742742
if (enc_idx == usascii_encindex) return 1;
743743
if (enc_idx == utf8_encindex) return 1;
744744
return 0;
745745
}
746746

747+
static inline VALUE ensure_valid_encoding(VALUE str)
748+
{
749+
int encindex = RB_ENCODING_GET(str);
750+
VALUE utf8_string;
751+
if (RB_UNLIKELY(!enc_utf8_compatible_p(encindex))) {
752+
if (encindex == binary_encindex) {
753+
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
754+
// TODO: Deprecate in 2.8.0
755+
// TODO: Remove in 3.0.0
756+
utf8_string = rb_enc_associate_index(rb_str_dup(str), utf8_encindex);
757+
switch (rb_enc_str_coderange(utf8_string)) {
758+
case ENC_CODERANGE_7BIT:
759+
case ENC_CODERANGE_VALID:
760+
return utf8_string;
761+
break;
762+
}
763+
}
764+
765+
str = rb_funcall(str, i_encode, 1, Encoding_UTF_8);
766+
}
767+
return str;
768+
}
769+
747770
static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_State *state, VALUE obj)
748771
{
749-
if (!enc_utf8_compatible_p(RB_ENCODING_GET(obj))) {
750-
obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
751-
}
772+
obj = ensure_valid_encoding(obj);
752773

753774
fbuffer_append_char(buffer, '"');
754775

@@ -1462,14 +1483,19 @@ void Init_generator(void)
14621483
VALUE mNilClass = rb_define_module_under(mGeneratorMethods, "NilClass");
14631484
rb_define_method(mNilClass, "to_json", mNilClass_to_json, -1);
14641485

1486+
rb_global_variable(&Encoding_UTF_8);
1487+
Encoding_UTF_8 = rb_const_get(rb_path2class("Encoding"), rb_intern("UTF_8"));
1488+
14651489
i_to_s = rb_intern("to_s");
14661490
i_to_json = rb_intern("to_json");
14671491
i_new = rb_intern("new");
14681492
i_pack = rb_intern("pack");
14691493
i_unpack = rb_intern("unpack");
14701494
i_create_id = rb_intern("create_id");
14711495
i_extend = rb_intern("extend");
1496+
i_encode = rb_intern("encode");
14721497

14731498
usascii_encindex = rb_usascii_encindex();
14741499
utf8_encindex = rb_utf8_encindex();
1500+
binary_encindex = rb_ascii8bit_encindex();
14751501
}

ext/json/ext/parser/parser.c

+11-8
Original file line numberDiff line numberDiff line change
@@ -1794,6 +1794,9 @@ static VALUE convert_encoding(VALUE source)
17941794
}
17951795

17961796
if (encindex == binary_encindex) {
1797+
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
1798+
// TODO: Deprecate in 2.8.0
1799+
// TODO: Remove in 3.0.0
17971800
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
17981801
}
17991802

@@ -1943,15 +1946,15 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
19431946
}
19441947

19451948

1946-
#line 1947 "parser.c"
1949+
#line 1950 "parser.c"
19471950
enum {JSON_start = 1};
19481951
enum {JSON_first_final = 10};
19491952
enum {JSON_error = 0};
19501953

19511954
enum {JSON_en_main = 1};
19521955

19531956

1954-
#line 855 "parser.rl"
1957+
#line 858 "parser.rl"
19551958

19561959

19571960
/*
@@ -1969,16 +1972,16 @@ static VALUE cParser_parse(VALUE self)
19691972
GET_PARSER;
19701973

19711974

1972-
#line 1973 "parser.c"
1975+
#line 1976 "parser.c"
19731976
{
19741977
cs = JSON_start;
19751978
}
19761979

1977-
#line 872 "parser.rl"
1980+
#line 875 "parser.rl"
19781981
p = json->source;
19791982
pe = p + json->len;
19801983

1981-
#line 1982 "parser.c"
1984+
#line 1985 "parser.c"
19821985
{
19831986
if ( p == pe )
19841987
goto _test_eof;
@@ -2012,7 +2015,7 @@ case 1:
20122015
cs = 0;
20132016
goto _out;
20142017
tr2:
2015-
#line 847 "parser.rl"
2018+
#line 850 "parser.rl"
20162019
{
20172020
char *np = JSON_parse_value(json, p, pe, &result, 0);
20182021
if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
@@ -2022,7 +2025,7 @@ cs = 0;
20222025
if ( ++p == pe )
20232026
goto _test_eof10;
20242027
case 10:
2025-
#line 2026 "parser.c"
2028+
#line 2029 "parser.c"
20262029
switch( (*p) ) {
20272030
case 13: goto st10;
20282031
case 32: goto st10;
@@ -2111,7 +2114,7 @@ case 9:
21112114
_out: {}
21122115
}
21132116

2114-
#line 875 "parser.rl"
2117+
#line 878 "parser.rl"
21152118

21162119
if (cs >= JSON_first_final && p == pe) {
21172120
return result;

ext/json/ext/parser/parser.rl

+3
Original file line numberDiff line numberDiff line change
@@ -689,6 +689,9 @@ static VALUE convert_encoding(VALUE source)
689689
}
690690

691691
if (encindex == binary_encindex) {
692+
// For historical reason, we silently reinterpret binary strings as UTF-8 if it would work.
693+
// TODO: Deprecate in 2.8.0
694+
// TODO: Remove in 3.0.0
692695
return rb_enc_associate_index(rb_str_dup(source), utf8_encindex);
693696
}
694697

test/json/json_generator_test.rb

+5-9
Original file line numberDiff line numberDiff line change
@@ -449,16 +449,12 @@ def test_invalid_encoding_string
449449
end
450450
assert_includes error.message, "source sequence is illegal/malformed utf-8"
451451

452-
# These pass on the pure-Ruby generator but not with the native extension
453-
# https://github.com/ruby/json/issues/634
454-
if defined?(JSON::Pure)
455-
assert_raise(Encoding::UndefinedConversionError) do
456-
"\x82\xAC\xEF".b.to_json
457-
end
452+
assert_raise(Encoding::UndefinedConversionError) do
453+
"\x82\xAC\xEF".b.to_json
454+
end
458455

459-
assert_raise(Encoding::UndefinedConversionError) do
460-
JSON.dump("\x82\xAC\xEF".b)
461-
end
456+
assert_raise(Encoding::UndefinedConversionError) do
457+
JSON.dump("\x82\xAC\xEF".b)
462458
end
463459
end
464460

0 commit comments

Comments
 (0)