Skip to content

Commit bf36d53

Browse files
author
jan.nijtmans
committed
Fix [17a1cb8d6e2a51bd]. From now on, noncharacters are no longer rejected in -strict mode
2 parents 49f1054 + 2a05601 commit bf36d53

File tree

3 files changed

+9
-88
lines changed

3 files changed

+9
-88
lines changed

generic/tclEncoding.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2468,13 +2468,12 @@ UtfToUtfProc(
24682468
src += len;
24692469
dst += Tcl_UniCharToUtf(ch, dst);
24702470
ch = low;
2471-
} else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && !Tcl_UniCharIsUnicode(ch)
2472-
&& (((ch & ~0x7FF) == 0xD800) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
2471+
} else if (STOPONERROR && !(flags & TCL_ENCODING_MODIFIED) && (((ch & ~0x7FF) == 0xD800))) {
24732472
result = TCL_CONVERT_UNKNOWN;
24742473
src = saveSrc;
24752474
break;
24762475
} else if (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
2477-
&& (flags & TCL_ENCODING_MODIFIED) && !Tcl_UniCharIsUnicode(ch)) {
2476+
&& (flags & TCL_ENCODING_MODIFIED) && ((ch & ~0x7FF) == 0xD800)) {
24782477
result = TCL_CONVERT_SYNTAX;
24792478
src = saveSrc;
24802479
break;
@@ -2566,7 +2565,7 @@ Utf32ToUtfProc(
25662565
ch = (src[0] & 0xFF) << 24 | (src[1] & 0xFF) << 16 | (src[2] & 0xFF) << 8 | (src[3] & 0xFF);
25672566
}
25682567
if ((unsigned)ch > 0x10FFFF || (((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT)
2569-
&& !Tcl_UniCharIsUnicode(ch))) {
2568+
&& ((ch & ~0x7FF) == 0xD800))) {
25702569
if (STOPONERROR) {
25712570
result = TCL_CONVERT_SYNTAX;
25722571
break;
@@ -2662,7 +2661,7 @@ UtfToUtf32Proc(
26622661
break;
26632662
}
26642663
len = TclUtfToUCS4(src, &ch);
2665-
if (!Tcl_UniCharIsUnicode(ch) && (((ch & ~0x7FF) == 0xD800) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
2664+
if ((ch & ~0x7FF) == 0xD800) {
26662665
if (STOPONERROR) {
26672666
result = TCL_CONVERT_UNKNOWN;
26682667
break;
@@ -2864,7 +2863,7 @@ UtfToUtf16Proc(
28642863
break;
28652864
}
28662865
len = TclUtfToUCS4(src, &ch);
2867-
if (!Tcl_UniCharIsUnicode(ch) && (((ch & ~0x7FF) == 0xD800) || ((flags & TCL_ENCODING_STRICT) == TCL_ENCODING_STRICT))) {
2866+
if ((ch & ~0x7FF) == 0xD800) {
28682867
if (STOPONERROR) {
28692868
result = TCL_CONVERT_UNKNOWN;
28702869
break;

tests/encoding.test

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -703,18 +703,18 @@ test encoding-24.28 {Parse invalid utf-8 with -strict} -body {
703703
test encoding-24.29 {Parse invalid utf-8} -body {
704704
encoding convertfrom utf-8 \xEF\xBF\xBF
705705
} -result \uFFFF
706-
test encoding-24.30 {Parse invalid utf-8 with -strict} -body {
706+
test encoding-24.30 {Parse noncharacter with -strict} -body {
707707
encoding convertfrom -strict utf-8 \xEF\xBF\xBF
708-
} -returnCodes 1 -result {unexpected byte sequence starting at index 0: '\xEF'}
708+
} -result \uFFFF
709709
test encoding-24.31 {Parse invalid utf-8 with -nocomplain} -body {
710710
encoding convertfrom -nocomplain utf-8 \xEF\xBF\xBF
711711
} -result \uFFFF
712712
test encoding-24.32 {Try to generate invalid utf-8} -body {
713713
encoding convertto utf-8 \uFFFF
714714
} -result \xEF\xBF\xBF
715-
test encoding-24.33 {Try to generate invalid utf-8 with -strict} -body {
715+
test encoding-24.33 {Try to generate noncharacter with -strict} -body {
716716
encoding convertto -strict utf-8 \uFFFF
717-
} -returnCodes 1 -result {unexpected character at index 0: 'U+00FFFF'}
717+
} -result \xEF\xBF\xBF
718718
test encoding-24.34 {Try to generate invalid utf-8 with -nocomplain} -body {
719719
encoding convertto -nocomplain utf-8 \uFFFF
720720
} -result \xEF\xBF\xBF

tests/io.test

Lines changed: 0 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -9201,84 +9201,6 @@ test io-75.13 {invalid utf-8 encoding read is not ignored (-strictencoding 1)} -
92019201
removeFile io-75.13
92029202
} -match glob -result {41 1 {error reading "*": illegal byte sequence}}
92039203

9204-
# Testcase for Rolf's use-case (detecting Invalid byte sequence, but allowing noncharacter)
9205-
test io-75.14 {How to use -strict, but allow non-characters} -setup {
9206-
set fn [makeFile {} io-75.14]
9207-
set f [open $fn w+]
9208-
fconfigure $f -encoding binary
9209-
# Noncharacter followed by a single
9210-
puts -nonewline $f pre\xEF\xBF\xBE\x81post
9211-
flush $f
9212-
seek $f 0
9213-
fconfigure stdout -nocomplainencoding 1
9214-
catch {fconfigure $f -nocomplainencoding 0};# Only needed on Tcl 9
9215-
fconfigure $f -encoding utf-8 -buffering none -translation lf -strictencoding 1
9216-
} -body {
9217-
set hd {}
9218-
catch {
9219-
while {![eof $f]} {
9220-
if {[catch {
9221-
append hd [read $f]
9222-
}]} {
9223-
fconfigure $f -nocomplainencoding 1 -strictencoding 0
9224-
set char [read $f 1]
9225-
if {[string is unicode $char]} {
9226-
error "InvalidByteSequence"
9227-
} elseif {$char >= "\uD800" && $char < "\uE000"} {
9228-
error "Surrogate"
9229-
} else {
9230-
append hd $char
9231-
}
9232-
catch {fconfigure $f -nocomplainencoding 0};# Only needed on Tcl 9
9233-
fconfigure $f -strictencoding 1 -encoding utf-8
9234-
}
9235-
}
9236-
} msg
9237-
close $f
9238-
append hd +$msg
9239-
} -cleanup {
9240-
removeFile io-75.14
9241-
} -result "pre\uFFFE+InvalidByteSequence"
9242-
9243-
# Testcase for Rolf's use-case (detecting Surrogate, but allowing noncharacter)
9244-
test io-75.15 {How to use -strict, but allow non-characters} -setup {
9245-
set fn [makeFile {} io-75.14]
9246-
set f [open $fn w+]
9247-
fconfigure $f -encoding utf-8 -nocomplainencoding 1
9248-
# Noncharacter followed by a single
9249-
puts -nonewline $f pre\uFFFE\uD800post
9250-
flush $f
9251-
seek $f 0
9252-
fconfigure stdout -nocomplainencoding 1
9253-
catch {fconfigure $f -nocomplainencoding 0};# Only needed on Tcl 9
9254-
fconfigure $f -buffering none -translation lf -strictencoding 1
9255-
} -body {
9256-
set hd {}
9257-
catch {
9258-
while {![eof $f]} {
9259-
if {[catch {
9260-
append hd [read $f]
9261-
}]} {
9262-
fconfigure $f -nocomplainencoding 1 -strictencoding 0
9263-
set char [read $f 1]
9264-
if {[string is unicode $char]} {
9265-
error "Invalid Byte Sequence"
9266-
} elseif {$char >= "\uD800" && $char < "\uE000"} {
9267-
error "Surrogate"
9268-
} else {
9269-
append hd $char
9270-
}
9271-
catch {fconfigure $f -nocomplainencoding 0};# Only needed on Tcl 9
9272-
fconfigure $f -strictencoding 1
9273-
}
9274-
}
9275-
} msg
9276-
close $f
9277-
append hd +$msg
9278-
} -cleanup {
9279-
removeFile io-75.15
9280-
} -result "pre\uFFFE+Surrogate"
9281-
92829204
# ### ### ### ######### ######### #########
92839205

92849206

0 commit comments

Comments
 (0)