Skip to content

Commit 4cacfde

Browse files
committed
various tokenizer improvements (catcing up with the test suite)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401208
1 parent b1460dc commit 4cacfde

File tree

1 file changed

+26
-16
lines changed

1 file changed

+26
-16
lines changed

lib/html5/tokenizer.rb

+26-16
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,22 @@ def process_solidus_in_tag
6565

6666
# We need to consume another character to make sure it's a ">"
6767
data = @stream.char
68-
68+
rv = false
6969
if @current_token[:type] == :StartTag and data == ">"
7070
@current_token[:type] = :EmptyTag
71+
elsif data == :EOF
72+
@token_queue << ({:type => :ParseError, :data => "eof-following-solidus"})
73+
@state = :data_state
74+
emit_current_token
75+
rv = true
7176
else
7277
@token_queue << {:type => :ParseError, :data => "incorrectly-placed-solidus"}
7378
end
7479

7580
# The character we just consumed need to be put back on the stack so it
7681
# doesn't get lost...
7782
@stream.unget(data)
83+
rv
7884
end
7985

8086
# This function returns either U+FFFD or the character based on the
@@ -117,7 +123,8 @@ def consume_number_entity(isHex)
117123
charAsInt = ENTITIES_WINDOWS1252[charAsInt - 128]
118124
end
119125

120-
if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343)
126+
if 0 < charAsInt && charAsInt <= 1114111 && !(55296 <= charAsInt && charAsInt <= 57343) &&
127+
![0x10FFFF].include?(charAsInt) # TODO add more entity replacements here
121128
if String.method_defined? :force_encoding
122129
char = charAsInt.chr('utf-8')
123130
else
@@ -475,8 +482,9 @@ def attribute_name_state
475482
elsif SPACE_CHARACTERS.include? data
476483
@state = :after_attribute_name_state
477484
elsif data == "/"
478-
process_solidus_in_tag
479-
@state = :before_attribute_name_state
485+
if !process_solidus_in_tag
486+
@state = :before_attribute_name_state
487+
end
480488
elsif data == "'" or data == '"':
481489
@token_queue.push({:type => :ParseError, :data => "invalid-character-in-attribute-name"})
482490
@current_token[:data][-1][0] += data
@@ -520,8 +528,9 @@ def after_attribute_name_state
520528
@current_token[:data].push([data, ""])
521529
@state = :attribute_name_state
522530
elsif data == "/"
523-
process_solidus_in_tag
524-
@state = :before_attribute_name_state
531+
if !process_solidus_in_tag
532+
@state = :before_attribute_name_state
533+
end
525534
else
526535
@current_token[:data].push([data, ""])
527536
@state = :attribute_name_state
@@ -592,7 +601,7 @@ def attribute_value_unquoted_state
592601
if SPACE_CHARACTERS.include? data
593602
@state = :before_attribute_name_state
594603
elsif data == "&"
595-
process_entity_in_attribute
604+
process_entity_in_attribute ''
596605
elsif data == ">"
597606
emit_current_token
598607
elsif data == '"' || data == "'" || data == "=":
@@ -615,12 +624,13 @@ def after_attribute_value_state
615624
emit_current_token
616625
@state = :data_state
617626
elsif data == "/"
618-
process_solidus_in_tag
619-
@state = :before_attribute_name_state
627+
if !process_solidus_in_tag
628+
@state = :before_attribute_name_state
629+
end
620630
else
621-
@tokenQueue.push({:type => :ParseError, :data => "unexpected-character-after-attribute-value"})
631+
@token_queue.push({:type => :ParseError, :data => "unexpected-character-after-attribute-value"})
622632
@stream.unget(data)
623-
@state = :before_attribute_name
633+
@state = :before_attribute_name_state
624634
end
625635
true
626636
end
@@ -629,7 +639,7 @@ def bogus_comment_state
629639
# Make a new comment token and give it as value all the characters
630640
# until the first > or :EOF (chars_until checks for :EOF automatically)
631641
# and emit it.
632-
@token_queue << {:type => :Comment, :data => @stream.chars_until((">"))}
642+
@token_queue << {:type => :Comment, :data => @stream.chars_until([">"])}
633643

634644
# Eat the character directly after the bogus comment which is either a
635645
# ">" or an :EOF.
@@ -824,6 +834,7 @@ def after_doctype_name_state
824834
else
825835
@stream.unget(char_stack)
826836
@token_queue << {:type => :ParseError, :data => "expected-space-or-right-bracket-in-doctype", "datavars" => {"data" => data}}
837+
@current_token[:correct] = false
827838
@state = :bogus_doctype_state
828839
end
829840
end
@@ -852,6 +863,7 @@ def before_doctype_public_identifier_state
852863
@state = :data_state
853864
else
854865
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
866+
@current_token[:correct] = false
855867
@state = :bogus_doctype_state
856868
end
857869

@@ -917,6 +929,7 @@ def after_doctype_public_identifier_state
917929
@state = :data_state
918930
else
919931
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
932+
@current_token[:correct] = false
920933
@state = :bogus_doctype_state
921934
end
922935
return true
@@ -943,6 +956,7 @@ def before_doctype_system_identifier_state
943956
@state = :data_state
944957
else
945958
@token_queue << {:type => :ParseError, :data => "unexpected-char-in-doctype"}
959+
@current_token[:correct] = false
946960
@state = :bogus_doctype_state
947961
end
948962
return true
@@ -1008,15 +1022,11 @@ def after_doctype_system_identifier_state
10081022

10091023
def bogus_doctype_state
10101024
data = @stream.char
1011-
@current_token[:correct] = false
10121025
if data == ">"
10131026
@token_queue << @current_token
10141027
@state = :data_state
10151028
elsif data == :EOF
1016-
# XXX EMIT
10171029
@stream.unget(data)
1018-
@token_queue << {:type => :ParseError, :data => "eof-in-doctype"}
1019-
@current_token[:correct] = false
10201030
@token_queue << @current_token
10211031
@state = :data_state
10221032
end

0 commit comments

Comments
 (0)