From bce6b87b0ab8b0c02de62b86da0c75f680ea5df6 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Wed, 25 Jan 2023 10:44:50 -0500 Subject: [PATCH] Handle invalid byte sequences in UTF-8 --- lib/syntax_tree/parser.rb | 21 +++++++++++++++++++-- test/parser_test.rb | 9 +++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/lib/syntax_tree/parser.rb b/lib/syntax_tree/parser.rb index 602bb98f..99b703d0 100644 --- a/lib/syntax_tree/parser.rb +++ b/lib/syntax_tree/parser.rb @@ -1103,6 +1103,7 @@ def on_command_call(receiver, operator, message, arguments) # :call-seq: # on_comment: (String value) -> Comment def on_comment(value) + # char is the index of the # character in the source. char = char_pos location = Location.token( @@ -1112,8 +1113,24 @@ def on_comment(value) size: value.size - 1 ) - index = source.rindex(/[^\t ]/, char - 1) if char != 0 - inline = index && (source[index] != "\n") + # Loop backward in the source string, starting from the beginning of the + # comment, and find the first character that is not a space or a tab. If + # index is -1, this indicates that we've checked all of the characters + # back to the start of the source, so this comment must be at the + # beginning of the file. + # + # We are purposefully not using rindex or regular expressions here because + # they check if there are invalid characters, which is actually possible + # with the use of __END__ syntax. + index = char - 1 + while index > -1 && (source[index] == "\t" || source[index] == " ") + index -= 1 + end + + # If we found a character that was not a space or a tab before the comment + # and it's a newline, then this comment is inline. Otherwise, it stands on + # its own and can be attached as its own node in the tree. + inline = index != -1 && source[index] != "\n" comment = Comment.new(value: value.chomp, inline: inline, location: location) diff --git a/test/parser_test.rb b/test/parser_test.rb index 6048cf11..8d6c0a16 100644 --- a/test/parser_test.rb +++ b/test/parser_test.rb @@ -65,5 +65,14 @@ def foo end RUBY end + + def test_does_not_choke_on_invalid_characters_in_source_string + SyntaxTree.parse(<<~RUBY) + # comment + # comment + __END__ + \xC5 + RUBY + end end end