From bce6b87b0ab8b0c02de62b86da0c75f680ea5df6 Mon Sep 17 00:00:00 2001
From: Kevin Newton <kddnewton@gmail.com>
Date: Wed, 25 Jan 2023 10:44:50 -0500
Subject: [PATCH] Handle invalid byte sequences in UTF-8

---
 lib/syntax_tree/parser.rb | 21 +++++++++++++++++++--
 test/parser_test.rb       |  9 +++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/lib/syntax_tree/parser.rb b/lib/syntax_tree/parser.rb
index 602bb98f..99b703d0 100644
--- a/lib/syntax_tree/parser.rb
+++ b/lib/syntax_tree/parser.rb
@@ -1103,6 +1103,7 @@ def on_command_call(receiver, operator, message, arguments)
     # :call-seq:
     #   on_comment: (String value) -> Comment
     def on_comment(value)
+      # char is the index of the # character in the source.
       char = char_pos
       location =
         Location.token(
@@ -1112,8 +1113,24 @@ def on_comment(value)
           size: value.size - 1
         )
 
-      index = source.rindex(/[^\t ]/, char - 1) if char != 0
-      inline = index && (source[index] != "\n")
+      # Loop backward in the source string, starting from the beginning of the
+      # comment, and find the first character that is not a space or a tab. If
+      # index is -1, this indicates that we've checked all of the characters
+      # back to the start of the source, so this comment must be at the
+      # beginning of the file.
+      #
+      # We are purposefully not using rindex or regular expressions here because
+      # they check if there are invalid characters, which is actually possible
+      # with the use of __END__ syntax.
+      index = char - 1
+      while index > -1 && (source[index] == "\t" || source[index] == " ")
+        index -= 1
+      end
+
+      # If we found a character that was not a space or a tab before the comment
+      # and it's a newline, then this comment is inline. Otherwise, it stands on
+      # its own and can be attached as its own node in the tree.
+      inline = index != -1 && source[index] != "\n"
       comment =
         Comment.new(value: value.chomp, inline: inline, location: location)
 
diff --git a/test/parser_test.rb b/test/parser_test.rb
index 6048cf11..8d6c0a16 100644
--- a/test/parser_test.rb
+++ b/test/parser_test.rb
@@ -65,5 +65,14 @@ def foo
         end
       RUBY
     end
+
+    def test_does_not_choke_on_invalid_characters_in_source_string
+      SyntaxTree.parse(<<~RUBY)
+        # comment
+        # comment
+        __END__
+        \xC5
+      RUBY
+    end
   end
 end