From 6fa4b18bb41a32c3e1fe6bad1dea4fe3623a399d Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Tue, 11 Oct 2022 17:29:20 +0200 Subject: [PATCH 1/4] Expand IO#set_encoding specs --- spec/ruby/core/io/set_encoding_spec.rb | 32 +++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/spec/ruby/core/io/set_encoding_spec.rb b/spec/ruby/core/io/set_encoding_spec.rb index bc448acfceef..22d901763570 100644 --- a/spec/ruby/core/io/set_encoding_spec.rb +++ b/spec/ruby/core/io/set_encoding_spec.rb @@ -1,7 +1,7 @@ require_relative '../../spec_helper' describe :io_set_encoding_write, shared: true do - it "sets the encodings to nil" do + it "sets the encodings to nil when they were set previously" do @io = new_io @name, "#{@object}:ibm437:ibm866" @io.set_encoding nil, nil @@ -9,6 +9,19 @@ @io.internal_encoding.should be_nil end + it "sets the encodings to nil when the IO is built with no explicit encoding" do + @io = new_io @name, @object + + # Checking our assumptions first + @io.external_encoding.should be_nil + @io.internal_encoding.should be_nil + + @io.set_encoding nil, nil + + @io.external_encoding.should be_nil + @io.internal_encoding.should be_nil + end + it "prevents the encodings from changing when Encoding defaults are changed" do @io = new_io @name, "#{@object}:utf-8:us-ascii" @io.set_encoding nil, nil @@ -38,6 +51,7 @@ @external = Encoding.default_external @internal = Encoding.default_internal + # The defaults Encoding.default_external = Encoding::UTF_8 Encoding.default_internal = nil @@ -113,6 +127,22 @@ describe "with 'a+' mode" do it_behaves_like :io_set_encoding_write, nil, "a+" end + + describe "with standard IOs" do + it "correctly resets them" do + STDOUT.external_encoding.should == nil + STDOUT.internal_encoding.should == nil + + begin + STDOUT.set_encoding(Encoding::US_ASCII, Encoding::ISO_8859_1) + ensure + STDOUT.set_encoding(nil, nil) + end + + STDOUT.external_encoding.should == nil + STDOUT.internal_encoding.should == nil + end + end end describe "IO#set_encoding" do From 7c628f9824a08b606b4d7831ea22ee609f37afd3 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Tue, 1 Nov 2022 16:36:45 +0100 Subject: [PATCH 2/4] Rewrite IO#{set_encoding,internal_encoding,external_encoding} by using the logic in CRuby 3.1.2 --- src/main/ruby/truffleruby/core/io.rb | 125 ++++++------------ .../truffleruby/core/truffle/io_operations.rb | 47 ++++++- 2 files changed, 86 insertions(+), 86 deletions(-) diff --git a/src/main/ruby/truffleruby/core/io.rb b/src/main/ruby/truffleruby/core/io.rb index 0d7ce236273f..4f57bc973053 100644 --- a/src/main/ruby/truffleruby/core/io.rb +++ b/src/main/ruby/truffleruby/core/io.rb @@ -678,7 +678,7 @@ def self.open(*args) end def self.pipe(external = nil, internal = nil, options = nil) - lhs, rhs = Truffle::IOOperations.create_pipe(self, self, external, internal, options) + lhs, rhs = Truffle::IOOperations.create_pipe(self, self, external, internal) if block_given? begin @@ -743,7 +743,7 @@ def self.popen(*args) if readable and writable pipe = pa_read - pipe.instance_variable_set(:@write, pa_write) + Primitive.object_ivar_set pipe, :@write, pa_write elsif readable pipe = pa_read elsif writable @@ -780,7 +780,7 @@ def self.popen(*args) pid = Truffle::ProcessOperations.spawn(env || {}, *cmd, options) end - pipe.instance_variable_set :@pid, pid + Primitive.object_ivar_set pipe, :@pid, pid ch_write.close if readable ch_read.close if writable @@ -920,12 +920,12 @@ def self.setup(io, fd, mode, sync) io.close if Primitive.io_fd(io) != -1 Primitive.io_set_fd(io, fd) - io.instance_variable_set :@mode, Truffle::IOOperations.translate_omode_to_fmode(mode) + Primitive.object_ivar_set io, :@mode, Truffle::IOOperations.translate_omode_to_fmode(mode) io.sync = sync io.autoclose = true ibuffer = mode != WRONLY ? IO::InternalBuffer.new : nil - io.instance_variable_set :@ibuffer, ibuffer - io.instance_variable_set :@lineno, 0 + Primitive.object_ivar_set io, :@ibuffer, ibuffer + Primitive.object_ivar_set io, :@lineno, 0 end # @@ -950,30 +950,6 @@ def initialize(fd, mode=nil, options=undefined) set_encoding external, internal @autoclose = autoclose_tmp - - if @external && !external - @external = nil - end - - if @internal - if Encoding.default_external == Encoding.default_internal or - (@external || Encoding.default_external) == Encoding::ASCII_8BIT - @internal = nil - end - elsif !mode_read_only? - if Encoding.default_external != Encoding.default_internal - @internal = Encoding.default_internal - end - end - - unless @external - if @binmode - @external = Encoding::ASCII_8BIT - elsif @internal or Encoding.default_internal - @external = Encoding.default_external - end - end - @pipe = false end @@ -1418,10 +1394,14 @@ def eof? def external_encoding ensure_open - if @mode & FMODE_WRITABLE == 0 - @external || Encoding.default_external + + external = @external + return external if external + + if @mode.anybits?(FMODE_WRITABLE) + @internal else - @external + @internal || Encoding.default_external end end @@ -1452,7 +1432,9 @@ def fcntl(command, arg=0) def internal_encoding ensure_open - @internal + + return nil unless @external + @internal || Encoding.default_external end ## @@ -2061,65 +2043,40 @@ def seek(amount, whence=SEEK_SET) 0 end - def set_encoding(external, internal=nil, options=undefined) - case external - when Encoding - @external = external - when String - @external = nil - when nil - if (@mode & FMODE_WRITABLE == 0) || @external - @external = nil - else - @external = Encoding.default_external - end - else - @external = nil - external = StringValue(external) - end - - if Primitive.nil?(@external) && !Primitive.nil?(external) - if index = external.index(':') - internal = external[index+1..-1] - external = external[0, index] - end + # MRI: io_encoding_set + # enc = internal, enc2 = external (see struct rb_io_enc_t) + def set_encoding(external, internal = nil, **options) + if !Primitive.nil?(internal) + external = Encoding.find(external) - if external[3] == ?| - if encoding = strip_bom - external = encoding + unless Primitive.object_kind_of?(internal, Encoding) + internal = StringValue(internal) + if internal == '-' # Special case - "-" => no transcoding + internal = external + external = nil else - external = external[4..-1] + internal = Encoding.find(internal) end end - @external = Encoding.find external - end - - unless Primitive.undefined? options - # TODO: set the encoding options on the IO instance - if options and not Primitive.object_kind_of?(options, Hash) - _options = Truffle::Type.coerce_to options, Hash, :to_hash + if internal == external # Special case => no transcoding + external = nil end - end - - case internal - when Encoding - @internal = nil if @external == internal - when String - # do nothing - when nil - internal = Encoding.default_internal else - internal = StringValue(internal) - end - - if Primitive.object_kind_of?(internal, String) - return self if internal == '-' - internal = Encoding.find internal + if Primitive.nil?(external) # Set to default encodings + external, internal = Truffle::IOOperations.rb_io_ext_int_to_encs(@mode, nil, nil) + else + if !Primitive.object_kind_of?(external, Encoding) and + external = StringValue(external) and external.encoding.ascii_compatible? + external, internal = Truffle::IOOperations.parse_mode_enc(@mode, external) + else + external, internal = Truffle::IOOperations.rb_io_ext_int_to_encs(@mode, Encoding.find(external), nil) + end + end end - @internal = internal unless internal && @external == internal - + @internal = internal + @external = external self end diff --git a/src/main/ruby/truffleruby/core/truffle/io_operations.rb b/src/main/ruby/truffleruby/core/truffle/io_operations.rb index 38c471e0b5d4..267df3189ec0 100644 --- a/src/main/ruby/truffleruby/core/truffle/io_operations.rb +++ b/src/main/ruby/truffleruby/core/truffle/io_operations.rb @@ -106,7 +106,7 @@ def self.pipe_end_setup(io) CLASS_NEW = Class.instance_method(:new) - def self.create_pipe(read_class, write_class, external = nil, internal = nil, options = nil) + def self.create_pipe(read_class, write_class, external = nil, internal = nil) fds = Truffle::FFI::MemoryPointer.new(:int, 2) do |ptr| res = Truffle::POSIX.pipe(ptr) Errno.handle if res == -1 @@ -117,7 +117,7 @@ def self.create_pipe(read_class, write_class, external = nil, internal = nil, op rhs = pipe_end_setup(CLASS_NEW.bind_call(write_class, fds[1], IO::WRONLY)) lhs.set_encoding external || Encoding.default_external, - internal || Encoding.default_internal, options + internal || Encoding.default_internal [lhs, rhs] end @@ -368,5 +368,48 @@ def self.parse_mode(mode) ret end + # MRI: parse_mode_enc + # Parses string as "enc" or "external:internal" or "enc:-" + def self.parse_mode_enc(mode, string) + external, internal = string.split(':', 2) + external = Encoding.find(external) + if internal + if internal == '-' # Special case - "-" => no transcoding + internal = nil + else + internal = Encoding.find(internal) + end + if mode.nobits?(FMODE_SETENC_BY_BOM) && internal == external + internal = nil + end + end + + rb_io_ext_int_to_encs(mode, external, internal) + end + + # MRI: rb_io_ext_int_to_encs + def self.rb_io_ext_int_to_encs(mode, external, internal) + default_ext = false + if Primitive.nil?(external) + external = Encoding.default_external + default_ext = true + end + + if external == Encoding::BINARY + # If external is BINARY, no transcoding + internal = nil + elsif Primitive.nil?(internal) + internal = Encoding.default_internal + end + + if Primitive.nil?(internal) or + (mode.nobits?(FMODE_SETENC_BY_BOM) && internal == external) + # No internal encoding => use external + no transcoding + internal = (default_ext && internal != external) ? nil : external + external = nil + end + + [external, internal] + end end end From b5f1899242d070e4287dd5cc14dace6b909791d9 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Tue, 1 Nov 2022 17:03:18 +0100 Subject: [PATCH 3/4] Use a sensible representation when only the external encoding is set --- src/main/ruby/truffleruby/core/io.rb | 65 +++++++++++-------- .../truffleruby/core/truffle/io_operations.rb | 22 +++++-- 2 files changed, 56 insertions(+), 31 deletions(-) diff --git a/src/main/ruby/truffleruby/core/io.rb b/src/main/ruby/truffleruby/core/io.rb index 4f57bc973053..81b9132aae49 100644 --- a/src/main/ruby/truffleruby/core/io.rb +++ b/src/main/ruby/truffleruby/core/io.rb @@ -619,7 +619,7 @@ def self.normalize_options(mode, perm, options) end if Primitive.object_kind_of?(mode, String) - mode, external, internal = mode.split(':') + mode, external, internal = mode.split(':', 3) raise ArgumentError, 'invalid access mode' unless mode binary = true if mode.include?(?b) @@ -653,10 +653,11 @@ def self.normalize_options(mode, perm, options) external = encoding elsif !Primitive.nil?(encoding) encoding = StringValue(encoding) - external, internal = encoding.split(':') + external, internal = encoding.split(':', 2) end end end + external = Encoding::BINARY if binary and !external and !internal perm ||= 0666 [mode, binary, external, internal, autoclose, perm] end @@ -677,7 +678,7 @@ def self.open(*args) end end - def self.pipe(external = nil, internal = nil, options = nil) + def self.pipe(external = nil, internal = nil, **options) lhs, rhs = Truffle::IOOperations.create_pipe(self, self, external, internal) if block_given? @@ -753,7 +754,7 @@ def self.popen(*args) end pipe.binmode if binary - pipe.set_encoding(external || Encoding.default_external, internal) + pipe.set_encoding(external, internal) if cmd == '-' Kernel.fork # will throw an error @@ -1394,14 +1395,10 @@ def eof? def external_encoding ensure_open - - external = @external - return external if external - if @mode.anybits?(FMODE_WRITABLE) - @internal + @external else - @internal || Encoding.default_external + @external || Encoding.default_external end end @@ -1432,9 +1429,7 @@ def fcntl(command, arg=0) def internal_encoding ensure_open - - return nil unless @external - @internal || Encoding.default_external + @internal end ## @@ -2044,23 +2039,41 @@ def seek(amount, whence=SEEK_SET) end # MRI: io_encoding_set - # enc = internal, enc2 = external (see struct rb_io_enc_t) + # + # Note that `enc` and `enc2` in MRI code, + # despite the confusing comments on struct rb_io_enc_t's fields, + # (see https://github.com/ruby/ruby/commit/f7bdac01c2) + # seem to mean: + # (enc=NULL, enc2=NULL) external = nil, internal = nil + # (enc=e1, enc2=NULL) external = e1, internal = nil + # (enc=e1, enc2=e2 ) external = e2, internal = e1 + # In other words, + # enc means internal if both are set, but external otherwise + # enc2 means external if both are set, but nothing (or internal) otherwise + # So a possible mapping is: + # Both enc/enc2 set => enc is internal, enc2 is external + # Otherwise => enc is external, enc2 is internal + # + # We use the internal and external terminology only because enc/enc2 is so confusing. def set_encoding(external, internal = nil, **options) if !Primitive.nil?(internal) - external = Encoding.find(external) + unless Primitive.nil?(external) || Primitive.object_kind_of?(external, Encoding) + external = Truffle::IOOperations.parse_external_enc(self, StringValue(external)) + end unless Primitive.object_kind_of?(internal, Encoding) internal = StringValue(internal) if internal == '-' # Special case - "-" => no transcoding - internal = external - external = nil + internal = nil else internal = Encoding.find(internal) end end - if internal == external # Special case => no transcoding - external = nil + if external == Encoding::BINARY # If external is BINARY, no transcoding + internal = nil + elsif internal == external # Special case => no transcoding + internal = nil end else if Primitive.nil?(external) # Set to default encodings @@ -2068,7 +2081,7 @@ def set_encoding(external, internal = nil, **options) else if !Primitive.object_kind_of?(external, Encoding) and external = StringValue(external) and external.encoding.ascii_compatible? - external, internal = Truffle::IOOperations.parse_mode_enc(@mode, external) + external, internal = Truffle::IOOperations.parse_mode_enc(self, @mode, external) else external, internal = Truffle::IOOperations.rb_io_ext_int_to_encs(@mode, Encoding.find(external), nil) end @@ -2095,7 +2108,7 @@ def set_encoding_by_bom external = strip_bom if external - @external = Encoding.find(external) + @external = external end end @@ -2111,7 +2124,7 @@ def set_encoding_by_bom if b3 == 0xFE b4 = getbyte if b4 == 0xFF - return +'UTF-32BE' + return Encoding::UTF_32BE end ungetbyte b4 end @@ -2126,12 +2139,12 @@ def set_encoding_by_bom if b3 == 0x00 b4 = getbyte if b4 == 0x00 - return +'UTF-32LE' + return Encoding::UTF_32LE end ungetbyte b4 else ungetbyte b3 - return +'UTF-16LE' + return Encoding::UTF_16LE end ungetbyte b3 end @@ -2140,7 +2153,7 @@ def set_encoding_by_bom when 0xFE b2 = getbyte if b2 == 0xFF - return +'UTF-16BE' + return Encoding::UTF_16BE end ungetbyte b2 @@ -2149,7 +2162,7 @@ def set_encoding_by_bom if b2 == 0xBB b3 = getbyte if b3 == 0xBF - return +'UTF-8' + return Encoding::UTF_8 end ungetbyte b3 end diff --git a/src/main/ruby/truffleruby/core/truffle/io_operations.rb b/src/main/ruby/truffleruby/core/truffle/io_operations.rb index 267df3189ec0..15911245ffb3 100644 --- a/src/main/ruby/truffleruby/core/truffle/io_operations.rb +++ b/src/main/ruby/truffleruby/core/truffle/io_operations.rb @@ -368,11 +368,23 @@ def self.parse_mode(mode) ret end + BOM = /\Abom\|/i + + def self.parse_external_enc(io, external) + if BOM.match?(external) + external = external[4..-1] + io.__send__(:strip_bom) || Encoding.find(external) + else + Encoding.find(external) + end + end + # MRI: parse_mode_enc - # Parses string as "enc" or "external:internal" or "enc:-" - def self.parse_mode_enc(mode, string) + # Parses string as "external" or "external:internal" or "external:-" + def self.parse_mode_enc(io, mode, string) external, internal = string.split(':', 2) - external = Encoding.find(external) + external = parse_external_enc(io, external) + if internal if internal == '-' # Special case - "-" => no transcoding internal = nil @@ -405,8 +417,8 @@ def self.rb_io_ext_int_to_encs(mode, external, internal) if Primitive.nil?(internal) or (mode.nobits?(FMODE_SETENC_BY_BOM) && internal == external) # No internal encoding => use external + no transcoding - internal = (default_ext && internal != external) ? nil : external - external = nil + external = (default_ext && internal != external) ? nil : external + internal = nil end [external, internal] From e60d3ebc31965d9a70ca63db4246e9d807cd50c7 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Mon, 31 Oct 2022 13:28:33 +0100 Subject: [PATCH 4/4] Use Encoding::BINARY for clarity in core --- src/main/ruby/truffleruby/core/argf.rb | 2 +- src/main/ruby/truffleruby/core/io.rb | 12 ++++++------ src/main/ruby/truffleruby/core/marshal.rb | 4 ++-- src/main/ruby/truffleruby/core/transcoding.rb | 4 ++-- .../ruby/truffleruby/core/truffle/dir_operations.rb | 2 +- .../truffleruby/core/truffle/string_operations.rb | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/main/ruby/truffleruby/core/argf.rb b/src/main/ruby/truffleruby/core/argf.rb index b88547a527ac..4c42e0e9b191 100644 --- a/src/main/ruby/truffleruby/core/argf.rb +++ b/src/main/ruby/truffleruby/core/argf.rb @@ -83,7 +83,7 @@ def initialize(argv = ARGV, *others) # def binmode @binmode = true - set_encoding(Encoding::ASCII_8BIT) + set_encoding(Encoding::BINARY) self end diff --git a/src/main/ruby/truffleruby/core/io.rb b/src/main/ruby/truffleruby/core/io.rb index 81b9132aae49..596fe8b5067b 100644 --- a/src/main/ruby/truffleruby/core/io.rb +++ b/src/main/ruby/truffleruby/core/io.rb @@ -208,7 +208,7 @@ def unseek!(io) # Returns +count+ bytes from the +start+ of the buffer as a new String. # If +count+ is +nil+, returns all available bytes in the buffer. - def shift(count=nil, encoding=Encoding::ASCII_8BIT) + def shift(count=nil, encoding=Encoding::BINARY) TruffleRuby.synchronized(self) do total = size total = count if count and count < total @@ -228,7 +228,7 @@ def read_to_char_boundary(io, str) peek_ahead = 0 while size > 0 and peek_ahead < PEEK_AHEAD_LIMIT - str.force_encoding Encoding::ASCII_8BIT + str.force_encoding Encoding::BINARY str << @storage[@start] @start += 1 peek_ahead += 1 @@ -260,7 +260,7 @@ def getchar(io) TruffleRuby.synchronized(self) do char = +'' while size > 0 - char.force_encoding Encoding::ASCII_8BIT + char.force_encoding Encoding::BINARY char << @storage[@start] @start += 1 @@ -481,7 +481,7 @@ def self.read_encode(io, str) internal = io.internal_encoding external = io.external_encoding || Encoding.default_external - if external.equal? Encoding::ASCII_8BIT + if external.equal? Encoding::BINARY str.force_encoding external elsif internal and external ec = Encoding::Converter.new external, internal @@ -2102,7 +2102,7 @@ def set_encoding_by_bom raise ArgumentError, 'encoding conversion is set' end - if external_encoding && external_encoding != Encoding::ASCII_8BIT + if external_encoding && external_encoding != Encoding::BINARY raise ArgumentError, "encoding is set to #{external_encoding} already" end @@ -2330,7 +2330,7 @@ def write(*data) if !binmode? && external_encoding && external_encoding != data.encoding && - external_encoding != Encoding::ASCII_8BIT + external_encoding != Encoding::BINARY unless data.ascii_only? && external_encoding.ascii_compatible? data = data.encode(external_encoding) end diff --git a/src/main/ruby/truffleruby/core/marshal.rb b/src/main/ruby/truffleruby/core/marshal.rb index ec523bff0aee..0585393bcf47 100644 --- a/src/main/ruby/truffleruby/core/marshal.rb +++ b/src/main/ruby/truffleruby/core/marshal.rb @@ -227,7 +227,7 @@ def set_instance_variables(obj) STRING_ALLOCATE = String.method(:__allocate__).unbind def construct_string - bytes = get_byte_sequence.force_encoding(Encoding::ASCII_8BIT) + bytes = get_byte_sequence.force_encoding(Encoding::BINARY) if @user_class cls = get_user_class @@ -243,7 +243,7 @@ def construct_string end end - Primitive.string_initialize(obj, bytes, Encoding::ASCII_8BIT) + Primitive.string_initialize(obj, bytes, Encoding::BINARY) else obj = bytes end diff --git a/src/main/ruby/truffleruby/core/transcoding.rb b/src/main/ruby/truffleruby/core/transcoding.rb index 857f8c04d1a3..b5f0bd72cba9 100644 --- a/src/main/ruby/truffleruby/core/transcoding.rb +++ b/src/main/ruby/truffleruby/core/transcoding.rb @@ -294,10 +294,10 @@ def last_error end if result == :invalid_byte_sequence or result == :incomplete_input - exc.__send__ :error_bytes=, error_bytes.force_encoding(Encoding::ASCII_8BIT) + exc.__send__ :error_bytes=, error_bytes.force_encoding(Encoding::BINARY) if bytes = read_again_bytes - exc.__send__ :readagain_bytes=, bytes.force_encoding(Encoding::ASCII_8BIT) + exc.__send__ :readagain_bytes=, bytes.force_encoding(Encoding::BINARY) end end diff --git a/src/main/ruby/truffleruby/core/truffle/dir_operations.rb b/src/main/ruby/truffleruby/core/truffle/dir_operations.rb index fd6bfbcc00fa..94fd64ab22aa 100644 --- a/src/main/ruby/truffleruby/core/truffle/dir_operations.rb +++ b/src/main/ruby/truffleruby/core/truffle/dir_operations.rb @@ -73,7 +73,7 @@ def self.fix_entry_encoding(dir,str) str = str.force_encoding(Primitive.object_ivar_get(dir, :@encoding)) if Encoding.default_external == Encoding::US_ASCII && !str.valid_encoding? - str.force_encoding Encoding::ASCII_8BIT + str.force_encoding Encoding::BINARY else enc = Encoding.default_internal begin diff --git a/src/main/ruby/truffleruby/core/truffle/string_operations.rb b/src/main/ruby/truffleruby/core/truffle/string_operations.rb index ad9d7720b212..dd8f74f38a19 100644 --- a/src/main/ruby/truffleruby/core/truffle/string_operations.rb +++ b/src/main/ruby/truffleruby/core/truffle/string_operations.rb @@ -158,7 +158,7 @@ def self.concat_internal(string, other) unless Primitive.object_kind_of?(other, String) if Primitive.object_kind_of?(other, Integer) if string.encoding == Encoding::US_ASCII and other >= 128 and other < 256 - string.force_encoding(Encoding::ASCII_8BIT) + string.force_encoding(Encoding::BINARY) end other = other.chr(string.encoding)