Skip to content

Commit

Permalink
[GR-50320] [GR-50321] [GR-50322] Backports for 23.1.
Browse files Browse the repository at this point in the history
PullRequest: truffleruby/4067
  • Loading branch information
eregon committed Nov 23, 2023
2 parents e976a4d + 152a876 commit e86d372
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 68 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# 23.1.2

Bug fixes:

* Fix `rb_enc_left_char_head()` so it is not always `ArgumentError` (#3267, @eregon).
* Fix `IO.copy_stream` with a `Tempfile` destination (#3280, @eregon).
* Fix `Regexp.union` negotiating the wrong result encoding (#3287, @nirvdrum, @simonlevasseur).

# 23.1.0

New features:
Expand Down
2 changes: 1 addition & 1 deletion lib/cext/ABI_check.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2
3
33 changes: 27 additions & 6 deletions spec/ruby/core/io/copy_stream_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,12 @@
end

it "raises an IOError if the destination IO is not open for writing" do
@to_io.close
@to_io = new_io @to_name, "r"
-> { IO.copy_stream @object.from, @to_io }.should raise_error(IOError)
to_io = new_io __FILE__, "r"
begin
-> { IO.copy_stream @object.from, to_io }.should raise_error(IOError)
ensure
to_io.close
end
end

it "does not close the destination IO" do
Expand Down Expand Up @@ -109,7 +112,8 @@
end

after :each do
rm_r @to_name, @from_bigfile
rm_r @to_name if @to_name
rm_r @from_bigfile
end

describe "from an IO" do
Expand Down Expand Up @@ -164,6 +168,25 @@
it_behaves_like :io_copy_stream_to_io, nil, IOSpecs::CopyStream
it_behaves_like :io_copy_stream_to_io_with_offset, nil, IOSpecs::CopyStream
end

describe "to a Tempfile" do
before :all do
require 'tempfile'
end

before :each do
@to_io = Tempfile.new("rubyspec_copy_stream", encoding: Encoding::BINARY, mode: File::RDONLY)
@to_name = @to_io.path
end

after :each do
@to_io.close!
@to_name = nil # do not rm_r it, already done by Tempfile#close!
end

it_behaves_like :io_copy_stream_to_io, nil, IOSpecs::CopyStream
it_behaves_like :io_copy_stream_to_io_with_offset, nil, IOSpecs::CopyStream
end
end

describe "from a file name" do
Expand Down Expand Up @@ -277,10 +300,8 @@
@io.should_not_receive(:pos)
IO.copy_stream(@io, @to_name)
end

end


describe "with a destination that does partial reads" do
before do
@from_out, @from_in = IO.pipe
Expand Down
51 changes: 37 additions & 14 deletions spec/ruby/core/regexp/union_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,27 @@
Regexp.union("\u00A9".encode("ISO-8859-1"), "a".encode("UTF-8")).encoding.should == Encoding::ISO_8859_1
end

it "returns ASCII-8BIT if the regexp encodings are ASCII-8BIT and at least one has non-ASCII characters" do
us_ascii_implicit, us_ascii_explicit, binary = /abc/, /[\x00-\x7f]/n, /[\x80-\xBF]/n
us_ascii_implicit.encoding.should == Encoding::US_ASCII
us_ascii_explicit.encoding.should == Encoding::US_ASCII
binary.encoding.should == Encoding::BINARY

Regexp.union(us_ascii_implicit, us_ascii_explicit, binary).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_implicit, binary, us_ascii_explicit).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_explicit, us_ascii_implicit, binary).encoding.should == Encoding::BINARY
Regexp.union(us_ascii_explicit, binary, us_ascii_implicit).encoding.should == Encoding::BINARY
Regexp.union(binary, us_ascii_implicit, us_ascii_explicit).encoding.should == Encoding::BINARY
Regexp.union(binary, us_ascii_explicit, us_ascii_implicit).encoding.should == Encoding::BINARY
end

it "return US-ASCII if all patterns are ASCII-only" do
Regexp.union(/abc/e, /def/e).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/n, /def/n).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/s, /def/s).encoding.should == Encoding::US_ASCII
Regexp.union(/abc/u, /def/u).encoding.should == Encoding::US_ASCII
end

it "returns a Regexp with UTF-8 if one part is UTF-8" do
Regexp.union(/probl[éeè]me/i, /help/i).encoding.should == Encoding::UTF_8
end
Expand All @@ -54,83 +75,83 @@
it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Strings" do
-> {
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-16BE"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
end

it "raises ArgumentError if the arguments include conflicting ASCII-incompatible Regexps" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")),
Regexp.new("b".encode("UTF-16BE")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and UTF-16BE')
end

it "raises ArgumentError if the arguments include conflicting fixed encoding Regexps" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
Regexp.new("b".encode("US-ASCII"), Regexp::FIXEDENCODING))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and US-ASCII')
end

it "raises ArgumentError if the arguments include a fixed encoding Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING),
"\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-8 and ISO-8859-1')
end

it "raises ArgumentError if the arguments include a String containing non-ASCII-compatible characters and a fixed encoding Regexp in a different encoding" do
-> {
Regexp.union("\u00A9".encode("ISO-8859-1"),
Regexp.new("a".encode("UTF-8"), Regexp::FIXEDENCODING))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: ISO-8859-1 and UTF-8')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only String" do
-> {
Regexp.union("a".encode("UTF-16LE"), "b".encode("UTF-8"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only String" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "b".encode("UTF-8"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and an ASCII-only Regexp" do
-> {
Regexp.union("a".encode("UTF-16LE"), Regexp.new("b".encode("UTF-8")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and an ASCII-only Regexp" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("b".encode("UTF-8")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, /ASCII incompatible encoding: UTF-16LE|incompatible encodings: UTF-16LE and US-ASCII/)
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union("a".encode("UTF-16LE"), "\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a String containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), "\u00A9".encode("ISO-8859-1"))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible String and a Regexp containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union("a".encode("UTF-16LE"), Regexp.new("\u00A9".encode("ISO-8859-1")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "raises ArgumentError if the arguments include an ASCII-incompatible Regexp and a Regexp containing non-ASCII-compatible characters in a different encoding" do
-> {
Regexp.union(Regexp.new("a".encode("UTF-16LE")), Regexp.new("\u00A9".encode("ISO-8859-1")))
}.should raise_error(ArgumentError)
}.should raise_error(ArgumentError, 'incompatible encodings: UTF-16LE and ISO-8859-1')
end

it "uses to_str to convert arguments (if not Regexp)" do
Expand All @@ -154,6 +175,8 @@
not_supported_on :opal do
Regexp.union([/dogs/, /cats/i]).should == /(?-mix:dogs)|(?i-mx:cats)/
end
->{Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])}.should raise_error(TypeError)
-> {
Regexp.union(["skiing", "sledding"], [/dogs/, /cats/i])
}.should raise_error(TypeError, 'no implicit conversion of Array into String')
end
end
16 changes: 16 additions & 0 deletions spec/ruby/optional/capi/encoding_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,22 @@
end
end

describe "rb_enc_left_char_head" do
it 'returns the head position of a character' do
@s.rb_enc_left_char_head("é", 1).should == 0
@s.rb_enc_left_char_head("éééé", 7).should == 6

@s.rb_enc_left_char_head("a", 0).should == 0

# unclear if this is intended to work
@s.rb_enc_left_char_head("a", 1).should == 1

# Works because for single-byte encodings rb_enc_left_char_head() just returns the pointer
@s.rb_enc_left_char_head("a".force_encoding(Encoding::US_ASCII), 88).should == 88
@s.rb_enc_left_char_head("a".b, 88).should == 88
end
end

describe "ONIGENC_MBC_CASE_FOLD" do
it "returns the correct case fold for the given string" do
@s.ONIGENC_MBC_CASE_FOLD("lower").should == ["l", 1]
Expand Down
7 changes: 7 additions & 0 deletions spec/ruby/optional/capi/ext/encoding_spec.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,12 @@ static VALUE encoding_spec_rb_enc_strlen(VALUE self, VALUE str, VALUE length, VA
return LONG2FIX(rb_enc_strlen(p, e, rb_to_encoding(encoding)));
}

static VALUE encoding_spec_rb_enc_left_char_head(VALUE self, VALUE str, VALUE offset) {
char *ptr = RSTRING_PTR(str);
char *result = rb_enc_left_char_head(ptr, ptr + NUM2INT(offset), RSTRING_END(str), rb_enc_get(str));
return LONG2NUM(result - ptr);
}

void Init_encoding_spec(void) {
VALUE cls;
native_rb_encoding_pointer = (rb_encoding**) malloc(sizeof(rb_encoding*));
Expand Down Expand Up @@ -364,6 +370,7 @@ void Init_encoding_spec(void) {
rb_define_method(cls, "rb_enc_str_asciionly_p", encoding_spec_rb_enc_str_asciionly_p, 1);
rb_define_method(cls, "rb_uv_to_utf8", encoding_spec_rb_uv_to_utf8, 2);
rb_define_method(cls, "ONIGENC_MBC_CASE_FOLD", encoding_spec_ONIGENC_MBC_CASE_FOLD, 1);
rb_define_method(cls, "rb_enc_left_char_head", encoding_spec_rb_enc_left_char_head, 2);
}

#ifdef __cplusplus
Expand Down
5 changes: 4 additions & 1 deletion src/main/c/cext/encoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,10 @@ int rb_enc_get_index(VALUE obj) {
}

char* rb_enc_left_char_head(const char *start, const char *p, const char *end, rb_encoding *enc) {
int length = start - end;
if (p <= start || p >= end) {
return p;
}
int length = end - start;
int position = polyglot_as_i32(polyglot_invoke(RUBY_CEXT, "rb_enc_left_char_head",
rb_tr_unwrap(rb_enc_from_encoding(enc)),
rb_tr_unwrap(rb_str_new(start, length)),
Expand Down
26 changes: 11 additions & 15 deletions src/main/ruby/truffleruby/core/io.rb
Original file line number Diff line number Diff line change
Expand Up @@ -356,24 +356,20 @@ def initialize(from, to, length, offset)
@method = read_method @from
end

# From copy_stream_body in io.c in CRuby
# The first element is true if obj can be used as an IO directly
def to_io(obj, mode)
if Primitive.is_a?(obj, IO)
flag = true
io = obj
else
flag = false

if Primitive.is_a?(obj, String)
io = File.open obj, mode
elsif obj.respond_to? :to_path
path = Truffle::Type.coerce_to obj, String, :to_path
io = File.open path, mode
else
io = obj
end
unless Primitive.is_a?(obj, IO) || Primitive.is_a?(obj, String) || obj.respond_to?(:to_path)
return [false, obj]
end

[flag, io]
if io = IO.try_convert(obj)
[true, io]
else
path = Truffle::Type.coerce_to obj, String, :to_path
io = File.open path, mode
[false, io]
end
end

def read_method(obj)
Expand Down
65 changes: 34 additions & 31 deletions src/main/ruby/truffleruby/core/regexp.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,22 +55,27 @@ def self.try_convert(obj)
Truffle::Type.try_convert obj, Regexp, :to_regexp
end

def self.convert(pattern)
return pattern if Primitive.is_a?(pattern, Regexp)
if Primitive.is_a?(pattern, Array)
union(*pattern)
else
Regexp.quote(pattern.to_s)
end
end
def self.negotiate_union_encoding(*patterns)
compatible_enc = nil

patterns.each do |pattern|
converted = Primitive.is_a?(pattern, Regexp) ? pattern : Regexp.quote(pattern)

enc = converted.encoding

if Primitive.nil?(compatible_enc)
compatible_enc = enc
else
if test = Primitive.encoding_compatible?(enc, compatible_enc)
compatible_enc = test
else
raise ArgumentError, "incompatible encodings: #{compatible_enc} and #{enc}"
end

def self.compatible?(*patterns)
encodings = patterns.map { |r| convert(r).encoding }
last_enc = encodings.pop
encodings.each do |encoding|
raise ArgumentError, "incompatible encodings: #{encoding} and #{last_enc}" unless Primitive.encoding_compatible?(last_enc, encoding)
last_enc = encoding
end
end

compatible_enc
end

def self.last_match(index = nil)
Expand All @@ -96,37 +101,35 @@ def self.last_match(index = nil)
def self.union(*patterns)
case patterns.size
when 0
return %r/(?!)/
%r/(?!)/
when 1
pattern = patterns.first
case pattern
when Array
return union(*pattern)
union(*pattern)
else
converted = Truffle::Type.rb_check_convert_type(pattern, Regexp, :to_regexp)
if Primitive.nil? converted
return Regexp.new(Regexp.quote(pattern))
Regexp.new(Regexp.quote(pattern))
else
return converted
converted
end
end
else
compatible?(*patterns)
enc = convert(patterns.first).encoding
end
patterns = patterns.map do |pat|
if Primitive.is_a?(pat, Regexp)
pat
else
StringValue(pat)
end
end

sep = '|'.encode(enc)
str = ''.encode(enc)
enc = negotiate_union_encoding(*patterns)
sep = '|'.encode(enc)
str = ''.encode(enc)

patterns = patterns.map do |pat|
if Primitive.is_a?(pat, Regexp)
pat
else
StringValue(pat)
end
Truffle::RegexpOperations.union(str, sep, *patterns)
end

Truffle::RegexpOperations.union(str, sep, *patterns)
end
Truffle::Graal.always_split(method(:union))

Expand Down

0 comments on commit e86d372

Please sign in to comment.