Skip to content

Commit

Permalink
YAML compile cache: encoding aware symbols
Browse files Browse the repository at this point in the history
Ref: msgpack/msgpack-ruby#211

The default msgpack Symbol packer/unpacker is not encoding
aware which cause all non-ASCII symbols to be unpacked with
ASCII-8BIT encoding aka BINARY.

So we define a custom packer that prefix the symbol name with the encoding
index. Note that the encoding index isn't fixed across ruby platforms
and version, but the cache versioning should protect us from that.

An alternative could be to simply assume non-ASCII symbols are UTF-8,
but it wouldn't work for people with non UTF-8 source files.
  • Loading branch information
byroot committed Jan 28, 2022
1 parent e3ef615 commit 19ccc01
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 6 deletions.
2 changes: 1 addition & 1 deletion ext/bootsnap/bootsnap.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct bs_cache_key {
STATIC_ASSERT(sizeof(struct bs_cache_key) == KEY_SIZE);

/* Effectively a schema version. Bumping invalidates all previous caches */
static const uint32_t current_version = 4;
static const uint32_t current_version = 5;

/* hash of e.g. "x86_64-darwin17", invalidating when ruby is recompiled on a
* new OS ABI, etc. */
Expand Down
44 changes: 40 additions & 4 deletions lib/bootsnap/compile_cache/yaml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
module Bootsnap
module CompileCache
module YAML
UnsupportedTags = Class.new(StandardError)
Uncompilable = Class.new(StandardError)
UnsupportedTags = Class.new(Uncompilable)
UnsupportedEncoding = Class.new(Uncompilable)

class << self
attr_accessor(:msgpack_factory, :supported_options)
Expand All @@ -29,6 +31,35 @@ def install!(cache_dir)
::YAML.singleton_class.prepend(@implementation::Patch)
end

module EncodingAwareSymbols
extend self

ENCODINGS = Encoding.list.freeze
ENCODINGS_INDEX = {}.compare_by_identity
ENCODINGS_INDEX.default_proc = ->(_hash, _encoding) { raise UnsupportedEncoding }
ENCODINGS.each_with_index do |encoding, index|
ENCODINGS_INDEX[encoding] = -index.chr.b
end
ENCODINGS_INDEX.freeze

if Symbol.method_defined?(:name)
def pack(symbol)
ENCODINGS_INDEX[symbol.encoding].dup << symbol.name
end
else
def pack(symbol)
ENCODINGS_INDEX[symbol.encoding].dup << symbol.to_s
end
end

def unpack(payload)
payload.freeze
string = payload.byteslice(1..-1)
string.force_encoding(ENCODINGS[payload.ord])
string.to_sym
end
end

def init!
require("yaml")
require("msgpack")
Expand All @@ -43,7 +74,12 @@ def init!
# We want them to roundtrip cleanly, so we use a custom factory.
# see: https://github.com/msgpack/msgpack-ruby/pull/122
factory = MessagePack::Factory.new
factory.register_type(0x00, Symbol)
factory.register_type(
0x00,
Symbol,
packer: EncodingAwareSymbols.method(:pack).to_proc,
unpacker: EncodingAwareSymbols.method(:unpack).to_proc,
)

if defined? MessagePack::Timestamp
factory.register_type(
Expand Down Expand Up @@ -124,7 +160,7 @@ def input_to_storage(contents, _)
packer.pack(false) # not safe loaded
packer.pack(obj)
packer.to_s
rescue NoMethodError, RangeError, UnsupportedTags
rescue NoMethodError, RangeError, Uncompilable
UNCOMPILABLE # The object included things that we can't serialize
end

Expand Down Expand Up @@ -233,7 +269,7 @@ def input_to_storage(contents, _)
packer.pack(false) # not safe loaded
packer.pack(obj)
packer.to_s
rescue NoMethodError, RangeError, UnsupportedTags
rescue NoMethodError, RangeError, Uncompilable
UNCOMPILABLE # The object included things that we can't serialize
end

Expand Down
31 changes: 31 additions & 0 deletions test/compile_cache/yaml_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,27 @@ def test_yaml_tags
assert_equal "YAML tags are not supported: !ruby/object", error.message
end

def test_symbols_encoding
symbols = [:ascii, :utf8_fée]
Help.set_file("a.yml", YAML.dump(symbols), 100)

loaded_symbols = FakeYaml.load_file("a.yml")
assert_equal(symbols, loaded_symbols)
assert_equal(symbols.map(&:encoding), loaded_symbols.map(&:encoding))
end

def test_custom_symbols_encoding
sym = "壁に耳あり、障子に目あり".to_sym
Help.set_file("a.yml", YAML.dump(sym), 100)
# YAML is limited to UTF-8 and UTF-16 by spec, but Psych does respect Encoding.default_internal
# so strings and symbol can actually be of any encoding.
with_default_encoding_internal(Encoding::EUC_JP) do
assert_equal Encoding::EUC_JP, ::YAML.load_file("a.yml").encoding
assert_equal ::YAML.load_file("a.yml"), FakeYaml.load_file("a.yml")
assert_equal ::YAML.load_file("a.yml").encoding, FakeYaml.load_file("a.yml").encoding
end
end

if YAML::VERSION >= "4"
def test_load_psych_4_with_alias
Help.set_file("a.yml", "foo: &foo\n bar: 42\nplop:\n <<: *foo", 100)
Expand Down Expand Up @@ -170,4 +191,14 @@ def test_unsafe_load_file
assert_equal({"foo" => {"bar" => 42}, "plop" => {"bar" => 42}}, FakeYaml.unsafe_load_file("a.yml"))
end
end

private

def with_default_encoding_internal(encoding)
original_internal = Encoding.default_internal
Encoding.default_internal = encoding
yield
ensure
Encoding.default_internal = original_internal
end
end
2 changes: 1 addition & 1 deletion test/compile_cache_key_format_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CompileCacheKeyFormatTest < Minitest::Test

def test_key_version
key = cache_key_for_file(FILE)
exp = [4].pack("L")
exp = [5].pack("L")
assert_equal(exp, key[R[:version]])
end

Expand Down

0 comments on commit 19ccc01

Please sign in to comment.