Skip to content

Commit

Permalink
YAML compile cache: encoding aware symbols
Browse files Browse the repository at this point in the history
Ref: msgpack/msgpack-ruby#211

The default msgpack Symbol packer/unpacker is not encoding
aware which cause all non-ASCII symbols to be unpacked with
ASCII-8BIT encoding aka BINARY.

So we define a custom packer that prefix the symbol name with `1`
for UTF-8 symbols, and `0` for the others (ASCII or binary)

If `Encoding.default_internal` is set to something MessagePack
doesn't support, we entirely disable the YAML cache.
  • Loading branch information
byroot committed Jan 28, 2022
1 parent e3ef615 commit f0304dd
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 27 deletions.
2 changes: 1 addition & 1 deletion ext/bootsnap/bootsnap.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct bs_cache_key {
STATIC_ASSERT(sizeof(struct bs_cache_key) == KEY_SIZE);

/* Effectively a schema version. Bumping invalidates all previous caches */
static const uint32_t current_version = 4;
static const uint32_t current_version = 5;

/* hash of e.g. "x86_64-darwin17", invalidating when ruby is recompiled on a
* new OS ABI, etc. */
Expand Down
127 changes: 102 additions & 25 deletions lib/bootsnap/compile_cache/yaml.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,16 @@
module Bootsnap
module CompileCache
module YAML
UnsupportedTags = Class.new(StandardError)
Uncompilable = Class.new(StandardError)
UnsupportedTags = Class.new(Uncompilable)
UnsupportedEncoding = Class.new(Uncompilable)

SUPPORTED_INTERNAL_ENCODINGS = [
nil, # UTF-8
Encoding::UTF_8,
Encoding::ASCII,
Encoding::BINARY,
].freeze

class << self
attr_accessor(:msgpack_factory, :supported_options)
Expand All @@ -16,7 +25,9 @@ def cache_dir=(cache_dir)
end

def precompile(path)
Bootsnap::CompileCache::Native.precompile(
return false unless CompileCache::YAML.supported_internal_encoding?

CompileCache::Native.precompile(
cache_dir,
path.to_s,
@implementation,
Expand All @@ -29,6 +40,59 @@ def install!(cache_dir)
::YAML.singleton_class.prepend(@implementation::Patch)
end

# Psych coerce strings to `Encoding.default_internal` but Message Pack only support
# UTF-8, US-ASCII and BINARY. So if Encoding.default_internal is set to anything else
# we can't safely use the cache
def supported_internal_encoding?
SUPPORTED_INTERNAL_ENCODINGS.include?(Encoding.default_internal)
end

module EncodingAwareSymbols
extend self

ASCII_SYMBOL = -"\x00".b
UTF8_SYMBOL = -"\x01".b

ENCODINGS = [
Encoding::UTF_8,
Encoding::ASCII,
Encoding::BINARY,
]
ENCODINGS_INDEX = {}.compare_by_identity
ENCODINGS_INDEX.default_proc = ->(_hash, _encoding) { raise UnsupportedEncoding }
ENCODINGS.each_with_index do |encoding, index|
ENCODINGS_INDEX[encoding] = -index.chr.b
end
ENCODINGS_INDEX.freeze

if Symbol.method_defined?(:name)
def pack(symbol)
if symbol.encoding == Encoding::UTF_8
1.chr << symbol.name
else
0.chr << symbol.name
end
end
else
def pack(symbol)
if symbol.encoding == Encoding::UTF_8
1.chr << symbol.to_s
else
0.chr << symbol.to_s
end
end
end

def unpack(payload)
payload.freeze
string = payload.byteslice(1..-1)
if payload.ord == 1 # Encoding::UTF_8
string.force_encoding(Encoding::UTF_8)
end
string.to_sym
end
end

def init!
require("yaml")
require("msgpack")
Expand All @@ -43,7 +107,12 @@ def init!
# We want them to roundtrip cleanly, so we use a custom factory.
# see: https://github.com/msgpack/msgpack-ruby/pull/122
factory = MessagePack::Factory.new
factory.register_type(0x00, Symbol)
factory.register_type(
0x00,
Symbol,
packer: EncodingAwareSymbols.method(:pack).to_proc,
unpacker: EncodingAwareSymbols.method(:unpack).to_proc,
)

if defined? MessagePack::Timestamp
factory.register_type(
Expand Down Expand Up @@ -124,7 +193,7 @@ def input_to_storage(contents, _)
packer.pack(false) # not safe loaded
packer.pack(obj)
packer.to_s
rescue NoMethodError, RangeError, UnsupportedTags
rescue NoMethodError, RangeError, Uncompilable
UNCOMPILABLE # The object included things that we can't serialize
end

Expand Down Expand Up @@ -179,44 +248,48 @@ def input_to_output(data, kwargs)

module Patch
def load_file(path, *args)
return super unless CompileCache::YAML.supported_internal_encoding?

return super if args.size > 1

if (kwargs = args.first)
return super unless kwargs.is_a?(Hash)
return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty?
return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty?
end

begin
::Bootsnap::CompileCache::Native.fetch(
Bootsnap::CompileCache::YAML.cache_dir,
CompileCache::Native.fetch(
CompileCache::YAML.cache_dir,
File.realpath(path),
::Bootsnap::CompileCache::YAML::Psych4::SafeLoad,
CompileCache::YAML::Psych4::SafeLoad,
kwargs,
)
rescue Errno::EACCES
::Bootsnap::CompileCache.permission_error(path)
CompileCache.permission_error(path)
end
end

ruby2_keywords :load_file if respond_to?(:ruby2_keywords, true)

def unsafe_load_file(path, *args)
return super unless CompileCache::YAML.supported_internal_encoding?

return super if args.size > 1

if (kwargs = args.first)
return super unless kwargs.is_a?(Hash)
return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty?
return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty?
end

begin
::Bootsnap::CompileCache::Native.fetch(
Bootsnap::CompileCache::YAML.cache_dir,
CompileCache::Native.fetch(
CompileCache::YAML.cache_dir,
File.realpath(path),
::Bootsnap::CompileCache::YAML::Psych4::UnsafeLoad,
CompileCache::YAML::Psych4::UnsafeLoad,
kwargs,
)
rescue Errno::EACCES
::Bootsnap::CompileCache.permission_error(path)
CompileCache.permission_error(path)
end
end

Expand All @@ -233,7 +306,7 @@ def input_to_storage(contents, _)
packer.pack(false) # not safe loaded
packer.pack(obj)
packer.to_s
rescue NoMethodError, RangeError, UnsupportedTags
rescue NoMethodError, RangeError, Uncompilable
UNCOMPILABLE # The object included things that we can't serialize
end

Expand All @@ -253,44 +326,48 @@ def input_to_output(data, kwargs)

module Patch
def load_file(path, *args)
return super unless CompileCache::YAML.supported_internal_encoding?

return super if args.size > 1

if (kwargs = args.first)
return super unless kwargs.is_a?(Hash)
return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty?
return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty?
end

begin
::Bootsnap::CompileCache::Native.fetch(
Bootsnap::CompileCache::YAML.cache_dir,
CompileCache::Native.fetch(
CompileCache::YAML.cache_dir,
File.realpath(path),
::Bootsnap::CompileCache::YAML::Psych3,
CompileCache::YAML::Psych3,
kwargs,
)
rescue Errno::EACCES
::Bootsnap::CompileCache.permission_error(path)
CompileCache.permission_error(path)
end
end

ruby2_keywords :load_file if respond_to?(:ruby2_keywords, true)

def unsafe_load_file(path, *args)
return super unless CompileCache::YAML.supported_internal_encoding?

return super if args.size > 1

if (kwargs = args.first)
return super unless kwargs.is_a?(Hash)
return super unless (kwargs.keys - ::Bootsnap::CompileCache::YAML.supported_options).empty?
return super unless (kwargs.keys - CompileCache::YAML.supported_options).empty?
end

begin
::Bootsnap::CompileCache::Native.fetch(
Bootsnap::CompileCache::YAML.cache_dir,
CompileCache::Native.fetch(
CompileCache::YAML.cache_dir,
File.realpath(path),
::Bootsnap::CompileCache::YAML::Psych3,
CompileCache::YAML::Psych3,
kwargs,
)
rescue Errno::EACCES
::Bootsnap::CompileCache.permission_error(path)
CompileCache.permission_error(path)
end
end

Expand Down
37 changes: 37 additions & 0 deletions test/compile_cache/yaml_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,27 @@ def test_yaml_tags
assert_equal "YAML tags are not supported: !ruby/object", error.message
end

def test_symbols_encoding
symbols = [:ascii, :utf8_fée]
Help.set_file("a.yml", YAML.dump(symbols), 100)

loaded_symbols = FakeYaml.load_file("a.yml")
assert_equal(symbols, loaded_symbols)
assert_equal(symbols.map(&:encoding), loaded_symbols.map(&:encoding))
end

def test_custom_symbols_encoding
sym = "壁に耳あり、障子に目あり".to_sym
Help.set_file("a.yml", YAML.dump(sym), 100)
# YAML is limited to UTF-8 and UTF-16 by spec, but Psych does respect Encoding.default_internal
# so strings and symbol can actually be of any encoding.
assert_raises FakeYaml::Fallback do
with_default_encoding_internal(Encoding::EUC_JP) do
FakeYaml.load_file("a.yml")
end
end
end

if YAML::VERSION >= "4"
def test_load_psych_4_with_alias
Help.set_file("a.yml", "foo: &foo\n bar: 42\nplop:\n <<: *foo", 100)
Expand Down Expand Up @@ -170,4 +191,20 @@ def test_unsafe_load_file
assert_equal({"foo" => {"bar" => 42}, "plop" => {"bar" => 42}}, FakeYaml.unsafe_load_file("a.yml"))
end
end

private

def with_default_encoding_internal(encoding)
original_internal = Encoding.default_internal
$VERBOSE = false
Encoding.default_internal = encoding
$VERBOSE = true
begin
yield
ensure
$VERBOSE = false
Encoding.default_internal = original_internal
$VERBOSE = true
end
end
end
2 changes: 1 addition & 1 deletion test/compile_cache_key_format_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CompileCacheKeyFormatTest < Minitest::Test

def test_key_version
key = cache_key_for_file(FILE)
exp = [4].pack("L")
exp = [5].pack("L")
assert_equal(exp, key[R[:version]])
end

Expand Down

0 comments on commit f0304dd

Please sign in to comment.