diff --git a/lib/yaml/emitter.py b/lib/yaml/emitter.py index e5bcdccc..732879a1 100644 --- a/lib/yaml/emitter.py +++ b/lib/yaml/emitter.py @@ -8,9 +8,13 @@ __all__ = ['Emitter', 'EmitterError'] +import sys + from error import YAMLError from events import * +has_ucs4 = sys.maxunicode > 0xffff + class EmitterError(YAMLError): pass @@ -674,7 +678,7 @@ def analyze_scalar(self, scalar): # Check for indicators. if index == 0: # Leading indicators are special characters. - if ch in u'#,[]{}&*!|>\'\"%@`': + if ch in u'#,[]{}&*!|>\'\"%@`': flow_indicators = True block_indicators = True if ch in u'?:': @@ -701,7 +705,8 @@ def analyze_scalar(self, scalar): line_breaks = True if not (ch == u'\n' or u'\x20' <= ch <= u'\x7E'): if (ch == u'\x85' or u'\xA0' <= ch <= u'\uD7FF' - or u'\uE000' <= ch <= u'\uFFFD') and ch != u'\uFEFF': + or u'\uE000' <= ch <= u'\uFFFD' + or ((not has_ucs4) or (u'\U00010000' <= ch < u'\U0010ffff'))) and ch != u'\uFEFF': unicode_characters = True if not self.allow_unicode: special_characters = True diff --git a/lib/yaml/reader.py b/lib/yaml/reader.py index 3249e6b9..0b95f474 100644 --- a/lib/yaml/reader.py +++ b/lib/yaml/reader.py @@ -19,7 +19,9 @@ from error import YAMLError, Mark -import codecs, re +import codecs, re, sys + +has_ucs4 = sys.maxunicode > 0xffff class ReaderError(YAMLError): @@ -134,7 +136,10 @@ def determine_encoding(self): self.encoding = 'utf-8' self.update(1) - NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') + if has_ucs4: + NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]') + else: + NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') def check_printable(self, data): match = self.NON_PRINTABLE.search(data) if match: diff --git a/lib3/yaml/emitter.py b/lib3/yaml/emitter.py index 34cb145a..ce76a0c8 100644 --- a/lib3/yaml/emitter.py +++ b/lib3/yaml/emitter.py @@ -671,7 +671,7 @@ def analyze_scalar(self, scalar): # Check for indicators. if index == 0: # Leading indicators are special characters. - if ch in '#,[]{}&*!|>\'\"%@`': + if ch in '#,[]{}&*!|>\'\"%@`': flow_indicators = True block_indicators = True if ch in '?:': @@ -698,7 +698,8 @@ def analyze_scalar(self, scalar): line_breaks = True if not (ch == '\n' or '\x20' <= ch <= '\x7E'): if (ch == '\x85' or '\xA0' <= ch <= '\uD7FF' - or '\uE000' <= ch <= '\uFFFD') and ch != '\uFEFF': + or '\uE000' <= ch <= '\uFFFD' + or '\U00010000' <= ch < '\U0010ffff') and ch != '\uFEFF': unicode_characters = True if not self.allow_unicode: special_characters = True diff --git a/lib3/yaml/reader.py b/lib3/yaml/reader.py index f70e920f..5764f2dc 100644 --- a/lib3/yaml/reader.py +++ b/lib3/yaml/reader.py @@ -134,7 +134,7 @@ def determine_encoding(self): self.encoding = 'utf-8' self.update(1) - NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') + NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]') def check_printable(self, data): match = self.NON_PRINTABLE.search(data) if match: diff --git a/tests/data/emoticons.unicode b/tests/data/emoticons.unicode new file mode 100644 index 00000000..6bcdb695 --- /dev/null +++ b/tests/data/emoticons.unicode @@ -0,0 +1,10 @@ +😀😁😂😃😄😅😆😇 +😈😉😊😋😌😍😎😏 +😐😑😒😓😔😕😖😗 +😘😙😚😛😜😝😞😟 +😠😡😢😣😤😥😦😧 +😨😩😪😫😬😭😮😯 +😰😱😲😳😴😵😶😷 +😸😹😺😻😼😽😾😿 +🙀🙁🙂🙃🙄🙅🙆🙇 +🙈🙉🙊🙋🙌🙍🙎🙏 diff --git a/tests/data/emoticons2.unicode b/tests/data/emoticons2.unicode new file mode 100644 index 00000000..b41d3dbc --- /dev/null +++ b/tests/data/emoticons2.unicode @@ -0,0 +1 @@ +😀 diff --git a/tests/lib/test_input_output.py b/tests/lib/test_input_output.py index 9ccc8fce..0d2bd70a 100644 --- a/tests/lib/test_input_output.py +++ b/tests/lib/test_input_output.py @@ -34,11 +34,11 @@ def test_unicode_input(unicode_filename, verbose=False): def test_unicode_input_errors(unicode_filename, verbose=False): data = open(unicode_filename, 'rb').read().decode('utf-8') - for input in [data.encode('latin1', 'ignore'), - data.encode('utf-16-be'), data.encode('utf-16-le'), - codecs.BOM_UTF8+data.encode('utf-16-be'), - codecs.BOM_UTF16_BE+data.encode('utf-16-le'), - codecs.BOM_UTF16_LE+data.encode('utf-8')+'!']: + for input in [data.encode('utf-16-be'), + data.encode('utf-16-le'), + codecs.BOM_UTF8+data.encode('utf-16-be'), + codecs.BOM_UTF8+data.encode('utf-16-le')]: + try: yaml.load(input) except yaml.YAMLError, exc: @@ -69,17 +69,7 @@ def test_unicode_output(unicode_filename, verbose=False): stream = StringIO.StringIO() yaml.dump(value, stream, encoding=encoding, allow_unicode=allow_unicode) data4 = stream.getvalue() - for copy in [data1, data2, data3, data4]: - if allow_unicode: - try: - copy[4:].encode('ascii') - except (UnicodeDecodeError, UnicodeEncodeError), exc: - if verbose: - print exc - else: - raise AssertionError("expected an exception") - else: - copy[4:].encode('ascii') + assert isinstance(data1, str), (type(data1), encoding) data1.decode('utf-8') assert isinstance(data2, str), (type(data2), encoding) diff --git a/tests/lib3/test_input_output.py b/tests/lib3/test_input_output.py index 70a945a2..6d83a24e 100644 --- a/tests/lib3/test_input_output.py +++ b/tests/lib3/test_input_output.py @@ -24,11 +24,11 @@ def test_unicode_input(unicode_filename, verbose=False): def test_unicode_input_errors(unicode_filename, verbose=False): data = open(unicode_filename, 'rb').read().decode('utf-8') - for input in [data.encode('latin1', 'ignore'), - data.encode('utf-16-be'), data.encode('utf-16-le'), - codecs.BOM_UTF8+data.encode('utf-16-be'), - codecs.BOM_UTF16_BE+data.encode('utf-16-le'), - codecs.BOM_UTF16_LE+data.encode('utf-8')+b'!']: + for input in [data.encode('utf-16-be'), + data.encode('utf-16-le'), + codecs.BOM_UTF8+data.encode('utf-16-be'), + codecs.BOM_UTF8+data.encode('utf-16-le')]: + try: yaml.load(input) except yaml.YAMLError as exc: @@ -75,20 +75,7 @@ def test_unicode_output(unicode_filename, verbose=False): if verbose: print("BYTES:", data4[:50]) data4 = data4.decode(encoding) - for copy in [data1, data2, data3, data4]: - if copy is None: - continue - assert isinstance(copy, str) - if allow_unicode: - try: - copy[4:].encode('ascii') - except UnicodeEncodeError as exc: - if verbose: - print(exc) - else: - raise AssertionError("expected an exception") - else: - copy[4:].encode('ascii') + assert isinstance(data1, str), (type(data1), encoding) assert isinstance(data2, str), (type(data2), encoding)