Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support unicode literals over codepoint 0xffff #65

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 7 additions & 2 deletions lib/yaml/emitter.py
Expand Up @@ -8,9 +8,13 @@

__all__ = ['Emitter', 'EmitterError']

import sys

from error import YAMLError
from events import *

has_ucs4 = sys.maxunicode > 0xffff

class EmitterError(YAMLError):
pass

Expand Down Expand Up @@ -674,7 +678,7 @@ def analyze_scalar(self, scalar):
# Check for indicators.
if index == 0:
# Leading indicators are special characters.
if ch in u'#,[]{}&*!|>\'\"%@`':
if ch in u'#,[]{}&*!|>\'\"%@`':
flow_indicators = True
block_indicators = True
if ch in u'?:':
Expand All @@ -701,7 +705,8 @@ def analyze_scalar(self, scalar):
line_breaks = True
if not (ch == u'\n' or u'\x20' <= ch <= u'\x7E'):
if (ch == u'\x85' or u'\xA0' <= ch <= u'\uD7FF'
or u'\uE000' <= ch <= u'\uFFFD') and ch != u'\uFEFF':
or u'\uE000' <= ch <= u'\uFFFD'
or ((not has_ucs4) or (u'\U00010000' <= ch < u'\U0010ffff'))) and ch != u'\uFEFF':
unicode_characters = True
if not self.allow_unicode:
special_characters = True
Expand Down
9 changes: 7 additions & 2 deletions lib/yaml/reader.py
Expand Up @@ -19,7 +19,9 @@

from error import YAMLError, Mark

import codecs, re
import codecs, re, sys

has_ucs4 = sys.maxunicode > 0xffff

class ReaderError(YAMLError):

Expand Down Expand Up @@ -134,7 +136,10 @@ def determine_encoding(self):
self.encoding = 'utf-8'
self.update(1)

NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
if has_ucs4:
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
else:
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
def check_printable(self, data):
match = self.NON_PRINTABLE.search(data)
if match:
Expand Down
5 changes: 3 additions & 2 deletions lib3/yaml/emitter.py
Expand Up @@ -671,7 +671,7 @@ def analyze_scalar(self, scalar):
# Check for indicators.
if index == 0:
# Leading indicators are special characters.
if ch in '#,[]{}&*!|>\'\"%@`':
if ch in '#,[]{}&*!|>\'\"%@`':
flow_indicators = True
block_indicators = True
if ch in '?:':
Expand All @@ -698,7 +698,8 @@ def analyze_scalar(self, scalar):
line_breaks = True
if not (ch == '\n' or '\x20' <= ch <= '\x7E'):
if (ch == '\x85' or '\xA0' <= ch <= '\uD7FF'
or '\uE000' <= ch <= '\uFFFD') and ch != '\uFEFF':
or '\uE000' <= ch <= '\uFFFD'
or '\U00010000' <= ch < '\U0010ffff') and ch != '\uFEFF':
unicode_characters = True
if not self.allow_unicode:
special_characters = True
Expand Down
2 changes: 1 addition & 1 deletion lib3/yaml/reader.py
Expand Up @@ -134,7 +134,7 @@ def determine_encoding(self):
self.encoding = 'utf-8'
self.update(1)

NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
def check_printable(self, data):
match = self.NON_PRINTABLE.search(data)
if match:
Expand Down
10 changes: 10 additions & 0 deletions tests/data/emoticons.unicode
@@ -0,0 +1,10 @@
😀😁😂😃😄😅😆😇
😈😉😊😋😌😍😎😏
😐😑😒😓😔😕😖😗
😘😙😚😛😜😝😞😟
😠😡😢😣😤😥😦😧
😨😩😪😫😬😭😮😯
😰😱😲😳😴😵😶😷
😸😹😺😻😼😽😾😿
🙀🙁🙂🙃🙄🙅🙆🙇
🙈🙉🙊🙋🙌🙍🙎🙏
1 change: 1 addition & 0 deletions tests/data/emoticons2.unicode
@@ -0,0 +1 @@
😀
22 changes: 6 additions & 16 deletions tests/lib/test_input_output.py
Expand Up @@ -34,11 +34,11 @@ def test_unicode_input(unicode_filename, verbose=False):

def test_unicode_input_errors(unicode_filename, verbose=False):
data = open(unicode_filename, 'rb').read().decode('utf-8')
for input in [data.encode('latin1', 'ignore'),
data.encode('utf-16-be'), data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
codecs.BOM_UTF16_LE+data.encode('utf-8')+'!']:
for input in [data.encode('utf-16-be'),
data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF8+data.encode('utf-16-le')]:

try:
yaml.load(input)
except yaml.YAMLError, exc:
Expand Down Expand Up @@ -69,17 +69,7 @@ def test_unicode_output(unicode_filename, verbose=False):
stream = StringIO.StringIO()
yaml.dump(value, stream, encoding=encoding, allow_unicode=allow_unicode)
data4 = stream.getvalue()
for copy in [data1, data2, data3, data4]:
if allow_unicode:
try:
copy[4:].encode('ascii')
except (UnicodeDecodeError, UnicodeEncodeError), exc:
if verbose:
print exc
else:
raise AssertionError("expected an exception")
else:
copy[4:].encode('ascii')

assert isinstance(data1, str), (type(data1), encoding)
data1.decode('utf-8')
assert isinstance(data2, str), (type(data2), encoding)
Expand Down
25 changes: 6 additions & 19 deletions tests/lib3/test_input_output.py
Expand Up @@ -24,11 +24,11 @@ def test_unicode_input(unicode_filename, verbose=False):

def test_unicode_input_errors(unicode_filename, verbose=False):
data = open(unicode_filename, 'rb').read().decode('utf-8')
for input in [data.encode('latin1', 'ignore'),
data.encode('utf-16-be'), data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
codecs.BOM_UTF16_LE+data.encode('utf-8')+b'!']:
for input in [data.encode('utf-16-be'),
data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF8+data.encode('utf-16-le')]:

try:
yaml.load(input)
except yaml.YAMLError as exc:
Expand Down Expand Up @@ -75,20 +75,7 @@ def test_unicode_output(unicode_filename, verbose=False):
if verbose:
print("BYTES:", data4[:50])
data4 = data4.decode(encoding)
for copy in [data1, data2, data3, data4]:
if copy is None:
continue
assert isinstance(copy, str)
if allow_unicode:
try:
copy[4:].encode('ascii')
except UnicodeEncodeError as exc:
if verbose:
print(exc)
else:
raise AssertionError("expected an exception")
else:
copy[4:].encode('ascii')

assert isinstance(data1, str), (type(data1), encoding)
assert isinstance(data2, str), (type(data2), encoding)

Expand Down