Skip to content

Commit

Permalink
Support unicode literals over codepoint 0xffff
Browse files Browse the repository at this point in the history
Fixes #25. Rebase and tidy up of #63.
  • Loading branch information
peterkmurphy authored and adamchainz committed May 16, 2017
1 parent c5b135f commit 421c9b3
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 48 deletions.
10 changes: 7 additions & 3 deletions lib/yaml/emitter.py
Expand Up @@ -8,9 +8,13 @@

__all__ = ['Emitter', 'EmitterError']

import sys

from error import YAMLError
from events import *

has_ucs4 = sys.maxunicode > 0xffff

class EmitterError(YAMLError):
pass

Expand Down Expand Up @@ -674,7 +678,7 @@ def analyze_scalar(self, scalar):
# Check for indicators.
if index == 0:
# Leading indicators are special characters.
if ch in u'#,[]{}&*!|>\'\"%@`':
if ch in u'#,[]{}&*!|>\'\"%@`':
flow_indicators = True
block_indicators = True
if ch in u'?:':
Expand All @@ -701,7 +705,8 @@ def analyze_scalar(self, scalar):
line_breaks = True
if not (ch == u'\n' or u'\x20' <= ch <= u'\x7E'):
if (ch == u'\x85' or u'\xA0' <= ch <= u'\uD7FF'
or u'\uE000' <= ch <= u'\uFFFD') and ch != u'\uFEFF':
or u'\uE000' <= ch <= u'\uFFFD'
or ((not has_ucs4) or (u'\U00010000' <= ch < u'\U0010ffff'))) and ch != u'\uFEFF':
unicode_characters = True
if not self.allow_unicode:
special_characters = True
Expand Down Expand Up @@ -1137,4 +1142,3 @@ def write_plain(self, text, split=True):
spaces = (ch == u' ')
breaks = (ch in u'\n\x85\u2028\u2029')
end += 1

10 changes: 7 additions & 3 deletions lib/yaml/reader.py
Expand Up @@ -19,7 +19,9 @@

from error import YAMLError, Mark

import codecs, re
import codecs, re, sys

has_ucs4 = sys.maxunicode > 0xffff

class ReaderError(YAMLError):

Expand Down Expand Up @@ -134,7 +136,10 @@ def determine_encoding(self):
self.encoding = 'utf-8'
self.update(1)

NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
if has_ucs4:
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
else:
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
def check_printable(self, data):
match = self.NON_PRINTABLE.search(data)
if match:
Expand Down Expand Up @@ -187,4 +192,3 @@ def update_raw(self, size=1024):
# psyco.bind(Reader)
#except ImportError:
# pass

6 changes: 3 additions & 3 deletions lib3/yaml/emitter.py
Expand Up @@ -671,7 +671,7 @@ def analyze_scalar(self, scalar):
# Check for indicators.
if index == 0:
# Leading indicators are special characters.
if ch in '#,[]{}&*!|>\'\"%@`':
if ch in '#,[]{}&*!|>\'\"%@`':
flow_indicators = True
block_indicators = True
if ch in '?:':
Expand All @@ -698,7 +698,8 @@ def analyze_scalar(self, scalar):
line_breaks = True
if not (ch == '\n' or '\x20' <= ch <= '\x7E'):
if (ch == '\x85' or '\xA0' <= ch <= '\uD7FF'
or '\uE000' <= ch <= '\uFFFD') and ch != '\uFEFF':
or '\uE000' <= ch <= '\uFFFD'
or '\U00010000' <= ch < '\U0010ffff') and ch != '\uFEFF':
unicode_characters = True
if not self.allow_unicode:
special_characters = True
Expand Down Expand Up @@ -1134,4 +1135,3 @@ def write_plain(self, text, split=True):
spaces = (ch == ' ')
breaks = (ch in '\n\x85\u2028\u2029')
end += 1

3 changes: 1 addition & 2 deletions lib3/yaml/reader.py
Expand Up @@ -134,7 +134,7 @@ def determine_encoding(self):
self.encoding = 'utf-8'
self.update(1)

NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
def check_printable(self, data):
match = self.NON_PRINTABLE.search(data)
if match:
Expand Down Expand Up @@ -189,4 +189,3 @@ def update_raw(self, size=4096):
# psyco.bind(Reader)
#except ImportError:
# pass

10 changes: 10 additions & 0 deletions tests/data/emoticons.unicode
@@ -0,0 +1,10 @@
😀😁😂😃😄😅😆😇
😈😉😊😋😌😍😎😏
😐😑😒😓😔😕😖😗
😘😙😚😛😜😝😞😟
😠😡😢😣😤😥😦😧
😨😩😪😫😬😭😮😯
😰😱😲😳😴😵😶😷
😸😹😺😻😼😽😾😿
🙀🙁🙂🙃🙄🙅🙆🙇
🙈🙉🙊🙋🙌🙍🙎🙏
1 change: 1 addition & 0 deletions tests/data/emoticons2.unicode
@@ -0,0 +1 @@
😀
23 changes: 6 additions & 17 deletions tests/lib/test_input_output.py
Expand Up @@ -34,11 +34,11 @@ def test_unicode_input(unicode_filename, verbose=False):

def test_unicode_input_errors(unicode_filename, verbose=False):
data = open(unicode_filename, 'rb').read().decode('utf-8')
for input in [data.encode('latin1', 'ignore'),
data.encode('utf-16-be'), data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
codecs.BOM_UTF16_LE+data.encode('utf-8')+'!']:
for input in [data.encode('utf-16-be'),
data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF8+data.encode('utf-16-le')]:

try:
yaml.load(input)
except yaml.YAMLError, exc:
Expand Down Expand Up @@ -69,17 +69,7 @@ def test_unicode_output(unicode_filename, verbose=False):
stream = StringIO.StringIO()
yaml.dump(value, stream, encoding=encoding, allow_unicode=allow_unicode)
data4 = stream.getvalue()
for copy in [data1, data2, data3, data4]:
if allow_unicode:
try:
copy[4:].encode('ascii')
except (UnicodeDecodeError, UnicodeEncodeError), exc:
if verbose:
print exc
else:
raise AssertionError("expected an exception")
else:
copy[4:].encode('ascii')

assert isinstance(data1, str), (type(data1), encoding)
data1.decode('utf-8')
assert isinstance(data2, str), (type(data2), encoding)
Expand Down Expand Up @@ -148,4 +138,3 @@ def test_unicode_transfer(unicode_filename, verbose=False):
if __name__ == '__main__':
import test_appliance
test_appliance.run(globals())

26 changes: 6 additions & 20 deletions tests/lib3/test_input_output.py
Expand Up @@ -24,11 +24,11 @@ def test_unicode_input(unicode_filename, verbose=False):

def test_unicode_input_errors(unicode_filename, verbose=False):
data = open(unicode_filename, 'rb').read().decode('utf-8')
for input in [data.encode('latin1', 'ignore'),
data.encode('utf-16-be'), data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
codecs.BOM_UTF16_LE+data.encode('utf-8')+b'!']:
for input in [data.encode('utf-16-be'),
data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF8+data.encode('utf-16-le')]:

try:
yaml.load(input)
except yaml.YAMLError as exc:
Expand Down Expand Up @@ -75,20 +75,7 @@ def test_unicode_output(unicode_filename, verbose=False):
if verbose:
print("BYTES:", data4[:50])
data4 = data4.decode(encoding)
for copy in [data1, data2, data3, data4]:
if copy is None:
continue
assert isinstance(copy, str)
if allow_unicode:
try:
copy[4:].encode('ascii')
except UnicodeEncodeError as exc:
if verbose:
print(exc)
else:
raise AssertionError("expected an exception")
else:
copy[4:].encode('ascii')

assert isinstance(data1, str), (type(data1), encoding)
assert isinstance(data2, str), (type(data2), encoding)

Expand Down Expand Up @@ -147,4 +134,3 @@ def test_unicode_transfer(unicode_filename, verbose=False):
if __name__ == '__main__':
import test_appliance
test_appliance.run(globals())

0 comments on commit 421c9b3

Please sign in to comment.