Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support to Unicode characters over codepoint 0xffff #63

Merged
merged 5 commits into from Aug 8, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 0 additions & 1 deletion README
Expand Up @@ -32,4 +32,3 @@ Submit bug reports and feature requests to the PyYAML bug tracker:

PyYAML is written by Kirill Simonov <xi@resolvent.net>. It is released
under the MIT license. See the file LICENSE for more details.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing trailing newlines is bad form as it's diff noise. I'd undo it for all the files you've added it for, which seems to be every file you've touched. Also files should end with a trailing newline to be POSIX valid

10 changes: 7 additions & 3 deletions lib/yaml/emitter.py
Expand Up @@ -8,9 +8,13 @@

__all__ = ['Emitter', 'EmitterError']

import sys

from error import YAMLError
from events import *

has_ucs4 = sys.maxunicode > 0xffff

class EmitterError(YAMLError):
pass

Expand Down Expand Up @@ -674,7 +678,7 @@ def analyze_scalar(self, scalar):
# Check for indicators.
if index == 0:
# Leading indicators are special characters.
if ch in u'#,[]{}&*!|>\'\"%@`':
if ch in u'#,[]{}&*!|>\'\"%@`':

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trimming trailing whitespace is bad form as it's diff noise

flow_indicators = True
block_indicators = True
if ch in u'?:':
Expand All @@ -701,7 +705,8 @@ def analyze_scalar(self, scalar):
line_breaks = True
if not (ch == u'\n' or u'\x20' <= ch <= u'\x7E'):
if (ch == u'\x85' or u'\xA0' <= ch <= u'\uD7FF'
or u'\uE000' <= ch <= u'\uFFFD') and ch != u'\uFEFF':
or u'\uE000' <= ch <= u'\uFFFD'
or ((not has_ucs4) or (u'\U00010000' <= ch < u'\U0010ffff'))) and ch != u'\uFEFF':
unicode_characters = True
if not self.allow_unicode:
special_characters = True
Expand Down Expand Up @@ -1137,4 +1142,3 @@ def write_plain(self, text, split=True):
spaces = (ch == u' ')
breaks = (ch in u'\n\x85\u2028\u2029')
end += 1

10 changes: 7 additions & 3 deletions lib/yaml/reader.py
Expand Up @@ -19,7 +19,9 @@

from error import YAMLError, Mark

import codecs, re
import codecs, re, sys

has_ucs4 = sys.maxunicode > 0xffff

class ReaderError(YAMLError):

Expand Down Expand Up @@ -134,7 +136,10 @@ def determine_encoding(self):
self.encoding = 'utf-8'
self.update(1)

NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
if has_ucs4:
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
else:
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
def check_printable(self, data):
match = self.NON_PRINTABLE.search(data)
if match:
Expand Down Expand Up @@ -187,4 +192,3 @@ def update_raw(self, size=1024):
# psyco.bind(Reader)
#except ImportError:
# pass

6 changes: 3 additions & 3 deletions lib3/yaml/emitter.py
Expand Up @@ -671,7 +671,7 @@ def analyze_scalar(self, scalar):
# Check for indicators.
if index == 0:
# Leading indicators are special characters.
if ch in '#,[]{}&*!|>\'\"%@`':
if ch in '#,[]{}&*!|>\'\"%@`':
flow_indicators = True
block_indicators = True
if ch in '?:':
Expand All @@ -698,7 +698,8 @@ def analyze_scalar(self, scalar):
line_breaks = True
if not (ch == '\n' or '\x20' <= ch <= '\x7E'):
if (ch == '\x85' or '\xA0' <= ch <= '\uD7FF'
or '\uE000' <= ch <= '\uFFFD') and ch != '\uFEFF':
or '\uE000' <= ch <= '\uFFFD'
or '\U00010000' <= ch < '\U0010ffff') and ch != '\uFEFF':
unicode_characters = True
if not self.allow_unicode:
special_characters = True
Expand Down Expand Up @@ -1134,4 +1135,3 @@ def write_plain(self, text, split=True):
spaces = (ch == ' ')
breaks = (ch in '\n\x85\u2028\u2029')
end += 1

3 changes: 1 addition & 2 deletions lib3/yaml/reader.py
Expand Up @@ -134,7 +134,7 @@ def determine_encoding(self):
self.encoding = 'utf-8'
self.update(1)

NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
def check_printable(self, data):
match = self.NON_PRINTABLE.search(data)
if match:
Expand Down Expand Up @@ -189,4 +189,3 @@ def update_raw(self, size=4096):
# psyco.bind(Reader)
#except ImportError:
# pass

10 changes: 10 additions & 0 deletions tests/data/emoticons.unicode
@@ -0,0 +1,10 @@
😀😁😂😃😄😅😆😇
😈😉😊😋😌😍😎😏
😐😑😒😓😔😕😖😗
😘😙😚😛😜😝😞😟
😠😡😢😣😤😥😦😧
😨😩😪😫😬😭😮😯
😰😱😲😳😴😵😶😷
😸😹😺😻😼😽😾😿
🙀🙁🙂🙃🙄🙅🙆🙇
🙈🙉🙊🙋🙌🙍🙎🙏
1 change: 1 addition & 0 deletions tests/data/emoticons2.unicode
@@ -0,0 +1 @@
😀
23 changes: 6 additions & 17 deletions tests/lib/test_input_output.py
Expand Up @@ -34,11 +34,11 @@ def test_unicode_input(unicode_filename, verbose=False):

def test_unicode_input_errors(unicode_filename, verbose=False):
data = open(unicode_filename, 'rb').read().decode('utf-8')
for input in [data.encode('latin1', 'ignore'),
data.encode('utf-16-be'), data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
codecs.BOM_UTF16_LE+data.encode('utf-8')+'!']:
for input in [data.encode('utf-16-be'),
data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF8+data.encode('utf-16-le')]:

try:
yaml.load(input)
except yaml.YAMLError, exc:
Expand Down Expand Up @@ -69,17 +69,7 @@ def test_unicode_output(unicode_filename, verbose=False):
stream = StringIO.StringIO()
yaml.dump(value, stream, encoding=encoding, allow_unicode=allow_unicode)
data4 = stream.getvalue()
for copy in [data1, data2, data3, data4]:
if allow_unicode:
try:
copy[4:].encode('ascii')
except (UnicodeDecodeError, UnicodeEncodeError), exc:
if verbose:
print exc
else:
raise AssertionError("expected an exception")
else:
copy[4:].encode('ascii')

assert isinstance(data1, str), (type(data1), encoding)
data1.decode('utf-8')
assert isinstance(data2, str), (type(data2), encoding)
Expand Down Expand Up @@ -148,4 +138,3 @@ def test_unicode_transfer(unicode_filename, verbose=False):
if __name__ == '__main__':
import test_appliance
test_appliance.run(globals())

26 changes: 6 additions & 20 deletions tests/lib3/test_input_output.py
Expand Up @@ -24,11 +24,11 @@ def test_unicode_input(unicode_filename, verbose=False):

def test_unicode_input_errors(unicode_filename, verbose=False):
data = open(unicode_filename, 'rb').read().decode('utf-8')
for input in [data.encode('latin1', 'ignore'),
data.encode('utf-16-be'), data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF16_BE+data.encode('utf-16-le'),
codecs.BOM_UTF16_LE+data.encode('utf-8')+b'!']:
for input in [data.encode('utf-16-be'),
data.encode('utf-16-le'),
codecs.BOM_UTF8+data.encode('utf-16-be'),
codecs.BOM_UTF8+data.encode('utf-16-le')]:

try:
yaml.load(input)
except yaml.YAMLError as exc:
Expand Down Expand Up @@ -75,20 +75,7 @@ def test_unicode_output(unicode_filename, verbose=False):
if verbose:
print("BYTES:", data4[:50])
data4 = data4.decode(encoding)
for copy in [data1, data2, data3, data4]:
if copy is None:
continue
assert isinstance(copy, str)
if allow_unicode:
try:
copy[4:].encode('ascii')
except UnicodeEncodeError as exc:
if verbose:
print(exc)
else:
raise AssertionError("expected an exception")
else:
copy[4:].encode('ascii')

assert isinstance(data1, str), (type(data1), encoding)
assert isinstance(data2, str), (type(data2), encoding)

Expand Down Expand Up @@ -147,4 +134,3 @@ def test_unicode_transfer(unicode_filename, verbose=False):
if __name__ == '__main__':
import test_appliance
test_appliance.run(globals())