Improve object repr slicing on Python 2. Fixes #1407 (#1429)

microsoft · May 17, 2019 · f115036 · f115036
1 parent 343ad5f
commit f115036
Show file tree

Hide file tree

Showing 4 changed files with 235 additions and 9 deletions.
diff --git a/src/ptvsd/_vendored/pydevd/_pydev_runfiles/pydev_runfiles_pytest2.py b/src/ptvsd/_vendored/pydevd/_pydev_runfiles/pydev_runfiles_pytest2.py
@@ -9,7 +9,6 @@
 import sys
 import time
 
-
 #=========================================================================
 # Load filters with tests we should skip
 #=========================================================================
@@ -35,6 +34,8 @@ def is_in_xdist_node():
 
 
 connected = False
+
+
 def connect_to_server_for_communication_to_xml_rpc_on_xdist():
     global connected
     if connected:
@@ -68,7 +69,9 @@ def start_redirect():
 
 
 def get_curr_output():
-    return State.buf_out.getvalue(), State.buf_err.getvalue()
+    buf_out = State.buf_out
+    buf_err = State.buf_err
+    return buf_out.getvalue() if buf_out is not None else '', buf_err.getvalue() if buf_err is not None else ''
 
 
 def pytest_unconfigure():
@@ -136,6 +139,7 @@ def pytest_collection_modifyitems(session, config, items):
 
 from py.io import TerminalWriter
 
+
 def _get_error_contents_from_report(report):
     if report.longrepr is not None:
         tw = TerminalWriter(stringio=True)
@@ -148,11 +152,13 @@ def _get_error_contents_from_report(report):
 
     return ''
 
+
 def pytest_collectreport(report):
     error_contents = _get_error_contents_from_report(report)
     if error_contents:
         report_test('fail', '<collect errors>', '<collect errors>', '', error_contents, 0.0)
 
+
 def append_strings(s1, s2):
     if s1.__class__ == s2.__class__:
         return s1 + s2
@@ -183,7 +189,6 @@ def append_strings(s1, s2):
         return s1 + s2
 
 
-
 def pytest_runtest_logreport(report):
     if is_in_xdist_node():
         # When running with xdist, we don't want the report to be called from the node, only
@@ -254,9 +259,11 @@ def report_test(status, filename, test, captured_output, error_contents, duratio
     pydev_runfiles_xml_rpc.notifyTest(
         status, captured_output, error_contents, filename, test, time_str)
 
+
 if not hasattr(pytest, 'hookimpl'):
     raise AssertionError('Please upgrade pytest (the current version of pytest: %s is unsupported)' % (pytest.__version__,))
 
+
 @pytest.hookimpl(hookwrapper=True)
 def pytest_runtest_makereport(item, call):
     outcome = yield

diff --git a/src/ptvsd/_vendored/pydevd/_pydevd_bundle/pydevd_safe_repr.py b/src/ptvsd/_vendored/pydevd/_pydevd_bundle/pydevd_safe_repr.py
@@ -4,6 +4,9 @@
 
 # Gotten from ptvsd for supporting the format expected there.
 import sys
+from _pydevd_bundle.pydevd_constants import IS_PY2
+import locale
+import json
 
 # Py3 compat - alias unicode to str, and xrange to range
 try:
@@ -17,6 +20,12 @@
 
 
 class SafeRepr(object):
+    # Can be used to override the encoding from locale.getpreferredencoding()
+    locale_preferred_encoding = None
+
+    # Can be used to override the encoding used for sys.stdout.encoding
+    sys_stdout_encoding = None
+
     # String types are truncated to maxstring_outer when at the outer-
     # most level, and truncated to maxstring_inner characters inside
     # collections.
@@ -74,8 +83,18 @@ class SafeRepr(object):
     raw_value = False
 
     def __call__(self, obj):
+        '''
+        :param object obj:
+            The object for which we want a representation.
+
+        :return str:
+            Returns bytes encoded as utf-8 on py2 and str on py3.
+        '''
         try:
-            return ''.join(self._repr(obj, 0))
+            if IS_PY2:
+                return ''.join((x.encode('utf-8') if isinstance(x, unicode) else x) for x in self._repr(obj, 0))
+            else:
+                return ''.join(self._repr(obj, 0))
         except Exception:
             try:
                 return 'An exception was raised: %r' % sys.exc_info()[1]
@@ -271,10 +290,14 @@ def _repr_obj(self, obj, level, limit_inner, limit_outer):
         try:
             if self.raw_value:
                 # For raw value retrieval, ignore all limits.
+                if isinstance(obj, bytes):
+                    yield obj.decode('latin-1')
+                    return
+
                 try:
                     mv = memoryview(obj)
                 except Exception:
-                    yield unicode(obj)
+                    yield self._convert_to_unicode_or_bytes_repr(repr(obj))
                     return
                 else:
                     # Map bytes to Unicode codepoints with same values.
@@ -296,14 +319,88 @@ def _repr_obj(self, obj, level, limit_inner, limit_outer):
         limit = limit_inner if level > 0 else limit_outer
 
         if limit >= len(obj_repr):
-            yield obj_repr
+            yield self._convert_to_unicode_or_bytes_repr(obj_repr)
             return
 
         # Slightly imprecise calculations - we may end up with a string that is
         # up to 3 characters longer than limit. If you need precise formatting,
         # you are using the wrong class.
         left_count, right_count = max(1, int(2 * limit / 3)), max(1, int(limit / 3))  # noqa
 
+        if IS_PY2 and isinstance(obj_repr, bytes):
+            # If we can convert to unicode before slicing, that's better (but don't do
+            # it if it's not possible as we may be dealing with actual binary data).
+
+            obj_repr = self._bytes_as_unicode_if_possible(obj_repr)
+            if isinstance(obj_repr, unicode):
+                # Deal with high-surrogate leftovers on Python 2.
+                try:
+                    if left_count > 0 and unichr(0xD800) <= obj_repr[left_count - 1] <= unichr(0xDBFF):
+                        left_count -= 1
+                except ValueError:
+                    # On Jython unichr(0xD800) will throw an error:
+                    # ValueError: unichr() arg is a lone surrogate in range (0xD800, 0xDFFF) (Jython UTF-16 encoding)
+                    # Just ignore it in this case.
+                    pass
+
+                start = obj_repr[:left_count]
+
+                # Note: yielding unicode is fine (it'll be properly converted to utf-8 if needed).
+                yield start
+                yield '...'
+
+                # Deal with high-surrogate leftovers on Python 2.
+                try:
+                    if right_count > 0 and unichr(0xD800) <= obj_repr[-right_count - 1] <= unichr(0xDBFF):
+                        right_count -= 1
+                except ValueError:
+                    # On Jython unichr(0xD800) will throw an error:
+                    # ValueError: unichr() arg is a lone surrogate in range (0xD800, 0xDFFF) (Jython UTF-16 encoding)
+                    # Just ignore it in this case.
+                    pass
+
+                yield obj_repr[-right_count:]
+                return
+            else:
+                # We can't decode it (binary string). Use repr() of bytes.
+                obj_repr = repr(obj_repr)
+
         yield obj_repr[:left_count]
         yield '...'
         yield obj_repr[-right_count:]
+
+    def _convert_to_unicode_or_bytes_repr(self, obj_repr):
+        if IS_PY2 and isinstance(obj_repr, bytes):
+            obj_repr = self._bytes_as_unicode_if_possible(obj_repr)
+            if isinstance(obj_repr, bytes):
+                # If we haven't been able to decode it this means it's some binary data
+                # we can't make sense of, so, we need its repr() -- otherwise json
+                # encoding may break later on.
+                obj_repr = repr(obj_repr)
+        return obj_repr
+
+    def _bytes_as_unicode_if_possible(self, obj_repr):
+        # We try to decode with 3 possible encoding (sys.stdout.encoding,
+        # locale.getpreferredencoding() and 'utf-8). If no encoding can decode
+        # the input, we return the original bytes.
+        try_encodings = []
+        encoding = self.sys_stdout_encoding or getattr(sys.stdout, 'encoding', '')
+        if encoding:
+            try_encodings.append(encoding.lower())
+
+        preferred_encoding = self.locale_preferred_encoding or locale.getpreferredencoding()
+        if preferred_encoding:
+            preferred_encoding = preferred_encoding.lower()
+            if preferred_encoding not in try_encodings:
+                try_encodings.append(preferred_encoding)
+
+        if 'utf-8' not in try_encodings:
+            try_encodings.append('utf-8')
+
+        for encoding in try_encodings:
+            try:
+                return obj_repr.decode(encoding)
+            except UnicodeDecodeError:
+                pass
+
+        return obj_repr  # Return the original version (in bytes)
diff --git a/src/ptvsd/_vendored/pydevd/tests_python/test_debugger_json.py b/src/ptvsd/_vendored/pydevd/tests_python/test_debugger_json.py
@@ -270,10 +270,11 @@ def write_step_in(self, thread_id):
         arguments = pydevd_schema.StepInArguments(threadId=thread_id)
         self.wait_for_response(self.write_request(pydevd_schema.StepInRequest(arguments)))
 
-    def write_step_next(self, thread_id):
+    def write_step_next(self, thread_id, wait_for_response=True):
         next_request = self.write_request(
             pydevd_schema.NextRequest(pydevd_schema.NextArguments(thread_id)))
-        self.wait_for_response(next_request)
+        if wait_for_response:
+            self.wait_for_response(next_request)
 
     def write_step_out(self, thread_id):
         stepout_request = self.write_request(
@@ -628,7 +629,7 @@ def test_case_skipping_filters(case_setup, custom_setup):
         if IS_JYTHON:
             json_facade.write_continue(wait_for_response=False)
         else:
-            json_facade.write_step_next(json_hit.thread_id)
+            json_facade.write_step_next(json_hit.thread_id, wait_for_response=False)
 
         writer.finished_ok = True
 

diff --git a/src/ptvsd/_vendored/pydevd/tests_python/test_safe_repr.py b/src/ptvsd/_vendored/pydevd/tests_python/test_safe_repr.py
@@ -1,8 +1,11 @@
+# coding: utf-8
 import collections
 import sys
 import re
 import pytest
 from _pydevd_bundle.pydevd_safe_repr import SafeRepr
+import json
+from _pydevd_bundle.pydevd_constants import IS_JYTHON, IS_PY2
 
 try:
     import numpy as np
@@ -593,3 +596,121 @@ def test_zeros(self):
         value = np.zeros(SafeRepr.maxcollection[0] + 1)
 
         self.assert_unchanged(value, repr(value))
+
+
+@pytest.mark.parametrize('params', [
+    # In python 2, unicode slicing may or may not work well depending on whether it's a ucs-2 or
+    # ucs-4 build (so, we have to strip the high-surrogate if it's ucs-2 and the number of chars
+    # will be different).
+
+    {'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄F😄FF😄F", 'output': (u"😄😄😄😄😄😄...FF😄F", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...F😄FF😄F")},
+
+    {'maxother_outer': 20, 'input': u"😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄😄😄😄...FFFFFF", u"😄😄😄😄😄😄😄😄😄😄😄😄😄...FFFFFF")},
+    {'maxother_outer': 20, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐🌐🌐🌐...FFFFFF", u"🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐🌐...FFFFFF")},
+    {'maxother_outer': 10, 'input': u"😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': (u"😄😄😄...FFF", u"😄😄😄😄😄😄...FFF")},
+    {'maxother_outer': 10, 'input': u"🌐🌐🌐🌐🌐🌐🌐🌐🌐FFFFFFFF", 'output': (u"🌐🌐🌐...FFF", u"🌐🌐🌐🌐🌐🌐...FFF")},
+
+    # Regular unicode
+    {'maxother_outer': 20, 'input': u"ωωωωωωωωωωωωωωωωωωωωωωωFFFFFFFF", 'output': u"ωωωωωωωωωωωωω...FFFFFF"},
+    {'maxother_outer': 20, 'input': u"������������FFFFFFFF", 'output': u"������������F...FFFFFF"},
+    {'maxother_outer': 10, 'input': u"������������FFFFFFFF", 'output': u"������...FFF"},
+
+    # Note that we actually get the repr() in this case as we can't decode it with any of the available encodings.
+    {'maxother_outer': 10, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\...fd'"},
+    {'maxother_outer': 20, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10, 'output': b"'\\xed\\xbd\\xbf...a\\xfd'"},
+    # Check that we use repr() even if it fits the maxother_outer limit.
+    {'maxother_outer': 100, 'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd', 'output': "'\\xed\\xbd\\xbf\\xff\\xfe\\xfa\\xfd'"},
+
+    # Note that with latin1 encoding we can actually decode the string but when encoding back to utf-8 we have garbage
+    # (couldn't find a good approach to know what to do here as we've actually been able to decode it as
+    # latin-1 because it's a very permissive encoding).
+    {
+        'maxother_outer': 10,
+        'sys_stdout_encoding': 'latin1',
+        'input': b'\xed\xbd\xbf\xff\xfe\xfa\xfd' * 10,
+        'output': b'\xc3\xad\xc2\xbd\xc2\xbf\xc3\xbf\xc3\xbe\xc3\xba...\xc3\xbe\xc3\xba\xc3\xbd'
+    },
+])
+@pytest.mark.skipif(not IS_PY2, reason='Py2 specific test.')
+def test_py2_bytes_slicing(params):
+    safe_repr = SafeRepr()
+    safe_repr.locale_preferred_encoding = 'ascii'
+    safe_repr.sys_stdout_encoding = params.get('sys_stdout_encoding', 'ascii')
+
+    safe_repr.maxother_outer = params['maxother_outer']
+
+    # This is the encoding that we expect back (because json needs to be able to encode it
+    # later on, so, the return from SafeRepr must always be utf-8 regardless of the input).
+    encoding = 'utf-8'
+
+    class MyObj(object):
+
+        def __repr__(self):
+            ret = params['input']
+            if isinstance(ret, unicode):
+                ret = ret.encode(encoding)
+            return ret
+
+    expected_output = params['output']
+    computed = safe_repr(MyObj())
+
+    expect_unicode = False
+    if isinstance(expected_output, unicode):
+        expect_unicode = True
+    if isinstance(expected_output, tuple) and isinstance(expected_output[0], unicode):
+        expect_unicode = True
+
+    if expect_unicode:
+        computed = computed.decode(encoding)
+        if isinstance(expected_output, tuple):
+            assert computed in expected_output
+        else:
+            assert computed == expected_output
+    else:
+        assert repr(computed) == repr(expected_output)
+
+    # Check that we can json-encode the return.
+    assert json.dumps(computed)
+
+
+@pytest.mark.parametrize('params', [
+    {'maxother_outer': 20, 'input': "😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': '😄😄😄😄😄😄😄😄😄😄😄😄😄...FFFFFF'},
+    {'maxother_outer': 10, 'input': "😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄😄FFFFFFFF", 'output': '😄😄😄😄😄😄...FFF'},
+    {'maxother_outer': 10, 'input': u"������������FFFFFFFF", 'output': u"������...FFF"},
+
+    # Because we can't return bytes, byte-related tests aren't needed (and str works as it should).
+])
+@pytest.mark.skipif(IS_PY2, reason='Py3 specific test')
+def test_py3_str_slicing(params):
+    # Note: much simpler in python because __repr__ is required to return str
+    # (which is actually unicode).
+    safe_repr = SafeRepr()
+    safe_repr.locale_preferred_encoding = 'ascii'
+    safe_repr.sys_stdout_encoding = params.get('sys_stdout_encoding', 'ascii')
+
+    safe_repr.maxother_outer = params['maxother_outer']
+
+    class MyObj(object):
+
+        def __repr__(self):
+            return params['input']
+
+    expected_output = params['output']
+    computed = safe_repr(MyObj())
+    assert repr(computed) == repr(expected_output)
+
+    # Check that we can json-encode the return.
+    assert json.dumps(computed)
+
+
+def test_raw():
+    safe_repr = SafeRepr()
+    safe_repr.raw_value = True
+    obj = b'\xed\xbd\xbf\xff\xfe\xfa\xfd'
+    raw_value_repr = safe_repr(obj)
+    assert isinstance(raw_value_repr, str)  # bytes on py2, str on py3
+    if IS_PY2:
+        assert raw_value_repr == obj.decode('latin1').encode('utf-8')
+    else:
+        assert raw_value_repr == obj.decode('latin1')
+