From 73511775b27747e3fa75695b7b92342d9d0b836b Mon Sep 17 00:00:00 2001 From: harshasrinivas Date: Wed, 19 Jul 2017 02:39:02 +0530 Subject: [PATCH 1/5] Add textwrap.shorten functionality and update tests --- parsel/selector.py | 4 ++-- parsel/utils.py | 10 +++++++++- tests/test_selector.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index bbd4289a..83d80fde 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -7,7 +7,7 @@ import six from lxml import etree, html -from .utils import flatten, iflatten, extract_regex +from .utils import flatten, iflatten, extract_regex, selector_data_shorten from .csstranslator import HTMLTranslator, GenericTranslator @@ -358,6 +358,6 @@ def __bool__(self): __nonzero__ = __bool__ def __str__(self): - data = repr(self.get()[:40]) + data = repr(selector_data_shorten(self.get(), width=40)) return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data) __repr__ = __str__ diff --git a/parsel/utils.py b/parsel/utils.py index 56bb105d..fa05e0ae 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -1,5 +1,6 @@ import re import six +import textwrap from w3lib.html import replace_entities as w3lib_replace_entities @@ -80,4 +81,11 @@ def extract_regex(regex, text, replace_entities=True): strings = flatten(strings) if not replace_entities: return strings - return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings] \ No newline at end of file + return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings] + +def selector_data_shorten(data, width): + """Shortens the preview of extracted data by adding placeholder '...' + """ + if six.PY2: + return data[:40] + return textwrap.shorten(data, width=width, placeholder="...") \ No newline at end of file diff --git a/tests/test_selector.py b/tests/test_selector.py index e504166a..2487a438 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -133,7 +133,7 @@ def test_representation_slice(self): body = u"

".format(50 * 'b') sel = self.sscls(text=body) - representation = "".format(40 * 'b') + representation = "".format('...') if six.PY2: representation = "".format(40 * 'b') From db4ff4890654cd1d3f9ac6f80ccb307db8ea3f2b Mon Sep 17 00:00:00 2001 From: harshasrinivas Date: Wed, 19 Jul 2017 04:05:16 +0530 Subject: [PATCH 2/5] Fix compatilibility issues with Python 3.3 or lower --- parsel/utils.py | 7 ++++--- tests/test_selector.py | 5 ++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/parsel/utils.py b/parsel/utils.py index fa05e0ae..3c0e97c4 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -86,6 +86,7 @@ def extract_regex(regex, text, replace_entities=True): def selector_data_shorten(data, width): """Shortens the preview of extracted data by adding placeholder '...' """ - if six.PY2: - return data[:40] - return textwrap.shorten(data, width=width, placeholder="...") \ No newline at end of file + if six.PY34: + return textwrap.shorten(data, width=width, placeholder="...") + else: + return data[:40] \ No newline at end of file diff --git a/tests/test_selector.py b/tests/test_selector.py index 2487a438..7e27beca 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -133,9 +133,12 @@ def test_representation_slice(self): body = u"

".format(50 * 'b') sel = self.sscls(text=body) - representation = "".format('...') if six.PY2: representation = "".format(40 * 'b') + elif six.PY34: + representation = "".format('...') + else: + representation = "".format(40 * 'b') self.assertEqual( [repr(it) for it in sel.xpath('//input/@name')], From a5c6b61ad977af407ba92a8bdbfa2cd243a6ced0 Mon Sep 17 00:00:00 2001 From: harshasrinivas Date: Thu, 20 Jul 2017 06:45:15 +0530 Subject: [PATCH 3/5] Customize text-wrapping function --- parsel/selector.py | 4 ++-- parsel/utils.py | 8 ++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/parsel/selector.py b/parsel/selector.py index 83d80fde..6d221628 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -7,7 +7,7 @@ import six from lxml import etree, html -from .utils import flatten, iflatten, extract_regex, selector_data_shorten +from .utils import flatten, iflatten, extract_regex, shorten_selector_data from .csstranslator import HTMLTranslator, GenericTranslator @@ -358,6 +358,6 @@ def __bool__(self): __nonzero__ = __bool__ def __str__(self): - data = repr(selector_data_shorten(self.get(), width=40)) + data = repr(shorten_selector_data(self.get(), width=40)) return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data) __repr__ = __str__ diff --git a/parsel/utils.py b/parsel/utils.py index 3c0e97c4..8bda2b33 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -1,6 +1,5 @@ import re import six -import textwrap from w3lib.html import replace_entities as w3lib_replace_entities @@ -83,10 +82,7 @@ def extract_regex(regex, text, replace_entities=True): return strings return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings] -def selector_data_shorten(data, width): +def shorten_selector_data(data, width): """Shortens the preview of extracted data by adding placeholder '...' """ - if six.PY34: - return textwrap.shorten(data, width=width, placeholder="...") - else: - return data[:40] \ No newline at end of file + return data[:width - 3] + "..." if (len(data) > width - 3) else data \ No newline at end of file From ac5ec5eab185c82a29e22363b51cdcc2eac6a0e8 Mon Sep 17 00:00:00 2001 From: harshasrinivas Date: Thu, 20 Jul 2017 06:47:54 +0530 Subject: [PATCH 4/5] Update tests --- tests/test_selector.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_selector.py b/tests/test_selector.py index 7e27beca..c8845a5f 100644 --- a/tests/test_selector.py +++ b/tests/test_selector.py @@ -133,12 +133,9 @@ def test_representation_slice(self): body = u"

".format(50 * 'b') sel = self.sscls(text=body) + representation = "".format(37 * 'b') if six.PY2: - representation = "".format(40 * 'b') - elif six.PY34: - representation = "".format('...') - else: - representation = "".format(40 * 'b') + representation = "".format(37 * 'b') self.assertEqual( [repr(it) for it in sel.xpath('//input/@name')], From 63fe1890e6fa63875f4ebeed57120e2d9d4fda12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 10 Jul 2019 21:51:39 +0200 Subject: [PATCH 5/5] Rename shorten_selector_data to shorten and improve its implementation --- parsel/selector.py | 4 ++-- parsel/utils.py | 14 ++++++++++---- tests/test_utils.py | 26 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 tests/test_utils.py diff --git a/parsel/selector.py b/parsel/selector.py index 6d221628..666a3036 100644 --- a/parsel/selector.py +++ b/parsel/selector.py @@ -7,7 +7,7 @@ import six from lxml import etree, html -from .utils import flatten, iflatten, extract_regex, shorten_selector_data +from .utils import flatten, iflatten, extract_regex, shorten from .csstranslator import HTMLTranslator, GenericTranslator @@ -358,6 +358,6 @@ def __bool__(self): __nonzero__ = __bool__ def __str__(self): - data = repr(shorten_selector_data(self.get(), width=40)) + data = repr(shorten(self.get(), width=40)) return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data) __repr__ = __str__ diff --git a/parsel/utils.py b/parsel/utils.py index 8bda2b33..458bc6cc 100644 --- a/parsel/utils.py +++ b/parsel/utils.py @@ -82,7 +82,13 @@ def extract_regex(regex, text, replace_entities=True): return strings return [w3lib_replace_entities(s, keep=['lt', 'amp']) for s in strings] -def shorten_selector_data(data, width): - """Shortens the preview of extracted data by adding placeholder '...' - """ - return data[:width - 3] + "..." if (len(data) > width - 3) else data \ No newline at end of file + +def shorten(text, width, suffix='...'): + """Truncate the given text to fit in the given width.""" + if len(text) <= width: + return text + if width > len(suffix): + return text[:width-len(suffix)] + suffix + if width >= 0: + return suffix[len(suffix)-width:] + raise ValueError('width must be equal or greater than 0') diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..da20ec2f --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,26 @@ +from parsel.utils import shorten + +from pytest import mark, raises +import six + + +@mark.parametrize( + 'width,expected', + ( + (-1, ValueError), + (0, u''), + (1, u'.'), + (2, u'..'), + (3, u'...'), + (4, u'f...'), + (5, u'fo...'), + (6, u'foobar'), + (7, u'foobar'), + ) +) +def test_shorten(width, expected): + if isinstance(expected, six.string_types): + assert shorten(u'foobar', width) == expected + else: + with raises(expected): + shorten(u'foobar', width)