From 7731814cc25c57fe31db9ba749450cd5a27eed39 Mon Sep 17 00:00:00 2001 From: elacuesta Date: Mon, 28 Oct 2019 06:53:53 -0300 Subject: [PATCH] ItemLoader: improve handling of initial item (#4036) --- docs/topics/loaders.rst | 10 +- scrapy/loader/__init__.py | 31 ++-- tests/test_loader.py | 319 +++++++++++++++++++++++++++++--------- 3 files changed, 272 insertions(+), 88 deletions(-) diff --git a/docs/topics/loaders.rst b/docs/topics/loaders.rst index 1c2f1da4d8d..0318e37aa28 100644 --- a/docs/topics/loaders.rst +++ b/docs/topics/loaders.rst @@ -35,6 +35,12 @@ Then, you start collecting values into the Item Loader, typically using the same item field; the Item Loader will know how to "join" those values later using a proper processing function. +.. note:: Collected data is internally stored as lists, + allowing to add several values to the same field. + If an ``item`` argument is passed when creating a loader, + each of the item's values will be stored as-is if it's already + an iterable, or wrapped with a list if it's a single value. + Here is a typical Item Loader usage in a :ref:`Spider `, using the :ref:`Product item ` declared in the :ref:`Items chapter `:: @@ -128,9 +134,9 @@ So what happens is: It's worth noticing that processors are just callable objects, which are called with the data to be parsed, and return a parsed value. So you can use any function as input or output processor. The only requirement is that they must -accept one (and only one) positional argument, which will be an iterator. +accept one (and only one) positional argument, which will be an iterable. -.. note:: Both input and output processors must receive an iterator as their +.. note:: Both input and output processors must receive an iterable as their first argument. The output of those functions can be anything. The result of input processors will be appended to an internal list (in the Loader) containing the collected values (for that field). The result of the output diff --git a/scrapy/loader/__init__.py b/scrapy/loader/__init__.py index 6665eba168a..60fd6d22293 100644 --- a/scrapy/loader/__init__.py +++ b/scrapy/loader/__init__.py @@ -1,19 +1,19 @@ -"""Item Loader +""" +Item Loader See documentation in docs/topics/loaders.rst - """ from collections import defaultdict + import six from scrapy.item import Item +from scrapy.loader.common import wrap_loader_context +from scrapy.loader.processors import Identity from scrapy.selector import Selector from scrapy.utils.misc import arg_to_iter, extract_regex from scrapy.utils.python import flatten -from .common import wrap_loader_context -from .processors import Identity - class ItemLoader(object): @@ -33,10 +33,9 @@ def __init__(self, item=None, selector=None, response=None, parent=None, **conte self.parent = parent self._local_item = context['item'] = item self._local_values = defaultdict(list) - # Preprocess values if item built from dict - # Values need to be added to item._values if added them from dict (not with add_values) + # values from initial item for field_name, value in item.items(): - self._values[field_name] = self._process_input_value(field_name, value) + self._values[field_name] += arg_to_iter(value) @property def _values(self): @@ -132,8 +131,8 @@ def get_output_value(self, field_name): try: return proc(self._values[field_name]) except Exception as e: - raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" % \ - (field_name, self._values[field_name], type(e).__name__, str(e))) + raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" % + (field_name, self._values[field_name], type(e).__name__, str(e))) def get_collected_values(self, field_name): return self._values[field_name] @@ -141,15 +140,15 @@ def get_collected_values(self, field_name): def get_input_processor(self, field_name): proc = getattr(self, '%s_in' % field_name, None) if not proc: - proc = self._get_item_field_attr(field_name, 'input_processor', \ - self.default_input_processor) + proc = self._get_item_field_attr(field_name, 'input_processor', + self.default_input_processor) return proc def get_output_processor(self, field_name): proc = getattr(self, '%s_out' % field_name, None) if not proc: - proc = self._get_item_field_attr(field_name, 'output_processor', \ - self.default_output_processor) + proc = self._get_item_field_attr(field_name, 'output_processor', + self.default_output_processor) return proc def _process_input_value(self, field_name, value): @@ -174,8 +173,8 @@ def _get_item_field_attr(self, field_name, key, default=None): def _check_selector_method(self): if self.selector is None: raise RuntimeError("To use XPath or CSS selectors, " - "%s must be instantiated with a selector " - "or a response" % self.__class__.__name__) + "%s must be instantiated with a selector " + "or a response" % self.__class__.__name__) def add_xpath(self, field_name, xpath, *processors, **kw): values = self._get_xpathvalues(xpath, **kw) diff --git a/tests/test_loader.py b/tests/test_loader.py index 2725b001a18..4a4264a2a38 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -1,13 +1,15 @@ +from functools import partial import unittest + import six -from functools import partial -from scrapy.loader import ItemLoader -from scrapy.loader.processors import Join, Identity, TakeFirst, \ - Compose, MapCompose, SelectJmes +from scrapy.http import HtmlResponse from scrapy.item import Item, Field +from scrapy.loader import ItemLoader +from scrapy.loader.processors import (Compose, Identity, Join, + MapCompose, SelectJmes, TakeFirst) from scrapy.selector import Selector -from scrapy.http import HtmlResponse + # test items class NameItem(Item): @@ -61,7 +63,7 @@ def test_load_item_using_default_loader(self): il.add_value('name', u'marta') item = il.load_item() assert item is i - self.assertEqual(item['summary'], u'lala') + self.assertEqual(item['summary'], [u'lala']) self.assertEqual(item['name'], [u'marta']) def test_load_item_using_custom_loader(self): @@ -419,43 +421,6 @@ class TestItemLoader(NameItemLoader): self.assertEqual(item['url'], u'rabbit.hole') self.assertEqual(item['summary'], u'rabbithole') - def test_create_item_from_dict(self): - class TestItem(Item): - title = Field() - - class TestItemLoader(ItemLoader): - default_item_class = TestItem - - input_item = {'title': 'Test item title 1'} - il = TestItemLoader(item=input_item) - # Getting output value mustn't remove value from item - self.assertEqual(il.load_item(), { - 'title': 'Test item title 1', - }) - self.assertEqual(il.get_output_value('title'), 'Test item title 1') - self.assertEqual(il.load_item(), { - 'title': 'Test item title 1', - }) - - input_item = {'title': 'Test item title 2'} - il = TestItemLoader(item=input_item) - # Values from dict must be added to item _values - self.assertEqual(il._values.get('title'), 'Test item title 2') - - input_item = {'title': [u'Test item title 3', u'Test item 4']} - il = TestItemLoader(item=input_item) - # Same rules must work for lists - self.assertEqual(il._values.get('title'), - [u'Test item title 3', u'Test item 4']) - self.assertEqual(il.load_item(), { - 'title': [u'Test item title 3', u'Test item 4'], - }) - self.assertEqual(il.get_output_value('title'), - [u'Test item title 3', u'Test item 4']) - self.assertEqual(il.load_item(), { - 'title': [u'Test item title 3', u'Test item 4'], - }) - def test_error_input_processor(self): class TestItem(Item): name = Field() @@ -493,6 +458,220 @@ class TestItemLoader(ItemLoader): [u'marta', u'other'], Compose(float)) +class InitializationTestMixin(object): + + item_class = None + + def test_keep_single_value(self): + """Loaded item should contain values from the initial item""" + input_item = self.item_class(name='foo') + il = ItemLoader(item=input_item) + loaded_item = il.load_item() + self.assertIsInstance(loaded_item, self.item_class) + self.assertEqual(dict(loaded_item), {'name': ['foo']}) + + def test_keep_list(self): + """Loaded item should contain values from the initial item""" + input_item = self.item_class(name=['foo', 'bar']) + il = ItemLoader(item=input_item) + loaded_item = il.load_item() + self.assertIsInstance(loaded_item, self.item_class) + self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar']}) + + def test_add_value_singlevalue_singlevalue(self): + """Values added after initialization should be appended""" + input_item = self.item_class(name='foo') + il = ItemLoader(item=input_item) + il.add_value('name', 'bar') + loaded_item = il.load_item() + self.assertIsInstance(loaded_item, self.item_class) + self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar']}) + + def test_add_value_singlevalue_list(self): + """Values added after initialization should be appended""" + input_item = self.item_class(name='foo') + il = ItemLoader(item=input_item) + il.add_value('name', ['item', 'loader']) + loaded_item = il.load_item() + self.assertIsInstance(loaded_item, self.item_class) + self.assertEqual(dict(loaded_item), {'name': ['foo', 'item', 'loader']}) + + def test_add_value_list_singlevalue(self): + """Values added after initialization should be appended""" + input_item = self.item_class(name=['foo', 'bar']) + il = ItemLoader(item=input_item) + il.add_value('name', 'qwerty') + loaded_item = il.load_item() + self.assertIsInstance(loaded_item, self.item_class) + self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'qwerty']}) + + def test_add_value_list_list(self): + """Values added after initialization should be appended""" + input_item = self.item_class(name=['foo', 'bar']) + il = ItemLoader(item=input_item) + il.add_value('name', ['item', 'loader']) + loaded_item = il.load_item() + self.assertIsInstance(loaded_item, self.item_class) + self.assertEqual(dict(loaded_item), {'name': ['foo', 'bar', 'item', 'loader']}) + + def test_get_output_value_singlevalue(self): + """Getting output value must not remove value from item""" + input_item = self.item_class(name='foo') + il = ItemLoader(item=input_item) + self.assertEqual(il.get_output_value('name'), ['foo']) + loaded_item = il.load_item() + self.assertIsInstance(loaded_item, self.item_class) + self.assertEqual(loaded_item, dict({'name': ['foo']})) + + def test_get_output_value_list(self): + """Getting output value must not remove value from item""" + input_item = self.item_class(name=['foo', 'bar']) + il = ItemLoader(item=input_item) + self.assertEqual(il.get_output_value('name'), ['foo', 'bar']) + loaded_item = il.load_item() + self.assertIsInstance(loaded_item, self.item_class) + self.assertEqual(loaded_item, dict({'name': ['foo', 'bar']})) + + def test_values_single(self): + """Values from initial item must be added to loader._values""" + input_item = self.item_class(name='foo') + il = ItemLoader(item=input_item) + self.assertEqual(il._values.get('name'), ['foo']) + + def test_values_list(self): + """Values from initial item must be added to loader._values""" + input_item = self.item_class(name=['foo', 'bar']) + il = ItemLoader(item=input_item) + self.assertEqual(il._values.get('name'), ['foo', 'bar']) + + +class InitializationFromDictTest(InitializationTestMixin, unittest.TestCase): + item_class = dict + + +class InitializationFromItemTest(InitializationTestMixin, unittest.TestCase): + item_class = NameItem + + +class BaseNoInputReprocessingLoader(ItemLoader): + title_in = MapCompose(str.upper) + title_out = TakeFirst() + + +class NoInputReprocessingDictLoader(BaseNoInputReprocessingLoader): + default_item_class = dict + + +class NoInputReprocessingFromDictTest(unittest.TestCase): + """ + Loaders initialized from loaded items must not reprocess fields (dict instances) + """ + def test_avoid_reprocessing_with_initial_values_single(self): + il = NoInputReprocessingDictLoader(item=dict(title='foo')) + il_loaded = il.load_item() + self.assertEqual(il_loaded, dict(title='foo')) + self.assertEqual(NoInputReprocessingDictLoader(item=il_loaded).load_item(), dict(title='foo')) + + def test_avoid_reprocessing_with_initial_values_list(self): + il = NoInputReprocessingDictLoader(item=dict(title=['foo', 'bar'])) + il_loaded = il.load_item() + self.assertEqual(il_loaded, dict(title='foo')) + self.assertEqual(NoInputReprocessingDictLoader(item=il_loaded).load_item(), dict(title='foo')) + + def test_avoid_reprocessing_without_initial_values_single(self): + il = NoInputReprocessingDictLoader() + il.add_value('title', 'foo') + il_loaded = il.load_item() + self.assertEqual(il_loaded, dict(title='FOO')) + self.assertEqual(NoInputReprocessingDictLoader(item=il_loaded).load_item(), dict(title='FOO')) + + def test_avoid_reprocessing_without_initial_values_list(self): + il = NoInputReprocessingDictLoader() + il.add_value('title', ['foo', 'bar']) + il_loaded = il.load_item() + self.assertEqual(il_loaded, dict(title='FOO')) + self.assertEqual(NoInputReprocessingDictLoader(item=il_loaded).load_item(), dict(title='FOO')) + + +class NoInputReprocessingItem(Item): + title = Field() + + +class NoInputReprocessingItemLoader(BaseNoInputReprocessingLoader): + default_item_class = NoInputReprocessingItem + + +class NoInputReprocessingFromItemTest(unittest.TestCase): + """ + Loaders initialized from loaded items must not reprocess fields (BaseItem instances) + """ + def test_avoid_reprocessing_with_initial_values_single(self): + il = NoInputReprocessingItemLoader(item=NoInputReprocessingItem(title='foo')) + il_loaded = il.load_item() + self.assertEqual(il_loaded, {'title': 'foo'}) + self.assertEqual(NoInputReprocessingItemLoader(item=il_loaded).load_item(), {'title': 'foo'}) + + def test_avoid_reprocessing_with_initial_values_list(self): + il = NoInputReprocessingItemLoader(item=NoInputReprocessingItem(title=['foo', 'bar'])) + il_loaded = il.load_item() + self.assertEqual(il_loaded, {'title': 'foo'}) + self.assertEqual(NoInputReprocessingItemLoader(item=il_loaded).load_item(), {'title': 'foo'}) + + def test_avoid_reprocessing_without_initial_values_single(self): + il = NoInputReprocessingItemLoader() + il.add_value('title', 'FOO') + il_loaded = il.load_item() + self.assertEqual(il_loaded, {'title': 'FOO'}) + self.assertEqual(NoInputReprocessingItemLoader(item=il_loaded).load_item(), {'title': 'FOO'}) + + def test_avoid_reprocessing_without_initial_values_list(self): + il = NoInputReprocessingItemLoader() + il.add_value('title', ['foo', 'bar']) + il_loaded = il.load_item() + self.assertEqual(il_loaded, {'title': 'FOO'}) + self.assertEqual(NoInputReprocessingItemLoader(item=il_loaded).load_item(), {'title': 'FOO'}) + + +class TestOutputProcessorDict(unittest.TestCase): + def test_output_processor(self): + + class TempDict(dict): + def __init__(self, *args, **kwargs): + super(TempDict, self).__init__(self, *args, **kwargs) + self.setdefault('temp', 0.3) + + class TempLoader(ItemLoader): + default_item_class = TempDict + default_input_processor = Identity() + default_output_processor = Compose(TakeFirst()) + + loader = TempLoader() + item = loader.load_item() + self.assertIsInstance(item, TempDict) + self.assertEqual(dict(item), {'temp': 0.3}) + + +class TestOutputProcessorItem(unittest.TestCase): + def test_output_processor(self): + + class TempItem(Item): + temp = Field() + + def __init__(self, *args, **kwargs): + super(TempItem, self).__init__(self, *args, **kwargs) + self.setdefault('temp', 0.3) + + class TempLoader(ItemLoader): + default_item_class = TempItem + default_input_processor = Identity() + default_output_processor = Compose(TakeFirst()) + + loader = TempLoader() + item = loader.load_item() + self.assertIsInstance(item, TempItem) + self.assertEqual(dict(item), {'temp': 0.3}) + + class ProcessorsTest(unittest.TestCase): def test_take_first(self): @@ -523,7 +702,8 @@ def test_compose(self): self.assertRaises(ValueError, proc, 'hello') def test_mapcompose(self): - filter_world = lambda x: None if x == 'world' else x + def filter_world(x): + return None if x == 'world' else x proc = MapCompose(filter_world, six.text_type.upper) self.assertEqual(proc([u'hello', u'world', u'this', u'is', u'scrapy']), [u'HELLO', u'THIS', u'IS', u'SCRAPY']) @@ -535,7 +715,6 @@ def test_mapcompose(self): self.assertRaises(ValueError, proc, 'hello') - class SelectortemLoaderTest(unittest.TestCase): response = HtmlResponse(url="", encoding='utf-8', body=b""" @@ -672,7 +851,7 @@ def test_get_css(self): self.assertEqual(l.get_css(['p::text', 'div::text']), [u'paragraph', 'marta']) self.assertEqual(l.get_css(['a::attr(href)', 'img::attr(src)']), - [u'http://www.scrapy.org', u'/images/logo.png']) + [u'http://www.scrapy.org', u'/images/logo.png']) def test_replace_css_multi_fields(self): l = TestItemLoader(response=self.response) @@ -720,7 +899,7 @@ def test_nested_xpath(self): self.assertEqual(l.get_output_value('name'), [u'marta']) self.assertEqual(l.get_output_value('name_div'), [u'
marta
']) - self.assertEqual(l.get_output_value('name_value'), [u'marta']) + self.assertEqual(l.get_output_value('name_value'), [u'marta']) self.assertEqual(l.get_output_value('name'), nl.get_output_value('name')) self.assertEqual(l.get_output_value('name_div'), nl.get_output_value('name_div')) @@ -735,7 +914,7 @@ def test_nested_css(self): self.assertEqual(l.get_output_value('name'), [u'marta']) self.assertEqual(l.get_output_value('name_div'), [u'
marta
']) - self.assertEqual(l.get_output_value('name_value'), [u'marta']) + self.assertEqual(l.get_output_value('name_value'), [u'marta']) self.assertEqual(l.get_output_value('name'), nl.get_output_value('name')) self.assertEqual(l.get_output_value('name_div'), nl.get_output_value('name_div')) @@ -791,28 +970,28 @@ def test_nested_load_item(self): class SelectJmesTestCase(unittest.TestCase): - test_list_equals = { - 'simple': ('foo.bar', {"foo": {"bar": "baz"}}, "baz"), - 'invalid': ('foo.bar.baz', {"foo": {"bar": "baz"}}, None), - 'top_level': ('foo', {"foo": {"bar": "baz"}}, {"bar": "baz"}), - 'double_vs_single_quote_string': ('foo.bar', {"foo": {"bar": "baz"}}, "baz"), - 'dict': ( - 'foo.bar[*].name', - {"foo": {"bar": [{"name": "one"}, {"name": "two"}]}}, - ['one', 'two'] - ), - 'list': ('[1]', [1, 2], 2) - } - - def test_output(self): - for l in self.test_list_equals: - expr, test_list, expected = self.test_list_equals[l] - test = SelectJmes(expr)(test_list) - self.assertEqual( - test, - expected, - msg='test "{}" got {} expected {}'.format(l, test, expected) - ) + test_list_equals = { + 'simple': ('foo.bar', {"foo": {"bar": "baz"}}, "baz"), + 'invalid': ('foo.bar.baz', {"foo": {"bar": "baz"}}, None), + 'top_level': ('foo', {"foo": {"bar": "baz"}}, {"bar": "baz"}), + 'double_vs_single_quote_string': ('foo.bar', {"foo": {"bar": "baz"}}, "baz"), + 'dict': ( + 'foo.bar[*].name', + {"foo": {"bar": [{"name": "one"}, {"name": "two"}]}}, + ['one', 'two'] + ), + 'list': ('[1]', [1, 2], 2) + } + + def test_output(self): + for l in self.test_list_equals: + expr, test_list, expected = self.test_list_equals[l] + test = SelectJmes(expr)(test_list) + self.assertEqual( + test, + expected, + msg='test "{}" got {} expected {}'.format(l, test, expected) + ) if __name__ == "__main__":