ItemLoader: improve handling of initial item (#4036)

scrapy · Oct 28, 2019 · 7731814 · 7731814
1 parent bb91f9c
commit 7731814
Show file tree

Hide file tree

Showing 3 changed files with 272 additions and 88 deletions.
diff --git a/docs/topics/loaders.rst b/docs/topics/loaders.rst
@@ -35,6 +35,12 @@ Then, you start collecting values into the Item Loader, typically using
 the same item field; the Item Loader will know how to "join" those values later
 using a proper processing function.
 
+.. note:: Collected data is internally stored as lists,
+   allowing to add several values to the same field.
+   If an ``item`` argument is passed when creating a loader,
+   each of the item's values will be stored as-is if it's already
+   an iterable, or wrapped with a list if it's a single value.
+
 Here is a typical Item Loader usage in a :ref:`Spider <topics-spiders>`, using
 the :ref:`Product item <topics-items-declaring>` declared in the :ref:`Items
 chapter <topics-items>`::
@@ -128,9 +134,9 @@ So what happens is:
 It's worth noticing that processors are just callable objects, which are called
 with the data to be parsed, and return a parsed value. So you can use any
 function as input or output processor. The only requirement is that they must
-accept one (and only one) positional argument, which will be an iterator.
+accept one (and only one) positional argument, which will be an iterable.
 
-.. note:: Both input and output processors must receive an iterator as their
+.. note:: Both input and output processors must receive an iterable as their
    first argument. The output of those functions can be anything. The result of
    input processors will be appended to an internal list (in the Loader)
    containing the collected values (for that field). The result of the output

diff --git a/scrapy/loader/__init__.py b/scrapy/loader/__init__.py
@@ -1,19 +1,19 @@
-"""Item Loader
+"""
+Item Loader
 
 See documentation in docs/topics/loaders.rst
-
 """
 from collections import defaultdict
+
 import six
 
 from scrapy.item import Item
+from scrapy.loader.common import wrap_loader_context
+from scrapy.loader.processors import Identity
 from scrapy.selector import Selector
 from scrapy.utils.misc import arg_to_iter, extract_regex
 from scrapy.utils.python import flatten
 
-from .common import wrap_loader_context
-from .processors import Identity
-
 
 class ItemLoader(object):
 
@@ -33,10 +33,9 @@ def __init__(self, item=None, selector=None, response=None, parent=None, **conte
         self.parent = parent
         self._local_item = context['item'] = item
         self._local_values = defaultdict(list)
-        # Preprocess values if item built from dict
-        # Values need to be added to item._values if added them from dict (not with add_values)
+        # values from initial item
         for field_name, value in item.items():
-            self._values[field_name] = self._process_input_value(field_name, value)
+            self._values[field_name] += arg_to_iter(value)
 
     @property
     def _values(self):
@@ -132,24 +131,24 @@ def get_output_value(self, field_name):
         try:
             return proc(self._values[field_name])
         except Exception as e:
-            raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" % \
-                (field_name, self._values[field_name], type(e).__name__, str(e)))
+            raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" %
+                             (field_name, self._values[field_name], type(e).__name__, str(e)))
 
     def get_collected_values(self, field_name):
         return self._values[field_name]
 
     def get_input_processor(self, field_name):
         proc = getattr(self, '%s_in' % field_name, None)
         if not proc:
-            proc = self._get_item_field_attr(field_name, 'input_processor', \
-                self.default_input_processor)
+            proc = self._get_item_field_attr(field_name, 'input_processor',
+                                             self.default_input_processor)
         return proc
 
     def get_output_processor(self, field_name):
         proc = getattr(self, '%s_out' % field_name, None)
         if not proc:
-            proc = self._get_item_field_attr(field_name, 'output_processor', \
-                self.default_output_processor)
+            proc = self._get_item_field_attr(field_name, 'output_processor',
+                                             self.default_output_processor)
         return proc
 
     def _process_input_value(self, field_name, value):
@@ -174,8 +173,8 @@ def _get_item_field_attr(self, field_name, key, default=None):
     def _check_selector_method(self):
         if self.selector is None:
             raise RuntimeError("To use XPath or CSS selectors, "
-                "%s must be instantiated with a selector "
-                "or a response" % self.__class__.__name__)
+                               "%s must be instantiated with a selector "
+                               "or a response" % self.__class__.__name__)
 
     def add_xpath(self, field_name, xpath, *processors, **kw):
         values = self._get_xpathvalues(xpath, **kw)