Add tests, fix canonicalize passing

scrapy · Feb 16, 2024 · 5e51417 · 5e51417
1 parent bccb4cf
commit 5e51417
Show file tree

Hide file tree

Showing 2 changed files with 113 additions and 1 deletion.
diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
@@ -153,7 +153,7 @@ def __init__(
             unique=unique,
             process=process_value,
             strip=strip,
-            canonicalized=canonicalize,
+            canonicalized=not canonicalize,
         )
         self.allow_res = [
             x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)

diff --git a/tests/test_linkextractors.py b/tests/test_linkextractors.py
@@ -745,6 +745,118 @@ def test_pickle_extractor(self):
             lx = self.extractor_cls()
             self.assertIsInstance(pickle.loads(pickle.dumps(lx)), self.extractor_cls)
 
+        def test_link_extractor_aggregation(self):
+            """When a parameter like restrict_css is used, the underlying
+            implementation calls its internal link extractor once per selector
+            matching the specified restrictions, and then aggregates the
+            extracted links.
+
+            Test that aggregation respects the unique and canonicalize
+            parameters.
+            """
+            # unique=True (default), canonicalize=False (default)
+            lx = self.extractor_cls(restrict_css=("div",))
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/b?b=2&a=1", text="b2"),
+                ],
+            )
+
+            # unique=True (default), canonicalize=True
+            lx = self.extractor_cls(restrict_css=("div",), canonicalize=True)
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                ],
+            )
+
+            # unique=False, canonicalize=False (default)
+            lx = self.extractor_cls(restrict_css=("div",), unique=False)
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/a", text="a2"),
+                    Link(url="https://example.com/b?b=2&a=1", text="b2"),
+                ],
+            )
+
+            # unique=False, canonicalize=True
+            lx = self.extractor_cls(
+                restrict_css=("div",), unique=False, canonicalize=True
+            )
+            response = HtmlResponse(
+                "https://example.com",
+                body=b"""
+                    <div>
+                        <a href="/a">a1</a>
+                        <a href="/b?a=1&b=2">b1</a>
+                    </div>
+                    <div>
+                        <a href="/a">a2</a>
+                        <a href="/b?b=2&a=1">b2</a>
+                    </div>
+                """,
+            )
+            actual = lx.extract_links(response)
+            self.assertEqual(
+                actual,
+                [
+                    Link(url="https://example.com/a", text="a1"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b1"),
+                    Link(url="https://example.com/a", text="a2"),
+                    Link(url="https://example.com/b?a=1&b=2", text="b2"),
+                ],
+            )
+
 
 class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
     extractor_cls = LxmlLinkExtractor