Skip to content

Commit

Permalink
Add tests, fix canonicalize passing
Browse files Browse the repository at this point in the history
  • Loading branch information
Gallaecio committed Feb 16, 2024
1 parent bccb4cf commit 5e51417
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 1 deletion.
2 changes: 1 addition & 1 deletion scrapy/linkextractors/lxmlhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def __init__(
unique=unique,
process=process_value,
strip=strip,
canonicalized=canonicalize,
canonicalized=not canonicalize,
)
self.allow_res = [
x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)
Expand Down
112 changes: 112 additions & 0 deletions tests/test_linkextractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,118 @@ def test_pickle_extractor(self):
lx = self.extractor_cls()
self.assertIsInstance(pickle.loads(pickle.dumps(lx)), self.extractor_cls)

def test_link_extractor_aggregation(self):
"""When a parameter like restrict_css is used, the underlying
implementation calls its internal link extractor once per selector
matching the specified restrictions, and then aggregates the
extracted links.
Test that aggregation respects the unique and canonicalize
parameters.
"""
# unique=True (default), canonicalize=False (default)
lx = self.extractor_cls(restrict_css=("div",))
response = HtmlResponse(
"https://example.com",
body=b"""
<div>
<a href="/a">a1</a>
<a href="/b?a=1&b=2">b1</a>
</div>
<div>
<a href="/a">a2</a>
<a href="/b?b=2&a=1">b2</a>
</div>
""",
)
actual = lx.extract_links(response)
self.assertEqual(
actual,
[
Link(url="https://example.com/a", text="a1"),
Link(url="https://example.com/b?a=1&b=2", text="b1"),
Link(url="https://example.com/b?b=2&a=1", text="b2"),
],
)

# unique=True (default), canonicalize=True
lx = self.extractor_cls(restrict_css=("div",), canonicalize=True)
response = HtmlResponse(
"https://example.com",
body=b"""
<div>
<a href="/a">a1</a>
<a href="/b?a=1&b=2">b1</a>
</div>
<div>
<a href="/a">a2</a>
<a href="/b?b=2&a=1">b2</a>
</div>
""",
)
actual = lx.extract_links(response)
self.assertEqual(
actual,
[
Link(url="https://example.com/a", text="a1"),
Link(url="https://example.com/b?a=1&b=2", text="b1"),
],
)

# unique=False, canonicalize=False (default)
lx = self.extractor_cls(restrict_css=("div",), unique=False)
response = HtmlResponse(
"https://example.com",
body=b"""
<div>
<a href="/a">a1</a>
<a href="/b?a=1&b=2">b1</a>
</div>
<div>
<a href="/a">a2</a>
<a href="/b?b=2&a=1">b2</a>
</div>
""",
)
actual = lx.extract_links(response)
self.assertEqual(
actual,
[
Link(url="https://example.com/a", text="a1"),
Link(url="https://example.com/b?a=1&b=2", text="b1"),
Link(url="https://example.com/a", text="a2"),
Link(url="https://example.com/b?b=2&a=1", text="b2"),
],
)

# unique=False, canonicalize=True
lx = self.extractor_cls(
restrict_css=("div",), unique=False, canonicalize=True
)
response = HtmlResponse(
"https://example.com",
body=b"""
<div>
<a href="/a">a1</a>
<a href="/b?a=1&b=2">b1</a>
</div>
<div>
<a href="/a">a2</a>
<a href="/b?b=2&a=1">b2</a>
</div>
""",
)
actual = lx.extract_links(response)
self.assertEqual(
actual,
[
Link(url="https://example.com/a", text="a1"),
Link(url="https://example.com/b?a=1&b=2", text="b1"),
Link(url="https://example.com/a", text="a2"),
Link(url="https://example.com/b?a=1&b=2", text="b2"),
],
)


class LxmlLinkExtractorTestCase(Base.LinkExtractorTestCase):
extractor_cls = LxmlLinkExtractor
Expand Down

0 comments on commit 5e51417

Please sign in to comment.