From b2c5f11beb3a26c4059997246ddf4844383a6334 Mon Sep 17 00:00:00 2001 From: facelessuser Date: Fri, 10 Sep 2021 10:28:10 -0600 Subject: [PATCH] Allow `:is()`, `:has()`, and `:where()` to forgive empty slots Resolves #122 --- .gitignore | 2 + docs/src/markdown/about/changelog.md | 9 +++ mkdocs.yml | 4 +- requirements/docs.txt | 1 - soupsieve/__meta__.py | 2 +- soupsieve/css_match.py | 3 + soupsieve/css_parser.py | 111 +++++++++++++++++++-------- tests/test_level3/test_not.py | 21 +++++ tests/test_level4/test_has.py | 58 +++++++++----- tests/test_level4/test_is.py | 45 +++++++++-- 10 files changed, 194 insertions(+), 62 deletions(-) diff --git a/.gitignore b/.gitignore index 3df67274..06ddcc83 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.DS_Store + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index 3e07a9e9..3d84535d 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -1,5 +1,14 @@ # Changelog +## 2.3 + +- **NEW**: `:has()`, `:is()`, and `:where()` now use use a forgiving selector list. While not as forgiving as due to + syntax errors as CSS might be, it will forgive such things as empty sets and empty slots due to multiple consecutive + commas, leading commas, or trailing commas. Essentially, these pseudo-classes will match all non-empty selectors and + ignore empty ones. As the scraping environment is different that a browser environment, it was chosen not to + aggressively forgive bad syntax and invalid features to ensure the user is alerted that their program may not perform + as expected. + ## 2.2.1 - **FIX**: Fix an issue with namespaces when one of the keys is `self`. diff --git a/mkdocs.yml b/mkdocs.yml index 32101df6..e4a7f4b8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -105,6 +105,6 @@ plugins: - search: separator: '[:\s\-]+' - git-revision-date-localized - - minify: - minify_html: true + # - minify: + # minify_html: true - mkdocs_pymdownx_material_extras diff --git a/requirements/docs.txt b/requirements/docs.txt index d09c37ef..d664d245 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,4 +1,3 @@ mkdocs_pymdownx_material_extras==1.2.2 mkdocs-git-revision-date-localized-plugin -mkdocs-minify-plugin pyspelling diff --git a/soupsieve/__meta__.py b/soupsieve/__meta__.py index eb145789..1148846c 100644 --- a/soupsieve/__meta__.py +++ b/soupsieve/__meta__.py @@ -188,5 +188,5 @@ def parse_version(ver): return Version(major, minor, micro, release, pre, post, dev) -__version_info__ = Version(2, 2, 1, "final") +__version_info__ = Version(2, 3, 0, ".dev") __version__ = __version_info__._get_canonical() diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py index a9eeaad2..74c9ff4a 100644 --- a/soupsieve/css_match.py +++ b/soupsieve/css_match.py @@ -784,6 +784,9 @@ def match_relations(self, el, relation): found = False + if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: + return found + if relation[0].rel_type.startswith(':'): found = self.match_future_relations(el, relation) else: diff --git a/soupsieve/css_parser.py b/soupsieve/css_parser.py index 462aa947..73475d98 100644 --- a/soupsieve/css_parser.py +++ b/soupsieve/css_parser.py @@ -196,6 +196,7 @@ FLG_IN_RANGE = 0x80 FLG_OUT_OF_RANGE = 0x100 FLG_PLACEHOLDER_SHOWN = 0x200 +FLG_FORGIVE = 0x400 # Maximum cached patterns to store _MAXCACHE = 500 @@ -715,11 +716,14 @@ def parse_pseudo_open(self, sel, name, has_selector, iselector, index): flags = FLG_PSEUDO | FLG_OPEN if name == ':not': flags |= FLG_NOT - if name == ':has': - flags |= FLG_RELATIVE + elif name == ':has': + flags |= FLG_RELATIVE | FLG_FORGIVE + elif name in (':where', ':is'): + flags |= FLG_FORGIVE sel.selectors.append(self.parse_selectors(iselector, index, flags)) has_selector = True + return has_selector def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index): @@ -731,12 +735,9 @@ def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index) if combinator == COMMA_COMBINATOR: if not has_selector: # If we've not captured any selector parts, the comma is either at the beginning of the pattern - # or following another comma, both of which are unexpected. Commas must split selectors. - raise SelectorSyntaxError( - "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), - self.pattern, - index - ) + # or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class. + sel.no_match = True + sel.rel_type = rel_type selectors[-1].relations.append(sel) rel_type = ":" + WS_COMBINATOR @@ -757,41 +758,50 @@ def parse_has_combinator(self, sel, m, has_selector, selectors, rel_type, index) self.pattern, index ) + # Set the leading combinator for the next selector. rel_type = ':' + combinator - sel = _Selector() + sel = _Selector() has_selector = False return has_selector, sel, rel_type - def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, index): + def parse_combinator(self, sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index): """Parse combinator tokens.""" combinator = m.group('relation').strip() if not combinator: combinator = WS_COMBINATOR if not has_selector: - raise SelectorSyntaxError( - "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), - self.pattern, - index - ) + if not is_forgive or combinator != COMMA_COMBINATOR: + raise SelectorSyntaxError( + "The combinator '{}' at postion {}, must have a selector before it".format(combinator, index), + self.pattern, + index + ) - if combinator == COMMA_COMBINATOR: - if not sel.tag and not is_pseudo: - # Implied `*` - sel.tag = ct.SelectorTag('*', None) - sel.relations.extend(relations) - selectors.append(sel) - del relations[:] + # If we are in a forgiving pseudo class, just make the selector a "no match" + if combinator == COMMA_COMBINATOR: + sel.no_match = True + del relations[:] + selectors.append(sel) else: - sel.relations.extend(relations) - sel.rel_type = combinator - del relations[:] - relations.append(sel) - sel = _Selector() + if combinator == COMMA_COMBINATOR: + if not sel.tag and not is_pseudo: + # Implied `*` + sel.tag = ct.SelectorTag('*', None) + sel.relations.extend(relations) + selectors.append(sel) + del relations[:] + else: + sel.relations.extend(relations) + sel.rel_type = combinator + del relations[:] + relations.append(sel) + sel = _Selector() has_selector = False + return has_selector, sel def parse_class_id(self, sel, m, has_selector): @@ -862,12 +872,15 @@ def parse_pseudo_dir(self, sel, m, has_selector): def parse_selectors(self, iselector, index=0, flags=0): """Parse selectors.""" + # Initialize important variables sel = _Selector() selectors = [] has_selector = False closed = False relations = [] rel_type = ":" + WS_COMBINATOR + + # Setup various flags is_open = bool(flags & FLG_OPEN) is_pseudo = bool(flags & FLG_PSEUDO) is_relative = bool(flags & FLG_RELATIVE) @@ -878,7 +891,9 @@ def parse_selectors(self, iselector, index=0, flags=0): is_in_range = bool(flags & FLG_IN_RANGE) is_out_of_range = bool(flags & FLG_OUT_OF_RANGE) is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN) + is_forgive = bool(flags & FLG_FORGIVE) + # Print out useful debug stuff if self.debug: # pragma: no cover if is_pseudo: print(' is_pseudo: True') @@ -900,7 +915,10 @@ def parse_selectors(self, iselector, index=0, flags=0): print(' is_out_of_range: True') if is_placeholder_shown: print(' is_placeholder_shown: True') + if is_forgive: + print(' is_forgive: True') + # The algorithm for relative selectors require an initial selector in the selector list if is_relative: selectors.append(_Selector()) @@ -929,11 +947,13 @@ def parse_selectors(self, iselector, index=0, flags=0): is_html = True elif key == 'pseudo_close': if not has_selector: - raise SelectorSyntaxError( - "Expected a selector at postion {}".format(m.start(0)), - self.pattern, - m.start(0) - ) + if not is_forgive: + raise SelectorSyntaxError( + "Expected a selector at postion {}".format(m.start(0)), + self.pattern, + m.start(0) + ) + sel.no_match = True if is_open: closed = True break @@ -950,7 +970,7 @@ def parse_selectors(self, iselector, index=0, flags=0): ) else: has_selector, sel = self.parse_combinator( - sel, m, has_selector, selectors, relations, is_pseudo, index + sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index ) elif key == 'attribute': has_selector = self.parse_attribute_selector(sel, m, has_selector) @@ -969,6 +989,7 @@ def parse_selectors(self, iselector, index=0, flags=0): except StopIteration: pass + # Handle selectors that are not closed if is_open and not closed: raise SelectorSyntaxError( "Unclosed pseudo-class at position {}".format(index), @@ -976,6 +997,7 @@ def parse_selectors(self, iselector, index=0, flags=0): index ) + # Cleanup completed selector piece if has_selector: if not sel.tag and not is_pseudo: # Implied `*` @@ -987,8 +1009,28 @@ def parse_selectors(self, iselector, index=0, flags=0): sel.relations.extend(relations) del relations[:] selectors.append(sel) - else: + + # Forgive empty slots in pseudo-classes that have lists (and are forgiving) + elif is_forgive: + if is_relative: + # Handle relative selectors pseudo-classes with empty slots like `:has()` + if selectors and selectors[-1].rel_type is None and rel_type == ': ': + sel.rel_type = rel_type + sel.no_match = True + selectors[-1].relations.append(sel) + has_selector = True + else: + # Handle normal pseudo-classes with empty slots + if not selectors or not relations: + # Others like `:is()` etc. + sel.no_match = True + del relations[:] + selectors.append(sel) + has_selector = True + + if not has_selector: # We will always need to finish a selector when `:has()` is used as it leads with combining. + # May apply to others as well. raise SelectorSyntaxError( 'Expected a selector at position {}'.format(index), self.pattern, @@ -1009,6 +1051,7 @@ def parse_selectors(self, iselector, index=0, flags=0): if is_placeholder_shown: selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN + # Return selector list return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) def selector_iter(self, pattern): diff --git a/tests/test_level3/test_not.py b/tests/test_level3/test_not.py index c3b3c989..943fc106 100644 --- a/tests/test_level3/test_not.py +++ b/tests/test_level3/test_not.py @@ -1,6 +1,7 @@ """Test not selectors.""" from .. import util from bs4 import BeautifulSoup as BS +from soupsieve import SelectorSyntaxError class TestNot(util.TestCase): @@ -55,3 +56,23 @@ def test_none_inputs(self): soup = BS('text', 'html.parser') soup.span['foo'] = None self.assertEqual(len(soup.select('span:not([foo])')), 0) + + def test_invalid_pseudo_empty(self): + """Test pseudo class group with empty set.""" + + self.assert_raises(':not()', SelectorSyntaxError) + + def test_invalid_pseudo_trailing_comma(self): + """Test pseudo class group with trailing comma.""" + + self.assert_raises(':not(.class,)', SelectorSyntaxError) + + def test_invalid_pseudo_leading_comma(self): + """Test pseudo class group with leading comma.""" + + self.assert_raises(':not(,.class)', SelectorSyntaxError) + + def test_invalid_pseudo_multi_comma(self): + """Test pseudo class group with multiple commas.""" + + self.assert_raises(':not(.this,,.that)', SelectorSyntaxError) diff --git a/tests/test_level4/test_has.py b/tests/test_level4/test_has.py index d1eccd0a..52c68d2a 100644 --- a/tests/test_level4/test_has.py +++ b/tests/test_level4/test_has.py @@ -129,20 +129,50 @@ def test_has_nested_pseudo(self): flags=util.HTML ) - def test_invalid_incomplete_has(self): - """Test `:has()` fails with just a combinator.""" + def test_has_empty(self): + """Test has with empty slot due to multiple commas.""" - self.assert_raises(':has(>)', SelectorSyntaxError) + self.assert_selector( + self.MARKUP2, + 'div:has()', + [], + flags=util.HTML + ) - def test_invalid_has_empty(self): - """Test `:has()` fails with empty function parameters.""" + def test_has_multi_commas(self): + """Test has with empty slot due to multiple commas.""" - self.assert_raises(':has()', SelectorSyntaxError) + self.assert_selector( + self.MARKUP2, + 'div:has(> .bbbb, .ffff, , .jjjj)', + ['0', '4', '8'], + flags=util.HTML + ) - def test_invalid_has_double_comma(self): - """Test `:has()` fails with consecutive commas.""" + def test_has_leading_commas(self): + """Test has with empty slot due to leading commas.""" - self.assert_raises(':has(> has,, a)', SelectorSyntaxError) + self.assert_selector( + self.MARKUP2, + 'div:has(, > .bbbb, .ffff, .jjjj)', + ['0', '4', '8'], + flags=util.HTML + ) + + def test_has_trailing_commas(self): + """Test has with empty slot due to trailing commas.""" + + self.assert_selector( + self.MARKUP2, + 'div:has(> .bbbb, .ffff, .jjjj, )', + ['0', '4', '8'], + flags=util.HTML + ) + + def test_invalid_incomplete_has(self): + """Test `:has()` fails with just a combinator.""" + + self.assert_raises(':has(>)', SelectorSyntaxError) def test_invalid_has_double_combinator(self): """Test `:has()` fails with consecutive combinators.""" @@ -155,13 +185,3 @@ def test_invalid_has_trailing_combinator(self): """Test `:has()` fails with trailing combinator.""" self.assert_raises(':has(> has >)', SelectorSyntaxError) - - def test_invalid_has_trailing_comma(self): - """Test `:has()` fails with trailing comma.""" - - self.assert_raises(':has(> has,)', SelectorSyntaxError) - - def test_invalid_has_start_comma(self): - """Test `:has()` fails with trailing comma.""" - - self.assert_raises(':has(, p)', SelectorSyntaxError) diff --git a/tests/test_level4/test_is.py b/tests/test_level4/test_is.py index f4598fea..e630e388 100644 --- a/tests/test_level4/test_is.py +++ b/tests/test_level4/test_is.py @@ -24,6 +24,46 @@ def test_is(self): flags=util.HTML ) + def test_is_multi_comma(self): + """Test multiple selectors but with an empty slot due to multiple commas.""" + + self.assert_selector( + self.MARKUP, + ":is(span, , a)", + ["1", "2"], + flags=util.HTML + ) + + def test_is_leading_comma(self): + """Test multiple selectors but with an empty slot due to leading commas.""" + + self.assert_selector( + self.MARKUP, + ":is(, span, a)", + ["1", "2"], + flags=util.HTML + ) + + def test_is_trailing_comma(self): + """Test multiple selectors but with an empty slot due to trailing commas.""" + + self.assert_selector( + self.MARKUP, + ":is(span, a, )", + ["1", "2"], + flags=util.HTML + ) + + def test_is_empty(self): + """Test empty `:is()` selector list.""" + + self.assert_selector( + self.MARKUP, + ":is()", + [], + flags=util.HTML + ) + def test_nested_is(self): """Test multiple nested selectors.""" @@ -85,11 +125,6 @@ def test_invalid_pseudo_orphan_close(self): self.assert_raises('div)', SelectorSyntaxError) - def test_invalid_pseudo_dangling_comma(self): - """Test pseudo class group with trailing comma.""" - - self.assert_raises(':is(div,)', SelectorSyntaxError) - def test_invalid_pseudo_open(self): """Test invalid pseudo close."""