From 5fcf00da5ab1bfb64e1bbdd066f061c7daff7a7a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Thu, 1 Dec 2022 15:33:59 -0700 Subject: [PATCH] Tag Processor: Add bookmark system for tracking semantic locations in document It can be helpful to track a location in an HTML document while updates are being made to it such that we can instruct the Tag Processor to seek to the location of one of the bookmarks. In this patch we're introducing a bookmarks system to do just that. Bookmarks are referenced by name and handled internally by a tracking object which will follow all updates made to the document. It will be possible to rewind or jump around a document by setting a bookmark and then calling `seek( $bookmark_name )` to move there. Co-authored-by: Adam Zielinski Co-authored-by: Dennis Snell --- lib/experimental/html/class-wp-html-span.php | 52 +++ .../html/class-wp-html-tag-processor.php | 279 +++++++++++-- lib/experimental/html/index.php | 1 + .../wp-html-tag-processor-bookmark-test.php | 370 ++++++++++++++++++ phpunit/html/wp-html-tag-processor-test.php | 30 +- 5 files changed, 684 insertions(+), 48 deletions(-) create mode 100644 lib/experimental/html/class-wp-html-span.php create mode 100644 phpunit/html/wp-html-tag-processor-bookmark-test.php diff --git a/lib/experimental/html/class-wp-html-span.php b/lib/experimental/html/class-wp-html-span.php new file mode 100644 index 0000000000000..39e603662b17b --- /dev/null +++ b/lib/experimental/html/class-wp-html-span.php @@ -0,0 +1,52 @@ +start = $start; + $this->end = $end; + } +} diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php index 5dc5981ef15f7..affbb6fb27b5c 100644 --- a/lib/experimental/html/class-wp-html-tag-processor.php +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -180,6 +180,25 @@ * @since 6.2.0 */ class WP_HTML_Tag_Processor { + /** + * The maximum number of bookmarks allowed to exist at + * any given time. + * + * @see set_bookmark(); + * @since 6.2.0 + * @var int + */ + const MAX_BOOKMARKS = 10; + + /** + * Maximum number of times seek() can be called. + * Prevents accidental infinite loops. + * + * @see seek() + * @since 6.2.0 + * @var int + */ + const MAX_SEEK_OPS = 1000; /** * The HTML document to parse. @@ -349,11 +368,11 @@ class WP_HTML_Tag_Processor { * * Example: * - * // Add the `WP-block-group` class, remove the `WP-group` class. - * $class_changes = [ + * // Add the `wp-block-group` class, remove the `wp-group` class. + * $classname_updates = [ * // Indexed by a comparable class name - * 'wp-block-group' => new WP_Class_Name_Operation( 'WP-block-group', WP_Class_Name_Operation::ADD ), - * 'wp-group' => new WP_Class_Name_Operation( 'WP-group', WP_Class_Name_Operation::REMOVE ) + * 'wp-block-group' => WP_HTML_Tag_Processor::ADD_CLASS, + * 'wp-group' => WP_HTML_Tag_Processor::REMOVE_CLASS * ]; * * @@ -362,6 +381,15 @@ class WP_HTML_Tag_Processor { */ private $classname_updates = array(); + /** + * Tracks a semantic location in the original HTML which + * shifts with updates as they are applied to the document. + * + * @since 6.2.0 + * @var WP_HTML_Span[] + */ + private $bookmarks = array(); + const ADD_CLASS = true; const REMOVE_CLASS = false; const SKIP_CLASS = null; @@ -396,6 +424,16 @@ class WP_HTML_Tag_Processor { */ private $attribute_updates = array(); + /** + * Tracks how many times we've performed a `seek()` + * so that we can prevent accidental infinite loops. + * + * @see seek + * @since 6.2.0 + * @var int + */ + private $seek_count = 0; + /** * Constructor. * @@ -479,6 +517,123 @@ public function next_tag( $query = null ) { return true; } + + /** + * Sets a bookmark in the HTML document. + * + * Bookmarks represent specific places or tokens in the HTML + * document, such as a tag opener or closer. When applying + * edits to a document, such as setting an attribute, the + * text offsets of that token may shift; the bookmark is + * kept updated with those shifts and remains stable unless + * the entire span of text in which the token sits is removed. + * + * Release bookmarks when they are no longer needed. + * + * Example: + * ``` + *

Surprising fact you may not know!

+ * ^ ^ + * \-|-- this `H2` opener bookmark tracks the token + * + *

Surprising fact you may no… + * ^ ^ + * \-|-- it shifts with edits + * ``` + * + * Bookmarks provide the ability to seek to a previously-scanned + * place in the HTML document. This avoids the need to re-scan + * the entire thing. + * + * Example: + * ``` + *
  • One
  • Two
  • Three
+ * ^^^^ + * want to note this last item + * + * $p = new WP_HTML_Tag_Processor( $html ); + * $in_list = false; + * while ( $p->next_tag( [ 'tag_closers' => $in_list ? 'visit' : 'skip' ] ) ) { + * if ( 'UL' === $p->get_tag() ) { + * if ( $p->is_tag_closer() ) { + * $in_list = false; + * $p->set_bookmark( 'resume' ); + * if ( $p->seek( 'last-li' ) ) { + * $p->add_class( 'last-li' ); + * } + * $p->seek( 'resume' ); + * $p->release_bookmark( 'last-li' ); + * $p->release_bookmark( 'resume' ); + * } else { + * $in_list = true; + * } + * } + * + * if ( 'LI' === $p->get_tag() ) { + * $p->set_bookmark( 'last-li' ); + * } + * } + * ``` + * + * Because bookmarks maintain their position they don't + * expose any internal offsets for the HTML document + * and can't be used with normal string functions. + * + * Because bookmarks allocate memory and require processing + * for every applied update they are limited and require + * a name. They should not be created inside a loop. + * + * Bookmarks are a powerful tool to enable complicated behavior; + * consider double-checking that you need this tool if you are + * reaching for it, as inappropriate use could lead to broken + * HTML structure or unwanted processing overhead. + * + * @param string $name Identifies this particular bookmark. + * @return false|void + * @throws Exception Throws on invalid bookmark name if WP_DEBUG set. + */ + public function set_bookmark( $name ) { + if ( null === $this->tag_name_starts_at ) { + return false; + } + + if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= self::MAX_BOOKMARKS ) { + if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) { + throw new Exception( "Tried to jump to a non-existent HTML bookmark {$name}." ); + } + return false; + } + + $this->bookmarks[ $name ] = new WP_HTML_Span( + $this->tag_name_starts_at - 1, + $this->tag_ends_at + ); + + return true; + } + + + /** + * Removes a bookmark if you no longer need to use it. + * + * Releasing a bookmark frees up the small performance + * overhead they require, mainly in the form of compute + * costs when modifying the document. + * + * @param string $name Name of the bookmark to remove. + * @return bool + */ + public function release_bookmark( $name ) { + if ( ! array_key_exists( $name, $this->bookmarks ) ) { + return false; + } + + unset( $this->bookmarks[ $name ] ); + + return true; + } + + /** * Skips the contents of the title and textarea tags until an appropriate * tag closer is found. @@ -1104,9 +1259,77 @@ private function apply_attributes_updates() { $this->updated_bytes = $diff->end; } + foreach ( $this->bookmarks as $bookmark ) { + /** + * As we loop through $this->attribute_updates, we keep comparing + * $bookmark->start and $bookmark->end to $diff->start. We can't + * change it and still expect the correct result, so let's accumulate + * the deltas separately and apply them all at once after the loop. + */ + $head_delta = 0; + $tail_delta = 0; + + foreach ( $this->attribute_updates as $diff ) { + $update_head = $bookmark->start >= $diff->start; + $update_tail = $bookmark->end >= $diff->start; + + if ( ! $update_head && ! $update_tail ) { + break; + } + + $delta = strlen( $diff->text ) - ( $diff->end - $diff->start ); + + if ( $update_head ) { + $head_delta += $delta; + } + + if ( $update_tail ) { + $tail_delta += $delta; + } + } + + $bookmark->start += $head_delta; + $bookmark->end += $tail_delta; + } + $this->attribute_updates = array(); } + /** + * Move the current pointer in the Tag Processor to a given bookmark's location. + * + * In order to prevent accidental infinite loops, there's a + * maximum limit on the number of times seek() can be called. + * + * @param string $bookmark_name Jump to the place in the document identified by this bookmark name. + * @return bool + * @throws Exception Throws on invalid bookmark name if WP_DEBUG set. + */ + public function seek( $bookmark_name ) { + if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { + if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) { + throw new Exception( 'Invalid bookmark name' ); + } + return false; + } + + if ( ++$this->seek_count > self::MAX_SEEK_OPS ) { + if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) { + throw new Exception( 'Too many calls to seek() - this can lead to performance issues.' ); + } + return false; + } + + // Flush out any pending updates to the document. + $this->get_updated_html(); + + // Point this tag processor before the sought tag opener and consume it. + $this->parsed_bytes = $this->bookmarks[ $bookmark_name ]->start; + $this->updated_bytes = $this->parsed_bytes; + $this->updated_html = substr( $this->html, 0, $this->updated_bytes ); + return $this->next_tag(); + } + /** * Sort function to arrange objects with a start property in ascending order. * @@ -1411,47 +1634,31 @@ public function __toString() { * @return string The processed HTML. */ public function get_updated_html() { - // Short-circuit if there are no updates to apply. + // Short-circuit if there are no new updates to apply. if ( ! count( $this->classname_updates ) && ! count( $this->attribute_updates ) ) { return $this->updated_html . substr( $this->html, $this->updated_bytes ); } - /* - * Parsing is in progress – let's apply the attribute updates without moving on to the next tag. - * - * In practice: - * 1. Apply the attributes updates to the original HTML - * 2. Replace the original HTML with the updated HTML - * 3. Point this tag processor to the current tag name's end in that updated HTML - */ - - // Find tag name's end in the updated markup. - $markup_updated_up_to_a_tag_name_end = $this->updated_html . substr( $this->html, $this->updated_bytes, $this->tag_name_starts_at + $this->tag_name_length - $this->updated_bytes ); - $updated_tag_name_ends_at = strlen( $markup_updated_up_to_a_tag_name_end ); - $updated_tag_name_starts_at = $updated_tag_name_ends_at - $this->tag_name_length; + // Otherwise: apply the updates, rewind before the current tag, and parse it again. + $delta_between_updated_html_end_and_current_tag_end = substr( + $this->html, + $this->updated_bytes, + $this->tag_name_starts_at + $this->tag_name_length - $this->updated_bytes + ); + $updated_html_up_to_current_tag_name_end = $this->updated_html . $delta_between_updated_html_end_and_current_tag_end; - // Apply attributes updates. - $this->updated_html = $markup_updated_up_to_a_tag_name_end; - $this->updated_bytes = $this->tag_name_starts_at + $this->tag_name_length; + // 1. Apply the attributes updates to the original HTML $this->class_name_updates_to_attributes_updates(); $this->apply_attributes_updates(); - // Replace $this->html with the updated markup. - $this->html = $this->updated_html . substr( $this->html, $this->updated_bytes ); + // 2. Replace the original HTML with the updated HTML + $this->html = $this->updated_html . substr( $this->html, $this->updated_bytes ); + $this->updated_html = $updated_html_up_to_current_tag_name_end; + $this->updated_bytes = strlen( $this->updated_html ); - // Rewind this processor to the tag name's end. - $this->tag_name_starts_at = $updated_tag_name_starts_at; - $this->parsed_bytes = $updated_tag_name_ends_at; - - // Restore the previous version of the updated_html as we are not finished with the current_tag yet. - $this->updated_html = $markup_updated_up_to_a_tag_name_end; - $this->updated_bytes = $updated_tag_name_ends_at; - - // Parse the attributes in the updated markup. - $this->attributes = array(); - while ( $this->parse_next_attribute() ) { - continue; - } + // 3. Point this tag processor at the original tag opener and consume it + $this->parsed_bytes = strlen( $updated_html_up_to_current_tag_name_end ) - $this->tag_name_length - 2; + $this->next_tag(); return $this->html; } diff --git a/lib/experimental/html/index.php b/lib/experimental/html/index.php index e7d41f8cdf486..a31dbaf48c6b2 100644 --- a/lib/experimental/html/index.php +++ b/lib/experimental/html/index.php @@ -7,5 +7,6 @@ // All class files necessary for the HTML Tag Processor. require_once __DIR__ . '/class-wp-html-attribute-token.php'; +require_once __DIR__ . '/class-wp-html-span.php'; require_once __DIR__ . '/class-wp-html-text-replacement.php'; require_once __DIR__ . '/class-wp-html-tag-processor.php'; diff --git a/phpunit/html/wp-html-tag-processor-bookmark-test.php b/phpunit/html/wp-html-tag-processor-bookmark-test.php new file mode 100644 index 0000000000000..e1c4b005ce47c --- /dev/null +++ b/phpunit/html/wp-html-tag-processor-bookmark-test.php @@ -0,0 +1,370 @@ +
  • One
  • Two
  • Three
  • ' ); + $p->next_tag( 'li' ); + $this->assertTrue( $p->set_bookmark( 'first li' ), 'Could not allocate a "first li" bookmark.' ); + $p->next_tag( 'li' ); + $this->assertTrue( $p->set_bookmark( 'second li' ), 'Could not allocate a "second li" bookmark.' ); + $this->assertTrue( $p->set_bookmark( 'first li' ), 'Could not move the "first li" bookmark.' ); + } + + /** + * @ticket 56299 + * + * @covers release_bookmark + */ + public function test_release_bookmark() { + $p = new WP_HTML_Tag_Processor( '
    • One
    • Two
    • Three
    ' ); + $p->next_tag( 'li' ); + $this->assertFalse( $p->release_bookmark( 'first li' ), 'Released a non-existing bookmark.' ); + $p->set_bookmark( 'first li' ); + $this->assertTrue( $p->release_bookmark( 'first li' ), 'Could not release a bookmark.' ); + } + + /** + * @ticket 56299 + * + * @covers seek + * @covers set_bookmark + */ + public function test_seek() { + $p = new WP_HTML_Tag_Processor( '
    • One
    • Two
    • Three
    ' ); + $p->next_tag( 'li' ); + $p->set_bookmark( 'first li' ); + + $p->next_tag( 'li' ); + $p->set_attribute( 'foo-2', 'bar-2' ); + + $p->seek( 'first li' ); + $p->set_attribute( 'foo-1', 'bar-1' ); + + $this->assertEquals( + '
    • One
    • Two
    • Three
    ', + $p->get_updated_html() + ); + } + + /** + * WP_HTML_Tag_Processor used to test for the diffs affecting + * the adjusted bookmark position while simultaneously adjusting + * the bookmark in question. As a result, updating the bookmarks + * of a next tag while removing two subsequent attributes in + * a previous tag unfolded like this: + * + * 1. Check if the first removed attribute is before the bookmark: + * + * + * ^-------------------^ ^ + * diff applied here the bookmark is here + * + * (Yes it is) + * + * 2. Move the bookmark to the left by the attribute length: + * + * + * ^ + * the bookmark is here + * + * 3. Check if the second removed attribute is before the bookmark: + * + * + * ^ ^-----^ + * bookmark diff + * + * This time, it isn't! + * + * The fix in the WP_HTML_Tag_Processor involves doing all the checks + * before moving the bookmark. This test is here to guard us from + * the erroneous behavior accidentally returning one day. + * + * @ticket 56299 + * + * @covers seek + * @covers set_bookmark + * @covers apply_attributes_updates + */ + public function test_removing_long_attributes_doesnt_break_seek() { + $input = << +HTML; + $p = new WP_HTML_Tag_Processor( $input ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'first' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'second' ); + + $this->assertTrue( + $p->seek( 'first' ), + 'Seek() to the first button has failed' + ); + $p->remove_attribute( 'twenty_one_characters' ); + $p->remove_attribute( '7_chars' ); + + $this->assertTrue( + $p->seek( 'second' ), + 'Seek() to the second button has failed' + ); + } + + /** + * @ticket 56299 + * + * @covers seek + * @covers set_bookmark + */ + public function test_bookmarks_complex_use_case() { + $input = << +
    +
    +
    + + + + + + + +
    +
    +
    +HTML; + $expected_output = << +
    +
    +
    + + + + + + + +
    +
    +
    +HTML; + $p = new WP_HTML_Tag_Processor( $input ); + $p->next_tag( 'div' ); + $p->next_tag( 'div' ); + $p->next_tag( 'div' ); + $p->set_bookmark( 'first div' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'first button' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'second button' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'third button' ); + $p->next_tag( 'button' ); + $p->set_bookmark( 'fourth button' ); + + $p->seek( 'first button' ); + $p->set_attribute( 'type', 'submit' ); + + $this->assertTrue( + $p->seek( 'third button' ), + 'Seek() to the third button failed' + ); + $p->remove_attribute( 'class' ); + $p->remove_attribute( 'type' ); + $p->remove_attribute( 'aria-expanded' ); + $p->set_attribute( 'id', 'rebase-and-merge' ); + $p->remove_attribute( 'data-details-container' ); + + $this->assertTrue( + $p->seek( 'first div' ), + 'Seek() to the first div failed' + ); + $p->set_attribute( 'checked', false ); + + $this->assertTrue( + $p->seek( 'fourth button' ), + 'Seek() to the fourth button failed' + ); + $p->set_attribute( 'id', 'last-button' ); + $p->remove_attribute( 'class' ); + $p->remove_attribute( 'type' ); + $p->remove_attribute( 'checked' ); + $p->remove_attribute( 'aria-label' ); + $p->remove_attribute( 'disabled' ); + $p->remove_attribute( 'data-view-component' ); + + $this->assertTrue( + $p->seek( 'second button' ), + 'Seek() to the second button failed' + ); + $p->remove_attribute( 'type' ); + $p->set_attribute( 'class', 'hx_create-pr-button' ); + + $this->assertEquals( + $expected_output, + $p->get_updated_html() + ); + } + + /** + * @ticket 56299 + * + * @covers seek + * @covers set_bookmark + */ + public function test_updates_bookmark_for_additions_after_both_sides() { + $p = new WP_HTML_Tag_Processor( '
    First
    Second
    ' ); + $p->next_tag(); + $p->set_bookmark( 'first' ); + $p->next_tag(); + $p->add_class( 'second' ); + + $p->seek( 'first' ); + $p->add_class( 'first' ); + + $this->assertEquals( + '
    First
    Second
    ', + $p->get_updated_html() + ); + } + + /** + * @ticket 56299 + * + * @covers seek + * @covers set_bookmark + */ + public function test_updates_bookmark_for_additions_before_both_sides() { + $p = new WP_HTML_Tag_Processor( '
    First
    Second
    ' ); + $p->next_tag(); + $p->set_bookmark( 'first' ); + $p->next_tag(); + $p->set_bookmark( 'second' ); + + $p->seek( 'first' ); + $p->add_class( 'first' ); + + $p->seek( 'second' ); + $p->add_class( 'second' ); + + $this->assertEquals( + '
    First
    Second
    ', + $p->get_updated_html() + ); + } + + /** + * @ticket 56299 + * + * @covers seek + * @covers set_bookmark + */ + public function test_updates_bookmark_for_deletions_after_both_sides() { + $p = new WP_HTML_Tag_Processor( '
    First
    Second
    ' ); + $p->next_tag(); + $p->set_bookmark( 'first' ); + $p->next_tag(); + $p->remove_attribute( 'disabled' ); + + $p->seek( 'first' ); + $p->set_attribute( 'untouched', true ); + + $this->assertEquals( + /** @TODO: we shouldn't have to assert the extra space after removing the attribute. */ + '
    First
    Second
    ', + $p->get_updated_html() + ); + } + + /** + * @ticket 56299 + * + * @covers seek + * @covers set_bookmark + */ + public function test_updates_bookmark_for_deletions_before_both_sides() { + $p = new WP_HTML_Tag_Processor( '
    First
    Second
    ' ); + $p->next_tag(); + $p->set_bookmark( 'first' ); + $p->next_tag(); + $p->set_bookmark( 'second' ); + + $p->seek( 'first' ); + $p->remove_attribute( 'disabled' ); + + $p->seek( 'second' ); + $p->set_attribute( 'safe', true ); + + $this->assertEquals( + /** @TODO: we shouldn't have to assert the extra space after removing the attribute. */ + '
    First
    Second
    ', + $p->get_updated_html() + ); + } + + /** + * @ticket 56299 + * + * @covers set_bookmark + */ + public function test_limits_the_number_of_bookmarks() { + $p = new WP_HTML_Tag_Processor( '
    • One
    • Two
    • Three
    ' ); + $p->next_tag( 'li' ); + + $this->expectException( Exception::class ); + + for ( $i = 0;$i < WP_HTML_Tag_Processor::MAX_BOOKMARKS;$i++ ) { + $this->assertTrue( $p->set_bookmark( "bookmark $i" ), "Could not allocate the bookmark #$i" ); + } + + $this->assertFalse( $p->set_bookmark( 'final bookmark' ), "Allocated $i bookmarks, which is one above the limit." ); + } + + /** + * @ticket 56299 + * + * @covers seek + */ + public function test_limits_the_number_of_seek_calls() { + $p = new WP_HTML_Tag_Processor( '
    • One
    • Two
    • Three
    ' ); + $p->next_tag( 'li' ); + $p->set_bookmark( 'bookmark' ); + + $this->expectException( Exception::class ); + + for ( $i = 0; $i < WP_HTML_Tag_Processor::MAX_SEEK_OPS; $i++ ) { + $this->assertTrue( $p->seek( 'bookmark' ), 'Could not seek to the "bookmark"' ); + } + $this->assertFalse( $p->seek( 'bookmark' ), "$i-th seek() to the bookmark succeeded, even though it should exceed the allowed limit." ); + } +} diff --git a/phpunit/html/wp-html-tag-processor-test.php b/phpunit/html/wp-html-tag-processor-test.php index 273cbddddea4b..6a850eb7a4edb 100644 --- a/phpunit/html/wp-html-tag-processor-test.php +++ b/phpunit/html/wp-html-tag-processor-test.php @@ -1296,18 +1296,24 @@ public function data_malformed_tag() { ); $examples['Multiple unclosed tags treated as a single tag'] = array( - '
    -test', - '
    -test', + << + test +HTML + , + << + test +HTML + , ); $examples['27'] = array(