From 7e7602cc5c4f898ba4ef6f9a1cfcf3dea4fd5bea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 21 Feb 2023 18:13:07 +0100 Subject: [PATCH 01/42] Explore HTML parsing and Adoption Agency Algorithm --- .../html-api/class-wp-html-processor.php | 1376 +++++++++++++++++ .../html-api/class-wp-html-tag-processor.php | 113 +- src/wp-settings.php | 1 + .../tests/html-api/wpHtmlProcessor.php | 22 + 4 files changed, 1481 insertions(+), 31 deletions(-) create mode 100644 src/wp-includes/html-api/class-wp-html-processor.php create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessor.php diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php new file mode 100644 index 0000000000000..da0c95738b2a8 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -0,0 +1,1376 @@ +tag = $tag; + $this->attributes = $attributes; + $this->is_opener = $is_opener; + $this->is_closer = ! $is_opener; + } + + public function equivalent( WP_HTML_Element $other ) { + if ( $this->is_closer !== $other->is_closer ) { + return false; + } + + if ( $this->tag !== $other->tag ) { + return false; + } + + if ( count( $this->attributes ) !== count( $other->attributes ) ) { + return false; + } + + $attributes_match = true; + foreach ( $other->attributes as $name => $value ) { + if ( ! isset( $this->attributes[ $name ] ) || $this->attributes[ $name ] !== $value ) { + $attributes_match = false; + break; + } + } + return $attributes_match; + } + + public function is_marker() { + return self::MARKER === $this->tag; + } +} + +class WP_HTML_Insertion_Mode { + + const INITIAL = 'INITIAL'; + const IN_SELECT = 'IN_SELECT'; + const IN_SELECT_IN_TABLE = 'IN_SELECT_IN_TABLE'; + const IN_CELL = 'IN_CELL'; + const IN_ROW = 'IN_ROW'; + const IN_TABLE_BODY = 'IN_TABLE_BODY'; + const IN_CAPTION = 'IN_CAPTION'; + const IN_COLUMN_GROUP = 'IN_COLUMN_GROUP'; + const IN_TABLE = 'IN_TABLE'; + const IN_HEAD = 'IN_HEAD'; + const IN_BODY = 'IN_BODY'; + const IN_FRAMESET = 'IN_FRAMESET'; + const BEFORE_HEAD = 'BEFORE_HEAD'; + const TEXT = 'TEXT'; + +} + +/** + * + */ +class WP_HTML_Processor extends WP_HTML_Tag_Processor { + + private $tag_processor; + /** + * @var WP_HTML_Element[] + */ + private $open_elements = array(); + /** + * @var WP_HTML_Element[] + */ + private $active_formatting_elements = array(); + private $root_node = null; + private $context_node = null; + private $original_insertion_mode = null; + private $insertion_mode = null; + + private $inserted_tokens = array(); + + private $head_pointer; + private $form_pointer; + + public function __construct( $html ) { + parent::__construct( $html ); + $this->root_node = new WP_HTML_Element( 'HTML' ); + $this->context_node = new WP_HTML_Element( 'DOCUMENT' ); + $this->open_elements = array( $this->root_node ); + $this->reset_insertion_mode(); + } + + public function parse_next() { + return $this->next_tag_in_body_insertion_mode(); + // @TODO: + // switch($this->insertion_mode) { + // case WP_HTML_Insertion_Mode::INITIAL: + // $this->next_tag_in_initial_mode(); + // break; + // case WP_HTML_Insertion_Mode::BEFORE_HEAD: + // $this->next_tag_in_before_head_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_HEAD: + // $this->next_tag_in_head_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_BODY: + // $this->next_tag_in_body_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_TABLE: + // $this->next_tag_in_table_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_TABLE_BODY: + // $this->next_tag_in_table_body_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_ROW: + // $this->next_tag_in_row_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_CELL: + // $this->next_tag_in_cell_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_SELECT: + // $this->next_tag_in_select_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE: + // $this->next_tag_in_select_in_table_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_CAPTION: + // $this->next_tag_in_caption_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_COLUMN_GROUP: + // $this->next_tag_in_column_group_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_FRAMESET: + // $this->next_tag_in_frameset_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::TEXT: + // $this->next_tag_in_text_insertion_mode(); + // break; + // } + } + + public function next_tag_in_body_insertion_mode() { + $token = $this->next_token(); + if ( $token->is_opener ) { + // Should we care? + // if(self::is_rcdata_element($token->tag)) { + // $this->original_insertion_mode = $this->insertion_mode; + // $this->insertion_mode = WP_HTML_Insertion_Mode::TEXT; + // } + switch ( $token->tag ) { + case 'ADDRESS': + case 'ARTICLE': + case 'ASIDE': + case 'BLOCKQUOTE': + case 'CENTER': + case 'DETAILS': + case 'DIALOG': + case 'DIR': + case 'DIV': + case 'DL': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'HEADER': + case 'HGROUP': + case 'MAIN': + case 'MENU': + case 'NAV': + case 'OL': + case 'P': + case 'SECTION': + case 'SUMMARY': + case 'UL': + // Ignore special rules for 'PRE' and 'LISTING' + case 'PRE': + case 'LISTING': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + // A start tag whose tag name is "h1", "h2", "h3", "h4", "h5", or "h6" + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + if ( in_array( $this->current_node()->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + $this->pop_open_element(); + } + $this->insert_element( $token ); + break; + case 'FORM': + if ( $this->form_pointer ) { + $this->ignore_token( $token ); + return $this->next_tag(); + } + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->form_pointer = $token; + $this->insert_element( $token ); + break; + case 'LI': + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === 'LI' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'LI' ), + ) + ); + $this->pop_until_tag_name( 'LI' ); + break; + } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + break; + } else { + --$i; + $node = $this->open_elements[ $i ]; + } + } + + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + case 'DD': + case 'DT': + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === 'DD' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'DD' ), + ) + ); + $this->pop_until_tag_name( 'DD' ); + break; + } elseif ( $node->tag === 'DT' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'DT' ), + ) + ); + $this->pop_until_tag_name( 'DT' ); + break; + } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + break; + } else { + --$i; + $node = $this->open_elements[ $i ]; + } + } + + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + case 'PLAINTEXT': + throw new Exception( 'PLAINTEXT not implemented yet' ); + case 'BUTTON': + if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( 'BUTTON' ); + } + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'A': + $active_a = null; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; --$i ) { + $elem = $this->active_formatting_elements[ $i ]; + if ( $elem->tag === 'A' ) { + $active_a = $elem; + break; + } elseif ( $elem->is_marker() ) { + break; + } + } + + if ( $active_a ) { + $this->parse_error(); + // @TODO: + // Run the adoption agency algorithm with the tag name "a". + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + $this->reconstruct_active_formatting_elements(); + $this->push_active_formatting_element( $token ); + $this->insert_element( $token ); + break; + case 'NOBR': + $this->reconstruct_active_formatting_elements(); + if ( $this->is_element_in_scope( 'NOBR' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $this->insert_element( $token ); + $this->push_active_formatting_element( $token ); + break; + case 'APPLET': + case 'MARQUEE': + case 'OBJECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + $this->active_formatting_elements[] = new WP_HTML_Element( WP_HTML_Element::MARKER ); + break; + case 'TABLE': + $this->insert_element( $token ); + $this->insertion_mode = WP_HTML_Insertion_Mode::IN_TABLE; + break; + case 'AREA': + case 'BR': + case 'EMBED': + case 'IMG': + case 'KEYGEN': + case 'WBR': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + $this->pop_open_element(); + // @TODO: Acknowledge the token's self-closing flag, if it is set. + break; + case 'PARAM': + case 'SOURCE': + case 'TRACK': + $this->insert_element( $token ); + $this->pop_open_element(); + break; + case 'HR': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + $this->pop_open_element(); + break; + case 'IMAGE': + $this->parse_error(); + // Change the tag name to "img" and reprocess the token. + throw new Exception( 'IMAGE not implemented yet' ); + case 'TEXTAREA': + $this->insert_element( $token ); + $this->original_insertion_mode = $this->insertion_mode; + $this->insertion_mode = WP_HTML_Insertion_Mode::TEXT; + break; + + case 'XMP': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->reconstruct_active_formatting_elements(); + // @TODO: Follow the generic raw text element parsing algorithm. + throw new Exception( 'XMP not implemented yet' ); + break; + case 'IFRAME': + case 'NOEMBED': + case 'NOSCRIPT': + // @TODO: Follow the generic raw text element parsing algorithm. + throw new Exception( $token->tag . ' not implemented yet' ); + case 'SELECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + if ( in_array( + $this->insertion_mode, + array( + WP_HTML_Insertion_Mode::IN_TABLE, + WP_HTML_Insertion_Mode::IN_CAPTION, + WP_HTML_Insertion_Mode::IN_TABLE_BODY, + WP_HTML_Insertion_Mode::IN_ROW, + WP_HTML_Insertion_Mode::IN_CELL, + ) + ) ) { + $this->insertion_mode = WP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE; + } else { + $this->insertion_mode = WP_HTML_Insertion_Mode::IN_SELECT; + } + break; + case 'OPTGROUP': + case 'OPTION': + if ( 'OPTION' === $token->tag ) { + $this->pop_open_element(); + } + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'RB': + case 'RTC': + if ( $this->is_element_in_scope( 'RB' ) || $this->is_element_in_scope( 'RTC' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $this->insert_element( $token ); + break; + case 'RP': + case 'RT': + if ( $this->is_element_in_scope( 'RP' ) || $this->is_element_in_scope( 'RT' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $this->insert_element( $token ); + break; + case 'MATH': + throw new Exception( 'MATH not implemented yet' ); + case 'SVG': + throw new Exception( 'SVG not implemented yet' ); + case 'CAPTION': + case 'COL': + case 'COLGROUP': + case 'FRAME': + case 'HEAD': + case 'TBODY': + case 'TD': + case 'TFOOT': + case 'TH': + case 'THEAD': + case 'TR': + $this->parse_error(); + // Ignore the token. + return; + default: + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + } + } else { + switch ( $token->tag ) { + case 'ADDRESS': + case 'ARTICLE': + case 'ASIDE': + case 'BLOCKQUOTE': + case 'CENTER': + case 'DETAILS': + case 'DIALOG': + case 'DIR': + case 'DIV': + case 'DL': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'HEADER': + case 'HGROUP': + case 'MAIN': + case 'MENU': + case 'NAV': + case 'OL': + case 'P': + case 'SECTION': + case 'SUMMARY': + case 'UL': + if ( $this->is_element_in_scope( $token->tag ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( $token->tag ); + break; + case 'FORM': + if ( $this->form_pointer ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + if ( $this->is_element_in_scope( $this->form_pointer ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + array_splice( $this->open_elements, array_search( $this->form_pointer, $this->open_elements ), 1 ); + $this->form_pointer = null; + break; + case 'P': + if ( ! $this->is_element_in_button_scope( 'P' ) ) { + // Parse error, insert an HTML element for a "p" start tag token with no attributes. + $this->parse_error(); + $this->insert_element( new WP_HTML_Element( 'P', array() ) ); + } + $this->close_p_element(); + break; + case 'LI': + if ( $this->is_element_in_list_item_scope( 'LI' ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( 'LI' ); + break; + case 'DD': + case 'DT': + if ( $this->is_element_in_scope( $token->tag ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( $token->tag ); + break; + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + if ( $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ); + break; + case 'A': + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + break; + + case 'APPLET': + case 'MARQUEE': + case 'OBJECT': + if ( $this->is_element_in_scope( $token->tag ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + if ( $this->current_node()->tag !== $token->tag ) { + $this->parse_error(); + } + $this->pop_until_tag_name( $token->tag ); + $this->clear_active_formatting_elements_up_to_last_marker(); + break; + case 'BR': + // This should never happen since Tag_Processor corrects that + default: + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === $token->tag ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( $token->tag ), + ) + ); + $this->pop_until_node( $node ); + break; + } elseif ( $this->is_special_element( $node->tag ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } else { + --$i; + } + } + break; + } + } + } + + private $element_bookmark_idx = 0; + private function next_token() { + if ( ! $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + return false; + } + + $consumed_node = new WP_HTML_Element( + $this->get_tag(), + array(), + ! $this->is_tag_closer() + ); + + $consumed_node->tag_processor_bookmark = $this->set_bookmark( + '__internal_' . ( $this->element_bookmark_idx++ ) + ); + + return $consumed_node; + } + + const ANY_OTHER_END_TAG = 1; + private function adoption_agency_algorithm( WP_HTML_Element $token ) { + $subject = $token->tag; + if ( + $this->current_node()->tag === $subject + && ! in_array( $subject, $this->active_formatting_elements, true ) + ) { + $this->pop_open_element(); + return; + } + + $outer_loop_counter = 0; + while ( ++$outer_loop_counter < 8 ) { + /* + * Let __formatting element__ be the last element in the list of active + * formatting elements that: + * - is between the end of the list and the last marker in the list, + * if any, or the start of the list otherwise, and + * - has the same tag name as the token. + */ + $formatting_element = null; + $formatting_element_idx = -1; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { + $candidate = $this->active_formatting_elements[ $i ]; + if ( $candidate->is_marker() ) { + break; + } + if ( $candidate->tag === $subject ) { + $formatting_element = $candidate; + $formatting_element_idx = $i; + break; + } + } + // If there is no such element, then abort these steps and instead act as + // described in the "any other end tag" entry below. + if ( null === $formatting_element ) { + return self::ANY_OTHER_END_TAG; + } + + // If formatting element is not in the stack of open elements, then this is + // a parse error; remove the element from the list, and return. + if ( ! in_array( $formatting_element, $this->open_elements, true ) ) { + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); + $this->parse_error(); + return; + } + + // If formatting element is not in scope, then this is a parse error; return + if ( ! $this->is_element_in_scope( $formatting_element->tag ) ) { + $this->parse_error(); + return; + } + + // If formatting element is not the current node, then this is a parse error. + // (But do not return.) + if ( $formatting_element !== $this->current_node() ) { + $this->parse_error(); + } + + /* + * Let furthest block be the topmost node in the stack of open elements that + * is lower in the stack than formatting element, and is an element in the + * special category. There might not be one. + */ + $furthest_block = null; + for ( $i = count( $this->open_elements ) - 1; $i >= 0; $i-- ) { + $node = $this->open_elements[ $i ]; + if ( $node === $formatting_element ) { + break; + } + if ( $this->is_special_element( $node->tag ) ) { + $furthest_block = $node; + break; + } + } + + // If there is no such node, then the UA must first pop all the nodes from + // the bottom of the stack of open elements, from the current node up to + // and including formatting element, then remove formatting element from + // the list of active formatting elements, and finally abort these steps. + if ( null === $furthest_block ) { + $this->pop_until_node( $formatting_element ); + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); + return; + } + + // Let common ancestor be the element immediately above formatting element + // in the stack of open elements. + $formatting_elem_stack_index = array_search( $formatting_element, $this->open_elements, true ); + $common_ancestor = $this->open_elements[ $formatting_elem_stack_index - 1 ]; + + // Let a bookmark note the position of formatting element in the list of + // active formatting elements relative to the elements on either side of it + // in the list. + $bookmark = $formatting_element_idx; + + // Let node and last node be furthest block. + $node = $last_node = $furthest_block; + $node_open_elements_index = array_search( $node, $this->open_elements, true ); + + $prev_node_open_elements_index = -1; + $inner_loop_counter = 0; + while ( true ) { + $inner_loop_counter++; + + /** + * Let node be the element immediately above node in the stack of open elements, + * or if node is no longer in the stack of open elements (e.g. because it got + * removed by this algorithm), the element that was immediately above node in + * the stack of open elements before node was removed. + */ + $node_open_elements_index = array_search( $node, $this->open_elements, true ); + if ( false === $node_open_elements_index ) { + $node_open_elements_index = $prev_node_open_elements_index; + return; + } + --$node_open_elements_index; + $node = $this->open_elements[ $node_open_elements_index ]; + $prev_node_open_elements_index = $node_open_elements_index; + + // If node is formatting element, then break. + if ( $node === $formatting_element ) { + break; + } + + /* + * If inner loop counter is greater than 3 and node is in the list + * of active formatting elements, then remove node from the list of + * active formatting elements. + */ + if ( $inner_loop_counter > 3 && in_array( $node, $this->active_formatting_elements, true ) ) { + $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); + array_splice( $this->active_formatting_elements, $node_formatting_idx, 1 ); + } + + /* + * If node is not in the list of active formatting elements, then remove + * node from the stack of open elements and continue. + */ + if ( ! in_array( $node, $this->active_formatting_elements, true ) ) { + array_splice( $this->open_elements, $node_open_elements_index, 1 ); + continue; + } + + /* + * Create an element for the token for which the element node was created, + * in the HTML namespace, with common ancestor as the intended parent. + * + * Replace the entry for node in the list of active formatting elements with an entry + * for the new element. + * + * Replace the entry for node in the stack of open elements with an entry for + * the new element. + * + * Let node be the new element. + */ + $new_node = new WP_HTML_Element( $node->tag, array() ); + $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); + $this->active_formatting_elements[ $node_formatting_idx ] = $new_node; + + $node_open_elements_index = array_search( $node, $this->open_elements, true ); + $this->open_elements[ $node_open_elements_index ] = $new_node; + $node = $new_node; + + /* + * If last node is furthest block, then move the aforementioned bookmark to be + * immediately after the new node in the list of active formatting elements. + */ + if ( $last_node === $furthest_block ) { + $bookmark = $node_formatting_idx + 1; + } + + // Append last node to node. + // @TODO + + // Set last node to node. + $last_node = $node; + } + + // Insert whatever last node ended up being in the previous step at the appropriate place + // for inserting a node, but using common ancestor as the override target. + // @TODO + + // Create an element for the token for which formatting element was created, in the HTML + // namespace, with furthest block as the intended parent. + $new_element = new WP_HTML_Element( $formatting_element->tag, array() ); + + // Take all of the child nodes of furthest block and append them to the element created in + // the last step. + // @TODO + + // Append that new element to furthest block. + // @TODO + + // Remove formatting element from the list of active formatting elements, and insert the new + // element into the list of active formatting elements at the position of the aforementioned + // bookmark. + $formatting_element_idx = array_search( $formatting_element, $this->active_formatting_elements, true ); + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1, array( $new_element ) ); + array_splice( $this->active_formatting_elements, $bookmark, 0, array( $new_element ) ); + + // Remove formatting element from the stack of open elements, and insert the new element into + // the stack of open elements immediately below the position of furthest block in that stack. + $formatting_element_idx = array_search( $formatting_element, $this->active_formatting_elements, true ); + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1, array( $new_element ) ); + + $furthest_block_idx = array_search( $furthest_block, $this->open_elements, true ); + array_splice( $this->open_elements, $furthest_block_idx + 1, 0, array( $new_element ) ); + } + } + + /* + @TODO Implement https://html.spec.whatwg.org/multipage/parsing.html#insert-a-foreign-element + + Let the adjusted insertion location be the appropriate place for inserting a node. + + Let element be the result of creating an element for the token in the given namespace, with the intended parent being the element in which the adjusted insertion location finds itself. + + If it is possible to insert element at the adjusted insertion location, then: + + If the parser was not created as part of the HTML fragment parsing algorithm, then push a new element queue onto element's relevant agent's custom element reactions stack. + + Insert element at the adjusted insertion location. + + If the parser was not created as part of the HTML fragment parsing algorithm, then pop the element queue from element's relevant agent's custom element reactions stack, and invoke custom element reactions in that queue. + + If the adjusted insertion location cannot accept more elements, e.g. because it's a Document that already has an element child, then element is dropped on the floor. + + Push element onto the stack of open elements so that it is the new current node. + + Return element. + + */ + private function insert_html_element( $node ) { + if ( ! $node->is_closer ) { + $this->insert_element( $node ); + } + $this->inserted_tokens[] = $node; + } + + private function ignore_token( $token ) { + if ( $token->tag_processor_bookmark ) { + $this->release_bookmark( $token->tag_processor_bookmark ); + $token->tag_processor_bookmark = null; + } + return; + } + + private function insert_element( $node ) { + $this->open_elements[] = $node; + } + + private function parse_error() { + // Noop for now + } + + private function pop_until_tag_name( $tags ) { + if ( ! is_array( $tags ) ) { + $tags = array( $tags ); + } + while ( ! in_array( $this->current_node()->tag, $tags ) ) { + $this->pop_open_element(); + } + } + + private function pop_until_node( $node ) { + do { + $popped = $this->pop_open_element(); + } while ( $popped !== $node ); + } + + private function pop_open_element() { + $popped = array_pop( $this->open_elements ); + if ( $popped->tag_processor_bookmark ) { + $this->release_bookmark( $popped->tag_processor_bookmark ); + $popped->tag_processor_bookmark = null; + } + return $popped; + } + + private function generate_implied_end_tags( $options = null ) { + while ( $this->should_generate_implied_end_tags( $options ) ) { + yield $this->pop_open_element(); + } + } + + private function current_node() { + return end( $this->open_elements ); + } + + private function close_p_element() { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'P' ), + ) + ); + // If the current node is not a p element, then this is a parse error. + if ( $this->current_node()->tag !== 'P' ) { + $this->parse_error(); + } + $this->pop_until_tag_name( 'P' ); + } + + private function should_generate_implied_end_tags( $options = null ) { + $current_tag_name = $this->current_node()->tag; + if ( null !== $options && isset( $options['except_for'] ) && in_array( $current_tag_name, $options['except_for'] ) ) { + return false; + } + switch ( $current_tag_name ) { + case 'DD': + case 'DT': + case 'LI': + case 'OPTION': + case 'OPTGROUP': + case 'P': + case 'RB': + case 'RP': + case 'RT': + case 'RTC': + return true; + } + + $thoroughly = null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly']; + if ( $thoroughly ) { + switch ( $current_tag_name ) { + case 'TBODY': + case 'TFOOT': + case 'THEAD': + case 'TD': + case 'TH': + case 'TR': + return true; + } + } + + return false; + } + + /** + * https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements + */ + private function push_active_formatting_element( $node ) { + $count = 0; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { + $formatting_element = $this->active_formatting_elements[ $i ]; + if ( $formatting_element->is_marker() ) { + break; + } + if ( ! $node->equivalent( $node ) ) { + continue; + } + $count++; + if ( $count === 3 ) { + array_splice( $this->active_formatting_elements, $i, 1 ); + break; + } + } + $this->active_formatting_elements[] = $node; + } + + private function reconstruct_active_formatting_elements() { + if ( empty( $this->active_formatting_elements ) ) { + return; + } + $i = count( $this->active_formatting_elements ) - 1; + $last_entry = $this->active_formatting_elements[ $i ]; + if ( $last_entry->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { + return; + } + $entry = $last_entry; + while ( true ) { + if ( $i <= 0 ) { + break; + } + --$i; + $entry = $this->active_formatting_elements[ $i ]; + if ( $entry->is_marker() || in_array( $entry, $this->open_elements, true ) ) { + break; + } + } + while ( true ) { + ++$i; + $entry = $this->active_formatting_elements[ $i ]; + if ( $entry === $last_entry ) { + break; + } + + // @TODO: + // Create: Insert an HTML element for the token for which the element entry + // was created, to obtain new element. + $new_element = new WP_HTML_Element( $entry->tag, $entry->attributes ); + + // Replace the entry for entry in the list with an entry for new element. + $index = array_search( $entry, $this->active_formatting_elements, true ); + + $this->active_formatting_elements[ $index ] = $new_element; + if ( $index === count( $this->active_formatting_elements ) - 1 ) { + break; + } + } + } + + private function clear_active_formatting_elements_up_to_last_marker() { + while ( ! empty( $this->active_formatting_elements ) ) { + $entry = array_pop( $this->active_formatting_elements ); + if ( $entry->is_marker() ) { + break; + } + } + } + + private function is_element_in_select_scope( $target_node ) { + return $this->is_element_in_specific_scope( + $target_node, + array( + 'optgroup', + 'option', + ), + array( + 'negative_match' => 'true', + ) + ); + } + + private function is_element_in_table_scope( $target_node ) { + return $this->is_element_in_specific_scope( + $target_node, + array( + 'html', + 'table', + 'template', + ) + ); + } + + private function is_element_in_button_scope( $target_node ) { + return $this->is_element_in_scope( + $target_node, + array( + 'button', + ) + ); + } + + private function is_element_in_list_item_scope( $target_node ) { + return $this->is_element_in_scope( + $target_node, + array( + 'li', + 'dd', + 'dt', + ) + ); + } + + private function is_element_in_scope( $target_node, $additional_elements = array() ) { + return $this->is_element_in_specific_scope( + $target_node, + array_merge( + array( + 'applet', + 'caption', + 'html', + 'table', + 'td', + 'th', + 'marquee', + 'object', + 'template', + ), + $additional_elements + ) + ); + } + + /** + * https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements + */ + private function is_element_in_specific_scope( $target_node, $element_types_list, $options = array() ) { + $negative_match = isset( $options['negative_match'] ) ? $options['negative_match'] : false; + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + + if ( $node === $target_node ) { + return true; + } + + $is_in_the_list = in_array( $node->tag, $element_types_list, true ); + $failure = $negative_match ? $is_in_the_list : ! $is_in_the_list; + if ( $failure ) { + return false; + } + } + } + + /** + * https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately + */ + private function reset_insertion_mode() { + $last = false; + $node = end( $this->open_elements ); + + while ( true ) { + if ( count( $this->open_elements ) === 1 && $node === reset( $this->open_elements ) ) { + $last = true; + $node = $this->context_node; + } + + if ( $node->tag === 'select' ) { + if ( $last ) { + break; + } + + $ancestor = $node; + while ( true ) { + if ( $ancestor === $this->open_elements[0] ) { + break; + } + + $index = array_search( $ancestor, $this->open_elements ); + $ancestor = $this->open_elements[ $index - 1 ]; + if ( $ancestor->tag === 'template' ) { + break; + } + + if ( $ancestor->tag === 'table' ) { + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE; + return; + } + } + + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_SELECT; + return; + } + + switch ( $node->tag ) { + case 'TD': + case 'TH': + if ( ! $last ) { + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_CELL; + return; + } + break; + case 'TR': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_ROW; + return; + case 'TBODY': + case 'THEAD': + case 'TFOOT': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_TABLE_BODY; + return; + case 'CAPTION': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_CAPTION; + return; + case 'COLGROUP': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_COLUMN_GROUP; + return; + case 'TABLE': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_TABLE; + return; + case 'TEMPLATE': + // TODO: implement the current template insertion mode + $this->insertion_mode = 0; + return; + case 'HEAD': + if ( ! $last ) { + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_HEAD; + return; + } + break; + case 'BODY': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; + return; + case 'FRAMESET': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_FRAMESET; + return; + case 'HTML': + // TODO: implement the head element pointer + $this->insertion_mode = WP_HTML_Insertion_Mode::BEFORE_HEAD; + return; + default: + if ( $last ) { + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; + return; + } + } + + $index = array_search( $node, $this->open_elements ); + $node = $this->open_elements[ $index - 1 ]; + } + + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; + } + + + private static function is_special_element( $tag_name, $except = null ) { + if ( null !== $except && in_array( $tag_name, $except, true ) ) { + return false; + } + + switch ( $tag_name ) { + case 'ADDRESS': + case 'APPLET': + case 'AREA': + case 'ARTICLE': + case 'ASIDE': + case 'BASE': + case 'BASEFONT': + case 'BGSOUND': + case 'BLOCKQUOTE': + case 'BODY': + case 'BR': + case 'BUTTON': + case 'CAPTION': + case 'CENTER': + case 'COL': + case 'COLGROUP': + case 'DD': + case 'DETAILS': + case 'DIR': + case 'DIV': + case 'DL': + case 'DT': + case 'EMBED': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'FORM': + case 'FRAME': + case 'FRAMESET': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + case 'HEAD': + case 'HEADER': + case 'HGROUP': + case 'HR': + case 'HTML': + case 'IFRAME': + case 'IMG': + case 'INPUT': + case 'ISINDEX': + case 'LI': + case 'LINK': + case 'LISTING': + case 'MAIN': + case 'MARQUEE': + case 'MENU': + case 'MENUITEM': + case 'META': + case 'NAV': + case 'NOEMBED': + case 'NOFRAMES': + case 'NOSCRIPT': + case 'OBJECT': + case 'OL': + case 'P': + case 'PARAM': + case 'PLAINTEXT': + case 'PRE': + case 'SCRIPT': + case 'SECTION': + case 'SELECT': + case 'SOURCE': + case 'STYLE': + case 'SUMMARY': + case 'TABLE': + case 'TBODY': + case 'TD': + case 'TEMPLATE': + case 'TEXTAREA': + case 'TFOOT': + case 'TH': + case 'THEAD': + case 'TITLE': + case 'TR': + case 'TRACK': + case 'UL': + case 'WBR': + case 'XMP': + return true; + default: + return false; + } + } + + private static function is_rcdata_element( $tag_name ) { + switch ( $tag_name ) { + case 'TITLE': + case 'TEXTAREA': + case 'STYLE': + case 'XMP': + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + case 'NOSCRIPT': + return true; + default: + return false; + } + } + + private static function is_formatting_element( $tag_name ) { + switch ( strtoupper( $tag_name ) ) { + case 'A': + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'NOBR': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + return true; + default: + return false; + } + } + +} + + +$p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); +// The controller's schema is hardcoded, so tests would not be meaningful. +$p->parse_next(); + +// $this->tag_processor->next_tag( +// array( +// 'tag_closers' => 'visit', +// ) +// ); +// var_dump( $this->tag_processor->get_tag() ); +// var_dump( $this->tag_processor->is_tag_closer() ); +// $last_parent = end( $this->open_elements ); diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 31db41a3c86ad..68f6d213155d4 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -528,6 +528,18 @@ class WP_HTML_Tag_Processor { */ protected $lexical_updates = array(); + /** + * Attribute replacements to apply to input HTML document. + * + * Unlike more generic lexical updates, attribute updates are stored + * in an associative array, where the keys are (lowercase-normalized) + * attribute names, in order to avoid duplication. + * + * @since 6.3.0 + * @var WP_HTML_Text_Replacement[] + */ + private $attribute_updates = array(); + /** * Tracks and limits `seek()` calls to prevent accidental infinite loops. * @@ -1237,15 +1249,16 @@ private function skip_whitespace() { } /** - * Applies attribute updates and cleans up once a tag is fully parsed. + * Applies lexical updates and cleans up once a tag is fully parsed. * * @since 6.2.0 * * @return void */ private function after_tag() { - $this->class_name_updates_to_attributes_updates(); - $this->apply_attributes_updates(); + $this->class_name_updates_to_attribute_updates(); + $this->attribute_updates_to_lexical_updates(); + $this->apply_lexical_updates(); $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->tag_ends_at = null; @@ -1254,17 +1267,17 @@ private function after_tag() { } /** - * Converts class name updates into tag attributes updates + * Converts class name updates into tag attribute updates * (they are accumulated in different data formats for performance). * - * @see $lexical_updates + * @see $attribute_updates * @see $classname_updates * * @since 6.2.0 * * @return void */ - private function class_name_updates_to_attributes_updates() { + private function class_name_updates_to_attribute_updates() { if ( count( $this->classname_updates ) === 0 ) { return; } @@ -1398,13 +1411,33 @@ private function class_name_updates_to_attributes_updates() { } /** - * Applies attribute updates to HTML document. + * Converts attribute updates into lexical updates. + * + * This method is only meant to run right before the attribute updates are applied. + * The behavior in all other cases is undefined. + * + * @return void + * @since 6.3.0 + * + * @see $attribute_updates + * @see $lexical_updates + */ + private function attribute_updates_to_lexical_updates() { + foreach ( $this->attribute_updates as $update ) { + $this->lexical_updates[] = $update; + } + $this->attribute_updates = array(); + } + + /** + * Applies lexical updates to HTML document. * * @since 6.2.0 + * @since 6.3.0 Invalidate any bookmarks whose targets are overwritten. * * @return void */ - private function apply_attributes_updates() { + private function apply_lexical_updates() { if ( ! count( $this->lexical_updates ) ) { return; } @@ -1431,7 +1464,7 @@ private function apply_attributes_updates() { * Adjust bookmark locations to account for how the text * replacements adjust offsets in the input document. */ - foreach ( $this->bookmarks as $bookmark ) { + foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { /* * Each lexical update which appears before the bookmark's endpoints * might shift the offsets for those endpoints. Loop through each change @@ -1442,20 +1475,22 @@ private function apply_attributes_updates() { $tail_delta = 0; foreach ( $this->lexical_updates as $diff ) { - $update_head = $bookmark->start >= $diff->start; - $update_tail = $bookmark->end >= $diff->start; - - if ( ! $update_head && ! $update_tail ) { + if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) { break; } + if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) { + $this->release_bookmark( $bookmark_name ); + continue 2; + } + $delta = strlen( $diff->text ) - ( $diff->end - $diff->start ); - if ( $update_head ) { + if ( $bookmark->start >= $diff->start ) { $head_delta += $delta; } - if ( $update_tail ) { + if ( $bookmark->end >= $diff->end ) { $tail_delta += $delta; } } @@ -1467,6 +1502,18 @@ private function apply_attributes_updates() { $this->lexical_updates = array(); } + /** + * Checks whether a bookmark with the given name exists. + * + * @since 6.3.0 + * + * @param string $bookmark_name Name to identify a bookmark that potentially exists. + * @return bool Whether that bookmark exists. + */ + public function has_bookmark( $bookmark_name ) { + return array_key_exists( $bookmark_name, $this->bookmarks ); + } + /** * Move the internal cursor in the Tag Processor to a given bookmark's location. * @@ -1512,8 +1559,8 @@ public function seek( $bookmark_name ) { * * @since 6.2.0 * - * @param WP_HTML_Text_Replacement $a First attribute update. - * @param WP_HTML_Text_Replacement $b Second attribute update. + * @param WP_HTML_Text_Replacement $a First lexical update. + * @param WP_HTML_Text_Replacement $b Second lexical update. * @return int Comparison value for string order. */ private static function sort_start_ascending( $a, $b ) { @@ -1549,11 +1596,11 @@ private static function sort_start_ascending( $a, $b ) { * @return string|boolean|null Value of enqueued update if present, otherwise false. */ private function get_enqueued_attribute_value( $comparable_name ) { - if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { + if ( ! isset( $this->attribute_updates[ $comparable_name ] ) ) { return false; } - $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; + $enqueued_text = $this->attribute_updates[ $comparable_name ]->text; // Removed attributes erase the entire span. if ( '' === $enqueued_text ) { @@ -1626,7 +1673,7 @@ public function get_attribute( $name ) { /* * For every attribute other than `class` it's possible to perform a quick check if - * there's an enqueued lexical update whose value takes priority over what's found in + * there's an enqueued attribute update whose value takes priority over what's found in * the input document. * * The `class` attribute is special though because of the exposed helpers `add_class` @@ -1636,7 +1683,7 @@ public function get_attribute( $name ) { * into an attribute value update. */ if ( 'class' === $name ) { - $this->class_name_updates_to_attributes_updates(); + $this->class_name_updates_to_attribute_updates(); } // Return any enqueued attribute value updates if they exist. @@ -1864,8 +1911,8 @@ public function set_attribute( $name, $value ) { * * Result:
*/ - $existing_attribute = $this->attributes[ $comparable_name ]; - $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $existing_attribute = $this->attributes[ $comparable_name ]; + $this->attribute_updates[ $name ] = new WP_HTML_Text_Replacement( $existing_attribute->start, $existing_attribute->end, $updated_attribute @@ -1882,7 +1929,7 @@ public function set_attribute( $name, $value ) { * * Result:
*/ - $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( + $this->attribute_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $this->tag_name_starts_at + $this->tag_name_length, $this->tag_name_starts_at + $this->tag_name_length, ' ' . $updated_attribute @@ -1940,8 +1987,8 @@ public function remove_attribute( $name ) { * and when that attribute wasn't originally present. */ if ( ! isset( $this->attributes[ $name ] ) ) { - if ( isset( $this->lexical_updates[ $name ] ) ) { - unset( $this->lexical_updates[ $name ] ); + if ( isset( $this->attribute_updates[ $name ] ) ) { + unset( $this->attribute_updates[ $name ] ); } return false; } @@ -1957,7 +2004,7 @@ public function remove_attribute( $name ) { * * Result:
*/ - $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $this->attribute_updates[ $name ] = new WP_HTML_Text_Replacement( $this->attributes[ $name ]->start, $this->attributes[ $name ]->end, '' @@ -2026,7 +2073,10 @@ public function __toString() { * @return string The processed HTML. */ public function get_updated_html() { - $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates ); + $requires_no_updating = + 0 === count( $this->classname_updates ) && + 0 === count( $this->attribute_updates ) && + 0 === count( $this->lexical_updates ); /* * When there is nothing more to update and nothing has already been @@ -2057,8 +2107,9 @@ public function get_updated_html() { * * Note: `apply_attributes_updates()` modifies `$this->output_buffer`. */ - $this->class_name_updates_to_attributes_updates(); - $this->apply_attributes_updates(); + $this->class_name_updates_to_attribute_updates(); + $this->attribute_updates_to_lexical_updates(); + $this->apply_lexical_updates(); /* * 2. Replace the original HTML with the now-updated HTML so that it's possible to @@ -2261,4 +2312,4 @@ private function matches() { return true; } -} +} \ No newline at end of file diff --git a/src/wp-settings.php b/src/wp-settings.php index a11b07ca28d07..ef5c6abc4355e 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -238,6 +238,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-span.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php new file mode 100644 index 0000000000000..1f1bf02237b39 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -0,0 +1,22 @@ +LoremIpsum

Dolor
Sit' ); + // The controller's schema is hardcoded, so tests would not be meaningful. + $p->next_tag_in_body_insertion_mode(); + } + +} From 0bdd4f6994429b1733558f92af60330d6a4b0f23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 10:48:04 +0100 Subject: [PATCH 02/42] Emit text tokens --- .../html-api/class-wp-html-processor.php | 189 ++++++++++++++---- .../html-api/class-wp-html-tag-processor.php | 8 +- 2 files changed, 150 insertions(+), 47 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index da0c95738b2a8..5a8a89588a7f0 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,22 +11,68 @@ function esc_attr( $text ) { } } -// Could be just WP_HTML_Node actually -class WP_HTML_Element { - const MARKER = -1; +class WP_HTML_Token { + const MARKER = 1; + const TAG = 2; + const TEXT = 3; + + public $type; + + // For tag tokens public $tag; public $attributes; public $is_closer; public $is_opener; - public $tag_processor_bookmark; - public function __construct( $tag, $attributes = null, $is_opener = true ) { - $this->tag = $tag; - $this->attributes = $attributes; - $this->is_opener = $is_opener; - $this->is_closer = ! $is_opener; + public $bookmark; + + // For text tokens + public $value; + + static public function marker() { + return new WP_HTML_Token( self::MARKER ); + } + + static public function tag( $tag, $attributes = null, $is_opener = true, $bookmark = null ) { + $token = new WP_HTML_Token( self::TAG, $tag ); + $token->tag = $tag; + $token->attributes = $attributes; + $token->is_opener = $is_opener; + $token->is_closer = ! $is_opener; + $token->bookmark = $bookmark; + return $token; + } + + static public function text( $text ) { + $token = new WP_HTML_Token( self::TEXT ); + $token->value = $text; + return $token; + } + + public function __construct( $type ) { + $this->type = $type; } - public function equivalent( WP_HTML_Element $other ) { + public function __toString() { + switch ( $this->type ) { + case self::MARKER: + return 'MARKER'; + case self::TAG: + return sprintf( + '<%s%s%s>', + $this->is_closer ? '/' : '', + $this->tag, + $this->attributes ? ' ' . implode( ' ', $this->attributes ) : '' + ); + case self::TEXT: + return $this->value; + } + } + + public function equivalent( WP_HTML_Token $other ) { + if ( ! $this->tag || ! $other->tag ) { + throw new Exception( 'Cannot compare non-tag tokens' ); + } + if ( $this->is_closer !== $other->is_closer ) { return false; } @@ -50,7 +96,15 @@ public function equivalent( WP_HTML_Element $other ) { } public function is_marker() { - return self::MARKER === $this->tag; + return self::MARKER === $this->type; + } + + public function is_tag() { + return self::TAG === $this->type; + } + + public function is_text() { + return self::TEXT === $this->type; } } @@ -78,13 +132,12 @@ class WP_HTML_Insertion_Mode { */ class WP_HTML_Processor extends WP_HTML_Tag_Processor { - private $tag_processor; /** - * @var WP_HTML_Element[] + * @var WP_HTML_Token[] */ private $open_elements = array(); /** - * @var WP_HTML_Element[] + * @var WP_HTML_Token[] */ private $active_formatting_elements = array(); private $root_node = null; @@ -92,6 +145,20 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $original_insertion_mode = null; private $insertion_mode = null; + /* + * WP_HTML_Tag_Processor skips over text nodes and only + * processes tags. + * + * WP_HTML_Processor needs to process text nodes as well. + * + * Whenever the tag processor skips over text to move to + * the next tag, the next_token() method emits that text + * as a token and stores the tag in $buffered_tag to be + * returned the next time. + */ + private $buffered_tag = null; + + private $last_token = null; private $inserted_tokens = array(); private $head_pointer; @@ -99,14 +166,22 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function __construct( $html ) { parent::__construct( $html ); - $this->root_node = new WP_HTML_Element( 'HTML' ); - $this->context_node = new WP_HTML_Element( 'DOCUMENT' ); + $this->root_node = WP_HTML_Token::tag( 'HTML' ); + $this->context_node = WP_HTML_Token::tag( 'DOCUMENT' ); $this->open_elements = array( $this->root_node ); $this->reset_insertion_mode(); } - public function parse_next() { - return $this->next_tag_in_body_insertion_mode(); + public function main() { + for ($i = 0; $i < 10; $i++) { + $token = $this->next_token(); + if(!$token) { + break; + } + echo "TOKEN: $token\n"; + $processed_token = $this->process_in_body_insertion_mode($token); + $this->last_token = $processed_token; + } // @TODO: // switch($this->insertion_mode) { // case WP_HTML_Insertion_Mode::INITIAL: @@ -154,9 +229,11 @@ public function parse_next() { // } } - public function next_tag_in_body_insertion_mode() { - $token = $this->next_token(); - if ( $token->is_opener ) { + public function process_in_body_insertion_mode(WP_HTML_Token $token) { + if ( $token->is_text() ) { + // ? + } + else if ( $token->is_opener ) { // Should we care? // if(self::is_rcdata_element($token->tag)) { // $this->original_insertion_mode = $this->insertion_mode; @@ -304,8 +381,7 @@ public function next_tag_in_body_insertion_mode() { if ( $active_a ) { $this->parse_error(); - // @TODO: - // Run the adoption agency algorithm with the tag name "a". + $this->adoption_agency_algorithm( $token ); } $this->reconstruct_active_formatting_elements(); @@ -342,7 +418,7 @@ public function next_tag_in_body_insertion_mode() { case 'OBJECT': $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); - $this->active_formatting_elements[] = new WP_HTML_Element( WP_HTML_Element::MARKER ); + $this->active_formatting_elements[] = WP_HTML_Token::marker(); break; case 'TABLE': $this->insert_element( $token ); @@ -515,7 +591,7 @@ public function next_tag_in_body_insertion_mode() { if ( ! $this->is_element_in_button_scope( 'P' ) ) { // Parse error, insert an HTML element for a "p" start tag token with no attributes. $this->parse_error(); - $this->insert_element( new WP_HTML_Element( 'P', array() ) ); + $this->insert_element( WP_HTML_Token::tag( 'P' ) ); } $this->close_p_element(); break; @@ -609,29 +685,56 @@ public function next_tag_in_body_insertion_mode() { break; } } + return $token; } private $element_bookmark_idx = 0; private function next_token() { + if($this->buffered_tag){ + $next_tag = $this->buffered_tag; + $this->buffered_tag = null; + return $next_tag; + } + if ( ! $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { return false; } - $consumed_node = new WP_HTML_Element( + $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); + $this->set_bookmark($bookmark); + $next_tag = WP_HTML_Token::tag( $this->get_tag(), array(), - ! $this->is_tag_closer() + ! $this->is_tag_closer(), + $bookmark ); - $consumed_node->tag_processor_bookmark = $this->set_bookmark( - '__internal_' . ( $this->element_bookmark_idx++ ) - ); + /* + * If any text was found between the last tag and this one, + * save the next tag for later and return the text token. + */ + $last = $this->last_token; + if ( + $last + && $last->is_tag() + && $last->bookmark + && $this->has_bookmark($last->bookmark) + ) { + $this->buffered_tag = $next_tag; + + $text_start = $this->bookmarks[$last->bookmark]->end + 1; + $text_end = $this->bookmarks[$bookmark]->start; + if ($text_start < $text_end) { + $text = substr($this->html, $text_start, $text_end - $text_start); + return WP_HTML_Token::text($text); + } + } - return $consumed_node; + return $next_tag; } const ANY_OTHER_END_TAG = 1; - private function adoption_agency_algorithm( WP_HTML_Element $token ) { + private function adoption_agency_algorithm( WP_HTML_Token $token ) { $subject = $token->tag; if ( $this->current_node()->tag === $subject @@ -786,7 +889,7 @@ private function adoption_agency_algorithm( WP_HTML_Element $token ) { * * Let node be the new element. */ - $new_node = new WP_HTML_Element( $node->tag, array() ); + $new_node = WP_HTML_Token::tag( $node->tag ); $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); $this->active_formatting_elements[ $node_formatting_idx ] = $new_node; @@ -815,7 +918,7 @@ private function adoption_agency_algorithm( WP_HTML_Element $token ) { // Create an element for the token for which formatting element was created, in the HTML // namespace, with furthest block as the intended parent. - $new_element = new WP_HTML_Element( $formatting_element->tag, array() ); + $new_element = WP_HTML_Token::tag( $formatting_element->tag ); // Take all of the child nodes of furthest block and append them to the element created in // the last step. @@ -871,9 +974,9 @@ private function insert_html_element( $node ) { } private function ignore_token( $token ) { - if ( $token->tag_processor_bookmark ) { - $this->release_bookmark( $token->tag_processor_bookmark ); - $token->tag_processor_bookmark = null; + if ( $token->bookmark ) { + $this->release_bookmark( $token->bookmark ); + $token->bookmark = null; } return; } @@ -903,9 +1006,9 @@ private function pop_until_node( $node ) { private function pop_open_element() { $popped = array_pop( $this->open_elements ); - if ( $popped->tag_processor_bookmark ) { - $this->release_bookmark( $popped->tag_processor_bookmark ); - $popped->tag_processor_bookmark = null; + if ( $popped->bookmark ) { + $this->release_bookmark( $popped->bookmark ); + $popped->bookmark = null; } return $popped; } @@ -1020,7 +1123,7 @@ private function reconstruct_active_formatting_elements() { // @TODO: // Create: Insert an HTML element for the token for which the element entry // was created, to obtain new element. - $new_element = new WP_HTML_Element( $entry->tag, $entry->attributes ); + $new_element = WP_HTML_Token::tag( $entry->tag, $entry->attributes ); // Replace the entry for entry in the list with an entry for new element. $index = array_search( $entry, $this->active_formatting_elements, true ); @@ -1362,9 +1465,9 @@ private static function is_formatting_element( $tag_name ) { } -$p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); +$p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); // The controller's schema is hardcoded, so tests would not be meaningful. -$p->parse_next(); +$p->main(); // $this->tag_processor->next_tag( // array( diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 68f6d213155d4..94a06fea79072 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -274,7 +274,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var string */ - private $html; + public $html; /** * The last query passed to next_tag(). @@ -343,7 +343,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var int */ - private $bytes_already_parsed = 0; + protected $bytes_already_parsed = 0; /** * How many bytes from the input HTML document have already been @@ -406,7 +406,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var int|null */ - private $tag_ends_at; + protected $tag_ends_at; /** * Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
. @@ -734,7 +734,7 @@ public function set_bookmark( $name ) { } $this->bookmarks[ $name ] = new WP_HTML_Span( - $this->tag_name_starts_at - 1, + $this->tag_name_starts_at - ($this->is_closing_tag ? 2 : 1), $this->tag_ends_at ); From 9026d1c3f78a532a8b04e42349e7fea3f70ad369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 10:49:18 +0100 Subject: [PATCH 03/42] WP_HTML_Tag_Processor: Use the correct starting position when bookmarking a tag closer This commit marks the start of a bookmark one byte before the tag name start for tag openers, and two bytes before the tag name for tag closers. Setting a bookmark on a tag should set its "start" position before the opening "<", e.g.: ```
Testing a Bookmark ----------------^ ``` The current calculation assumes this is always one byte to the left from $tag_name_starts_at. However, in tag closers that index points to a solidus symbol "/": ```
Testing a Bookmark ----------------------------^ ``` The bookmark should therefore start two bytes before the tag name: ```
Testing a Bookmark ---------------------------^ ``` --- .../html-api/class-wp-html-tag-processor.php | 4 ++-- .../html-api/wpHtmlTagProcessor-bookmark.php | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 31db41a3c86ad..aa52dcb37b283 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -722,7 +722,7 @@ public function set_bookmark( $name ) { } $this->bookmarks[ $name ] = new WP_HTML_Span( - $this->tag_name_starts_at - 1, + $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ), $this->tag_ends_at ); @@ -1504,7 +1504,7 @@ public function seek( $bookmark_name ) { $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; $this->bytes_already_copied = $this->bytes_already_parsed; $this->output_buffer = substr( $this->html, 0, $this->bytes_already_copied ); - return $this->next_tag(); + return $this->next_tag( array( 'tag_closers' => 'visit' ) ); } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php index 04a6ae590cd7d..69a9695d1fd59 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php @@ -63,6 +63,28 @@ public function test_seek() { ); } + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + */ + public function test_seeks_to_tag_closer_bookmark() { + $p = new WP_HTML_Tag_Processor( '
First
Second' ); + $p->next_tag( array( 'tag_closers' => 'visit' ) ); + $p->set_bookmark( 'first' ); + $p->next_tag( array( 'tag_closers' => 'visit' ) ); + $p->set_bookmark( 'second' ); + + $p->seek( 'first' ); + $p->seek( 'second' ); + + $this->assertSame( + 'DIV', + $p->get_tag(), + 'Did not seek to the intended bookmark location' + ); + } + /** * WP_HTML_Tag_Processor used to test for the diffs affecting * the adjusted bookmark position while simultaneously adjusting From ff9505b3a00f88b9430d99da5f070011ee7f2b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 11:27:45 +0100 Subject: [PATCH 04/42] Consume HTML text nodes as tokens --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 5a8a89588a7f0..8616d9e4736fe 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -179,8 +179,9 @@ public function main() { break; } echo "TOKEN: $token\n"; - $processed_token = $this->process_in_body_insertion_mode($token); - $this->last_token = $processed_token; + $this->last_token = $token; + // $processed_token = $this->process_in_body_insertion_mode($token); + // $this->last_token = $processed_token; } // @TODO: // switch($this->insertion_mode) { @@ -720,11 +721,10 @@ private function next_token() { && $last->bookmark && $this->has_bookmark($last->bookmark) ) { - $this->buffered_tag = $next_tag; - $text_start = $this->bookmarks[$last->bookmark]->end + 1; $text_end = $this->bookmarks[$bookmark]->start; if ($text_start < $text_end) { + $this->buffered_tag = $next_tag; $text = substr($this->html, $text_start, $text_end - $text_start); return WP_HTML_Token::text($text); } From afbfdc5ac251fe4cdc119c76a935fae29081618b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 14:03:56 +0100 Subject: [PATCH 05/42] Implement DOM insertion --- .../html-api/class-wp-html-processor.php | 543 ++++++++++++------ 1 file changed, 357 insertions(+), 186 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 8616d9e4736fe..d2ac45a14ccb4 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,6 +11,15 @@ function esc_attr( $text ) { } } +function dbg( $message, $indent = 0 ) { + $show_debug = true; + // $show_debug = false; + if( $show_debug ) { + $indent = str_repeat( ' ', $indent * 2 ); + echo $indent . $message . "\n"; + } +} + class WP_HTML_Token { const MARKER = 1; const TAG = 2; @@ -33,7 +42,7 @@ static public function marker() { } static public function tag( $tag, $attributes = null, $is_opener = true, $bookmark = null ) { - $token = new WP_HTML_Token( self::TAG, $tag ); + $token = new WP_HTML_Token( self::TAG ); $token->tag = $tag; $token->attributes = $attributes; $token->is_opener = $is_opener; @@ -108,6 +117,46 @@ public function is_text() { } } +class WP_HTML_Node { + public $parent; + public $children = array(); + public $token; + public $depth = 1; + + // For the adoption agency algorithm: + public $intended_parent = null; + + public function __construct( WP_HTML_Token $token ) { + $this->token = $token; + } + + public function append_child( WP_HTML_Node $node ) { + if($node->parent) { + $node->parent->remove($node); + } + $node->parent = $this; + $this->children[] = $node; + $node->depth = $this->depth + 1; + } + + public function remove( WP_HTML_Node $node ) { + $index = array_search( $node, $this->children, true ); + if ( false !== $index ) { + unset( $this->children[ $index ] ); + } + } + + public function __toString() { + $out = ''; + $indent = str_repeat( ' ', $this->depth ); + $out .= $indent . $this->token . "\n"; + foreach ( $this->children as $child ) { + $out .= $child; + } + return $out; + } +} + class WP_HTML_Insertion_Mode { const INITIAL = 'INITIAL'; @@ -133,11 +182,11 @@ class WP_HTML_Insertion_Mode { class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** - * @var WP_HTML_Token[] + * @var WP_HTML_Node[] */ private $open_elements = array(); /** - * @var WP_HTML_Token[] + * @var WP_HTML_Node[] */ private $active_formatting_elements = array(); private $root_node = null; @@ -166,23 +215,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function __construct( $html ) { parent::__construct( $html ); - $this->root_node = WP_HTML_Token::tag( 'HTML' ); - $this->context_node = WP_HTML_Token::tag( 'DOCUMENT' ); + $this->root_node = new WP_HTML_Node(WP_HTML_Token::tag( 'HTML' )); + $this->context_node = new WP_HTML_Node(WP_HTML_Token::tag( 'DOCUMENT' )); $this->open_elements = array( $this->root_node ); $this->reset_insertion_mode(); } - public function main() { - for ($i = 0; $i < 10; $i++) { - $token = $this->next_token(); - if(!$token) { - break; - } - echo "TOKEN: $token\n"; + public function parse() { + echo("HTML before main loop:\n"); + echo($this->html); + echo("\n\n"); + while ($token = $this->next_token()) { $this->last_token = $token; - // $processed_token = $this->process_in_body_insertion_mode($token); - // $this->last_token = $processed_token; + $processed_token = $this->process_in_body_insertion_mode($token); + $this->last_token = $processed_token; } + echo("\n"); + echo("DOM after main loop:\n"); + echo($this->root_node.''); // @TODO: // switch($this->insertion_mode) { // case WP_HTML_Insertion_Mode::INITIAL: @@ -232,7 +282,10 @@ public function main() { public function process_in_body_insertion_mode(WP_HTML_Token $token) { if ( $token->is_text() ) { - // ? + dbg( "Found text node '$token'" ); + dbg( "Inserting text to current node " . $this->current_node()->token->tag, 1 ); + $this->reconstruct_active_formatting_elements(); + $this->insert_text( $token ); } else if ( $token->is_opener ) { // Should we care? @@ -268,6 +321,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { // Ignore special rules for 'PRE' and 'LISTING' case 'PRE': case 'LISTING': + dbg( "Found {$token->tag} tag opener" ); if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } @@ -283,7 +337,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - if ( in_array( $this->current_node()->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + if ( in_array( $this->current_node()->token->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->pop_open_element(); } $this->insert_element( $token ); @@ -303,7 +357,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $i = count( $this->open_elements ) - 1; while ( true ) { $node = $this->open_elements[ $i ]; - if ( $node->tag === 'LI' ) { + if ( $node->token->tag === 'LI' ) { $this->generate_implied_end_tags( array( 'except_for' => array( 'LI' ), @@ -311,7 +365,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { ); $this->pop_until_tag_name( 'LI' ); break; - } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + } elseif ( self::is_special_element( $node->token->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; } else { --$i; @@ -329,7 +383,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $i = count( $this->open_elements ) - 1; while ( true ) { $node = $this->open_elements[ $i ]; - if ( $node->tag === 'DD' ) { + if ( $node->token->tag === 'DD' ) { $this->generate_implied_end_tags( array( 'except_for' => array( 'DD' ), @@ -337,7 +391,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { ); $this->pop_until_tag_name( 'DD' ); break; - } elseif ( $node->tag === 'DT' ) { + } elseif ( $node->token->tag === 'DT' ) { $this->generate_implied_end_tags( array( 'except_for' => array( 'DT' ), @@ -345,7 +399,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { ); $this->pop_until_tag_name( 'DT' ); break; - } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + } elseif ( self::is_special_element( $node->token->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; } else { --$i; @@ -371,11 +425,11 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'A': $active_a = null; for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; --$i ) { - $elem = $this->active_formatting_elements[ $i ]; - if ( $elem->tag === 'A' ) { - $active_a = $elem; + $node = $this->active_formatting_elements[ $i ]; + if ( $node->token->tag === 'A' ) { + $active_a = $node; break; - } elseif ( $elem->is_marker() ) { + } elseif ( $node->token->is_marker() ) { break; } } @@ -400,9 +454,10 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'STRONG': case 'TT': case 'U': + dbg( "Found {$token->tag} tag opener" ); $this->reconstruct_active_formatting_elements(); - $this->push_active_formatting_element( $token ); - $this->insert_element( $token ); + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); break; case 'NOBR': $this->reconstruct_active_formatting_elements(); @@ -411,8 +466,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->adoption_agency_algorithm( $token ); $this->reconstruct_active_formatting_elements(); } - $this->insert_element( $token ); - $this->push_active_formatting_element( $token ); + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); break; case 'APPLET': case 'MARQUEE': @@ -466,7 +521,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->reconstruct_active_formatting_elements(); // @TODO: Follow the generic raw text element parsing algorithm. throw new Exception( 'XMP not implemented yet' ); - break; case 'IFRAME': case 'NOEMBED': case 'NOSCRIPT': @@ -561,7 +615,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'MENU': case 'NAV': case 'OL': - case 'P': + case 'PRE': case 'SECTION': case 'SUMMARY': case 'UL': @@ -589,6 +643,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->form_pointer = null; break; case 'P': + dbg( "Found {$token->tag} tag closer" ); if ( ! $this->is_element_in_button_scope( 'P' ) ) { // Parse error, insert an HTML element for a "p" start tag token with no attributes. $this->parse_error(); @@ -642,7 +697,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'STRONG': case 'TT': case 'U': - $this->parse_error(); + dbg( "Found {$token->tag} tag closer" ); $this->adoption_agency_algorithm( $token ); break; @@ -655,7 +710,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { return $this->next_tag(); } $this->generate_implied_end_tags(); - if ( $this->current_node()->tag !== $token->tag ) { + if ( $this->current_node()->token->tag !== $token->tag ) { $this->parse_error(); } $this->pop_until_tag_name( $token->tag ); @@ -667,7 +722,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $i = count( $this->open_elements ) - 1; while ( true ) { $node = $this->open_elements[ $i ]; - if ( $node->tag === $token->tag ) { + if ( $node->token->tag === $token->tag ) { $this->generate_implied_end_tags( array( 'except_for' => array( $token->tag ), @@ -675,7 +730,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { ); $this->pop_until_node( $node ); break; - } elseif ( $this->is_special_element( $node->tag ) ) { + } elseif ( $this->is_special_element( $node->token->tag ) ) { $this->ignore_token( $token ); $this->parse_error(); return $this->next_tag(); @@ -697,19 +752,21 @@ private function next_token() { return $next_tag; } - if ( ! $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { - return false; + $next_tag = false; + if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); + $this->set_bookmark($bookmark); + $next_tag = WP_HTML_Token::tag( + $this->get_tag(), + array(), + ! $this->is_tag_closer(), + $bookmark + ); + $text_end = $this->bookmarks[$bookmark]->start; + } else { + $text_end = strlen($this->html); } - $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); - $this->set_bookmark($bookmark); - $next_tag = WP_HTML_Token::tag( - $this->get_tag(), - array(), - ! $this->is_tag_closer(), - $bookmark - ); - /* * If any text was found between the last tag and this one, * save the next tag for later and return the text token. @@ -722,7 +779,6 @@ private function next_token() { && $this->has_bookmark($last->bookmark) ) { $text_start = $this->bookmarks[$last->bookmark]->end + 1; - $text_end = $this->bookmarks[$bookmark]->start; if ($text_start < $text_end) { $this->buffered_tag = $next_tag; $text = substr($this->html, $text_start, $text_end - $text_start); @@ -735,12 +791,15 @@ private function next_token() { const ANY_OTHER_END_TAG = 1; private function adoption_agency_algorithm( WP_HTML_Token $token ) { + dbg("Adoption Agency Algorithm", 1); $subject = $token->tag; + $current_node = $this->current_node(); if ( - $this->current_node()->tag === $subject - && ! in_array( $subject, $this->active_formatting_elements, true ) + $current_node->token->tag === $subject + && ! in_array( $current_node, $this->active_formatting_elements, true ) ) { $this->pop_open_element(); + dbg("Skipping AAA: current node is \$subject ($subject) and is not AFE", 2); return; } @@ -757,18 +816,21 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { $formatting_element_idx = -1; for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { $candidate = $this->active_formatting_elements[ $i ]; - if ( $candidate->is_marker() ) { + if ( $candidate->token->is_marker() ) { break; } - if ( $candidate->tag === $subject ) { + if ( $candidate->token->tag === $subject ) { $formatting_element = $candidate; $formatting_element_idx = $i; break; } } + dbg("AAA: Formatting element = {$formatting_element->token->tag}", 2); + // If there is no such element, then abort these steps and instead act as // described in the "any other end tag" entry below. if ( null === $formatting_element ) { + dbg("Skipping AAA: no formatting element found", 2); return self::ANY_OTHER_END_TAG; } @@ -777,12 +839,15 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { if ( ! in_array( $formatting_element, $this->open_elements, true ) ) { array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); $this->parse_error(); + dbg("Skipping AAA: formatting element is not in the stack of open elements", 2); return; } // If formatting element is not in scope, then this is a parse error; return - if ( ! $this->is_element_in_scope( $formatting_element->tag ) ) { + if ( ! $this->is_element_in_scope( $formatting_element ) ) { $this->parse_error(); + dbg("Skipping AAA: formatting element {$formatting_element->token->tag} is not in scope", 2); + $this->print_open_elements('Open elements: ', 2); return; } @@ -803,9 +868,8 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { if ( $node === $formatting_element ) { break; } - if ( $this->is_special_element( $node->tag ) ) { + if ( $this->is_special_element( $node->token->tag ) ) { $furthest_block = $node; - break; } } @@ -816,14 +880,22 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { if ( null === $furthest_block ) { $this->pop_until_node( $formatting_element ); array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); + dbg("Skipping AAA: no furthest block found", 2); return; } + dbg("AAA: Furthest block = {$furthest_block->token->tag}", 2); + // Let common ancestor be the element immediately above formatting element // in the stack of open elements. $formatting_elem_stack_index = array_search( $formatting_element, $this->open_elements, true ); $common_ancestor = $this->open_elements[ $formatting_elem_stack_index - 1 ]; + dbg("AAA: Common ancestor = {$common_ancestor->token->tag}", 2); + + $this->print_open_elements('AAA: Open elements: ', 2); + $this->print_rafe_formats('AAA: Formatting elements: ', 2); + // Let a bookmark note the position of formatting element in the list of // active formatting elements relative to the elements on either side of it // in the list. @@ -833,8 +905,8 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { $node = $last_node = $furthest_block; $node_open_elements_index = array_search( $node, $this->open_elements, true ); - $prev_node_open_elements_index = -1; - $inner_loop_counter = 0; + $prev_open_element_index = false; + $inner_loop_counter = 0; while ( true ) { $inner_loop_counter++; @@ -846,15 +918,21 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { */ $node_open_elements_index = array_search( $node, $this->open_elements, true ); if ( false === $node_open_elements_index ) { - $node_open_elements_index = $prev_node_open_elements_index; - return; + if ( false === $prev_open_element_index ) { + throw new Exception( 'Unexpected error in AAA algorithm – cannot find node.' ); + } + $node_open_elements_index = $prev_open_element_index; } --$node_open_elements_index; - $node = $this->open_elements[ $node_open_elements_index ]; - $prev_node_open_elements_index = $node_open_elements_index; + if( $node_open_elements_index < 0 ) { + throw new Exception( 'Unexpected error in AAA algorithm – node is not in the stack of open elements.' ); + } + $node = $this->open_elements[ $node_open_elements_index ]; + $prev_open_element_index = $node_open_elements_index; // If node is formatting element, then break. if ( $node === $formatting_element ) { + dbg("AAA: Inner loop break – node is formatting element", 3); break; } @@ -873,28 +951,34 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { * node from the stack of open elements and continue. */ if ( ! in_array( $node, $this->active_formatting_elements, true ) ) { + dbg("AAA: Inner loop – removing node from the stack of open elements", 3); array_splice( $this->open_elements, $node_open_elements_index, 1 ); - continue; } /* * Create an element for the token for which the element node was created, * in the HTML namespace, with common ancestor as the intended parent. - * + */ + $new_node = $this->create_element_for_token( $node->token ); + $new_node->intended_parent = $common_ancestor; + + /* * Replace the entry for node in the list of active formatting elements with an entry * for the new element. - * - * Replace the entry for node in the stack of open elements with an entry for - * the new element. - * - * Let node be the new element. */ - $new_node = WP_HTML_Token::tag( $node->tag ); $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); $this->active_formatting_elements[ $node_formatting_idx ] = $new_node; - $node_open_elements_index = array_search( $node, $this->open_elements, true ); - $this->open_elements[ $node_open_elements_index ] = $new_node; + /* + * Replace the entry for node in the stack of open elements with an entry for + * the new element. + */ + $idx = array_search( $node, $this->open_elements, true ); + $this->open_elements[ $idx ] = $new_node; + + /* + * Let node be the new element. + */ $node = $new_node; /* @@ -906,7 +990,8 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { } // Append last node to node. - // @TODO + dbg("AAA: Appending {$last_node->token->tag} to {$node->token->tag}", 3); + $node->append_child( $last_node ); // Set last node to node. $last_node = $node; @@ -914,63 +999,77 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { // Insert whatever last node ended up being in the previous step at the appropriate place // for inserting a node, but using common ancestor as the override target. - // @TODO + $this->insert_element( $last_node, $common_ancestor ); // Create an element for the token for which formatting element was created, in the HTML // namespace, with furthest block as the intended parent. - $new_element = WP_HTML_Token::tag( $formatting_element->tag ); + $new_element = $this->create_element_for_token( $formatting_element->token ); + $new_element->intended_parent = $furthest_block; // Take all of the child nodes of furthest block and append them to the element created in // the last step. - // @TODO + foreach ($furthest_block->children as $child) { + $new_element->append_child( $child ); + } // Append that new element to furthest block. - // @TODO + $furthest_block->append_child( $new_element ); - // Remove formatting element from the list of active formatting elements, and insert the new - // element into the list of active formatting elements at the position of the aforementioned - // bookmark. - $formatting_element_idx = array_search( $formatting_element, $this->active_formatting_elements, true ); - array_splice( $this->active_formatting_elements, $formatting_element_idx, 1, array( $new_element ) ); + // Remove formatting element from the list of active formatting elements + $idx = array_search( $formatting_element, $this->active_formatting_elements, true ); + array_splice( $this->active_formatting_elements, $idx, 1 ); + + // Insert the new element into the list of active formatting elements at the + // position of the aforementioned bookmark. array_splice( $this->active_formatting_elements, $bookmark, 0, array( $new_element ) ); - // Remove formatting element from the stack of open elements, and insert the new element into - // the stack of open elements immediately below the position of furthest block in that stack. - $formatting_element_idx = array_search( $formatting_element, $this->active_formatting_elements, true ); - array_splice( $this->active_formatting_elements, $formatting_element_idx, 1, array( $new_element ) ); - - $furthest_block_idx = array_search( $furthest_block, $this->open_elements, true ); - array_splice( $this->open_elements, $furthest_block_idx + 1, 0, array( $new_element ) ); + // Remove formatting element from the stack of open elements + $idx = array_search( $formatting_element, $this->open_elements, true ); + array_splice( $this->open_elements, $idx, 1 ); + + // Insert the new element into the stack of open elements immediately below the + // position of furthest block in that stack. + $idx = array_search( $furthest_block, $this->open_elements, true ); + array_splice( $this->open_elements, $idx + 1, 0, array( $new_element ) ); } } - /* - @TODO Implement https://html.spec.whatwg.org/multipage/parsing.html#insert-a-foreign-element - - Let the adjusted insertion location be the appropriate place for inserting a node. - - Let element be the result of creating an element for the token in the given namespace, with the intended parent being the element in which the adjusted insertion location finds itself. - - If it is possible to insert element at the adjusted insertion location, then: - - If the parser was not created as part of the HTML fragment parsing algorithm, then push a new element queue onto element's relevant agent's custom element reactions stack. - - Insert element at the adjusted insertion location. - - If the parser was not created as part of the HTML fragment parsing algorithm, then pop the element queue from element's relevant agent's custom element reactions stack, and invoke custom element reactions in that queue. + private function insert_element( $token_or_node, $override_target = null ) { + // Create element for a token + // Skip reset algorithm for now + // Skip form-association for now + if($token_or_node instanceof WP_HTML_Token) { + $node = $this->create_element_for_token($token_or_node); + } else { + $node = $token_or_node; + } - If the adjusted insertion location cannot accept more elements, e.g. because it's a Document that already has an element child, then element is dropped on the floor. + $target = $override_target ?: $this->current_node(); - Push element onto the stack of open elements so that it is the new current node. + // Appropriate place for inserting a node: + // For now skip foster parenting and always use the + // location after the last child of the target + $target->append_child($node); + array_push($this->open_elements, $node); + dbg("inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); + return $node; + } - Return element. + private function create_element_for_token( WP_HTML_Token $token ) { + $node = new WP_HTML_Node($token); + return $node; + } - */ - private function insert_html_element( $node ) { - if ( ! $node->is_closer ) { - $this->insert_element( $node ); + private function insert_text( WP_HTML_Token $token ) { + $target = $this->current_node(); + if(count($target->children)){ + $last_child = end($target->children); + if ( $last_child && $last_child->token->is_text() ) { + $last_child->token->value .= $token->value; + return; + } } - $this->inserted_tokens[] = $node; + $target->append_child(new WP_HTML_Node($token)); } private function ignore_token( $token ) { @@ -981,10 +1080,6 @@ private function ignore_token( $token ) { return; } - private function insert_element( $node ) { - $this->open_elements[] = $node; - } - private function parse_error() { // Noop for now } @@ -993,9 +1088,12 @@ private function pop_until_tag_name( $tags ) { if ( ! is_array( $tags ) ) { $tags = array( $tags ); } - while ( ! in_array( $this->current_node()->tag, $tags ) ) { + dbg( "Popping until tag names: " . implode(', ', $tags), 1 ); + $this->print_open_elements( "Open elements before: " ); + while ( ! in_array( $this->current_node()->token->tag, $tags ) ) { $this->pop_open_element(); } + $this->print_open_elements( "Open elements after: " ); } private function pop_until_node( $node ) { @@ -1006,9 +1104,9 @@ private function pop_until_node( $node ) { private function pop_open_element() { $popped = array_pop( $this->open_elements ); - if ( $popped->bookmark ) { - $this->release_bookmark( $popped->bookmark ); - $popped->bookmark = null; + if ( $popped->token->bookmark ) { + $this->release_bookmark( $popped->token->bookmark ); + $popped->token->bookmark = null; } return $popped; } @@ -1024,20 +1122,21 @@ private function current_node() { } private function close_p_element() { + dbg( "close_p_element" ); $this->generate_implied_end_tags( array( 'except_for' => array( 'P' ), ) ); // If the current node is not a p element, then this is a parse error. - if ( $this->current_node()->tag !== 'P' ) { + if ( $this->current_node()->token->tag !== 'P' ) { $this->parse_error(); } $this->pop_until_tag_name( 'P' ); } private function should_generate_implied_end_tags( $options = null ) { - $current_tag_name = $this->current_node()->tag; + $current_tag_name = $this->current_node()->token->tag; if ( null !== $options && isset( $options['except_for'] ) && in_array( $current_tag_name, $options['except_for'] ) ) { return false; } @@ -1074,14 +1173,14 @@ private function should_generate_implied_end_tags( $options = null ) { /** * https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements */ - private function push_active_formatting_element( $node ) { + private function push_active_formatting_element( WP_HTML_Node $node ) { $count = 0; for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { $formatting_element = $this->active_formatting_elements[ $i ]; - if ( $formatting_element->is_marker() ) { + if ( $formatting_element->token->is_marker() ) { break; } - if ( ! $node->equivalent( $node ) ) { + if ( ! $formatting_element->token->equivalent( $node->token ) ) { continue; } $count++; @@ -1093,63 +1192,100 @@ private function push_active_formatting_element( $node ) { $this->active_formatting_elements[] = $node; } + private function print_rafe_formats($msg, $indent=1) { + $formats = array_map( function( $node ) { + return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); + }, $this->active_formatting_elements); + dbg( "$msg " . implode(', ', $formats), $indent ); + } + + private function print_open_elements($msg, $indent=1) { + $elems = array_map(function ($node) { + return $node->token->tag; + }, $this->open_elements); + dbg( "$msg " . implode(', ', $elems), $indent ); + } + private function reconstruct_active_formatting_elements() { + $this->print_rafe_formats('RAFE: before'); if ( empty( $this->active_formatting_elements ) ) { + dbg( "Skipping RAFE: empty list", 1 ); return; } - $i = count( $this->active_formatting_elements ) - 1; - $last_entry = $this->active_formatting_elements[ $i ]; - if ( $last_entry->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { + $entry_idx = count( $this->active_formatting_elements ) - 1; + $last_entry = $this->active_formatting_elements[ $entry_idx ]; + if ( $last_entry->token->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { + dbg( "Skipping RAFE: marker or open element", 1 ); return; } + + // Let entry be the last (most recently added) element in the list of active formatting elements. $entry = $last_entry; + + $is_rewinding = true; while ( true ) { - if ( $i <= 0 ) { - break; - } - --$i; - $entry = $this->active_formatting_elements[ $i ]; - if ( $entry->is_marker() || in_array( $entry, $this->open_elements, true ) ) { - break; - } - } - while ( true ) { - ++$i; - $entry = $this->active_formatting_elements[ $i ]; - if ( $entry === $last_entry ) { - break; + if ( $is_rewinding ) { + // Rewind: + /* + * If there are no entries before entry in the list of active formatting elements, + * then jump to the step labeled create. + */ + if ( $entry_idx === 0 ) { + $is_rewinding = false; + } else { + // Let entry be the entry one earlier than entry in the list of active formatting elements. + $entry = $this->active_formatting_elements[ --$entry_idx ]; + + // If entry is neither a marker nor an element that is also in the stack of open elements, + // go to the step labeled rewind. + if ( ! $entry->token->is_marker() && ! in_array( $entry, $this->open_elements, true ) ) { + continue; + } + } + } else { + // Advance: + // Let entry be the element one later than entry in the list of active formatting elements. + $entry = $this->active_formatting_elements[ ++$entry_idx ]; } - // @TODO: - // Create: Insert an HTML element for the token for which the element entry - // was created, to obtain new element. - $new_element = WP_HTML_Token::tag( $entry->tag, $entry->attributes ); + // Create: Insert an HTML element for the token for which the element entry was created, + // to obtain new element. + $new_element = $this->insert_element( $entry->token ); // Replace the entry for entry in the list with an entry for new element. - $index = array_search( $entry, $this->active_formatting_elements, true ); + $this->active_formatting_elements[ $entry_idx ] = $new_element; - $this->active_formatting_elements[ $index ] = $new_element; - if ( $index === count( $this->active_formatting_elements ) - 1 ) { + // If the entry for new element in the list of active formatting elements is not the last entry + // in the list, return to the step labeled advance. + if ( $entry_idx === count( $this->active_formatting_elements ) - 1 ) { break; } } + $this->print_rafe_formats('RAFE: after'); } private function clear_active_formatting_elements_up_to_last_marker() { while ( ! empty( $this->active_formatting_elements ) ) { $entry = array_pop( $this->active_formatting_elements ); - if ( $entry->is_marker() ) { + if ( $entry->token->is_marker() ) { break; } } } + /** + * The stack of open elements is said to have a particular element in + * select scope when it has that element in the specific scope consisting + * of all element types except the following: + * * optgroup + * * option + */ private function is_element_in_select_scope( $target_node ) { return $this->is_element_in_specific_scope( $target_node, array( - 'optgroup', - 'option', + 'OPTGROUP', + 'OPTION', ), array( 'negative_match' => 'true', @@ -1161,9 +1297,9 @@ private function is_element_in_table_scope( $target_node ) { return $this->is_element_in_specific_scope( $target_node, array( - 'html', - 'table', - 'template', + 'HTML', + 'TABLE', + 'TEMPLATE', ) ); } @@ -1172,7 +1308,7 @@ private function is_element_in_button_scope( $target_node ) { return $this->is_element_in_scope( $target_node, array( - 'button', + 'BUTTON', ) ); } @@ -1181,9 +1317,9 @@ private function is_element_in_list_item_scope( $target_node ) { return $this->is_element_in_scope( $target_node, array( - 'li', - 'dd', - 'dt', + 'LI', + 'DD', + 'DT', ) ); } @@ -1193,39 +1329,60 @@ private function is_element_in_scope( $target_node, $additional_elements = array $target_node, array_merge( array( - 'applet', - 'caption', - 'html', - 'table', - 'td', - 'th', - 'marquee', - 'object', - 'template', + 'APPLET', + 'CAPTION', + 'HTML', + 'TABLE', + 'TD', + 'TH', + 'MARQUEE', + 'OBJECT', + 'TEMPLATE', ), $additional_elements ) ); } - /** + /* * https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements */ private function is_element_in_specific_scope( $target_node, $element_types_list, $options = array() ) { $negative_match = isset( $options['negative_match'] ) ? $options['negative_match'] : false; - $i = count( $this->open_elements ) - 1; - while ( true ) { - $node = $this->open_elements[ $i ]; - if ( $node === $target_node ) { + /** + * The stack of open elements is said to have an element target node in a + * specific scope consisting of a list of element types list when the following + * algorithm terminates in a match state: + */ + $i = count( $this->open_elements ) - 1; + // 1. Initialize node to be the current node (the bottommost node of the stack). + $node = $this->open_elements[ $i ]; + + while ( true ) { + // 2. If node is the target node, terminate in a match state. + if ( $node === $target_node || $node->token->tag === $target_node ) { return true; } - $is_in_the_list = in_array( $node->tag, $element_types_list, true ); - $failure = $negative_match ? $is_in_the_list : ! $is_in_the_list; + // 3. Otherwise, if node is one of the element types in list, terminate in a failure state. + $failure = in_array( $node->token->tag, $element_types_list, true ); + + // Some elements say: + // > If has that element in the specific scope consisting of all element types + // > except the following + // So we need to invert the result. + if($negative_match) { + $failure = ! $failure; + } if ( $failure ) { return false; } + + // Otherwise, set node to the previous entry in the stack of open elements and + // return to step 2. (This will never fail, since the loop will always terminate + // in the previous step if the top of the stack — an html element — is reached.) + $node = $this->open_elements[ --$i ]; } } @@ -1242,7 +1399,7 @@ private function reset_insertion_mode() { $node = $this->context_node; } - if ( $node->tag === 'select' ) { + if ( $node->token->tag === 'select' ) { if ( $last ) { break; } @@ -1269,7 +1426,7 @@ private function reset_insertion_mode() { return; } - switch ( $node->tag ) { + switch ( $node->token->tag ) { case 'TD': case 'TH': if ( ! $last ) { @@ -1465,15 +1622,29 @@ private static function is_formatting_element( $tag_name ) { } -$p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); -// The controller's schema is hardcoded, so tests would not be meaningful. -$p->main(); - -// $this->tag_processor->next_tag( -// array( -// 'tag_closers' => 'visit', -// ) -// ); -// var_dump( $this->tag_processor->get_tag() ); -// var_dump( $this->tag_processor->is_tag_closer() ); -// $last_parent = end( $this->open_elements ); +// $p = new WP_HTML_Processor( '

12345

' ); +// $p->parse(); +/* +Should output: + p + ├─ #text: 1 + ├─ b + │ ├─ #text: 2 + │ └─ i + │ └─ #text: 3 + ├─ i + │ └─ #text: 4 + └─ #text: 5 +*/ + +$p = new WP_HTML_Processor( '1

23

' ); +$p->parse(); +/* +Should output: +b +└─ #text: 1 +p +├─ b +│ └─ #text: 2 +└─ #text: 3 +*/ From 9d31cb7fdca4d9d40cf74755ac04dc6eee67977b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 17:11:58 +0100 Subject: [PATCH 06/42] Fix a bug in the adoption agency algorithm --- .../html-api/class-wp-html-processor.php | 100 +++++++++++------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index d2ac45a14ccb4..68fc9028b18a8 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -12,8 +12,8 @@ function esc_attr( $text ) { } function dbg( $message, $indent = 0 ) { - $show_debug = true; - // $show_debug = false; + // $show_debug = true; + $show_debug = false; if( $show_debug ) { $indent = str_repeat( ' ', $indent * 2 ); echo $indent . $message . "\n"; @@ -21,9 +21,9 @@ function dbg( $message, $indent = 0 ) { } class WP_HTML_Token { - const MARKER = 1; - const TAG = 2; - const TEXT = 3; + const MARKER = 'MARKER'; + const TAG = 'TAG'; + const TEXT = 'TEXT'; public $type; @@ -67,13 +67,13 @@ public function __toString() { return 'MARKER'; case self::TAG: return sprintf( - '<%s%s%s>', + '%s%s%s', $this->is_closer ? '/' : '', $this->tag, $this->attributes ? ' ' . implode( ' ', $this->attributes ) : '' ); case self::TEXT: - return $this->value; + return '#text: ' . trim($this->value); } } @@ -118,16 +118,32 @@ public function is_text() { } class WP_HTML_Node { + /** + * @var WP_HTML_Node + */ public $parent; + /** + * @var WP_HTML_Node[] + */ public $children = array(); + /** + * @var string + */ public $token; public $depth = 1; // For the adoption agency algorithm: public $intended_parent = null; + private $type; + private $value; + private $tag; public function __construct( WP_HTML_Token $token ) { $this->token = $token; + // Just for debugging convenience – remove eventually + $this->type = $token->type; + $this->value = $token->value; + $this->tag = $token->tag; } public function append_child( WP_HTML_Node $node ) { @@ -147,16 +163,26 @@ public function remove( WP_HTML_Node $node ) { } public function __toString() { - $out = ''; - $indent = str_repeat( ' ', $this->depth ); - $out .= $indent . $this->token . "\n"; - foreach ( $this->children as $child ) { - $out .= $child; - } - return $out; + return wp_html_node_to_ascii_tree( $this ); } } + +function wp_html_node_to_ascii_tree( WP_HTML_Node $node, $prefix = '', $is_last = false ) { + $ascii_tree = $prefix . ( $node->parent ? ($is_last ? '└─ ' : '├─ ') : ' ' ) . $node->token . "\n"; + + // Recursively process the children of the current node + $children = array_values($node->children); + $num_children = count( $children ); + for ( $i = 0; $i < $num_children; $i++ ) { + $child_prefix = $prefix . ( $i == $num_children - 1 ? ' ' : ' ' ); + $is_last_child = ( $i == $num_children - 1 ); + $ascii_tree .= wp_html_node_to_ascii_tree( $children[ $i ], $child_prefix, $is_last_child ); + } + + return $ascii_tree; +} + class WP_HTML_Insertion_Mode { const INITIAL = 'INITIAL'; @@ -224,7 +250,7 @@ public function __construct( $html ) { public function parse() { echo("HTML before main loop:\n"); echo($this->html); - echo("\n\n"); + echo("\n"); while ($token = $this->next_token()) { $this->last_token = $token; $processed_token = $this->process_in_body_insertion_mode($token); @@ -233,6 +259,7 @@ public function parse() { echo("\n"); echo("DOM after main loop:\n"); echo($this->root_node.''); + echo "\n\n"; // @TODO: // switch($this->insertion_mode) { // case WP_HTML_Insertion_Mode::INITIAL: @@ -999,7 +1026,7 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { // Insert whatever last node ended up being in the previous step at the appropriate place // for inserting a node, but using common ancestor as the override target. - $this->insert_element( $last_node, $common_ancestor ); + $this->insert_node( $last_node, $common_ancestor ); // Create an element for the token for which formatting element was created, in the HTML // namespace, with furthest block as the intended parent. @@ -1034,30 +1061,28 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { } } - private function insert_element( $token_or_node, $override_target = null ) { + private function insert_element( WP_HTML_Token $token, $override_target = null ) { // Create element for a token // Skip reset algorithm for now // Skip form-association for now - if($token_or_node instanceof WP_HTML_Token) { - $node = $this->create_element_for_token($token_or_node); - } else { - $node = $token_or_node; - } + $node = $this->create_element_for_token($token); + $this->insert_node($node, $override_target); + array_push($this->open_elements, $node); + return $node; + } + private function insert_node( WP_HTML_Node $node, $override_target = null ) { $target = $override_target ?: $this->current_node(); // Appropriate place for inserting a node: // For now skip foster parenting and always use the // location after the last child of the target $target->append_child($node); - array_push($this->open_elements, $node); - dbg("inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); - return $node; + dbg("Inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); } private function create_element_for_token( WP_HTML_Token $token ) { - $node = new WP_HTML_Node($token); - return $node; + return new WP_HTML_Node($token); } private function insert_text( WP_HTML_Token $token ) { @@ -1622,10 +1647,10 @@ private static function is_formatting_element( $tag_name ) { } -// $p = new WP_HTML_Processor( '

12345

' ); -// $p->parse(); +$p = new WP_HTML_Processor( '

12345

' ); +$p->parse(); /* -Should output: +Outputs: p ├─ #text: 1 ├─ b @@ -1640,11 +1665,12 @@ private static function is_formatting_element( $tag_name ) { $p = new WP_HTML_Processor( '1

23

' ); $p->parse(); /* -Should output: -b -└─ #text: 1 -p -├─ b -│ └─ #text: 2 -└─ #text: 3 +Outputs the correct result: + HTML + ├─ B + └─ #text: 1 + └─ P + ├─ B + └─ #text: 2 + └─ #text: 3 */ From eeea95ae21c74ad6c97c07997a112ef0a3838c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 17:30:23 +0100 Subject: [PATCH 07/42] Correctly cose the p tags --- .../html-api/class-wp-html-processor.php | 93 +++++++++++++++---- 1 file changed, 76 insertions(+), 17 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 68fc9028b18a8..28f27d0d9040d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -66,11 +66,17 @@ public function __toString() { case self::MARKER: return 'MARKER'; case self::TAG: + $attributes = ''; + if($this->attributes) { + foreach( $this->attributes as $name => $value ) { + $attributes .= ' ' . $name . '="' . esc_attr( $value ) . '"'; + } + } return sprintf( '%s%s%s', $this->is_closer ? '/' : '', $this->tag, - $this->attributes ? ' ' . implode( ' ', $this->attributes ) : '' + $attributes ); case self::TEXT: return '#text: ' . trim($this->value); @@ -127,7 +133,7 @@ class WP_HTML_Node { */ public $children = array(); /** - * @var string + * @var WP_HTML_Token */ public $token; public $depth = 1; @@ -315,6 +321,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->insert_text( $token ); } else if ( $token->is_opener ) { + dbg( "Found {$token->tag} tag opener" ); // Should we care? // if(self::is_rcdata_element($token->tag)) { // $this->original_insertion_mode = $this->insertion_mode; @@ -348,7 +355,10 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { // Ignore special rules for 'PRE' and 'LISTING' case 'PRE': case 'LISTING': - dbg( "Found {$token->tag} tag opener" ); + /* + * If the stack of open elements has a p element in button scope, + * then close a p element. + */ if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } @@ -481,7 +491,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'STRONG': case 'TT': case 'U': - dbg( "Found {$token->tag} tag opener" ); $this->reconstruct_active_formatting_elements(); $node = $this->insert_element( $token ); $this->push_active_formatting_element( $node ); @@ -621,6 +630,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { break; } } else { + dbg( "Found {$token->tag} tag closer" ); switch ( $token->tag ) { case 'ADDRESS': case 'ARTICLE': @@ -670,12 +680,16 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->form_pointer = null; break; case 'P': - dbg( "Found {$token->tag} tag closer" ); + /* + * If the stack of open elements does not have a p element in button scope, + * then this is a parse error; insert an HTML element for a "p" start tag + * token with no attributes. + */ if ( ! $this->is_element_in_button_scope( 'P' ) ) { - // Parse error, insert an HTML element for a "p" start tag token with no attributes. $this->parse_error(); $this->insert_element( WP_HTML_Token::tag( 'P' ) ); } + // Close a p element. $this->close_p_element(); break; case 'LI': @@ -783,9 +797,16 @@ private function next_token() { if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); $this->set_bookmark($bookmark); + $attributes = array(); + $attrs = $this->get_attribute_names_with_prefix(''); + if ($attrs) { + foreach ($attrs as $name) { + $attributes[$name] = $this->get_attribute($name); + } + } $next_tag = WP_HTML_Token::tag( $this->get_tag(), - array(), + $attributes, ! $this->is_tag_closer(), $bookmark ); @@ -852,7 +873,6 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { break; } } - dbg("AAA: Formatting element = {$formatting_element->token->tag}", 2); // If there is no such element, then abort these steps and instead act as // described in the "any other end tag" entry below. @@ -860,6 +880,7 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { dbg("Skipping AAA: no formatting element found", 2); return self::ANY_OTHER_END_TAG; } + dbg("AAA: Formatting element = {$formatting_element->token->tag}", 2); // If formatting element is not in the stack of open elements, then this is // a parse error; remove the element from the list, and return. @@ -921,7 +942,7 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { dbg("AAA: Common ancestor = {$common_ancestor->token->tag}", 2); $this->print_open_elements('AAA: Open elements: ', 2); - $this->print_rafe_formats('AAA: Formatting elements: ', 2); + $this->print_active_formatting_elements('AAA: Formatting elements: ', 2); // Let a bookmark note the position of formatting element in the list of // active formatting elements relative to the elements on either side of it @@ -1115,9 +1136,9 @@ private function pop_until_tag_name( $tags ) { } dbg( "Popping until tag names: " . implode(', ', $tags), 1 ); $this->print_open_elements( "Open elements before: " ); - while ( ! in_array( $this->current_node()->token->tag, $tags ) ) { - $this->pop_open_element(); - } + do { + $popped = $this->pop_open_element(); + } while (!in_array($popped->token->tag, $tags)); $this->print_open_elements( "Open elements after: " ); } @@ -1217,7 +1238,7 @@ private function push_active_formatting_element( WP_HTML_Node $node ) { $this->active_formatting_elements[] = $node; } - private function print_rafe_formats($msg, $indent=1) { + private function print_active_formatting_elements($msg, $indent=1) { $formats = array_map( function( $node ) { return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); }, $this->active_formatting_elements); @@ -1232,15 +1253,15 @@ private function print_open_elements($msg, $indent=1) { } private function reconstruct_active_formatting_elements() { - $this->print_rafe_formats('RAFE: before'); + $this->print_active_formatting_elements('AFE: before'); if ( empty( $this->active_formatting_elements ) ) { - dbg( "Skipping RAFE: empty list", 1 ); + dbg( "Skipping AFE: empty list", 1 ); return; } $entry_idx = count( $this->active_formatting_elements ) - 1; $last_entry = $this->active_formatting_elements[ $entry_idx ]; if ( $last_entry->token->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { - dbg( "Skipping RAFE: marker or open element", 1 ); + dbg( "Skipping AFE: marker or open element", 1 ); return; } @@ -1286,7 +1307,7 @@ private function reconstruct_active_formatting_elements() { break; } } - $this->print_rafe_formats('RAFE: after'); + $this->print_active_formatting_elements('AFE: after'); } private function clear_active_formatting_elements_up_to_last_marker() { @@ -1674,3 +1695,41 @@ private static function is_formatting_element( $tag_name ) { └─ #text: 2 └─ #text: 3 */ + + +$p = new WP_HTML_Processor( '

X +

X +

X +

X' ); +$p->parse(); +/* +DOM after main loop: + HTML + ├─ P + └─ B class="x" + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ #text: X + ├─ P + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ #text: X + ├─ P + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ B + └─ B class="x" + └─ B + └─ #text: X + └─ P + └─ #text: X +*/ From ddf2c7311218318b03bbc7df1382ea02538ed9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 18:45:54 +0100 Subject: [PATCH 08/42] Simplify HTML Processor --- .../html-api/class-wp-html-processor.php | 392 ++++++------------ .../html-api/class-wp-html-tag-processor.php | 2 +- 2 files changed, 124 insertions(+), 270 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 28f27d0d9040d..c24eee4a430f2 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,10 +11,9 @@ function esc_attr( $text ) { } } +define('HTML_DEBUG_MODE', false); function dbg( $message, $indent = 0 ) { - // $show_debug = true; - $show_debug = false; - if( $show_debug ) { + if( HTML_DEBUG_MODE ) { $indent = str_repeat( ' ', $indent * 2 ); echo $indent . $message . "\n"; } @@ -138,8 +137,6 @@ class WP_HTML_Node { public $token; public $depth = 1; - // For the adoption agency algorithm: - public $intended_parent = null; private $type; private $value; private $tag; @@ -223,8 +220,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $active_formatting_elements = array(); private $root_node = null; private $context_node = null; - private $original_insertion_mode = null; - private $insertion_mode = null; /* * WP_HTML_Tag_Processor skips over text nodes and only @@ -242,78 +237,57 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $last_token = null; private $inserted_tokens = array(); - private $head_pointer; - private $form_pointer; + const MAX_BOOKMARKS = 1000000; public function __construct( $html ) { parent::__construct( $html ); $this->root_node = new WP_HTML_Node(WP_HTML_Token::tag( 'HTML' )); $this->context_node = new WP_HTML_Node(WP_HTML_Token::tag( 'DOCUMENT' )); $this->open_elements = array( $this->root_node ); - $this->reset_insertion_mode(); } public function parse() { echo("HTML before main loop:\n"); echo($this->html); echo("\n"); - while ($token = $this->next_token()) { - $this->last_token = $token; - $processed_token = $this->process_in_body_insertion_mode($token); - $this->last_token = $processed_token; + while ($this->process_next_token()) { + // ... twiddle thumbs ... } echo("\n"); echo("DOM after main loop:\n"); echo($this->root_node.''); echo "\n\n"; - // @TODO: - // switch($this->insertion_mode) { - // case WP_HTML_Insertion_Mode::INITIAL: - // $this->next_tag_in_initial_mode(); - // break; - // case WP_HTML_Insertion_Mode::BEFORE_HEAD: - // $this->next_tag_in_before_head_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_HEAD: - // $this->next_tag_in_head_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_BODY: - // $this->next_tag_in_body_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_TABLE: - // $this->next_tag_in_table_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_TABLE_BODY: - // $this->next_tag_in_table_body_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_ROW: - // $this->next_tag_in_row_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_CELL: - // $this->next_tag_in_cell_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_SELECT: - // $this->next_tag_in_select_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE: - // $this->next_tag_in_select_in_table_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_CAPTION: - // $this->next_tag_in_caption_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_COLUMN_GROUP: - // $this->next_tag_in_column_group_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_FRAMESET: - // $this->next_tag_in_frameset_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::TEXT: - // $this->next_tag_in_text_insertion_mode(); - // break; + + echo "Mem peak usage:" . memory_get_peak_usage(true) . "\n"; + } + + private function process_next_token() { + $token = $this->next_token(); + if(!$token){ + return false; + } + $this->last_token = $token; + $processed_token = $this->process_token($token); + $this->last_token = $processed_token; + return $processed_token; + } + + private function ignore_token( $ignored_token ) { + // if ( $ignored_token->bookmark ) { + // // $this->release_bookmark( $ignored_token->bookmark ); + // // $ignored_token->bookmark = null; // } + + $token = $this->next_token(); + if(!$token){ + return false; + } + $processed_token = $this->process_token($token); + $this->last_token = $processed_token; + return $processed_token; } - public function process_in_body_insertion_mode(WP_HTML_Token $token) { + public function process_token(WP_HTML_Token $token) { if ( $token->is_text() ) { dbg( "Found text node '$token'" ); dbg( "Inserting text to current node " . $this->current_node()->token->tag, 1 ); @@ -322,11 +296,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { } else if ( $token->is_opener ) { dbg( "Found {$token->tag} tag opener" ); - // Should we care? - // if(self::is_rcdata_element($token->tag)) { - // $this->original_insertion_mode = $this->insertion_mode; - // $this->insertion_mode = WP_HTML_Insertion_Mode::TEXT; - // } switch ( $token->tag ) { case 'ADDRESS': case 'ARTICLE': @@ -380,14 +349,9 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->insert_element( $token ); break; case 'FORM': - if ( $this->form_pointer ) { - $this->ignore_token( $token ); - return $this->next_tag(); - } if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - $this->form_pointer = $token; $this->insert_element( $token ); break; case 'LI': @@ -449,8 +413,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { } $this->insert_element( $token ); break; - case 'PLAINTEXT': - throw new Exception( 'PLAINTEXT not implemented yet' ); case 'BUTTON': if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { $this->generate_implied_end_tags(); @@ -514,7 +476,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { break; case 'TABLE': $this->insert_element( $token ); - $this->insertion_mode = WP_HTML_Insertion_Mode::IN_TABLE; break; case 'AREA': case 'BR': @@ -525,7 +486,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); $this->pop_open_element(); - // @TODO: Acknowledge the token's self-closing flag, if it is set. break; case 'PARAM': case 'SOURCE': @@ -540,45 +500,12 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->insert_element( $token ); $this->pop_open_element(); break; - case 'IMAGE': - $this->parse_error(); - // Change the tag name to "img" and reprocess the token. - throw new Exception( 'IMAGE not implemented yet' ); case 'TEXTAREA': $this->insert_element( $token ); - $this->original_insertion_mode = $this->insertion_mode; - $this->insertion_mode = WP_HTML_Insertion_Mode::TEXT; break; - - case 'XMP': - if ( $this->is_element_in_button_scope( 'P' ) ) { - $this->close_p_element(); - } - $this->reconstruct_active_formatting_elements(); - // @TODO: Follow the generic raw text element parsing algorithm. - throw new Exception( 'XMP not implemented yet' ); - case 'IFRAME': - case 'NOEMBED': - case 'NOSCRIPT': - // @TODO: Follow the generic raw text element parsing algorithm. - throw new Exception( $token->tag . ' not implemented yet' ); case 'SELECT': $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); - if ( in_array( - $this->insertion_mode, - array( - WP_HTML_Insertion_Mode::IN_TABLE, - WP_HTML_Insertion_Mode::IN_CAPTION, - WP_HTML_Insertion_Mode::IN_TABLE_BODY, - WP_HTML_Insertion_Mode::IN_ROW, - WP_HTML_Insertion_Mode::IN_CELL, - ) - ) ) { - $this->insertion_mode = WP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE; - } else { - $this->insertion_mode = WP_HTML_Insertion_Mode::IN_SELECT; - } break; case 'OPTGROUP': case 'OPTION': @@ -606,24 +533,17 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { } $this->insert_element( $token ); break; - case 'MATH': - throw new Exception( 'MATH not implemented yet' ); - case 'SVG': - throw new Exception( 'SVG not implemented yet' ); - case 'CAPTION': - case 'COL': - case 'COLGROUP': - case 'FRAME': - case 'HEAD': - case 'TBODY': - case 'TD': - case 'TFOOT': - case 'TH': - case 'THEAD': - case 'TR': - $this->parse_error(); - // Ignore the token. - return; + + // case 'XMP': + // case 'IFRAME': + // case 'NOEMBED': + // case 'MATH': + // case 'SVG': + // case 'NOSCRIPT': + // case 'PLAINTEXT': + // case 'IMAGE': + // throw new Exception( $token->tag . ' not implemented yet' ); + default: $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); @@ -656,28 +576,16 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'SECTION': case 'SUMMARY': case 'UL': - if ( $this->is_element_in_scope( $token->tag ) ) { - $this->ignore_token( $token ); + if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); $this->pop_until_tag_name( $token->tag ); break; case 'FORM': - if ( $this->form_pointer ) { - $this->ignore_token( $token ); - $this->parse_error(); - return $this->next_tag(); - } - if ( $this->is_element_in_scope( $this->form_pointer ) ) { - $this->ignore_token( $token ); - $this->parse_error(); - return $this->next_tag(); - } $this->generate_implied_end_tags(); - array_splice( $this->open_elements, array_search( $this->form_pointer, $this->open_elements ), 1 ); - $this->form_pointer = null; + $this->pop_until_tag_name( $token->tag ); break; case 'P': /* @@ -694,9 +602,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { break; case 'LI': if ( $this->is_element_in_list_item_scope( 'LI' ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); $this->pop_until_tag_name( 'LI' ); @@ -704,9 +611,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'DD': case 'DT': if ( $this->is_element_in_scope( $token->tag ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); $this->pop_until_tag_name( $token->tag ); @@ -718,9 +624,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'H5': case 'H6': if ( $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); $this->pop_until_tag_name( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ); @@ -746,9 +651,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'MARQUEE': case 'OBJECT': if ( $this->is_element_in_scope( $token->tag ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); if ( $this->current_node()->token->tag !== $token->tag ) { @@ -772,9 +676,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->pop_until_node( $node ); break; } elseif ( $this->is_special_element( $node->token->tag ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } else { --$i; } @@ -821,7 +724,7 @@ private function next_token() { */ $last = $this->last_token; if ( - $last + $last && $last->is_tag() && $last->bookmark && $this->has_bookmark($last->bookmark) @@ -1008,7 +911,6 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { * in the HTML namespace, with common ancestor as the intended parent. */ $new_node = $this->create_element_for_token( $node->token ); - $new_node->intended_parent = $common_ancestor; /* * Replace the entry for node in the list of active formatting elements with an entry @@ -1052,7 +954,6 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { // Create an element for the token for which formatting element was created, in the HTML // namespace, with furthest block as the intended parent. $new_element = $this->create_element_for_token( $formatting_element->token ); - $new_element->intended_parent = $furthest_block; // Take all of the child nodes of furthest block and append them to the element created in // the last step. @@ -1118,14 +1019,6 @@ private function insert_text( WP_HTML_Token $token ) { $target->append_child(new WP_HTML_Node($token)); } - private function ignore_token( $token ) { - if ( $token->bookmark ) { - $this->release_bookmark( $token->bookmark ); - $token->bookmark = null; - } - return; - } - private function parse_error() { // Noop for now } @@ -1239,17 +1132,21 @@ private function push_active_formatting_element( WP_HTML_Node $node ) { } private function print_active_formatting_elements($msg, $indent=1) { - $formats = array_map( function( $node ) { - return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); - }, $this->active_formatting_elements); - dbg( "$msg " . implode(', ', $formats), $indent ); + if (HTML_DEBUG_MODE) { + $formats = array_map(function ($node) { + return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); + }, $this->active_formatting_elements); + dbg("$msg " . implode(', ', $formats), $indent); + } } private function print_open_elements($msg, $indent=1) { - $elems = array_map(function ($node) { - return $node->token->tag; - }, $this->open_elements); - dbg( "$msg " . implode(', ', $elems), $indent ); + if (HTML_DEBUG_MODE) { + $elems = array_map(function ($node) { + return $node->token->tag; + }, $this->open_elements); + dbg("$msg " . implode(', ', $elems), $indent); + } } private function reconstruct_active_formatting_elements() { @@ -1407,7 +1304,11 @@ private function is_element_in_specific_scope( $target_node, $element_types_list while ( true ) { // 2. If node is the target node, terminate in a match state. - if ( $node === $target_node || $node->token->tag === $target_node ) { + if ( is_string( $target_node ) ) { + if ( $node->token->tag === $target_node ) { + return true; + } + } else if ( $node === $target_node ) { return true; } @@ -1432,106 +1333,6 @@ private function is_element_in_specific_scope( $target_node, $element_types_list } } - /** - * https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately - */ - private function reset_insertion_mode() { - $last = false; - $node = end( $this->open_elements ); - - while ( true ) { - if ( count( $this->open_elements ) === 1 && $node === reset( $this->open_elements ) ) { - $last = true; - $node = $this->context_node; - } - - if ( $node->token->tag === 'select' ) { - if ( $last ) { - break; - } - - $ancestor = $node; - while ( true ) { - if ( $ancestor === $this->open_elements[0] ) { - break; - } - - $index = array_search( $ancestor, $this->open_elements ); - $ancestor = $this->open_elements[ $index - 1 ]; - if ( $ancestor->tag === 'template' ) { - break; - } - - if ( $ancestor->tag === 'table' ) { - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE; - return; - } - } - - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_SELECT; - return; - } - - switch ( $node->token->tag ) { - case 'TD': - case 'TH': - if ( ! $last ) { - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_CELL; - return; - } - break; - case 'TR': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_ROW; - return; - case 'TBODY': - case 'THEAD': - case 'TFOOT': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_TABLE_BODY; - return; - case 'CAPTION': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_CAPTION; - return; - case 'COLGROUP': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_COLUMN_GROUP; - return; - case 'TABLE': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_TABLE; - return; - case 'TEMPLATE': - // TODO: implement the current template insertion mode - $this->insertion_mode = 0; - return; - case 'HEAD': - if ( ! $last ) { - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_HEAD; - return; - } - break; - case 'BODY': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; - return; - case 'FRAMESET': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_FRAMESET; - return; - case 'HTML': - // TODO: implement the head element pointer - $this->insertion_mode = WP_HTML_Insertion_Mode::BEFORE_HEAD; - return; - default: - if ( $last ) { - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; - return; - } - } - - $index = array_search( $node, $this->open_elements ); - $node = $this->open_elements[ $index - 1 ]; - } - - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; - } - - private static function is_special_element( $tag_name, $except = null ) { if ( null !== $except && in_array( $tag_name, $except, true ) ) { return false; @@ -1667,6 +1468,59 @@ private static function is_formatting_element( $tag_name ) { } +// $dir = realpath( __DIR__ . '/../../../index.html' ); + +// $htmlspec = file_get_contents( $dir ); +// $p = new WP_HTML_Processor( $htmlspec ); +// $p->parse(); + +// die(); + +$p = new WP_HTML_Processor( '

  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
Sit
Amet' ); +$p->parse(); +/* +Outputs: + +DOM after main loop: + HTML + ├─ UL + ├─ LI + └─ #text: 1 + ├─ LI + └─ #text: 2 + ├─ LI + └─ #text: 3 + ├─ LI + ├─ #text: Lorem + └─ B + └─ #text: Ipsum + └─ LI + └─ B + └─ #text: Dolor + └─ B + ├─ #text: Sit + └─ DIV + └─ #text: Amet +*/ + +die(); + +$p = new WP_HTML_Processor( '
12
34' ); +$p->parse(); +/* +Outputs: + p + ├─ #text: 1 + ├─ b + │ ├─ #text: 2 + │ └─ i + │ └─ #text: 3 + ├─ i + │ └─ #text: 4 + └─ #text: 5 +*/ + +die(); $p = new WP_HTML_Processor( '

12345

' ); $p->parse(); diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 958e02cca7cfa..9aca0d6f28b85 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -724,7 +724,7 @@ public function set_bookmark( $name ) { return false; } - if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= self::MAX_BOOKMARKS ) { + if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) { _doing_it_wrong( __METHOD__, __( 'Too many bookmarks: cannot create any more.' ), From db40a948624fe7cc167b9757f48c1ff3678c2a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 18:48:14 +0100 Subject: [PATCH 09/42] Correct the is_element_in_scope checks --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index c24eee4a430f2..93b3b93fbdb93 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -601,7 +601,7 @@ public function process_token(WP_HTML_Token $token) { $this->close_p_element(); break; case 'LI': - if ( $this->is_element_in_list_item_scope( 'LI' ) ) { + if ( ! $this->is_element_in_list_item_scope( 'LI' ) ) { $this->parse_error(); return $this->ignore_token( $token ); } @@ -610,7 +610,7 @@ public function process_token(WP_HTML_Token $token) { break; case 'DD': case 'DT': - if ( $this->is_element_in_scope( $token->tag ) ) { + if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); return $this->ignore_token( $token ); } @@ -623,7 +623,7 @@ public function process_token(WP_HTML_Token $token) { case 'H4': case 'H5': case 'H6': - if ( $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + if ( ! $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->parse_error(); return $this->ignore_token( $token ); } @@ -650,7 +650,7 @@ public function process_token(WP_HTML_Token $token) { case 'APPLET': case 'MARQUEE': case 'OBJECT': - if ( $this->is_element_in_scope( $token->tag ) ) { + if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); return $this->ignore_token( $token ); } From ea4f392f574fa165da5d34fb225b9b3d1e559ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 18:49:20 +0100 Subject: [PATCH 10/42] Uncomment some test inputs --- src/wp-includes/html-api/class-wp-html-processor.php | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 93b3b93fbdb93..521a924ee846f 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1503,8 +1503,6 @@ private static function is_formatting_element( $tag_name ) { └─ #text: Amet */ -die(); - $p = new WP_HTML_Processor( '
12
34' ); $p->parse(); /* @@ -1520,8 +1518,6 @@ private static function is_formatting_element( $tag_name ) { └─ #text: 5 */ -die(); - $p = new WP_HTML_Processor( '

12345

' ); $p->parse(); /* From 66fd636c47f7125d3ea84070eba169dc81a0da9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 18:53:06 +0100 Subject: [PATCH 11/42] Document insert_node --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 521a924ee846f..ffe10bc0f2be8 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -995,10 +995,10 @@ private function insert_element( WP_HTML_Token $token, $override_target = null ) private function insert_node( WP_HTML_Node $node, $override_target = null ) { $target = $override_target ?: $this->current_node(); - - // Appropriate place for inserting a node: - // For now skip foster parenting and always use the - // location after the last child of the target + /** + * Appropriate place for inserting a node is always the end of the + * target's children thanks to the assumptions this parser makes. + */ $target->append_child($node); dbg("Inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); } From 93fea6ccd499ed861ae7e4f0aa05c87a6f7ff0ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 23 Feb 2023 10:57:01 +0100 Subject: [PATCH 12/42] Simplify ignore_token() --- .../html-api/class-wp-html-processor.php | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ffe10bc0f2be8..11b1a5b52d237 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -149,7 +149,7 @@ public function __construct( WP_HTML_Token $token ) { $this->tag = $token->tag; } - public function append_child( WP_HTML_Node $node ) { + public function append_child( WP_HTML_Node $node ) { if($node->parent) { $node->parent->remove($node); } @@ -278,13 +278,8 @@ private function ignore_token( $ignored_token ) { // // $ignored_token->bookmark = null; // } - $token = $this->next_token(); - if(!$token){ - return false; - } - $processed_token = $this->process_token($token); - $this->last_token = $processed_token; - return $processed_token; + $this->last_token = $ignored_token; + return $this->process_next_token(); } public function process_token(WP_HTML_Token $token) { @@ -1476,7 +1471,7 @@ private static function is_formatting_element( $tag_name ) { // die(); -$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
Sit
Amet' ); +$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); $p->parse(); /* Outputs: @@ -1498,9 +1493,13 @@ private static function is_formatting_element( $tag_name ) { └─ B └─ #text: Dolor └─ B - ├─ #text: Sit - └─ DIV - └─ #text: Amet + └─ SPAN + ├─ #text: Sit + └─ SPAN + ├─ #text: Sit + └─ SPAN + └─ DIV + └─ #text: Amet */ $p = new WP_HTML_Processor( '
12
34' ); From fd2ddcfa086d6d0b3748155ae69294cb48ff45cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 23 Feb 2023 13:01:53 +0100 Subject: [PATCH 13/42] Start exploring a text-based API --- .../html-api/class-wp-html-processor.php | 88 ++++++++++++------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 11b1a5b52d237..c9f213b900494 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,7 +11,7 @@ function esc_attr( $text ) { } } -define('HTML_DEBUG_MODE', false); +define('HTML_DEBUG_MODE', true); function dbg( $message, $indent = 0 ) { if( HTML_DEBUG_MODE ) { $indent = str_repeat( ' ', $indent * 2 ); @@ -237,6 +237,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $last_token = null; private $inserted_tokens = array(); + public $reconstructed_html = ''; + const MAX_BOOKMARKS = 1000000; public function __construct( $html ) { @@ -253,6 +255,11 @@ public function parse() { while ($this->process_next_token()) { // ... twiddle thumbs ... } + + while ( count($this->open_elements) > 1 ) { + $this->pop_open_element(); + } + echo("\n"); echo("DOM after main loop:\n"); echo($this->root_node.''); @@ -979,6 +986,11 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { } private function insert_element( WP_HTML_Token $token, $override_target = null ) { + // Text API: + $this->reconstructed_html .= '<'.$token->tag.'>'; + + // Object-oriented API: + // Create element for a token // Skip reset algorithm for now // Skip form-association for now @@ -1003,6 +1015,10 @@ private function create_element_for_token( WP_HTML_Token $token ) { } private function insert_text( WP_HTML_Token $token ) { + // Text API: + $this->reconstructed_html .= $token->value; + + // Object-oriented API: $target = $this->current_node(); if(count($target->children)){ $last_child = end($target->children); @@ -1038,6 +1054,11 @@ private function pop_until_node( $node ) { private function pop_open_element() { $popped = array_pop( $this->open_elements ); + + // Text API: + $this->reconstructed_html .= 'token->tag.'>'; + + // Object-oriented API: if ( $popped->token->bookmark ) { $this->release_bookmark( $popped->token->bookmark ); $popped->token->bookmark = null; @@ -1471,8 +1492,36 @@ private static function is_formatting_element( $tag_name ) { // die(); -$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); -$p->parse(); +// $p = new WP_HTML_Processor( '

12345

' ); +// $p->parse(); +/* +Outputs: + p + ├─ #text: 1 + ├─ b + │ ├─ #text: 2 + │ └─ i + │ └─ #text: 3 + ├─ i + │ └─ #text: 4 + └─ #text: 5 +*/ +// die(); + +// $p = new WP_HTML_Processor( '
12
34' ); +// $p->parse(); +/* +DOM after main loop: + HTML + ├─ DIV + ├─ #text: 1 + └─ SPAN + └─ #text: 2 + └─ #text: 34 +*/ + +// $p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); +// $p->parse(); /* Outputs: @@ -1502,35 +1551,6 @@ private static function is_formatting_element( $tag_name ) { └─ #text: Amet */ -$p = new WP_HTML_Processor( '
12
34' ); -$p->parse(); -/* -Outputs: - p - ├─ #text: 1 - ├─ b - │ ├─ #text: 2 - │ └─ i - │ └─ #text: 3 - ├─ i - │ └─ #text: 4 - └─ #text: 5 -*/ - -$p = new WP_HTML_Processor( '

12345

' ); -$p->parse(); -/* -Outputs: - p - ├─ #text: 1 - ├─ b - │ ├─ #text: 2 - │ └─ i - │ └─ #text: 3 - ├─ i - │ └─ #text: 4 - └─ #text: 5 -*/ $p = new WP_HTML_Processor( '1

23

' ); $p->parse(); @@ -1544,7 +1564,9 @@ private static function is_formatting_element( $tag_name ) { └─ #text: 2 └─ #text: 3 */ - +echo "\n\n"; +echo $p->reconstructed_html; +die(); $p = new WP_HTML_Processor( '

X

X From faf724e56dde1aa7d6e0c033e5b8e04f76d3ade3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 23 Feb 2023 15:31:25 +0100 Subject: [PATCH 14/42] Doodling more --- .../html-api/class-wp-html-processor.php | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index c9f213b900494..707c82d0560eb 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,7 +11,7 @@ function esc_attr( $text ) { } } -define('HTML_DEBUG_MODE', true); +define('HTML_DEBUG_MODE', false); function dbg( $message, $indent = 0 ) { if( HTML_DEBUG_MODE ) { $indent = str_repeat( ' ', $indent * 2 ); @@ -441,7 +441,8 @@ public function process_token(WP_HTML_Token $token) { } $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); break; case 'B': case 'BIG': @@ -949,6 +950,10 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { $last_node = $node; } + // $this->reconstructed_html .= ''; + // $this->reconstructed_html .= '<'.$common_ancestor->token->tag.'>'; + // $this->reconstructed_html .= '<'.$last_node->token->tag.'>'; + // Insert whatever last node ended up being in the previous step at the appropriate place // for inserting a node, but using common ancestor as the override target. $this->insert_node( $last_node, $common_ancestor ); @@ -1551,24 +1556,29 @@ private static function is_formatting_element( $tag_name ) { └─ #text: Amet */ - -$p = new WP_HTML_Processor( '1

23

' ); +$p = new WP_HTML_Processor( ' +
+
+
+
+
' ); $p->parse(); -/* -Outputs the correct result: - HTML - ├─ B - └─ #text: 1 - └─ P - ├─ B - └─ #text: 2 - └─ #text: 3 -*/ +// $p = new WP_HTML_Processor( '1

23

' ); +// $p->parse(); +// /* +// Outputs the correct result: +// B +// └─ #text: 1 +// P +// ├─ B +// └─ #text: 2 +// └─ #text: 3 +// */ echo "\n\n"; echo $p->reconstructed_html; die(); -$p = new WP_HTML_Processor( '

X +$p = new WP_HTML_Processor( '

X

X

X

X' ); From 0565b6ba03d1dbec02815e006ea7ad26903642a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 27 Feb 2023 13:45:01 +0100 Subject: [PATCH 15/42] Simplify the adoption agency algorithm --- .../html-api/class-wp-html-processor.php | 181 ++---------------- 1 file changed, 15 insertions(+), 166 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 707c82d0560eb..4b40bb571464a 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -838,159 +838,14 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { return; } - dbg("AAA: Furthest block = {$furthest_block->token->tag}", 2); - - // Let common ancestor be the element immediately above formatting element - // in the stack of open elements. - $formatting_elem_stack_index = array_search( $formatting_element, $this->open_elements, true ); - $common_ancestor = $this->open_elements[ $formatting_elem_stack_index - 1 ]; - - dbg("AAA: Common ancestor = {$common_ancestor->token->tag}", 2); - - $this->print_open_elements('AAA: Open elements: ', 2); - $this->print_active_formatting_elements('AAA: Formatting elements: ', 2); - - // Let a bookmark note the position of formatting element in the list of - // active formatting elements relative to the elements on either side of it - // in the list. - $bookmark = $formatting_element_idx; - - // Let node and last node be furthest block. - $node = $last_node = $furthest_block; - $node_open_elements_index = array_search( $node, $this->open_elements, true ); - - $prev_open_element_index = false; - $inner_loop_counter = 0; - while ( true ) { - $inner_loop_counter++; - - /** - * Let node be the element immediately above node in the stack of open elements, - * or if node is no longer in the stack of open elements (e.g. because it got - * removed by this algorithm), the element that was immediately above node in - * the stack of open elements before node was removed. - */ - $node_open_elements_index = array_search( $node, $this->open_elements, true ); - if ( false === $node_open_elements_index ) { - if ( false === $prev_open_element_index ) { - throw new Exception( 'Unexpected error in AAA algorithm – cannot find node.' ); - } - $node_open_elements_index = $prev_open_element_index; - } - --$node_open_elements_index; - if( $node_open_elements_index < 0 ) { - throw new Exception( 'Unexpected error in AAA algorithm – node is not in the stack of open elements.' ); - } - $node = $this->open_elements[ $node_open_elements_index ]; - $prev_open_element_index = $node_open_elements_index; - - // If node is formatting element, then break. - if ( $node === $formatting_element ) { - dbg("AAA: Inner loop break – node is formatting element", 3); - break; - } - - /* - * If inner loop counter is greater than 3 and node is in the list - * of active formatting elements, then remove node from the list of - * active formatting elements. - */ - if ( $inner_loop_counter > 3 && in_array( $node, $this->active_formatting_elements, true ) ) { - $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); - array_splice( $this->active_formatting_elements, $node_formatting_idx, 1 ); - } - - /* - * If node is not in the list of active formatting elements, then remove - * node from the stack of open elements and continue. - */ - if ( ! in_array( $node, $this->active_formatting_elements, true ) ) { - dbg("AAA: Inner loop – removing node from the stack of open elements", 3); - array_splice( $this->open_elements, $node_open_elements_index, 1 ); - } - - /* - * Create an element for the token for which the element node was created, - * in the HTML namespace, with common ancestor as the intended parent. - */ - $new_node = $this->create_element_for_token( $node->token ); - - /* - * Replace the entry for node in the list of active formatting elements with an entry - * for the new element. - */ - $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); - $this->active_formatting_elements[ $node_formatting_idx ] = $new_node; - - /* - * Replace the entry for node in the stack of open elements with an entry for - * the new element. - */ - $idx = array_search( $node, $this->open_elements, true ); - $this->open_elements[ $idx ] = $new_node; - - /* - * Let node be the new element. - */ - $node = $new_node; - - /* - * If last node is furthest block, then move the aforementioned bookmark to be - * immediately after the new node in the list of active formatting elements. - */ - if ( $last_node === $furthest_block ) { - $bookmark = $node_formatting_idx + 1; - } - - // Append last node to node. - dbg("AAA: Appending {$last_node->token->tag} to {$node->token->tag}", 3); - $node->append_child( $last_node ); - - // Set last node to node. - $last_node = $node; - } - - // $this->reconstructed_html .= ''; - // $this->reconstructed_html .= '<'.$common_ancestor->token->tag.'>'; - // $this->reconstructed_html .= '<'.$last_node->token->tag.'>'; - - // Insert whatever last node ended up being in the previous step at the appropriate place - // for inserting a node, but using common ancestor as the override target. - $this->insert_node( $last_node, $common_ancestor ); - - // Create an element for the token for which formatting element was created, in the HTML - // namespace, with furthest block as the intended parent. - $new_element = $this->create_element_for_token( $formatting_element->token ); - - // Take all of the child nodes of furthest block and append them to the element created in - // the last step. - foreach ($furthest_block->children as $child) { - $new_element->append_child( $child ); - } - - // Append that new element to furthest block. - $furthest_block->append_child( $new_element ); - - // Remove formatting element from the list of active formatting elements - $idx = array_search( $formatting_element, $this->active_formatting_elements, true ); - array_splice( $this->active_formatting_elements, $idx, 1 ); - - // Insert the new element into the list of active formatting elements at the - // position of the aforementioned bookmark. - array_splice( $this->active_formatting_elements, $bookmark, 0, array( $new_element ) ); - - // Remove formatting element from the stack of open elements - $idx = array_search( $formatting_element, $this->open_elements, true ); - array_splice( $this->open_elements, $idx, 1 ); - - // Insert the new element into the stack of open elements immediately below the - // position of furthest block in that stack. - $idx = array_search( $furthest_block, $this->open_elements, true ); - array_splice( $this->open_elements, $idx + 1, 0, array( $new_element ) ); + // We didn't bale out so far, but the algorithm is not implemented. + // Let's error out. + break; } + throw new Exception('Adoption Agency Algorithm not supported.'); } - private function insert_element( WP_HTML_Token $token, $override_target = null ) { + private function insert_element( WP_HTML_Token $token ) { // Text API: $this->reconstructed_html .= '<'.$token->tag.'>'; @@ -999,24 +854,16 @@ private function insert_element( WP_HTML_Token $token, $override_target = null ) // Create element for a token // Skip reset algorithm for now // Skip form-association for now - $node = $this->create_element_for_token($token); - $this->insert_node($node, $override_target); - array_push($this->open_elements, $node); - return $node; - } - - private function insert_node( WP_HTML_Node $node, $override_target = null ) { - $target = $override_target ?: $this->current_node(); /** * Appropriate place for inserting a node is always the end of the * target's children thanks to the assumptions this parser makes. */ - $target->append_child($node); - dbg("Inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); - } + $node = new WP_HTML_Node($token); + $this->current_node()->append_child($node); + dbg("Inserted element: {$node->token->tag} to parent {$this->current_node()->token->tag}", 2); - private function create_element_for_token( WP_HTML_Token $token ) { - return new WP_HTML_Node($token); + array_push($this->open_elements, $node); + return $node; } private function insert_text( WP_HTML_Token $token ) { @@ -1497,8 +1344,8 @@ private static function is_formatting_element( $tag_name ) { // die(); -// $p = new WP_HTML_Processor( '

12345

' ); -// $p->parse(); +$p = new WP_HTML_Processor( '

12345

' ); +$p->parse(); /* Outputs: p @@ -1511,7 +1358,9 @@ private static function is_formatting_element( $tag_name ) { │ └─ #text: 4 └─ #text: 5 */ -// die(); +echo "\n\n"; +echo $p->reconstructed_html; +die(); // $p = new WP_HTML_Processor( '
12
34' ); // $p->parse(); From a2879999c4b4d068e8df185b9e0d2ed691780a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 27 Feb 2023 14:15:10 +0100 Subject: [PATCH 16/42] Get rid of next_token() logic --- .../html-api/class-wp-html-processor.php | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 4b40bb571464a..e276f10c750bf 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -19,6 +19,19 @@ function dbg( $message, $indent = 0 ) { } } +class WP_HTML_Tag_Token { + + public $tag; + + public $bookmark; + + public function __construct( $tag, $bookmark = null ) { + $this->tag = $tag; + $this->bookmark = $bookmark; + } + +} + class WP_HTML_Token { const MARKER = 'MARKER'; const TAG = 'TAG'; From 74300673eec0a0cc795cfd14d5861613b997a847 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 27 Feb 2023 14:24:37 +0100 Subject: [PATCH 17/42] Remove Object-oriented logic --- .../html-api/class-wp-html-text-processor.php | 1251 +++++++++++++++++ 1 file changed, 1251 insertions(+) create mode 100644 src/wp-includes/html-api/class-wp-html-text-processor.php diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php new file mode 100644 index 0000000000000..3ffd03e499ca4 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -0,0 +1,1251 @@ +tag = $tag; + $this->bookmark = $bookmark; + } + +} + +/** + * + */ +class WP_HTML_Processor extends WP_HTML_Tag_Processor { + + private $MARKER; + + /** + * @var WP_HTML_Tag_Token[] + */ + private $open_elements = array(); + /** + * @var WP_HTML_Tag_Token[] + */ + private $active_formatting_elements = array(); + private $root_node = null; + private $context_node = null; + + /* + * WP_HTML_Tag_Processor skips over text nodes and only + * processes tags. + * + * WP_HTML_Processor needs to process text nodes as well. + * + * Whenever the tag processor skips over text to move to + * the next tag, the next_token() method emits that text + * as a token and stores the tag in $buffered_tag to be + * returned the next time. + */ + private $buffered_tag = null; + + private $last_token = null; + private $inserted_tokens = array(); + + public $reconstructed_html = ''; + + const MAX_BOOKMARKS = 1000000; + + public function __construct( $html ) { + parent::__construct( $html ); + $this->MARKER = new WP_HTML_Tag_Token(null); + $this->root_node = new WP_HTML_Tag_Token( 'HTML' ); + $this->context_node = new WP_HTML_Tag_Token( 'DOCUMENT' ); + $this->open_elements = array( $this->root_node ); + } + + public function parse() { + echo("HTML before main loop:\n"); + echo($this->html); + echo("\n"); + while ($this->next_node()) { + // ... twiddle thumbs ... + } + + while ( count($this->open_elements) > 1 ) { + $this->pop_open_element(); + } + + echo("\n"); + echo("HTML after main loop:\n"); + echo($this->reconstructed_html.''); + echo "\n\n"; + + echo "Mem peak usage:" . memory_get_peak_usage(true) . "\n"; + } + + public function ignore_token() { + // @TODO: remove the current tag from $this->html instead of + // not appending it to $this->reconstructed_html + return $this->next_node(); + } + + public function next_node() { + $text_start = $this->tag_ends_at + 1; + + $next_tag = false; + if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); + $this->set_bookmark($bookmark); + $next_tag = new WP_HTML_Tag_Token( + $this->get_tag(), + $bookmark + ); + $text_end = $this->bookmarks[$bookmark]->start; + } else { + $text_end = strlen($this->html); + } + + if ($text_start < $text_end) { + $text = substr($this->html, $text_start, $text_end - $text_start); + dbg( "Found text node '$text'" ); + dbg( "Appending text to reconstructed HTML", 1 ); + $this->reconstruct_active_formatting_elements(); + // @TODO don't append stuff to $this->reconstructed_html + // instead, skip over the text in $this->html + $this->reconstructed_html .= $text; + } + + if ( ! $next_tag ) { + return false; + } + + $token = $next_tag; + if ( ! $this->is_tag_closer() ) { + dbg( "Found {$token->tag} tag opener" ); + switch ( $token->tag ) { + case 'ADDRESS': + case 'ARTICLE': + case 'ASIDE': + case 'BLOCKQUOTE': + case 'CENTER': + case 'DETAILS': + case 'DIALOG': + case 'DIR': + case 'DIV': + case 'DL': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'HEADER': + case 'HGROUP': + case 'MAIN': + case 'MENU': + case 'NAV': + case 'OL': + case 'P': + case 'SECTION': + case 'SUMMARY': + case 'UL': + // Ignore special rules for 'PRE' and 'LISTING' + case 'PRE': + case 'LISTING': + /* + * If the stack of open elements has a p element in button scope, + * then close a p element. + */ + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + // A start tag whose tag name is "h1", "h2", "h3", "h4", "h5", or "h6" + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + if ( in_array( $this->current_node()->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + $this->pop_open_element(); + } + $this->insert_element( $token ); + break; + case 'FORM': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + case 'LI': + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === 'LI' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'LI' ), + ) + ); + $this->pop_until_tag_name( 'LI' ); + break; + } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + break; + } else { + --$i; + $node = $this->open_elements[ $i ]; + } + } + + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + case 'DD': + case 'DT': + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === 'DD' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'DD' ), + ) + ); + $this->pop_until_tag_name( 'DD' ); + break; + } elseif ( $node->tag === 'DT' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'DT' ), + ) + ); + $this->pop_until_tag_name( 'DT' ); + break; + } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + break; + } else { + --$i; + $node = $this->open_elements[ $i ]; + } + } + + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + case 'BUTTON': + if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( 'BUTTON' ); + } + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'A': + $active_a = null; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; --$i ) { + $node = $this->active_formatting_elements[ $i ]; + if ( $node->tag === 'A' ) { + $active_a = $node; + break; + } elseif ( $this->MARKER !== $node ) { + break; + } + } + + if ( $active_a ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + } + + $this->reconstruct_active_formatting_elements(); + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); + break; + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + $this->reconstruct_active_formatting_elements(); + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); + break; + case 'NOBR': + $this->reconstruct_active_formatting_elements(); + if ( $this->is_element_in_scope( 'NOBR' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); + break; + case 'APPLET': + case 'MARQUEE': + case 'OBJECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + $this->active_formatting_elements[] = $this->MARKER; + break; + case 'TABLE': + $this->insert_element( $token ); + break; + case 'AREA': + case 'BR': + case 'EMBED': + case 'IMG': + case 'KEYGEN': + case 'WBR': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + $this->pop_open_element(); + break; + case 'PARAM': + case 'SOURCE': + case 'TRACK': + $this->insert_element( $token ); + $this->pop_open_element(); + break; + case 'HR': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + $this->pop_open_element(); + break; + case 'TEXTAREA': + $this->insert_element( $token ); + break; + case 'SELECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'OPTGROUP': + case 'OPTION': + if ( 'OPTION' === $token->tag ) { + $this->pop_open_element(); + } + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'RB': + case 'RTC': + if ( $this->is_element_in_scope( 'RB' ) || $this->is_element_in_scope( 'RTC' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $this->insert_element( $token ); + break; + case 'RP': + case 'RT': + if ( $this->is_element_in_scope( 'RP' ) || $this->is_element_in_scope( 'RT' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $this->insert_element( $token ); + break; + + // case 'XMP': + // case 'IFRAME': + // case 'NOEMBED': + // case 'MATH': + // case 'SVG': + // case 'NOSCRIPT': + // case 'PLAINTEXT': + // case 'IMAGE': + // throw new Exception( $token->tag . ' not implemented yet' ); + + default: + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + } + } else { + dbg( "Found {$token->tag} tag closer" ); + switch ( $token->tag ) { + case 'ADDRESS': + case 'ARTICLE': + case 'ASIDE': + case 'BLOCKQUOTE': + case 'CENTER': + case 'DETAILS': + case 'DIALOG': + case 'DIR': + case 'DIV': + case 'DL': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'HEADER': + case 'HGROUP': + case 'MAIN': + case 'MENU': + case 'NAV': + case 'OL': + case 'PRE': + case 'SECTION': + case 'SUMMARY': + case 'UL': + if ( ! $this->is_element_in_scope( $token->tag ) ) { + $this->parse_error(); + return $this->ignore_token(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( $token->tag ); + break; + case 'FORM': + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( $token->tag ); + break; + case 'P': + /* + * If the stack of open elements does not have a p element in button scope, + * then this is a parse error; insert an HTML element for a "p" start tag + * token with no attributes. + */ + if ( ! $this->is_element_in_button_scope( 'P' ) ) { + $this->parse_error(); + $this->insert_element( new WP_HTML_Tag_Token( 'P' ) ); + } + // Close a p element. + $this->close_p_element(); + break; + case 'LI': + if ( ! $this->is_element_in_list_item_scope( 'LI' ) ) { + $this->parse_error(); + return $this->ignore_token(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( 'LI' ); + break; + case 'DD': + case 'DT': + if ( ! $this->is_element_in_scope( $token->tag ) ) { + $this->parse_error(); + return $this->ignore_token(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( $token->tag ); + break; + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + if ( ! $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + $this->parse_error(); + return $this->ignore_token(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ); + break; + case 'A': + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + dbg( "Found {$token->tag} tag closer" ); + $this->adoption_agency_algorithm( $token ); + break; + + case 'APPLET': + case 'MARQUEE': + case 'OBJECT': + if ( ! $this->is_element_in_scope( $token->tag ) ) { + $this->parse_error(); + return $this->ignore_token(); + } + $this->generate_implied_end_tags(); + if ( $this->current_node()->tag !== $token->tag ) { + $this->parse_error(); + } + $this->pop_until_tag_name( $token->tag ); + $this->clear_active_formatting_elements_up_to_last_marker(); + break; + case 'BR': + // This should never happen since Tag_Processor corrects that + default: + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === $token->tag ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( $token->tag ), + ) + ); + $this->pop_until_node( $node ); + break; + } elseif ( $this->is_special_element( $node->tag ) ) { + $this->parse_error(); + return $this->ignore_token(); + } else { + --$i; + } + } + break; + } + } + return $token; + } + + private $element_bookmark_idx = 0; + private function next_token() { + if($this->buffered_tag){ + $next_tag = $this->buffered_tag; + $this->buffered_tag = null; + return $next_tag; + } + + $next_tag = false; + if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); + $this->set_bookmark($bookmark); + $attributes = array(); + $attrs = $this->get_attribute_names_with_prefix(''); + if ($attrs) { + foreach ($attrs as $name) { + $attributes[$name] = $this->get_attribute($name); + } + } + $next_tag = new WP_HTML_Tag_Token( + $this->get_tag(), + $bookmark + ); + $text_end = $this->bookmarks[$bookmark]->start; + } else { + $text_end = strlen($this->html); + } + + /* + * If any text was found between the last tag and this one, + * save the next tag for later and return the text token. + */ + $last = $this->last_token; + if ( + $last + && $last->bookmark + && $this->has_bookmark($last->bookmark) + ) { + $text_start = $this->bookmarks[$last->bookmark]->end + 1; + if ($text_start < $text_end) { + $this->buffered_tag = $next_tag; + $text = substr($this->html, $text_start, $text_end - $text_start); + return $text; + } + } + + return $next_tag; + } + + const ANY_OTHER_END_TAG = 1; + private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { + dbg("Adoption Agency Algorithm", 1); + $subject = $token->tag; + $current_node = $this->current_node(); + if ( + $current_node->tag === $subject + && ! in_array( $current_node, $this->active_formatting_elements, true ) + ) { + $this->pop_open_element(); + dbg("Skipping AAA: current node is \$subject ($subject) and is not AFE", 2); + return; + } + + $outer_loop_counter = 0; + while ( ++$outer_loop_counter < 8 ) { + /* + * Let __formatting element__ be the last element in the list of active + * formatting elements that: + * - is between the end of the list and the last marker in the list, + * if any, or the start of the list otherwise, and + * - has the same tag name as the token. + */ + $formatting_element = null; + $formatting_element_idx = -1; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { + $candidate = $this->active_formatting_elements[ $i ]; + if ( $this->MARKER === $candidate ) { + break; + } + if ( $candidate->tag === $subject ) { + $formatting_element = $candidate; + $formatting_element_idx = $i; + break; + } + } + + // If there is no such element, then abort these steps and instead act as + // described in the "any other end tag" entry below. + if ( null === $formatting_element ) { + dbg("Skipping AAA: no formatting element found", 2); + return self::ANY_OTHER_END_TAG; + } + dbg("AAA: Formatting element = {$formatting_element->tag}", 2); + + // If formatting element is not in the stack of open elements, then this is + // a parse error; remove the element from the list, and return. + if ( ! in_array( $formatting_element, $this->open_elements, true ) ) { + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); + $this->parse_error(); + dbg("Skipping AAA: formatting element is not in the stack of open elements", 2); + return; + } + + // If formatting element is not in scope, then this is a parse error; return + if ( ! $this->is_element_in_scope( $formatting_element ) ) { + $this->parse_error(); + dbg("Skipping AAA: formatting element {$formatting_element->tag} is not in scope", 2); + $this->print_open_elements('Open elements: ', 2); + return; + } + + // If formatting element is not the current node, then this is a parse error. + // (But do not return.) + if ( $formatting_element !== $this->current_node() ) { + $this->parse_error(); + } + + /* + * Let furthest block be the topmost node in the stack of open elements that + * is lower in the stack than formatting element, and is an element in the + * special category. There might not be one. + */ + $furthest_block = null; + for ( $i = count( $this->open_elements ) - 1; $i >= 0; $i-- ) { + $node = $this->open_elements[ $i ]; + if ( $node === $formatting_element ) { + break; + } + if ( $this->is_special_element( $node->tag ) ) { + $furthest_block = $node; + } + } + + // If there is no such node, then the UA must first pop all the nodes from + // the bottom of the stack of open elements, from the current node up to + // and including formatting element, then remove formatting element from + // the list of active formatting elements, and finally abort these steps. + if ( null === $furthest_block ) { + $this->pop_until_node( $formatting_element ); + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); + dbg("Skipping AAA: no furthest block found", 2); + return; + } + + // We didn't bale out so far, but the algorithm is not implemented. + // Let's error out. + break; + } + throw new Exception('Adoption Agency Algorithm not supported.'); + } + + private function insert_element( WP_HTML_Tag_Token $token ) { + // Text API: + // @TODO: do nothing if $token is already in $this->html + // instead of building $this->reconstructed_html + // from scratch + // @TODO attrs + $this->reconstructed_html .= '<'.$token->tag.'>'; + array_push($this->open_elements, $token); + return $token; + } + + private function parse_error() { + // Noop for now + } + + private function pop_until_tag_name( $tags ) { + if ( ! is_array( $tags ) ) { + $tags = array( $tags ); + } + dbg( "Popping until tag names: " . implode(', ', $tags), 1 ); + $this->print_open_elements( "Open elements before: " ); + do { + $popped = $this->pop_open_element(); + } while (!in_array($popped->tag, $tags)); + $this->print_open_elements( "Open elements after: " ); + } + + private function pop_until_node( $node ) { + do { + $popped = $this->pop_open_element(); + } while ( $popped !== $node ); + } + + private function pop_open_element() { + $popped = array_pop( $this->open_elements ); + + // Text API: + $this->reconstructed_html .= 'tag.'>'; + + // Object-oriented API: + if ( $popped->bookmark ) { + $this->release_bookmark( $popped->bookmark ); + } + return $popped; + } + + private function generate_implied_end_tags( $options = null ) { + while ( $this->should_generate_implied_end_tags( $options ) ) { + yield $this->pop_open_element(); + } + } + + private function current_node() { + return end( $this->open_elements ); + } + + private function close_p_element() { + dbg( "close_p_element" ); + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'P' ), + ) + ); + // If the current node is not a p element, then this is a parse error. + if ( $this->get_tag() !== 'P' ) { + $this->parse_error(); + } + $this->pop_until_tag_name( 'P' ); + } + + private function should_generate_implied_end_tags( $options = null ) { + $current_tag_name = $this->get_tag(); + if ( null !== $options && isset( $options['except_for'] ) && in_array( $current_tag_name, $options['except_for'] ) ) { + return false; + } + switch ( $current_tag_name ) { + case 'DD': + case 'DT': + case 'LI': + case 'OPTION': + case 'OPTGROUP': + case 'P': + case 'RB': + case 'RP': + case 'RT': + case 'RTC': + return true; + } + + $thoroughly = null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly']; + if ( $thoroughly ) { + switch ( $current_tag_name ) { + case 'TBODY': + case 'TFOOT': + case 'THEAD': + case 'TD': + case 'TH': + case 'TR': + return true; + } + } + + return false; + } + + /** + * https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements + */ + private function push_active_formatting_element( WP_HTML_Tag_Token $node ) { + $count = 0; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { + $formatting_element = $this->active_formatting_elements[ $i ]; + if ( $this->MARKER !== $formatting_element ) { + break; + } + if ( $formatting_element !== $node ) { + continue; + } + $count++; + if ( $count === 3 ) { + array_splice( $this->active_formatting_elements, $i, 1 ); + break; + } + } + $this->active_formatting_elements[] = $node; + } + + private function print_active_formatting_elements($msg, $indent=1) { + if (HTML_DEBUG_MODE) { + $formats = array_map(function ($node) { + return $this->MARKER === $node ? 'M' : ($node->tag ?: 'ERROR'); + }, $this->active_formatting_elements); + dbg("$msg " . implode(', ', $formats), $indent); + } + } + + private function print_open_elements($msg, $indent=1) { + if (HTML_DEBUG_MODE) { + $elems = array_map(function ($node) { + return $node->tag; + }, $this->open_elements); + dbg("$msg " . implode(', ', $elems), $indent); + } + } + + private function reconstruct_active_formatting_elements() { + $this->print_active_formatting_elements('AFE: before'); + if ( empty( $this->active_formatting_elements ) ) { + dbg( "Skipping AFE: empty list", 1 ); + return; + } + $entry_idx = count( $this->active_formatting_elements ) - 1; + $last_entry = $this->active_formatting_elements[ $entry_idx ]; + if ( $this->MARKER === $last_entry || in_array( $last_entry, $this->open_elements, true ) ) { + dbg( "Skipping AFE: marker or open element", 1 ); + return; + } + + // Let entry be the last (most recently added) element in the list of active formatting elements. + $entry = $last_entry; + + $is_rewinding = true; + while ( true ) { + if ( $is_rewinding ) { + // Rewind: + /* + * If there are no entries before entry in the list of active formatting elements, + * then jump to the step labeled create. + */ + if ( $entry_idx === 0 ) { + $is_rewinding = false; + } else { + // Let entry be the entry one earlier than entry in the list of active formatting elements. + $entry = $this->active_formatting_elements[ --$entry_idx ]; + + // If entry is neither a marker nor an element that is also in the stack of open elements, + // go to the step labeled rewind. + if ( $this->MARKER !== $entry && ! in_array( $entry, $this->open_elements, true ) ) { + continue; + } + } + } else { + // Advance: + // Let entry be the element one later than entry in the list of active formatting elements. + $entry = $this->active_formatting_elements[ ++$entry_idx ]; + } + + // Create: Insert an HTML element for the token for which the element entry was created, + // to obtain new element. + $new_element = $this->insert_element( $entry ); + + // Replace the entry for entry in the list with an entry for new element. + $this->active_formatting_elements[ $entry_idx ] = $new_element; + + // If the entry for new element in the list of active formatting elements is not the last entry + // in the list, return to the step labeled advance. + if ( $entry_idx === count( $this->active_formatting_elements ) - 1 ) { + break; + } + } + $this->print_active_formatting_elements('AFE: after'); + } + + private function clear_active_formatting_elements_up_to_last_marker() { + while ( ! empty( $this->active_formatting_elements ) ) { + $entry = array_pop( $this->active_formatting_elements ); + if ( $this->MARKER === $entry ) { + break; + } + } + } + + /** + * The stack of open elements is said to have a particular element in + * select scope when it has that element in the specific scope consisting + * of all element types except the following: + * * optgroup + * * option + */ + private function is_element_in_select_scope( $target_node ) { + return $this->is_element_in_specific_scope( + $target_node, + array( + 'OPTGROUP', + 'OPTION', + ), + array( + 'negative_match' => 'true', + ) + ); + } + + private function is_element_in_table_scope( $target_node ) { + return $this->is_element_in_specific_scope( + $target_node, + array( + 'HTML', + 'TABLE', + 'TEMPLATE', + ) + ); + } + + private function is_element_in_button_scope( $target_node ) { + return $this->is_element_in_scope( + $target_node, + array( + 'BUTTON', + ) + ); + } + + private function is_element_in_list_item_scope( $target_node ) { + return $this->is_element_in_scope( + $target_node, + array( + 'LI', + 'DD', + 'DT', + ) + ); + } + + private function is_element_in_scope( $target_node, $additional_elements = array() ) { + return $this->is_element_in_specific_scope( + $target_node, + array_merge( + array( + 'APPLET', + 'CAPTION', + 'HTML', + 'TABLE', + 'TD', + 'TH', + 'MARQUEE', + 'OBJECT', + 'TEMPLATE', + ), + $additional_elements + ) + ); + } + + /* + * https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements + */ + private function is_element_in_specific_scope( $target_node, $element_types_list, $options = array() ) { + $negative_match = isset( $options['negative_match'] ) ? $options['negative_match'] : false; + + /** + * The stack of open elements is said to have an element target node in a + * specific scope consisting of a list of element types list when the following + * algorithm terminates in a match state: + */ + $i = count( $this->open_elements ) - 1; + // 1. Initialize node to be the current node (the bottommost node of the stack). + $node = $this->open_elements[ $i ]; + + while ( true ) { + // 2. If node is the target node, terminate in a match state. + if ( is_string( $target_node ) ) { + if ( $node->tag === $target_node ) { + return true; + } + } else if ( $node === $target_node ) { + return true; + } + + // 3. Otherwise, if node is one of the element types in list, terminate in a failure state. + $failure = in_array( $node->tag, $element_types_list, true ); + + // Some elements say: + // > If has that element in the specific scope consisting of all element types + // > except the following + // So we need to invert the result. + if($negative_match) { + $failure = ! $failure; + } + if ( $failure ) { + return false; + } + + // Otherwise, set node to the previous entry in the stack of open elements and + // return to step 2. (This will never fail, since the loop will always terminate + // in the previous step if the top of the stack — an html element — is reached.) + $node = $this->open_elements[ --$i ]; + } + } + + private static function is_special_element( $tag_name, $except = null ) { + if ( null !== $except && in_array( $tag_name, $except, true ) ) { + return false; + } + + switch ( $tag_name ) { + case 'ADDRESS': + case 'APPLET': + case 'AREA': + case 'ARTICLE': + case 'ASIDE': + case 'BASE': + case 'BASEFONT': + case 'BGSOUND': + case 'BLOCKQUOTE': + case 'BODY': + case 'BR': + case 'BUTTON': + case 'CAPTION': + case 'CENTER': + case 'COL': + case 'COLGROUP': + case 'DD': + case 'DETAILS': + case 'DIR': + case 'DIV': + case 'DL': + case 'DT': + case 'EMBED': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'FORM': + case 'FRAME': + case 'FRAMESET': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + case 'HEAD': + case 'HEADER': + case 'HGROUP': + case 'HR': + case 'HTML': + case 'IFRAME': + case 'IMG': + case 'INPUT': + case 'ISINDEX': + case 'LI': + case 'LINK': + case 'LISTING': + case 'MAIN': + case 'MARQUEE': + case 'MENU': + case 'MENUITEM': + case 'META': + case 'NAV': + case 'NOEMBED': + case 'NOFRAMES': + case 'NOSCRIPT': + case 'OBJECT': + case 'OL': + case 'P': + case 'PARAM': + case 'PLAINTEXT': + case 'PRE': + case 'SCRIPT': + case 'SECTION': + case 'SELECT': + case 'SOURCE': + case 'STYLE': + case 'SUMMARY': + case 'TABLE': + case 'TBODY': + case 'TD': + case 'TEMPLATE': + case 'TEXTAREA': + case 'TFOOT': + case 'TH': + case 'THEAD': + case 'TITLE': + case 'TR': + case 'TRACK': + case 'UL': + case 'WBR': + case 'XMP': + return true; + default: + return false; + } + } + + private static function is_rcdata_element( $tag_name ) { + switch ( $tag_name ) { + case 'TITLE': + case 'TEXTAREA': + case 'STYLE': + case 'XMP': + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + case 'NOSCRIPT': + return true; + default: + return false; + } + } + + private static function is_formatting_element( $tag_name ) { + switch ( strtoupper( $tag_name ) ) { + case 'A': + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'NOBR': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + return true; + default: + return false; + } + } + +} + +// $dir = realpath( __DIR__ . '/../../../index.html' ); + +// $htmlspec = file_get_contents( $dir ); +// $p = new WP_HTML_Processor( $htmlspec ); +// $p->parse(); + +// die(); + +$p = new WP_HTML_Processor( '

12345

' ); +$p->parse(); + +$p = new WP_HTML_Processor( '
12
34' ); +$p->parse(); + +$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); +$p->parse(); +die(); +/* +Outputs: + +DOM after main loop: + HTML + ├─ UL + ├─ LI + └─ #text: 1 + ├─ LI + └─ #text: 2 + ├─ LI + └─ #text: 3 + ├─ LI + ├─ #text: Lorem + └─ B + └─ #text: Ipsum + └─ LI + └─ B + └─ #text: Dolor + └─ B + └─ SPAN + ├─ #text: Sit + └─ SPAN + ├─ #text: Sit + └─ SPAN + └─ DIV + └─ #text: Amet +*/ + +$p = new WP_HTML_Processor( ' +
+
+
+
+
' ); +$p->parse(); +// $p = new WP_HTML_Processor( '1

23

' ); +// $p->parse(); +// /* +// Outputs the correct result: +// B +// └─ #text: 1 +// P +// ├─ B +// └─ #text: 2 +// └─ #text: 3 +// */ +echo "\n\n"; +echo $p->reconstructed_html; +die(); + +$p = new WP_HTML_Processor( '

X +

X +

X +

X' ); +$p->parse(); +/* +DOM after main loop: + HTML + ├─ P + └─ B class="x" + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ #text: X + ├─ P + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ #text: X + ├─ P + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ B + └─ B class="x" + └─ B + └─ #text: X + └─ P + └─ #text: X +*/ From 4f6ec24f28531b5670336fe4ce08d49bf57db973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 27 Feb 2023 16:19:39 +0100 Subject: [PATCH 18/42] More advanced diff-based approach --- .../html-api/class-wp-html-text-processor.php | 247 ++++++++---------- 1 file changed, 105 insertions(+), 142 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index 3ffd03e499ca4..898acd2cea3d7 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -80,7 +80,7 @@ public function __construct( $html ) { public function parse() { echo("HTML before main loop:\n"); - echo($this->html); + // echo($this->html); echo("\n"); while ($this->next_node()) { // ... twiddle thumbs ... @@ -91,23 +91,35 @@ public function parse() { } echo("\n"); - echo("HTML after main loop:\n"); - echo($this->reconstructed_html.''); + echo("Reconstructed HTML after main loop:\n"); + // echo($this->reconstructed_html.''); + echo "\n\n"; + echo("\$this->HTML after main loop:\n"); + // echo($this->get_updated_html().''); echo "\n\n"; - echo "Mem peak usage:" . memory_get_peak_usage(true) . "\n"; + echo "Mem peak usage:" . (memory_get_peak_usage(true) / 1024 / 1024) . "MB\n"; + echo("\n---------------\n\n"); } - public function ignore_token() { + public function ignore_current_tag_token() { // @TODO: remove the current tag from $this->html instead of // not appending it to $this->reconstructed_html - return $this->next_node(); + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->current_token_start, + $this->current_token_end, + '' + ); + return true; } + private $current_token; + private $current_token_start; + private $current_token_end; public function next_node() { $text_start = $this->tag_ends_at + 1; + $this->current_token_start = $text_start; - $next_tag = false; if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); $this->set_bookmark($bookmark); @@ -117,11 +129,15 @@ public function next_node() { ); $text_end = $this->bookmarks[$bookmark]->start; } else { + $next_tag = null; + $this->current_token_start = strlen($this->html); $text_end = strlen($this->html); } + $this->current_token_end = $text_end; if ($text_start < $text_end) { $text = substr($this->html, $text_start, $text_end - $text_start); + $this->current_token = $text; dbg( "Found text node '$text'" ); dbg( "Appending text to reconstructed HTML", 1 ); $this->reconstruct_active_formatting_elements(); @@ -130,11 +146,14 @@ public function next_node() { $this->reconstructed_html .= $text; } - if ( ! $next_tag ) { + $this->current_token = $next_tag; + if ( ! $this->current_token ) { return false; } + $this->current_token_start = $this->bookmarks[$this->current_token->bookmark]->start; + $this->current_token_end = $this->bookmarks[$this->current_token->bookmark]->end + 1; - $token = $next_tag; + $token = $this->current_token; if ( ! $this->is_tag_closer() ) { dbg( "Found {$token->tag} tag opener" ); switch ( $token->tag ) { @@ -205,7 +224,7 @@ public function next_node() { 'except_for' => array( 'LI' ), ) ); - $this->pop_until_tag_name( 'LI' ); + $this->pop_until_node_or_tag( 'LI' ); break; } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; @@ -231,7 +250,7 @@ public function next_node() { 'except_for' => array( 'DD' ), ) ); - $this->pop_until_tag_name( 'DD' ); + $this->pop_until_node_or_tag( 'DD' ); break; } elseif ( $node->tag === 'DT' ) { $this->generate_implied_end_tags( @@ -239,7 +258,7 @@ public function next_node() { 'except_for' => array( 'DT' ), ) ); - $this->pop_until_tag_name( 'DT' ); + $this->pop_until_node_or_tag( 'DT' ); break; } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; @@ -257,7 +276,7 @@ public function next_node() { case 'BUTTON': if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { $this->generate_implied_end_tags(); - $this->pop_until_tag_name( 'BUTTON' ); + $this->pop_until_node_or_tag( 'BUTTON' ); } $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); @@ -327,20 +346,20 @@ public function next_node() { case 'WBR': $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); - $this->pop_open_element(); + $this->pop_open_element( false ); break; case 'PARAM': case 'SOURCE': case 'TRACK': $this->insert_element( $token ); - $this->pop_open_element(); + $this->pop_open_element( false ); break; case 'HR': if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } $this->insert_element( $token ); - $this->pop_open_element(); + $this->pop_open_element( false ); break; case 'TEXTAREA': $this->insert_element( $token ); @@ -349,11 +368,9 @@ public function next_node() { $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); break; - case 'OPTGROUP': case 'OPTION': - if ( 'OPTION' === $token->tag ) { - $this->pop_open_element(); - } + $this->pop_open_element(false); + case 'OPTGROUP': $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); break; @@ -420,14 +437,14 @@ public function next_node() { case 'UL': if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); - return $this->ignore_token(); + return $this->ignore_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_tag_name( $token->tag ); + $this->pop_until_node_or_tag( $token->tag, false ); break; case 'FORM': $this->generate_implied_end_tags(); - $this->pop_until_tag_name( $token->tag ); + $this->pop_until_node_or_tag( $token->tag, false ); break; case 'P': /* @@ -440,24 +457,24 @@ public function next_node() { $this->insert_element( new WP_HTML_Tag_Token( 'P' ) ); } // Close a p element. - $this->close_p_element(); + $this->close_p_element(false); break; case 'LI': if ( ! $this->is_element_in_list_item_scope( 'LI' ) ) { $this->parse_error(); - return $this->ignore_token(); + return $this->ignore_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_tag_name( 'LI' ); + $this->pop_until_node_or_tag( 'LI', false ); break; case 'DD': case 'DT': if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); - return $this->ignore_token(); + return $this->ignore_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_tag_name( $token->tag ); + $this->pop_until_node_or_tag( $token->tag, false ); break; case 'H1': case 'H2': @@ -467,10 +484,10 @@ public function next_node() { case 'H6': if ( ! $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->parse_error(); - return $this->ignore_token(); + return $this->ignore_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_tag_name( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ); + $this->pop_until_node_or_tag( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), false ); break; case 'A': case 'B': @@ -494,13 +511,13 @@ public function next_node() { case 'OBJECT': if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); - return $this->ignore_token(); + return $this->ignore_current_tag_token(); } $this->generate_implied_end_tags(); if ( $this->current_node()->tag !== $token->tag ) { $this->parse_error(); } - $this->pop_until_tag_name( $token->tag ); + $this->pop_until_node_or_tag( $token->tag, false ); $this->clear_active_formatting_elements_up_to_last_marker(); break; case 'BR': @@ -515,11 +532,11 @@ public function next_node() { 'except_for' => array( $token->tag ), ) ); - $this->pop_until_node( $node ); + $this->pop_until_node_or_tag( $node ); break; } elseif ( $this->is_special_element( $node->tag ) ) { $this->parse_error(); - return $this->ignore_token(); + return $this->ignore_current_tag_token(); } else { --$i; } @@ -668,7 +685,7 @@ private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { // and including formatting element, then remove formatting element from // the list of active formatting elements, and finally abort these steps. if ( null === $furthest_block ) { - $this->pop_until_node( $formatting_element ); + $this->pop_until_node_or_tag( $formatting_element, false ); array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); dbg("Skipping AAA: no furthest block found", 2); return; @@ -688,48 +705,66 @@ private function insert_element( WP_HTML_Tag_Token $token ) { // from scratch // @TODO attrs $this->reconstructed_html .= '<'.$token->tag.'>'; + if($token !== $this->current_token) { + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->current_token_start, + $this->current_token_start, + "<{$token->tag}>" + ); + } array_push($this->open_elements, $token); return $token; } + private function insert_tag_closer_before_current_token( $tag ) { + $this->lexical_updates[] = new WP_HTML_Text_Replacement( + $this->current_token_start, + $this->current_token_start, + "" + ); + } + private function parse_error() { // Noop for now } - private function pop_until_tag_name( $tags ) { - if ( ! is_array( $tags ) ) { - $tags = array( $tags ); + private function pop_until_node_or_tag( $node_or_element, $tag_closer_for_last_element = true ) { + while( true ) { + $popped = $this->pop_open_element( false ); + if ($tag_closer_for_last_element) { + $this->insert_tag_closer_before_current_token($popped->tag); + } + if(is_string($node_or_element)) { + if($popped->tag === $node_or_element) { + break; + } + } else if(is_array($node_or_element)) { + if(in_array($popped->tag, $node_or_element)) { + break; + } + } else { + if($popped === $node_or_element) { + break; + } + } + if(!$tag_closer_for_last_element) { + $this->insert_tag_closer_before_current_token($popped->tag); + } } - dbg( "Popping until tag names: " . implode(', ', $tags), 1 ); - $this->print_open_elements( "Open elements before: " ); - do { - $popped = $this->pop_open_element(); - } while (!in_array($popped->tag, $tags)); - $this->print_open_elements( "Open elements after: " ); - } - - private function pop_until_node( $node ) { - do { - $popped = $this->pop_open_element(); - } while ( $popped !== $node ); } - private function pop_open_element() { + private function pop_open_element($add_close_tag = true) { $popped = array_pop( $this->open_elements ); - - // Text API: $this->reconstructed_html .= 'tag.'>'; - - // Object-oriented API: - if ( $popped->bookmark ) { - $this->release_bookmark( $popped->bookmark ); + if ( $add_close_tag ) { + $this->insert_tag_closer_before_current_token( $popped->tag ); } return $popped; } private function generate_implied_end_tags( $options = null ) { - while ( $this->should_generate_implied_end_tags( $options ) ) { - yield $this->pop_open_element(); + while( $this->should_generate_implied_end_tags( $options ) ) { + $this->pop_open_element( true ); } } @@ -737,7 +772,7 @@ private function current_node() { return end( $this->open_elements ); } - private function close_p_element() { + private function close_p_element($closer_for_last_elem = true) { dbg( "close_p_element" ); $this->generate_implied_end_tags( array( @@ -748,7 +783,7 @@ private function close_p_element() { if ( $this->get_tag() !== 'P' ) { $this->parse_error(); } - $this->pop_until_tag_name( 'P' ); + $this->pop_until_node_or_tag( 'P', $closer_for_last_elem ); } private function should_generate_implied_end_tags( $options = null ) { @@ -1161,91 +1196,19 @@ private static function is_formatting_element( $tag_name ) { $p = new WP_HTML_Processor( '

  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); $p->parse(); -die(); -/* -Outputs: - -DOM after main loop: - HTML - ├─ UL - ├─ LI - └─ #text: 1 - ├─ LI - └─ #text: 2 - ├─ LI - └─ #text: 3 - ├─ LI - ├─ #text: Lorem - └─ B - └─ #text: Ipsum - └─ LI - └─ B - └─ #text: Dolor - └─ B - └─ SPAN - ├─ #text: Sit - └─ SPAN - ├─ #text: Sit - └─ SPAN - └─ DIV - └─ #text: Amet -*/ - -$p = new WP_HTML_Processor( ' -
-
-
-
-
' ); -$p->parse(); -// $p = new WP_HTML_Processor( '1

23

' ); + + +// $p = new WP_HTML_Processor( ' +//
+//
+//
+//
+// ' ); // $p->parse(); -// /* -// Outputs the correct result: -// B -// └─ #text: 1 -// P -// ├─ B -// └─ #text: 2 -// └─ #text: 3 -// */ -echo "\n\n"; -echo $p->reconstructed_html; -die(); + $p = new WP_HTML_Processor( '

X

X

X

X' ); $p->parse(); -/* -DOM after main loop: - HTML - ├─ P - └─ B class="x" - └─ B class="x" - └─ B - └─ B class="x" - └─ B class="x" - └─ B - └─ #text: X - ├─ P - └─ B class="x" - └─ B - └─ B class="x" - └─ B class="x" - └─ B - └─ #text: X - ├─ P - └─ B class="x" - └─ B - └─ B class="x" - └─ B class="x" - └─ B - └─ B - └─ B class="x" - └─ B - └─ #text: X - └─ P - └─ #text: X -*/ From 3a0ed5fb0f4ab5ae6c12a8b2f15938f4c4fda779 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 27 Feb 2023 16:21:53 +0100 Subject: [PATCH 19/42] Remove class-wp-html-processor.php --- .../html-api/class-wp-html-processor.php | 1478 ----------------- .../html-api/class-wp-html-tag-processor.php | 2 +- 2 files changed, 1 insertion(+), 1479 deletions(-) delete mode 100644 src/wp-includes/html-api/class-wp-html-processor.php diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php deleted file mode 100644 index e276f10c750bf..0000000000000 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ /dev/null @@ -1,1478 +0,0 @@ -tag = $tag; - $this->bookmark = $bookmark; - } - -} - -class WP_HTML_Token { - const MARKER = 'MARKER'; - const TAG = 'TAG'; - const TEXT = 'TEXT'; - - public $type; - - // For tag tokens - public $tag; - public $attributes; - public $is_closer; - public $is_opener; - public $bookmark; - - // For text tokens - public $value; - - static public function marker() { - return new WP_HTML_Token( self::MARKER ); - } - - static public function tag( $tag, $attributes = null, $is_opener = true, $bookmark = null ) { - $token = new WP_HTML_Token( self::TAG ); - $token->tag = $tag; - $token->attributes = $attributes; - $token->is_opener = $is_opener; - $token->is_closer = ! $is_opener; - $token->bookmark = $bookmark; - return $token; - } - - static public function text( $text ) { - $token = new WP_HTML_Token( self::TEXT ); - $token->value = $text; - return $token; - } - - public function __construct( $type ) { - $this->type = $type; - } - - public function __toString() { - switch ( $this->type ) { - case self::MARKER: - return 'MARKER'; - case self::TAG: - $attributes = ''; - if($this->attributes) { - foreach( $this->attributes as $name => $value ) { - $attributes .= ' ' . $name . '="' . esc_attr( $value ) . '"'; - } - } - return sprintf( - '%s%s%s', - $this->is_closer ? '/' : '', - $this->tag, - $attributes - ); - case self::TEXT: - return '#text: ' . trim($this->value); - } - } - - public function equivalent( WP_HTML_Token $other ) { - if ( ! $this->tag || ! $other->tag ) { - throw new Exception( 'Cannot compare non-tag tokens' ); - } - - if ( $this->is_closer !== $other->is_closer ) { - return false; - } - - if ( $this->tag !== $other->tag ) { - return false; - } - - if ( count( $this->attributes ) !== count( $other->attributes ) ) { - return false; - } - - $attributes_match = true; - foreach ( $other->attributes as $name => $value ) { - if ( ! isset( $this->attributes[ $name ] ) || $this->attributes[ $name ] !== $value ) { - $attributes_match = false; - break; - } - } - return $attributes_match; - } - - public function is_marker() { - return self::MARKER === $this->type; - } - - public function is_tag() { - return self::TAG === $this->type; - } - - public function is_text() { - return self::TEXT === $this->type; - } -} - -class WP_HTML_Node { - /** - * @var WP_HTML_Node - */ - public $parent; - /** - * @var WP_HTML_Node[] - */ - public $children = array(); - /** - * @var WP_HTML_Token - */ - public $token; - public $depth = 1; - - private $type; - private $value; - private $tag; - - public function __construct( WP_HTML_Token $token ) { - $this->token = $token; - // Just for debugging convenience – remove eventually - $this->type = $token->type; - $this->value = $token->value; - $this->tag = $token->tag; - } - - public function append_child( WP_HTML_Node $node ) { - if($node->parent) { - $node->parent->remove($node); - } - $node->parent = $this; - $this->children[] = $node; - $node->depth = $this->depth + 1; - } - - public function remove( WP_HTML_Node $node ) { - $index = array_search( $node, $this->children, true ); - if ( false !== $index ) { - unset( $this->children[ $index ] ); - } - } - - public function __toString() { - return wp_html_node_to_ascii_tree( $this ); - } -} - - -function wp_html_node_to_ascii_tree( WP_HTML_Node $node, $prefix = '', $is_last = false ) { - $ascii_tree = $prefix . ( $node->parent ? ($is_last ? '└─ ' : '├─ ') : ' ' ) . $node->token . "\n"; - - // Recursively process the children of the current node - $children = array_values($node->children); - $num_children = count( $children ); - for ( $i = 0; $i < $num_children; $i++ ) { - $child_prefix = $prefix . ( $i == $num_children - 1 ? ' ' : ' ' ); - $is_last_child = ( $i == $num_children - 1 ); - $ascii_tree .= wp_html_node_to_ascii_tree( $children[ $i ], $child_prefix, $is_last_child ); - } - - return $ascii_tree; -} - -class WP_HTML_Insertion_Mode { - - const INITIAL = 'INITIAL'; - const IN_SELECT = 'IN_SELECT'; - const IN_SELECT_IN_TABLE = 'IN_SELECT_IN_TABLE'; - const IN_CELL = 'IN_CELL'; - const IN_ROW = 'IN_ROW'; - const IN_TABLE_BODY = 'IN_TABLE_BODY'; - const IN_CAPTION = 'IN_CAPTION'; - const IN_COLUMN_GROUP = 'IN_COLUMN_GROUP'; - const IN_TABLE = 'IN_TABLE'; - const IN_HEAD = 'IN_HEAD'; - const IN_BODY = 'IN_BODY'; - const IN_FRAMESET = 'IN_FRAMESET'; - const BEFORE_HEAD = 'BEFORE_HEAD'; - const TEXT = 'TEXT'; - -} - -/** - * - */ -class WP_HTML_Processor extends WP_HTML_Tag_Processor { - - /** - * @var WP_HTML_Node[] - */ - private $open_elements = array(); - /** - * @var WP_HTML_Node[] - */ - private $active_formatting_elements = array(); - private $root_node = null; - private $context_node = null; - - /* - * WP_HTML_Tag_Processor skips over text nodes and only - * processes tags. - * - * WP_HTML_Processor needs to process text nodes as well. - * - * Whenever the tag processor skips over text to move to - * the next tag, the next_token() method emits that text - * as a token and stores the tag in $buffered_tag to be - * returned the next time. - */ - private $buffered_tag = null; - - private $last_token = null; - private $inserted_tokens = array(); - - public $reconstructed_html = ''; - - const MAX_BOOKMARKS = 1000000; - - public function __construct( $html ) { - parent::__construct( $html ); - $this->root_node = new WP_HTML_Node(WP_HTML_Token::tag( 'HTML' )); - $this->context_node = new WP_HTML_Node(WP_HTML_Token::tag( 'DOCUMENT' )); - $this->open_elements = array( $this->root_node ); - } - - public function parse() { - echo("HTML before main loop:\n"); - echo($this->html); - echo("\n"); - while ($this->process_next_token()) { - // ... twiddle thumbs ... - } - - while ( count($this->open_elements) > 1 ) { - $this->pop_open_element(); - } - - echo("\n"); - echo("DOM after main loop:\n"); - echo($this->root_node.''); - echo "\n\n"; - - echo "Mem peak usage:" . memory_get_peak_usage(true) . "\n"; - } - - private function process_next_token() { - $token = $this->next_token(); - if(!$token){ - return false; - } - $this->last_token = $token; - $processed_token = $this->process_token($token); - $this->last_token = $processed_token; - return $processed_token; - } - - private function ignore_token( $ignored_token ) { - // if ( $ignored_token->bookmark ) { - // // $this->release_bookmark( $ignored_token->bookmark ); - // // $ignored_token->bookmark = null; - // } - - $this->last_token = $ignored_token; - return $this->process_next_token(); - } - - public function process_token(WP_HTML_Token $token) { - if ( $token->is_text() ) { - dbg( "Found text node '$token'" ); - dbg( "Inserting text to current node " . $this->current_node()->token->tag, 1 ); - $this->reconstruct_active_formatting_elements(); - $this->insert_text( $token ); - } - else if ( $token->is_opener ) { - dbg( "Found {$token->tag} tag opener" ); - switch ( $token->tag ) { - case 'ADDRESS': - case 'ARTICLE': - case 'ASIDE': - case 'BLOCKQUOTE': - case 'CENTER': - case 'DETAILS': - case 'DIALOG': - case 'DIR': - case 'DIV': - case 'DL': - case 'FIELDSET': - case 'FIGCAPTION': - case 'FIGURE': - case 'FOOTER': - case 'HEADER': - case 'HGROUP': - case 'MAIN': - case 'MENU': - case 'NAV': - case 'OL': - case 'P': - case 'SECTION': - case 'SUMMARY': - case 'UL': - // Ignore special rules for 'PRE' and 'LISTING' - case 'PRE': - case 'LISTING': - /* - * If the stack of open elements has a p element in button scope, - * then close a p element. - */ - if ( $this->is_element_in_button_scope( 'P' ) ) { - $this->close_p_element(); - } - $this->insert_element( $token ); - break; - // A start tag whose tag name is "h1", "h2", "h3", "h4", "h5", or "h6" - case 'H1': - case 'H2': - case 'H3': - case 'H4': - case 'H5': - case 'H6': - if ( $this->is_element_in_button_scope( 'P' ) ) { - $this->close_p_element(); - } - if ( in_array( $this->current_node()->token->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { - $this->pop_open_element(); - } - $this->insert_element( $token ); - break; - case 'FORM': - if ( $this->is_element_in_button_scope( 'P' ) ) { - $this->close_p_element(); - } - $this->insert_element( $token ); - break; - case 'LI': - $i = count( $this->open_elements ) - 1; - while ( true ) { - $node = $this->open_elements[ $i ]; - if ( $node->token->tag === 'LI' ) { - $this->generate_implied_end_tags( - array( - 'except_for' => array( 'LI' ), - ) - ); - $this->pop_until_tag_name( 'LI' ); - break; - } elseif ( self::is_special_element( $node->token->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { - break; - } else { - --$i; - $node = $this->open_elements[ $i ]; - } - } - - if ( $this->is_element_in_button_scope( 'P' ) ) { - $this->close_p_element(); - } - $this->insert_element( $token ); - break; - case 'DD': - case 'DT': - $i = count( $this->open_elements ) - 1; - while ( true ) { - $node = $this->open_elements[ $i ]; - if ( $node->token->tag === 'DD' ) { - $this->generate_implied_end_tags( - array( - 'except_for' => array( 'DD' ), - ) - ); - $this->pop_until_tag_name( 'DD' ); - break; - } elseif ( $node->token->tag === 'DT' ) { - $this->generate_implied_end_tags( - array( - 'except_for' => array( 'DT' ), - ) - ); - $this->pop_until_tag_name( 'DT' ); - break; - } elseif ( self::is_special_element( $node->token->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { - break; - } else { - --$i; - $node = $this->open_elements[ $i ]; - } - } - - if ( $this->is_element_in_button_scope( 'P' ) ) { - $this->close_p_element(); - } - $this->insert_element( $token ); - break; - case 'BUTTON': - if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { - $this->generate_implied_end_tags(); - $this->pop_until_tag_name( 'BUTTON' ); - } - $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); - break; - case 'A': - $active_a = null; - for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; --$i ) { - $node = $this->active_formatting_elements[ $i ]; - if ( $node->token->tag === 'A' ) { - $active_a = $node; - break; - } elseif ( $node->token->is_marker() ) { - break; - } - } - - if ( $active_a ) { - $this->parse_error(); - $this->adoption_agency_algorithm( $token ); - } - - $this->reconstruct_active_formatting_elements(); - $node = $this->insert_element( $token ); - $this->push_active_formatting_element( $node ); - break; - case 'B': - case 'BIG': - case 'CODE': - case 'EM': - case 'FONT': - case 'I': - case 'S': - case 'SMALL': - case 'STRIKE': - case 'STRONG': - case 'TT': - case 'U': - $this->reconstruct_active_formatting_elements(); - $node = $this->insert_element( $token ); - $this->push_active_formatting_element( $node ); - break; - case 'NOBR': - $this->reconstruct_active_formatting_elements(); - if ( $this->is_element_in_scope( 'NOBR' ) ) { - $this->parse_error(); - $this->adoption_agency_algorithm( $token ); - $this->reconstruct_active_formatting_elements(); - } - $node = $this->insert_element( $token ); - $this->push_active_formatting_element( $node ); - break; - case 'APPLET': - case 'MARQUEE': - case 'OBJECT': - $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); - $this->active_formatting_elements[] = WP_HTML_Token::marker(); - break; - case 'TABLE': - $this->insert_element( $token ); - break; - case 'AREA': - case 'BR': - case 'EMBED': - case 'IMG': - case 'KEYGEN': - case 'WBR': - $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); - $this->pop_open_element(); - break; - case 'PARAM': - case 'SOURCE': - case 'TRACK': - $this->insert_element( $token ); - $this->pop_open_element(); - break; - case 'HR': - if ( $this->is_element_in_button_scope( 'P' ) ) { - $this->close_p_element(); - } - $this->insert_element( $token ); - $this->pop_open_element(); - break; - case 'TEXTAREA': - $this->insert_element( $token ); - break; - case 'SELECT': - $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); - break; - case 'OPTGROUP': - case 'OPTION': - if ( 'OPTION' === $token->tag ) { - $this->pop_open_element(); - } - $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); - break; - case 'RB': - case 'RTC': - if ( $this->is_element_in_scope( 'RB' ) || $this->is_element_in_scope( 'RTC' ) ) { - $this->parse_error(); - $this->adoption_agency_algorithm( $token ); - $this->reconstruct_active_formatting_elements(); - } - $this->insert_element( $token ); - break; - case 'RP': - case 'RT': - if ( $this->is_element_in_scope( 'RP' ) || $this->is_element_in_scope( 'RT' ) ) { - $this->parse_error(); - $this->adoption_agency_algorithm( $token ); - $this->reconstruct_active_formatting_elements(); - } - $this->insert_element( $token ); - break; - - // case 'XMP': - // case 'IFRAME': - // case 'NOEMBED': - // case 'MATH': - // case 'SVG': - // case 'NOSCRIPT': - // case 'PLAINTEXT': - // case 'IMAGE': - // throw new Exception( $token->tag . ' not implemented yet' ); - - default: - $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); - break; - } - } else { - dbg( "Found {$token->tag} tag closer" ); - switch ( $token->tag ) { - case 'ADDRESS': - case 'ARTICLE': - case 'ASIDE': - case 'BLOCKQUOTE': - case 'CENTER': - case 'DETAILS': - case 'DIALOG': - case 'DIR': - case 'DIV': - case 'DL': - case 'FIELDSET': - case 'FIGCAPTION': - case 'FIGURE': - case 'FOOTER': - case 'HEADER': - case 'HGROUP': - case 'MAIN': - case 'MENU': - case 'NAV': - case 'OL': - case 'PRE': - case 'SECTION': - case 'SUMMARY': - case 'UL': - if ( ! $this->is_element_in_scope( $token->tag ) ) { - $this->parse_error(); - return $this->ignore_token( $token ); - } - $this->generate_implied_end_tags(); - $this->pop_until_tag_name( $token->tag ); - break; - case 'FORM': - $this->generate_implied_end_tags(); - $this->pop_until_tag_name( $token->tag ); - break; - case 'P': - /* - * If the stack of open elements does not have a p element in button scope, - * then this is a parse error; insert an HTML element for a "p" start tag - * token with no attributes. - */ - if ( ! $this->is_element_in_button_scope( 'P' ) ) { - $this->parse_error(); - $this->insert_element( WP_HTML_Token::tag( 'P' ) ); - } - // Close a p element. - $this->close_p_element(); - break; - case 'LI': - if ( ! $this->is_element_in_list_item_scope( 'LI' ) ) { - $this->parse_error(); - return $this->ignore_token( $token ); - } - $this->generate_implied_end_tags(); - $this->pop_until_tag_name( 'LI' ); - break; - case 'DD': - case 'DT': - if ( ! $this->is_element_in_scope( $token->tag ) ) { - $this->parse_error(); - return $this->ignore_token( $token ); - } - $this->generate_implied_end_tags(); - $this->pop_until_tag_name( $token->tag ); - break; - case 'H1': - case 'H2': - case 'H3': - case 'H4': - case 'H5': - case 'H6': - if ( ! $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { - $this->parse_error(); - return $this->ignore_token( $token ); - } - $this->generate_implied_end_tags(); - $this->pop_until_tag_name( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ); - break; - case 'A': - case 'B': - case 'BIG': - case 'CODE': - case 'EM': - case 'FONT': - case 'I': - case 'S': - case 'SMALL': - case 'STRIKE': - case 'STRONG': - case 'TT': - case 'U': - dbg( "Found {$token->tag} tag closer" ); - $this->adoption_agency_algorithm( $token ); - break; - - case 'APPLET': - case 'MARQUEE': - case 'OBJECT': - if ( ! $this->is_element_in_scope( $token->tag ) ) { - $this->parse_error(); - return $this->ignore_token( $token ); - } - $this->generate_implied_end_tags(); - if ( $this->current_node()->token->tag !== $token->tag ) { - $this->parse_error(); - } - $this->pop_until_tag_name( $token->tag ); - $this->clear_active_formatting_elements_up_to_last_marker(); - break; - case 'BR': - // This should never happen since Tag_Processor corrects that - default: - $i = count( $this->open_elements ) - 1; - while ( true ) { - $node = $this->open_elements[ $i ]; - if ( $node->token->tag === $token->tag ) { - $this->generate_implied_end_tags( - array( - 'except_for' => array( $token->tag ), - ) - ); - $this->pop_until_node( $node ); - break; - } elseif ( $this->is_special_element( $node->token->tag ) ) { - $this->parse_error(); - return $this->ignore_token( $token ); - } else { - --$i; - } - } - break; - } - } - return $token; - } - - private $element_bookmark_idx = 0; - private function next_token() { - if($this->buffered_tag){ - $next_tag = $this->buffered_tag; - $this->buffered_tag = null; - return $next_tag; - } - - $next_tag = false; - if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { - $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); - $this->set_bookmark($bookmark); - $attributes = array(); - $attrs = $this->get_attribute_names_with_prefix(''); - if ($attrs) { - foreach ($attrs as $name) { - $attributes[$name] = $this->get_attribute($name); - } - } - $next_tag = WP_HTML_Token::tag( - $this->get_tag(), - $attributes, - ! $this->is_tag_closer(), - $bookmark - ); - $text_end = $this->bookmarks[$bookmark]->start; - } else { - $text_end = strlen($this->html); - } - - /* - * If any text was found between the last tag and this one, - * save the next tag for later and return the text token. - */ - $last = $this->last_token; - if ( - $last - && $last->is_tag() - && $last->bookmark - && $this->has_bookmark($last->bookmark) - ) { - $text_start = $this->bookmarks[$last->bookmark]->end + 1; - if ($text_start < $text_end) { - $this->buffered_tag = $next_tag; - $text = substr($this->html, $text_start, $text_end - $text_start); - return WP_HTML_Token::text($text); - } - } - - return $next_tag; - } - - const ANY_OTHER_END_TAG = 1; - private function adoption_agency_algorithm( WP_HTML_Token $token ) { - dbg("Adoption Agency Algorithm", 1); - $subject = $token->tag; - $current_node = $this->current_node(); - if ( - $current_node->token->tag === $subject - && ! in_array( $current_node, $this->active_formatting_elements, true ) - ) { - $this->pop_open_element(); - dbg("Skipping AAA: current node is \$subject ($subject) and is not AFE", 2); - return; - } - - $outer_loop_counter = 0; - while ( ++$outer_loop_counter < 8 ) { - /* - * Let __formatting element__ be the last element in the list of active - * formatting elements that: - * - is between the end of the list and the last marker in the list, - * if any, or the start of the list otherwise, and - * - has the same tag name as the token. - */ - $formatting_element = null; - $formatting_element_idx = -1; - for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { - $candidate = $this->active_formatting_elements[ $i ]; - if ( $candidate->token->is_marker() ) { - break; - } - if ( $candidate->token->tag === $subject ) { - $formatting_element = $candidate; - $formatting_element_idx = $i; - break; - } - } - - // If there is no such element, then abort these steps and instead act as - // described in the "any other end tag" entry below. - if ( null === $formatting_element ) { - dbg("Skipping AAA: no formatting element found", 2); - return self::ANY_OTHER_END_TAG; - } - dbg("AAA: Formatting element = {$formatting_element->token->tag}", 2); - - // If formatting element is not in the stack of open elements, then this is - // a parse error; remove the element from the list, and return. - if ( ! in_array( $formatting_element, $this->open_elements, true ) ) { - array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); - $this->parse_error(); - dbg("Skipping AAA: formatting element is not in the stack of open elements", 2); - return; - } - - // If formatting element is not in scope, then this is a parse error; return - if ( ! $this->is_element_in_scope( $formatting_element ) ) { - $this->parse_error(); - dbg("Skipping AAA: formatting element {$formatting_element->token->tag} is not in scope", 2); - $this->print_open_elements('Open elements: ', 2); - return; - } - - // If formatting element is not the current node, then this is a parse error. - // (But do not return.) - if ( $formatting_element !== $this->current_node() ) { - $this->parse_error(); - } - - /* - * Let furthest block be the topmost node in the stack of open elements that - * is lower in the stack than formatting element, and is an element in the - * special category. There might not be one. - */ - $furthest_block = null; - for ( $i = count( $this->open_elements ) - 1; $i >= 0; $i-- ) { - $node = $this->open_elements[ $i ]; - if ( $node === $formatting_element ) { - break; - } - if ( $this->is_special_element( $node->token->tag ) ) { - $furthest_block = $node; - } - } - - // If there is no such node, then the UA must first pop all the nodes from - // the bottom of the stack of open elements, from the current node up to - // and including formatting element, then remove formatting element from - // the list of active formatting elements, and finally abort these steps. - if ( null === $furthest_block ) { - $this->pop_until_node( $formatting_element ); - array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); - dbg("Skipping AAA: no furthest block found", 2); - return; - } - - // We didn't bale out so far, but the algorithm is not implemented. - // Let's error out. - break; - } - throw new Exception('Adoption Agency Algorithm not supported.'); - } - - private function insert_element( WP_HTML_Token $token ) { - // Text API: - $this->reconstructed_html .= '<'.$token->tag.'>'; - - // Object-oriented API: - - // Create element for a token - // Skip reset algorithm for now - // Skip form-association for now - /** - * Appropriate place for inserting a node is always the end of the - * target's children thanks to the assumptions this parser makes. - */ - $node = new WP_HTML_Node($token); - $this->current_node()->append_child($node); - dbg("Inserted element: {$node->token->tag} to parent {$this->current_node()->token->tag}", 2); - - array_push($this->open_elements, $node); - return $node; - } - - private function insert_text( WP_HTML_Token $token ) { - // Text API: - $this->reconstructed_html .= $token->value; - - // Object-oriented API: - $target = $this->current_node(); - if(count($target->children)){ - $last_child = end($target->children); - if ( $last_child && $last_child->token->is_text() ) { - $last_child->token->value .= $token->value; - return; - } - } - $target->append_child(new WP_HTML_Node($token)); - } - - private function parse_error() { - // Noop for now - } - - private function pop_until_tag_name( $tags ) { - if ( ! is_array( $tags ) ) { - $tags = array( $tags ); - } - dbg( "Popping until tag names: " . implode(', ', $tags), 1 ); - $this->print_open_elements( "Open elements before: " ); - do { - $popped = $this->pop_open_element(); - } while (!in_array($popped->token->tag, $tags)); - $this->print_open_elements( "Open elements after: " ); - } - - private function pop_until_node( $node ) { - do { - $popped = $this->pop_open_element(); - } while ( $popped !== $node ); - } - - private function pop_open_element() { - $popped = array_pop( $this->open_elements ); - - // Text API: - $this->reconstructed_html .= 'token->tag.'>'; - - // Object-oriented API: - if ( $popped->token->bookmark ) { - $this->release_bookmark( $popped->token->bookmark ); - $popped->token->bookmark = null; - } - return $popped; - } - - private function generate_implied_end_tags( $options = null ) { - while ( $this->should_generate_implied_end_tags( $options ) ) { - yield $this->pop_open_element(); - } - } - - private function current_node() { - return end( $this->open_elements ); - } - - private function close_p_element() { - dbg( "close_p_element" ); - $this->generate_implied_end_tags( - array( - 'except_for' => array( 'P' ), - ) - ); - // If the current node is not a p element, then this is a parse error. - if ( $this->current_node()->token->tag !== 'P' ) { - $this->parse_error(); - } - $this->pop_until_tag_name( 'P' ); - } - - private function should_generate_implied_end_tags( $options = null ) { - $current_tag_name = $this->current_node()->token->tag; - if ( null !== $options && isset( $options['except_for'] ) && in_array( $current_tag_name, $options['except_for'] ) ) { - return false; - } - switch ( $current_tag_name ) { - case 'DD': - case 'DT': - case 'LI': - case 'OPTION': - case 'OPTGROUP': - case 'P': - case 'RB': - case 'RP': - case 'RT': - case 'RTC': - return true; - } - - $thoroughly = null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly']; - if ( $thoroughly ) { - switch ( $current_tag_name ) { - case 'TBODY': - case 'TFOOT': - case 'THEAD': - case 'TD': - case 'TH': - case 'TR': - return true; - } - } - - return false; - } - - /** - * https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements - */ - private function push_active_formatting_element( WP_HTML_Node $node ) { - $count = 0; - for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { - $formatting_element = $this->active_formatting_elements[ $i ]; - if ( $formatting_element->token->is_marker() ) { - break; - } - if ( ! $formatting_element->token->equivalent( $node->token ) ) { - continue; - } - $count++; - if ( $count === 3 ) { - array_splice( $this->active_formatting_elements, $i, 1 ); - break; - } - } - $this->active_formatting_elements[] = $node; - } - - private function print_active_formatting_elements($msg, $indent=1) { - if (HTML_DEBUG_MODE) { - $formats = array_map(function ($node) { - return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); - }, $this->active_formatting_elements); - dbg("$msg " . implode(', ', $formats), $indent); - } - } - - private function print_open_elements($msg, $indent=1) { - if (HTML_DEBUG_MODE) { - $elems = array_map(function ($node) { - return $node->token->tag; - }, $this->open_elements); - dbg("$msg " . implode(', ', $elems), $indent); - } - } - - private function reconstruct_active_formatting_elements() { - $this->print_active_formatting_elements('AFE: before'); - if ( empty( $this->active_formatting_elements ) ) { - dbg( "Skipping AFE: empty list", 1 ); - return; - } - $entry_idx = count( $this->active_formatting_elements ) - 1; - $last_entry = $this->active_formatting_elements[ $entry_idx ]; - if ( $last_entry->token->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { - dbg( "Skipping AFE: marker or open element", 1 ); - return; - } - - // Let entry be the last (most recently added) element in the list of active formatting elements. - $entry = $last_entry; - - $is_rewinding = true; - while ( true ) { - if ( $is_rewinding ) { - // Rewind: - /* - * If there are no entries before entry in the list of active formatting elements, - * then jump to the step labeled create. - */ - if ( $entry_idx === 0 ) { - $is_rewinding = false; - } else { - // Let entry be the entry one earlier than entry in the list of active formatting elements. - $entry = $this->active_formatting_elements[ --$entry_idx ]; - - // If entry is neither a marker nor an element that is also in the stack of open elements, - // go to the step labeled rewind. - if ( ! $entry->token->is_marker() && ! in_array( $entry, $this->open_elements, true ) ) { - continue; - } - } - } else { - // Advance: - // Let entry be the element one later than entry in the list of active formatting elements. - $entry = $this->active_formatting_elements[ ++$entry_idx ]; - } - - // Create: Insert an HTML element for the token for which the element entry was created, - // to obtain new element. - $new_element = $this->insert_element( $entry->token ); - - // Replace the entry for entry in the list with an entry for new element. - $this->active_formatting_elements[ $entry_idx ] = $new_element; - - // If the entry for new element in the list of active formatting elements is not the last entry - // in the list, return to the step labeled advance. - if ( $entry_idx === count( $this->active_formatting_elements ) - 1 ) { - break; - } - } - $this->print_active_formatting_elements('AFE: after'); - } - - private function clear_active_formatting_elements_up_to_last_marker() { - while ( ! empty( $this->active_formatting_elements ) ) { - $entry = array_pop( $this->active_formatting_elements ); - if ( $entry->token->is_marker() ) { - break; - } - } - } - - /** - * The stack of open elements is said to have a particular element in - * select scope when it has that element in the specific scope consisting - * of all element types except the following: - * * optgroup - * * option - */ - private function is_element_in_select_scope( $target_node ) { - return $this->is_element_in_specific_scope( - $target_node, - array( - 'OPTGROUP', - 'OPTION', - ), - array( - 'negative_match' => 'true', - ) - ); - } - - private function is_element_in_table_scope( $target_node ) { - return $this->is_element_in_specific_scope( - $target_node, - array( - 'HTML', - 'TABLE', - 'TEMPLATE', - ) - ); - } - - private function is_element_in_button_scope( $target_node ) { - return $this->is_element_in_scope( - $target_node, - array( - 'BUTTON', - ) - ); - } - - private function is_element_in_list_item_scope( $target_node ) { - return $this->is_element_in_scope( - $target_node, - array( - 'LI', - 'DD', - 'DT', - ) - ); - } - - private function is_element_in_scope( $target_node, $additional_elements = array() ) { - return $this->is_element_in_specific_scope( - $target_node, - array_merge( - array( - 'APPLET', - 'CAPTION', - 'HTML', - 'TABLE', - 'TD', - 'TH', - 'MARQUEE', - 'OBJECT', - 'TEMPLATE', - ), - $additional_elements - ) - ); - } - - /* - * https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements - */ - private function is_element_in_specific_scope( $target_node, $element_types_list, $options = array() ) { - $negative_match = isset( $options['negative_match'] ) ? $options['negative_match'] : false; - - /** - * The stack of open elements is said to have an element target node in a - * specific scope consisting of a list of element types list when the following - * algorithm terminates in a match state: - */ - $i = count( $this->open_elements ) - 1; - // 1. Initialize node to be the current node (the bottommost node of the stack). - $node = $this->open_elements[ $i ]; - - while ( true ) { - // 2. If node is the target node, terminate in a match state. - if ( is_string( $target_node ) ) { - if ( $node->token->tag === $target_node ) { - return true; - } - } else if ( $node === $target_node ) { - return true; - } - - // 3. Otherwise, if node is one of the element types in list, terminate in a failure state. - $failure = in_array( $node->token->tag, $element_types_list, true ); - - // Some elements say: - // > If has that element in the specific scope consisting of all element types - // > except the following - // So we need to invert the result. - if($negative_match) { - $failure = ! $failure; - } - if ( $failure ) { - return false; - } - - // Otherwise, set node to the previous entry in the stack of open elements and - // return to step 2. (This will never fail, since the loop will always terminate - // in the previous step if the top of the stack — an html element — is reached.) - $node = $this->open_elements[ --$i ]; - } - } - - private static function is_special_element( $tag_name, $except = null ) { - if ( null !== $except && in_array( $tag_name, $except, true ) ) { - return false; - } - - switch ( $tag_name ) { - case 'ADDRESS': - case 'APPLET': - case 'AREA': - case 'ARTICLE': - case 'ASIDE': - case 'BASE': - case 'BASEFONT': - case 'BGSOUND': - case 'BLOCKQUOTE': - case 'BODY': - case 'BR': - case 'BUTTON': - case 'CAPTION': - case 'CENTER': - case 'COL': - case 'COLGROUP': - case 'DD': - case 'DETAILS': - case 'DIR': - case 'DIV': - case 'DL': - case 'DT': - case 'EMBED': - case 'FIELDSET': - case 'FIGCAPTION': - case 'FIGURE': - case 'FOOTER': - case 'FORM': - case 'FRAME': - case 'FRAMESET': - case 'H1': - case 'H2': - case 'H3': - case 'H4': - case 'H5': - case 'H6': - case 'HEAD': - case 'HEADER': - case 'HGROUP': - case 'HR': - case 'HTML': - case 'IFRAME': - case 'IMG': - case 'INPUT': - case 'ISINDEX': - case 'LI': - case 'LINK': - case 'LISTING': - case 'MAIN': - case 'MARQUEE': - case 'MENU': - case 'MENUITEM': - case 'META': - case 'NAV': - case 'NOEMBED': - case 'NOFRAMES': - case 'NOSCRIPT': - case 'OBJECT': - case 'OL': - case 'P': - case 'PARAM': - case 'PLAINTEXT': - case 'PRE': - case 'SCRIPT': - case 'SECTION': - case 'SELECT': - case 'SOURCE': - case 'STYLE': - case 'SUMMARY': - case 'TABLE': - case 'TBODY': - case 'TD': - case 'TEMPLATE': - case 'TEXTAREA': - case 'TFOOT': - case 'TH': - case 'THEAD': - case 'TITLE': - case 'TR': - case 'TRACK': - case 'UL': - case 'WBR': - case 'XMP': - return true; - default: - return false; - } - } - - private static function is_rcdata_element( $tag_name ) { - switch ( $tag_name ) { - case 'TITLE': - case 'TEXTAREA': - case 'STYLE': - case 'XMP': - case 'IFRAME': - case 'NOEMBED': - case 'NOFRAMES': - case 'NOSCRIPT': - return true; - default: - return false; - } - } - - private static function is_formatting_element( $tag_name ) { - switch ( strtoupper( $tag_name ) ) { - case 'A': - case 'B': - case 'BIG': - case 'CODE': - case 'EM': - case 'FONT': - case 'I': - case 'NOBR': - case 'S': - case 'SMALL': - case 'STRIKE': - case 'STRONG': - case 'TT': - case 'U': - return true; - default: - return false; - } - } - -} - -// $dir = realpath( __DIR__ . '/../../../index.html' ); - -// $htmlspec = file_get_contents( $dir ); -// $p = new WP_HTML_Processor( $htmlspec ); -// $p->parse(); - -// die(); - -$p = new WP_HTML_Processor( '

12345

' ); -$p->parse(); -/* -Outputs: - p - ├─ #text: 1 - ├─ b - │ ├─ #text: 2 - │ └─ i - │ └─ #text: 3 - ├─ i - │ └─ #text: 4 - └─ #text: 5 -*/ -echo "\n\n"; -echo $p->reconstructed_html; -die(); - -// $p = new WP_HTML_Processor( '
12
34' ); -// $p->parse(); -/* -DOM after main loop: - HTML - ├─ DIV - ├─ #text: 1 - └─ SPAN - └─ #text: 2 - └─ #text: 34 -*/ - -// $p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); -// $p->parse(); -/* -Outputs: - -DOM after main loop: - HTML - ├─ UL - ├─ LI - └─ #text: 1 - ├─ LI - └─ #text: 2 - ├─ LI - └─ #text: 3 - ├─ LI - ├─ #text: Lorem - └─ B - └─ #text: Ipsum - └─ LI - └─ B - └─ #text: Dolor - └─ B - └─ SPAN - ├─ #text: Sit - └─ SPAN - ├─ #text: Sit - └─ SPAN - └─ DIV - └─ #text: Amet -*/ - -$p = new WP_HTML_Processor( ' -
-
-
-
-
' ); -$p->parse(); -// $p = new WP_HTML_Processor( '1

23

' ); -// $p->parse(); -// /* -// Outputs the correct result: -// B -// └─ #text: 1 -// P -// ├─ B -// └─ #text: 2 -// └─ #text: 3 -// */ -echo "\n\n"; -echo $p->reconstructed_html; -die(); - -$p = new WP_HTML_Processor( '

X -

X -

X -

X' ); -$p->parse(); -/* -DOM after main loop: - HTML - ├─ P - └─ B class="x" - └─ B class="x" - └─ B - └─ B class="x" - └─ B class="x" - └─ B - └─ #text: X - ├─ P - └─ B class="x" - └─ B - └─ B class="x" - └─ B class="x" - └─ B - └─ #text: X - ├─ P - └─ B class="x" - └─ B - └─ B class="x" - └─ B class="x" - └─ B - └─ B - └─ B class="x" - └─ B - └─ #text: X - └─ P - └─ #text: X -*/ diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 9aca0d6f28b85..5818843523e2c 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -413,7 +413,7 @@ class WP_HTML_Tag_Processor { * * @var bool */ - private $is_closing_tag; + protected $is_closing_tag; /** * Lazily-built index of attributes found within an HTML tag, keyed by the attribute name. From a7d76e7cb8e6cb8c5f4a056b0b0fbaf1c1587add Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 13:58:29 +0100 Subject: [PATCH 20/42] Close the tags in a correct order --- .../html-api/class-wp-html-tag-processor.php | 23 +++- .../html-api/class-wp-html-text-processor.php | 125 +++++++++++------- .../class-wp-html-text-replacement.php | 11 +- 3 files changed, 107 insertions(+), 52 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 5818843523e2c..1e8d5c00b7d1e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1424,7 +1424,7 @@ private function class_name_updates_to_attribute_updates() { */ private function attribute_updates_to_lexical_updates() { foreach ( $this->attribute_updates as $update ) { - $this->lexical_updates[] = $update; + $this->add_lexical_update( $update ); } $this->attribute_updates = array(); } @@ -1502,6 +1502,22 @@ private function apply_lexical_updates() { $this->lexical_updates = array(); } + /** + * WP_HTML_Processor often needs to insert a few tag closers + * at the same offset in a very specific order. + * + * However, the usort implemented in `apply_lexical_updates` + * used to reorder them alphabetically based on the text to be + * inserted. + * + * This method enables retaining the order in which the updates + * were enqueued. + */ + protected function add_lexical_update( WP_HTML_Text_Replacement $update ) { + $update->order = count($this->lexical_updates); + $this->lexical_updates[] = $update; + } + /** * Checks whether a bookmark with the given name exists. * @@ -1569,6 +1585,11 @@ private static function sort_start_ascending( $a, $b ) { return $by_start; } + $by_order = $a->order - $b->order; + if ( 0 !== $by_order ) { + return $by_order; + } + $by_text = isset( $a->text, $b->text ) ? strcmp( $a->text, $b->text ) : 0; if ( 0 !== $by_text ) { return $by_text; diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index 898acd2cea3d7..160b452dc68a0 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -80,35 +80,36 @@ public function __construct( $html ) { public function parse() { echo("HTML before main loop:\n"); - // echo($this->html); + echo($this->html); echo("\n"); while ($this->next_node()) { // ... twiddle thumbs ... } - while ( count($this->open_elements) > 1 ) { $this->pop_open_element(); } echo("\n"); echo("Reconstructed HTML after main loop:\n"); - // echo($this->reconstructed_html.''); + echo($this->reconstructed_html.''); echo "\n\n"; echo("\$this->HTML after main loop:\n"); - // echo($this->get_updated_html().''); + echo($this->get_updated_html().''); echo "\n\n"; echo "Mem peak usage:" . (memory_get_peak_usage(true) / 1024 / 1024) . "MB\n"; echo("\n---------------\n\n"); } - public function ignore_current_tag_token() { + public function drop_current_tag_token() { // @TODO: remove the current tag from $this->html instead of // not appending it to $this->reconstructed_html - $this->lexical_updates[] = new WP_HTML_Text_Replacement( - $this->current_token_start, - $this->current_token_end, - '' + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $this->current_token_start, + $this->current_token_end, + '' + ) ); return true; } @@ -119,7 +120,6 @@ public function ignore_current_tag_token() { public function next_node() { $text_start = $this->tag_ends_at + 1; $this->current_token_start = $text_start; - if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); $this->set_bookmark($bookmark); @@ -437,7 +437,7 @@ public function next_node() { case 'UL': if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); - return $this->ignore_current_tag_token(); + return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); $this->pop_until_node_or_tag( $token->tag, false ); @@ -462,7 +462,7 @@ public function next_node() { case 'LI': if ( ! $this->is_element_in_list_item_scope( 'LI' ) ) { $this->parse_error(); - return $this->ignore_current_tag_token(); + return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); $this->pop_until_node_or_tag( 'LI', false ); @@ -471,7 +471,7 @@ public function next_node() { case 'DT': if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); - return $this->ignore_current_tag_token(); + return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); $this->pop_until_node_or_tag( $token->tag, false ); @@ -484,7 +484,7 @@ public function next_node() { case 'H6': if ( ! $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->parse_error(); - return $this->ignore_current_tag_token(); + return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); $this->pop_until_node_or_tag( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), false ); @@ -511,7 +511,7 @@ public function next_node() { case 'OBJECT': if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); - return $this->ignore_current_tag_token(); + return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); if ( $this->current_node()->tag !== $token->tag ) { @@ -523,30 +523,39 @@ public function next_node() { case 'BR': // This should never happen since Tag_Processor corrects that default: - $i = count( $this->open_elements ) - 1; - while ( true ) { - $node = $this->open_elements[ $i ]; - if ( $node->tag === $token->tag ) { - $this->generate_implied_end_tags( - array( - 'except_for' => array( $token->tag ), - ) - ); - $this->pop_until_node_or_tag( $node ); - break; - } elseif ( $this->is_special_element( $node->tag ) ) { - $this->parse_error(); - return $this->ignore_current_tag_token(); - } else { - --$i; - } - } + $this->process_any_other_end_tag( $token ); break; } } return $token; } + private function process_any_other_end_tag( WP_HTML_Tag_Token $token ) { + $node = $this->current_node(); + $tag = $token->tag; + $i = count( $this->open_elements ) - 1; + while ( true ) { + if ( $node->tag === $tag ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( $tag ), + ) + ); + if ( $node->tag !== $tag ) { + $this->parse_error(); + } + $this->pop_until_node_or_tag( $node ); + break; + } elseif ( $this->is_special_element( $node->tag ) ) { + $this->parse_error(); + return $this->drop_current_tag_token(); + } else { + --$i; + $node = $this->open_elements[ $i ]; + } + } + } + private $element_bookmark_idx = 0; private function next_token() { if($this->buffered_tag){ @@ -637,7 +646,7 @@ private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { // described in the "any other end tag" entry below. if ( null === $formatting_element ) { dbg("Skipping AAA: no formatting element found", 2); - return self::ANY_OTHER_END_TAG; + return $this->process_any_other_end_tag( $token ); } dbg("AAA: Formatting element = {$formatting_element->tag}", 2); @@ -647,6 +656,19 @@ private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); $this->parse_error(); dbg("Skipping AAA: formatting element is not in the stack of open elements", 2); + + /** + * This is not in the spec, but it's necessary. + * + * If we were building a DOM, moving on without + * creating a Node would be the same as dropping + * the unexpected token. + * + * We're processing a text stream, though, so simply + * moving on would leave that token in place. Instead, + * we need to drop it explicitly. + */ + $this->drop_current_tag_token(); return; } @@ -654,7 +676,12 @@ private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { if ( ! $this->is_element_in_scope( $formatting_element ) ) { $this->parse_error(); dbg("Skipping AAA: formatting element {$formatting_element->tag} is not in scope", 2); - $this->print_open_elements('Open elements: ', 2); + + /** + * This is not in the spec, but it's necessary. + * See the previous "if" statement for details. + */ + $this->drop_current_tag_token(); return; } @@ -699,17 +726,14 @@ private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { } private function insert_element( WP_HTML_Tag_Token $token ) { - // Text API: - // @TODO: do nothing if $token is already in $this->html - // instead of building $this->reconstructed_html - // from scratch - // @TODO attrs $this->reconstructed_html .= '<'.$token->tag.'>'; if($token !== $this->current_token) { - $this->lexical_updates[] = new WP_HTML_Text_Replacement( - $this->current_token_start, - $this->current_token_start, - "<{$token->tag}>" + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $this->current_token_start, + $this->current_token_start, + "<{$token->tag}>" + ) ); } array_push($this->open_elements, $token); @@ -717,10 +741,12 @@ private function insert_element( WP_HTML_Tag_Token $token ) { } private function insert_tag_closer_before_current_token( $tag ) { - $this->lexical_updates[] = new WP_HTML_Text_Replacement( - $this->current_token_start, - $this->current_token_start, - "" + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $this->current_token_start, + $this->current_token_start, + "" + ) ); } @@ -1197,7 +1223,6 @@ private static function is_formatting_element( $tag_name ) { $p = new WP_HTML_Processor( '

  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); $p->parse(); - // $p = new WP_HTML_Processor( ' //
//
@@ -1210,5 +1235,5 @@ private static function is_formatting_element( $tag_name ) { $p = new WP_HTML_Processor( '

X

X

X -

X' ); +

Xy' ); $p->parse(); diff --git a/src/wp-includes/html-api/class-wp-html-text-replacement.php b/src/wp-includes/html-api/class-wp-html-text-replacement.php index 912b4a56a5eb4..e76f3fcfb5a3d 100644 --- a/src/wp-includes/html-api/class-wp-html-text-replacement.php +++ b/src/wp-includes/html-api/class-wp-html-text-replacement.php @@ -42,6 +42,13 @@ class WP_HTML_Text_Replacement { */ public $text; + /** + * Order in which the replacement was enqueued. + * + * @var mixed + */ + public $order; + /** * Constructor. * @@ -50,10 +57,12 @@ class WP_HTML_Text_Replacement { * @param int $start Byte offset into document where replacement span begins. * @param int $end Byte offset into document where replacement span ends. * @param string $text Span of text to insert in document to replace existing content from start to end. + * @param string $order Order in which the replacement was enqueued. */ - public function __construct( $start, $end, $text ) { + public function __construct( $start, $end, $text, $order = 0 ) { $this->start = $start; $this->end = $end; $this->text = $text; + $this->order = $order; } } From 0663a48427f594546da89f1e8efe4983c7b7d213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 14:25:17 +0100 Subject: [PATCH 21/42] Reconstruct the active formatting elements in their correct location --- .../html-api/class-wp-html-text-processor.php | 147 ++++++++++-------- 1 file changed, 85 insertions(+), 62 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index 160b452dc68a0..7745246852fbb 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -32,6 +32,14 @@ public function __construct( $tag, $bookmark = null ) { } +class WP_HTML_Text_Token { + public $bookmark; + + public function __construct( $bookmark ) { + $this->bookmark = $bookmark; + } +} + /** * */ @@ -82,7 +90,7 @@ public function parse() { echo("HTML before main loop:\n"); echo($this->html); echo("\n"); - while ($this->next_node()) { + while ($this->next_element_node()) { // ... twiddle thumbs ... } while ( count($this->open_elements) > 1 ) { @@ -107,56 +115,71 @@ public function drop_current_tag_token() { $this->add_lexical_update( new WP_HTML_Text_Replacement( $this->current_token_start, - $this->current_token_end, + $this->current_token_end + 1, '' ) ); return true; } - private $current_token; - private $current_token_start; - private $current_token_end; - public function next_node() { + private $previous_token; + private function next_tag_token() { + if( + $this->current_token && + $this->has_bookmark($this->current_token->bookmark) + ) { + $this->previous_token = $this->current_token; + } + + $tag_token = null; $text_start = $this->tag_ends_at + 1; - $this->current_token_start = $text_start; - if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + if ($this->next_tag(array('tag_closers' => 'visit'))) { $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); $this->set_bookmark($bookmark); - $next_tag = new WP_HTML_Tag_Token( + $tag_token = new WP_HTML_Tag_Token( $this->get_tag(), $bookmark ); $text_end = $this->bookmarks[$bookmark]->start; } else { - $next_tag = null; - $this->current_token_start = strlen($this->html); $text_end = strlen($this->html); } - $this->current_token_end = $text_end; if ($text_start < $text_end) { - $text = substr($this->html, $text_start, $text_end - $text_start); - $this->current_token = $text; - dbg( "Found text node '$text'" ); + $this->current_token = substr($this->html, $text_start, $text_end - $text_start); + $this->current_token_start = $text_start; + $this->current_token_end = $text_end; + dbg( "Found text node '$this->current_token'" ); dbg( "Appending text to reconstructed HTML", 1 ); $this->reconstruct_active_formatting_elements(); // @TODO don't append stuff to $this->reconstructed_html // instead, skip over the text in $this->html - $this->reconstructed_html .= $text; + $this->reconstructed_html .= $this->current_token; } - $this->current_token = $next_tag; - if ( ! $this->current_token ) { + if ( ! $tag_token ) { + $this->current_token = null; + $this->current_token_start = strlen($this->html); + $this->current_token_end = strlen($this->html); return false; } - $this->current_token_start = $this->bookmarks[$this->current_token->bookmark]->start; - $this->current_token_end = $this->bookmarks[$this->current_token->bookmark]->end + 1; - $token = $this->current_token; + $this->current_token = $tag_token; + $this->current_token_start = $this->bookmarks[$tag_token->bookmark]->start; + $this->current_token_end = $this->bookmarks[$tag_token->bookmark]->end; + return true; + } + + private $current_token; + private $current_token_start; + private $current_token_end; + public function next_element_node() { + if ( ! $this->next_tag_token() ) { + return false; + } if ( ! $this->is_tag_closer() ) { - dbg( "Found {$token->tag} tag opener" ); - switch ( $token->tag ) { + dbg( "Found {$this->current_token->tag} tag opener" ); + switch ( $this->current_token->tag ) { case 'ADDRESS': case 'ARTICLE': case 'ASIDE': @@ -191,7 +214,7 @@ public function next_node() { if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; // A start tag whose tag name is "h1", "h2", "h3", "h4", "h5", or "h6" case 'H1': @@ -206,13 +229,13 @@ public function next_node() { if ( in_array( $this->current_node()->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->pop_open_element(); } - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'FORM': if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'LI': $i = count( $this->open_elements ) - 1; @@ -237,7 +260,7 @@ public function next_node() { if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'DD': case 'DT': @@ -271,7 +294,7 @@ public function next_node() { if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'BUTTON': if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { @@ -279,7 +302,7 @@ public function next_node() { $this->pop_until_node_or_tag( 'BUTTON' ); } $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'A': $active_a = null; @@ -295,11 +318,11 @@ public function next_node() { if ( $active_a ) { $this->parse_error(); - $this->adoption_agency_algorithm( $token ); + $this->adoption_agency_algorithm( $this->current_token ); } $this->reconstruct_active_formatting_elements(); - $node = $this->insert_element( $token ); + $node = $this->insert_element( $this->current_token ); $this->push_active_formatting_element( $node ); break; case 'B': @@ -315,28 +338,28 @@ public function next_node() { case 'TT': case 'U': $this->reconstruct_active_formatting_elements(); - $node = $this->insert_element( $token ); + $node = $this->insert_element( $this->current_token ); $this->push_active_formatting_element( $node ); break; case 'NOBR': $this->reconstruct_active_formatting_elements(); if ( $this->is_element_in_scope( 'NOBR' ) ) { $this->parse_error(); - $this->adoption_agency_algorithm( $token ); + $this->adoption_agency_algorithm( $this->current_token ); $this->reconstruct_active_formatting_elements(); } - $node = $this->insert_element( $token ); + $node = $this->insert_element( $this->current_token ); $this->push_active_formatting_element( $node ); break; case 'APPLET': case 'MARQUEE': case 'OBJECT': $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); $this->active_formatting_elements[] = $this->MARKER; break; case 'TABLE': - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'AREA': case 'BR': @@ -345,52 +368,52 @@ public function next_node() { case 'KEYGEN': case 'WBR': $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); $this->pop_open_element( false ); break; case 'PARAM': case 'SOURCE': case 'TRACK': - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); $this->pop_open_element( false ); break; case 'HR': if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); $this->pop_open_element( false ); break; case 'TEXTAREA': - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'SELECT': $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'OPTION': $this->pop_open_element(false); case 'OPTGROUP': $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'RB': case 'RTC': if ( $this->is_element_in_scope( 'RB' ) || $this->is_element_in_scope( 'RTC' ) ) { $this->parse_error(); - $this->adoption_agency_algorithm( $token ); + $this->adoption_agency_algorithm( $this->current_token ); $this->reconstruct_active_formatting_elements(); } - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; case 'RP': case 'RT': if ( $this->is_element_in_scope( 'RP' ) || $this->is_element_in_scope( 'RT' ) ) { $this->parse_error(); - $this->adoption_agency_algorithm( $token ); + $this->adoption_agency_algorithm( $this->current_token ); $this->reconstruct_active_formatting_elements(); } - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; // case 'XMP': @@ -401,16 +424,16 @@ public function next_node() { // case 'NOSCRIPT': // case 'PLAINTEXT': // case 'IMAGE': - // throw new Exception( $token->tag . ' not implemented yet' ); + // throw new Exception( $this->current_token->tag . ' not implemented yet' ); default: $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); + $this->insert_element( $this->current_token ); break; } } else { - dbg( "Found {$token->tag} tag closer" ); - switch ( $token->tag ) { + dbg( "Found {$this->current_token->tag} tag closer" ); + switch ( $this->current_token->tag ) { case 'ADDRESS': case 'ARTICLE': case 'ASIDE': @@ -435,16 +458,16 @@ public function next_node() { case 'SECTION': case 'SUMMARY': case 'UL': - if ( ! $this->is_element_in_scope( $token->tag ) ) { + if ( ! $this->is_element_in_scope( $this->current_token->tag ) ) { $this->parse_error(); return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( $token->tag, false ); + $this->pop_until_node_or_tag( $this->current_token->tag, false ); break; case 'FORM': $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( $token->tag, false ); + $this->pop_until_node_or_tag( $this->current_token->tag, false ); break; case 'P': /* @@ -469,12 +492,12 @@ public function next_node() { break; case 'DD': case 'DT': - if ( ! $this->is_element_in_scope( $token->tag ) ) { + if ( ! $this->is_element_in_scope( $this->current_token->tag ) ) { $this->parse_error(); return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( $token->tag, false ); + $this->pop_until_node_or_tag( $this->current_token->tag, false ); break; case 'H1': case 'H2': @@ -502,32 +525,32 @@ public function next_node() { case 'STRONG': case 'TT': case 'U': - dbg( "Found {$token->tag} tag closer" ); - $this->adoption_agency_algorithm( $token ); + dbg( "Found {$this->current_token->tag} tag closer" ); + $this->adoption_agency_algorithm( $this->current_token ); break; case 'APPLET': case 'MARQUEE': case 'OBJECT': - if ( ! $this->is_element_in_scope( $token->tag ) ) { + if ( ! $this->is_element_in_scope( $this->current_token->tag ) ) { $this->parse_error(); return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); - if ( $this->current_node()->tag !== $token->tag ) { + if ( $this->current_node()->tag !== $this->current_token->tag ) { $this->parse_error(); } - $this->pop_until_node_or_tag( $token->tag, false ); + $this->pop_until_node_or_tag( $this->current_token->tag, false ); $this->clear_active_formatting_elements_up_to_last_marker(); break; case 'BR': // This should never happen since Tag_Processor corrects that default: - $this->process_any_other_end_tag( $token ); + $this->process_any_other_end_tag( $this->current_token ); break; } } - return $token; + return $this->current_token; } private function process_any_other_end_tag( WP_HTML_Tag_Token $token ) { From 7887401315481d0b4e94188fa1a4572d1eb46462 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 14:29:55 +0100 Subject: [PATCH 22/42] =?UTF-8?q?Remove=20$reconstructed=5Fhtml=20?= =?UTF-8?q?=E2=80=93=20always=20operate=20on=20the=20tag=20processor=20str?= =?UTF-8?q?eam?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html-api/class-wp-html-text-processor.php | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index 7745246852fbb..83d1431050490 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -74,8 +74,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $last_token = null; private $inserted_tokens = array(); - public $reconstructed_html = ''; - const MAX_BOOKMARKS = 1000000; public function __construct( $html ) { @@ -98,9 +96,6 @@ public function parse() { } echo("\n"); - echo("Reconstructed HTML after main loop:\n"); - echo($this->reconstructed_html.''); - echo "\n\n"; echo("\$this->HTML after main loop:\n"); echo($this->get_updated_html().''); echo "\n\n"; @@ -110,8 +105,6 @@ public function parse() { } public function drop_current_tag_token() { - // @TODO: remove the current tag from $this->html instead of - // not appending it to $this->reconstructed_html $this->add_lexical_update( new WP_HTML_Text_Replacement( $this->current_token_start, @@ -152,9 +145,6 @@ private function next_tag_token() { dbg( "Found text node '$this->current_token'" ); dbg( "Appending text to reconstructed HTML", 1 ); $this->reconstruct_active_formatting_elements(); - // @TODO don't append stuff to $this->reconstructed_html - // instead, skip over the text in $this->html - $this->reconstructed_html .= $this->current_token; } if ( ! $tag_token ) { @@ -749,13 +739,16 @@ private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { } private function insert_element( WP_HTML_Tag_Token $token ) { - $this->reconstructed_html .= '<'.$token->tag.'>'; if($token !== $this->current_token) { + // Aesthetic choice for now. + // @TODO: discuss it with the team + $tag = strtolower($token->tag); + $this->add_lexical_update( new WP_HTML_Text_Replacement( $this->current_token_start, $this->current_token_start, - "<{$token->tag}>" + "<{$tag}>" ) ); } @@ -764,6 +757,9 @@ private function insert_element( WP_HTML_Tag_Token $token ) { } private function insert_tag_closer_before_current_token( $tag ) { + // Aesthetic choice for now. + // @TODO: consider preserving the case of the opening tag + $tag = strtolower($tag); $this->add_lexical_update( new WP_HTML_Text_Replacement( $this->current_token_start, @@ -804,7 +800,6 @@ private function pop_until_node_or_tag( $node_or_element, $tag_closer_for_last_e private function pop_open_element($add_close_tag = true) { $popped = array_pop( $this->open_elements ); - $this->reconstructed_html .= 'tag.'>'; if ( $add_close_tag ) { $this->insert_tag_closer_before_current_token( $popped->tag ); } From 481fce59281105f36fd669137c4caa4125b93cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 14:34:05 +0100 Subject: [PATCH 23/42] Clean up the API --- .../html-api/class-wp-html-text-processor.php | 283 +++++++----------- 1 file changed, 104 insertions(+), 179 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index 83d1431050490..498e63eb2ad6a 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -32,14 +32,6 @@ public function __construct( $tag, $bookmark = null ) { } -class WP_HTML_Text_Token { - public $bookmark; - - public function __construct( $bookmark ) { - $this->bookmark = $bookmark; - } -} - /** * */ @@ -58,6 +50,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $root_node = null; private $context_node = null; + private $element_bookmark_idx = 0; + /* * WP_HTML_Tag_Processor skips over text nodes and only * processes tags. @@ -104,62 +98,6 @@ public function parse() { echo("\n---------------\n\n"); } - public function drop_current_tag_token() { - $this->add_lexical_update( - new WP_HTML_Text_Replacement( - $this->current_token_start, - $this->current_token_end + 1, - '' - ) - ); - return true; - } - - private $previous_token; - private function next_tag_token() { - if( - $this->current_token && - $this->has_bookmark($this->current_token->bookmark) - ) { - $this->previous_token = $this->current_token; - } - - $tag_token = null; - $text_start = $this->tag_ends_at + 1; - if ($this->next_tag(array('tag_closers' => 'visit'))) { - $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); - $this->set_bookmark($bookmark); - $tag_token = new WP_HTML_Tag_Token( - $this->get_tag(), - $bookmark - ); - $text_end = $this->bookmarks[$bookmark]->start; - } else { - $text_end = strlen($this->html); - } - - if ($text_start < $text_end) { - $this->current_token = substr($this->html, $text_start, $text_end - $text_start); - $this->current_token_start = $text_start; - $this->current_token_end = $text_end; - dbg( "Found text node '$this->current_token'" ); - dbg( "Appending text to reconstructed HTML", 1 ); - $this->reconstruct_active_formatting_elements(); - } - - if ( ! $tag_token ) { - $this->current_token = null; - $this->current_token_start = strlen($this->html); - $this->current_token_end = strlen($this->html); - return false; - } - - $this->current_token = $tag_token; - $this->current_token_start = $this->bookmarks[$tag_token->bookmark]->start; - $this->current_token_end = $this->bookmarks[$tag_token->bookmark]->end; - return true; - } - private $current_token; private $current_token_start; private $current_token_end; @@ -237,7 +175,7 @@ public function next_element_node() { 'except_for' => array( 'LI' ), ) ); - $this->pop_until_node_or_tag( 'LI' ); + $this->pop_until_tag( 'LI' ); break; } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; @@ -263,7 +201,7 @@ public function next_element_node() { 'except_for' => array( 'DD' ), ) ); - $this->pop_until_node_or_tag( 'DD' ); + $this->pop_until_tag( 'DD' ); break; } elseif ( $node->tag === 'DT' ) { $this->generate_implied_end_tags( @@ -271,7 +209,7 @@ public function next_element_node() { 'except_for' => array( 'DT' ), ) ); - $this->pop_until_node_or_tag( 'DT' ); + $this->pop_until_tag( 'DT' ); break; } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; @@ -289,7 +227,7 @@ public function next_element_node() { case 'BUTTON': if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( 'BUTTON' ); + $this->pop_until_tag( 'BUTTON' ); } $this->reconstruct_active_formatting_elements(); $this->insert_element( $this->current_token ); @@ -453,11 +391,11 @@ public function next_element_node() { return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( $this->current_token->tag, false ); + $this->pop_until_tag( $this->current_token->tag, false ); break; case 'FORM': $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( $this->current_token->tag, false ); + $this->pop_until_tag( $this->current_token->tag, false ); break; case 'P': /* @@ -478,7 +416,7 @@ public function next_element_node() { return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( 'LI', false ); + $this->pop_until_tag( 'LI', false ); break; case 'DD': case 'DT': @@ -487,7 +425,7 @@ public function next_element_node() { return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( $this->current_token->tag, false ); + $this->pop_until_tag( $this->current_token->tag, false ); break; case 'H1': case 'H2': @@ -500,7 +438,7 @@ public function next_element_node() { return $this->drop_current_tag_token(); } $this->generate_implied_end_tags(); - $this->pop_until_node_or_tag( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), false ); + $this->pop_until_tag( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), false ); break; case 'A': case 'B': @@ -530,7 +468,7 @@ public function next_element_node() { if ( $this->current_node()->tag !== $this->current_token->tag ) { $this->parse_error(); } - $this->pop_until_node_or_tag( $this->current_token->tag, false ); + $this->pop_until_tag( $this->current_token->tag, false ); $this->clear_active_formatting_elements_up_to_last_marker(); break; case 'BR': @@ -543,6 +481,44 @@ public function next_element_node() { return $this->current_token; } + private function next_tag_token() { + $tag_token = null; + $text_start = $this->tag_ends_at + 1; + if ($this->next_tag(array('tag_closers' => 'visit'))) { + // @TODO don't create a bookmark for every single tag + $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); + $this->set_bookmark($bookmark); + $tag_token = new WP_HTML_Tag_Token( + $this->get_tag(), + $bookmark + ); + $text_end = $this->bookmarks[$bookmark]->start; + } else { + $text_end = strlen($this->html); + } + + if ($text_start < $text_end) { + $this->current_token = substr($this->html, $text_start, $text_end - $text_start); + $this->current_token_start = $text_start; + $this->current_token_end = $text_end; + dbg( "Found text node '$this->current_token'" ); + dbg( "Appending text to reconstructed HTML", 1 ); + $this->reconstruct_active_formatting_elements(); + } + + if ( ! $tag_token ) { + $this->current_token = null; + $this->current_token_start = strlen($this->html); + $this->current_token_end = strlen($this->html); + return false; + } + + $this->current_token = $tag_token; + $this->current_token_start = $this->bookmarks[$tag_token->bookmark]->start; + $this->current_token_end = $this->bookmarks[$tag_token->bookmark]->end; + return true; + } + private function process_any_other_end_tag( WP_HTML_Tag_Token $token ) { $node = $this->current_node(); $tag = $token->tag; @@ -557,7 +533,7 @@ private function process_any_other_end_tag( WP_HTML_Tag_Token $token ) { if ( $node->tag !== $tag ) { $this->parse_error(); } - $this->pop_until_node_or_tag( $node ); + $this->pop_until_node( $node ); break; } elseif ( $this->is_special_element( $node->tag ) ) { $this->parse_error(); @@ -569,56 +545,6 @@ private function process_any_other_end_tag( WP_HTML_Tag_Token $token ) { } } - private $element_bookmark_idx = 0; - private function next_token() { - if($this->buffered_tag){ - $next_tag = $this->buffered_tag; - $this->buffered_tag = null; - return $next_tag; - } - - $next_tag = false; - if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { - $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); - $this->set_bookmark($bookmark); - $attributes = array(); - $attrs = $this->get_attribute_names_with_prefix(''); - if ($attrs) { - foreach ($attrs as $name) { - $attributes[$name] = $this->get_attribute($name); - } - } - $next_tag = new WP_HTML_Tag_Token( - $this->get_tag(), - $bookmark - ); - $text_end = $this->bookmarks[$bookmark]->start; - } else { - $text_end = strlen($this->html); - } - - /* - * If any text was found between the last tag and this one, - * save the next tag for later and return the text token. - */ - $last = $this->last_token; - if ( - $last - && $last->bookmark - && $this->has_bookmark($last->bookmark) - ) { - $text_start = $this->bookmarks[$last->bookmark]->end + 1; - if ($text_start < $text_end) { - $this->buffered_tag = $next_tag; - $text = substr($this->html, $text_start, $text_end - $text_start); - return $text; - } - } - - return $next_tag; - } - - const ANY_OTHER_END_TAG = 1; private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { dbg("Adoption Agency Algorithm", 1); $subject = $token->tag; @@ -725,7 +651,7 @@ private function adoption_agency_algorithm( WP_HTML_Tag_Token $token ) { // and including formatting element, then remove formatting element from // the list of active formatting elements, and finally abort these steps. if ( null === $furthest_block ) { - $this->pop_until_node_or_tag( $formatting_element, false ); + $this->pop_until_node( $formatting_element, false ); array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); dbg("Skipping AAA: no furthest block found", 2); return; @@ -756,45 +682,37 @@ private function insert_element( WP_HTML_Tag_Token $token ) { return $token; } - private function insert_tag_closer_before_current_token( $tag ) { - // Aesthetic choice for now. - // @TODO: consider preserving the case of the opening tag - $tag = strtolower($tag); - $this->add_lexical_update( - new WP_HTML_Text_Replacement( - $this->current_token_start, - $this->current_token_start, - "" - ) - ); - } - private function parse_error() { // Noop for now } - private function pop_until_node_or_tag( $node_or_element, $tag_closer_for_last_element = true ) { + private function pop_until_tag( $tag_names, $insert_tag_closer_for_last_popped_element = true ) { + // @TODO split this into two methods + if(!is_array($tag_names)) { + $tag_names = array($tag_names); + } while( true ) { $popped = $this->pop_open_element( false ); - if ($tag_closer_for_last_element) { - $this->insert_tag_closer_before_current_token($popped->tag); - } - if(is_string($node_or_element)) { - if($popped->tag === $node_or_element) { - break; - } - } else if(is_array($node_or_element)) { - if(in_array($popped->tag, $node_or_element)) { - break; - } - } else { - if($popped === $node_or_element) { - break; - } + if(in_array($popped->tag, $tag_names, true)) { + break; } - if(!$tag_closer_for_last_element) { - $this->insert_tag_closer_before_current_token($popped->tag); + $this->insert_tag_closer_before_current_token($popped->tag); + } + if($insert_tag_closer_for_last_popped_element) { + $this->insert_tag_closer_before_current_token($popped->tag); + } + } + + private function pop_until_node( WP_HTML_Tag_Token $target, $insert_tag_closer_for_last_popped_element = true ) { + while( true ) { + $popped = $this->pop_open_element( false ); + if($popped === $target) { + break; } + $this->insert_tag_closer_before_current_token($popped->tag); + } + if($insert_tag_closer_for_last_popped_element) { + $this->insert_tag_closer_before_current_token($popped->tag); } } @@ -806,6 +724,30 @@ private function pop_open_element($add_close_tag = true) { return $popped; } + public function drop_current_tag_token() { + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $this->current_token_start, + $this->current_token_end + 1, + '' + ) + ); + return true; + } + + private function insert_tag_closer_before_current_token( $tag ) { + // Aesthetic choice for now. + // @TODO: consider preserving the case of the opening tag + $tag = strtolower($tag); + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $this->current_token_start, + $this->current_token_start, + "" + ) + ); + } + private function generate_implied_end_tags( $options = null ) { while( $this->should_generate_implied_end_tags( $options ) ) { $this->pop_open_element( true ); @@ -816,7 +758,7 @@ private function current_node() { return end( $this->open_elements ); } - private function close_p_element($closer_for_last_elem = true) { + private function close_p_element($insert_p_tag_closer = true) { dbg( "close_p_element" ); $this->generate_implied_end_tags( array( @@ -827,7 +769,10 @@ private function close_p_element($closer_for_last_elem = true) { if ( $this->get_tag() !== 'P' ) { $this->parse_error(); } - $this->pop_until_node_or_tag( 'P', $closer_for_last_elem ); + $this->pop_until_tag( 'P', false ); + if($insert_p_tag_closer) { + $this->insert_tag_closer_before_current_token( 'P' ); + } } private function should_generate_implied_end_tags( $options = null ) { @@ -887,26 +832,7 @@ private function push_active_formatting_element( WP_HTML_Tag_Token $node ) { $this->active_formatting_elements[] = $node; } - private function print_active_formatting_elements($msg, $indent=1) { - if (HTML_DEBUG_MODE) { - $formats = array_map(function ($node) { - return $this->MARKER === $node ? 'M' : ($node->tag ?: 'ERROR'); - }, $this->active_formatting_elements); - dbg("$msg " . implode(', ', $formats), $indent); - } - } - - private function print_open_elements($msg, $indent=1) { - if (HTML_DEBUG_MODE) { - $elems = array_map(function ($node) { - return $node->tag; - }, $this->open_elements); - dbg("$msg " . implode(', ', $elems), $indent); - } - } - private function reconstruct_active_formatting_elements() { - $this->print_active_formatting_elements('AFE: before'); if ( empty( $this->active_formatting_elements ) ) { dbg( "Skipping AFE: empty list", 1 ); return; @@ -960,7 +886,6 @@ private function reconstruct_active_formatting_elements() { break; } } - $this->print_active_formatting_elements('AFE: after'); } private function clear_active_formatting_elements_up_to_last_marker() { From 140459e84a275e71ca2d28bbb3a51395137b0d37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 15:01:13 +0100 Subject: [PATCH 24/42] Cleanup the API --- .../html-api/class-wp-html-text-processor.php | 44 ++++++------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index 498e63eb2ad6a..0cd3f4bc08f37 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -47,35 +47,20 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * @var WP_HTML_Tag_Token[] */ private $active_formatting_elements = array(); - private $root_node = null; - private $context_node = null; private $element_bookmark_idx = 0; - - /* - * WP_HTML_Tag_Processor skips over text nodes and only - * processes tags. - * - * WP_HTML_Processor needs to process text nodes as well. - * - * Whenever the tag processor skips over text to move to - * the next tag, the next_token() method emits that text - * as a token and stores the tag in $buffered_tag to be - * returned the next time. - */ - private $buffered_tag = null; - - private $last_token = null; - private $inserted_tokens = array(); + private $current_token; + private $current_token_start; + private $current_token_end; const MAX_BOOKMARKS = 1000000; public function __construct( $html ) { parent::__construct( $html ); $this->MARKER = new WP_HTML_Tag_Token(null); - $this->root_node = new WP_HTML_Tag_Token( 'HTML' ); - $this->context_node = new WP_HTML_Tag_Token( 'DOCUMENT' ); - $this->open_elements = array( $this->root_node ); + $this->open_elements = array( + new WP_HTML_Tag_Token( 'HTML' ) + ); } public function parse() { @@ -98,9 +83,6 @@ public function parse() { echo("\n---------------\n\n"); } - private $current_token; - private $current_token_start; - private $current_token_end; public function next_element_node() { if ( ! $this->next_tag_token() ) { return false; @@ -192,9 +174,9 @@ public function next_element_node() { break; case 'DD': case 'DT': - $i = count( $this->open_elements ) - 1; - while ( true ) { - $node = $this->open_elements[ $i ]; + $i = count( $this->open_elements ); + while ( $i > 0 ) { + $node = $this->open_elements[ --$i ]; if ( $node->tag === 'DD' ) { $this->generate_implied_end_tags( array( @@ -213,9 +195,6 @@ public function next_element_node() { break; } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; - } else { - --$i; - $node = $this->open_elements[ $i ]; } } @@ -776,7 +755,7 @@ private function close_p_element($insert_p_tag_closer = true) { } private function should_generate_implied_end_tags( $options = null ) { - $current_tag_name = $this->get_tag(); + $current_tag_name = $this->current_node()->tag; if ( null !== $options && isset( $options['except_for'] ) && in_array( $current_tag_name, $options['except_for'] ) ) { return false; } @@ -1157,6 +1136,9 @@ private static function is_formatting_element( $tag_name ) { // die(); +$p = new WP_HTML_Processor( '

' ); +$p->parse(); +die(); $p = new WP_HTML_Processor( '

12345

' ); $p->parse(); From 26c6f21305bb50e7d9407e28a7af9c7ab41ff162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 15:31:51 +0100 Subject: [PATCH 25/42] Don't skip over RCData and Script tag closers --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 1e8d5c00b7d1e..3370feedbd24e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -787,6 +787,7 @@ private function skip_rcdata( $tag_name ) { return false; } + $closer_potentially_starts_at = $at; $at += 2; /* @@ -830,7 +831,7 @@ private function skip_rcdata( $tag_name ) { } if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { - ++$this->bytes_already_parsed; + $this->bytes_already_parsed = $closer_potentially_starts_at; return true; } } @@ -899,6 +900,7 @@ private function skip_script_data() { } if ( '/' === $html[ $at ] ) { + $closer_potentially_starts_at = $at - 1; $is_closing = true; ++$at; } else { @@ -960,7 +962,7 @@ private function skip_script_data() { } if ( '>' === $html[ $this->bytes_already_parsed ] ) { - ++$this->bytes_already_parsed; + $this->bytes_already_parsed = $closer_potentially_starts_at; return true; } } From 37659fbd736d3de6361f89adac3c8eb2421aeb11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 15:46:49 +0100 Subject: [PATCH 26/42] MVP parser capable of parsing the entire HTML spec --- .../html-api/class-wp-html-text-processor.php | 124 ++++++++++++------ 1 file changed, 84 insertions(+), 40 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index 0cd3f4bc08f37..fce09e50a7c97 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -19,15 +19,13 @@ function dbg( $message, $indent = 0 ) { } } +// It's an object because sometimes the identity matters class WP_HTML_Tag_Token { public $tag; - public $bookmark; - - public function __construct( $tag, $bookmark = null ) { + public function __construct( $tag ) { $this->tag = $tag; - $this->bookmark = $bookmark; } } @@ -65,10 +63,19 @@ public function __construct( $html ) { public function parse() { echo("HTML before main loop:\n"); - echo($this->html); + // echo($this->html); echo("\n"); + $i = 0; while ($this->next_element_node()) { // ... twiddle thumbs ... + if(++$i % 10000 === 0) + { + echo $this->get_tag()." oe: " . count($this->open_elements) . " "; + echo "afe: " . count($this->active_formatting_elements) . " \n"; + echo "Peak mem:" . round(memory_get_peak_usage(true) / 1024 / 1024, 2) . "MB\n"; + // print_r($this->open_elements); + // die(); + } } while ( count($this->open_elements) > 1 ) { $this->pop_open_element(); @@ -76,11 +83,12 @@ public function parse() { echo("\n"); echo("\$this->HTML after main loop:\n"); - echo($this->get_updated_html().''); + // echo($this->get_updated_html().''); echo "\n\n"; echo "Mem peak usage:" . (memory_get_peak_usage(true) / 1024 / 1024) . "MB\n"; echo("\n---------------\n\n"); + return $this->get_updated_html(); } public function next_element_node() { @@ -90,6 +98,9 @@ public function next_element_node() { if ( ! $this->is_tag_closer() ) { dbg( "Found {$this->current_token->tag} tag opener" ); switch ( $this->current_token->tag ) { + case 'HTML': + $this->drop_current_tag_token(); + break; case 'ADDRESS': case 'ARTICLE': case 'ASIDE': @@ -268,6 +279,9 @@ public function next_element_node() { case 'TABLE': $this->insert_element( $this->current_token ); break; + + // Void elements. + // Some require reconstructing the active formatting elements. case 'AREA': case 'BR': case 'EMBED': @@ -275,9 +289,13 @@ public function next_element_node() { case 'KEYGEN': case 'WBR': $this->reconstruct_active_formatting_elements(); - $this->insert_element( $this->current_token ); - $this->pop_open_element( false ); - break; + // But others don't. + case 'META': + case 'LINK': + case 'BASE': + case 'COL': + case 'FRAME': + case 'INPUT': case 'PARAM': case 'SOURCE': case 'TRACK': @@ -450,6 +468,22 @@ public function next_element_node() { $this->pop_until_tag( $this->current_token->tag, false ); $this->clear_active_formatting_elements_up_to_last_marker(); break; + + /* + * @divergence from spec: + * Close all the open tags when a table-related + * tag closer is encountered + */ + case 'TBODY': + case 'TFOOT': + case 'THEAD': + case 'TD': + case 'TH': + case 'TR': + case 'TABLE': + $this->pop_until_tag( $this->current_token->tag, false ); + break; + case 'BR': // This should never happen since Tag_Processor corrects that default: @@ -462,20 +496,33 @@ public function next_element_node() { private function next_tag_token() { $tag_token = null; + $bookmark = null; $text_start = $this->tag_ends_at + 1; - if ($this->next_tag(array('tag_closers' => 'visit'))) { - // @TODO don't create a bookmark for every single tag - $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); - $this->set_bookmark($bookmark); - $tag_token = new WP_HTML_Tag_Token( - $this->get_tag(), - $bookmark - ); - $text_end = $this->bookmarks[$bookmark]->start; - } else { - $text_end = strlen($this->html); + if (!$this->next_tag(array('tag_closers' => 'visit'))) { + $this->process_text($text_start, strlen($this->html)); + $this->current_token = null; + $this->current_token_start = strlen($this->html); + $this->current_token_end = strlen($this->html); + return false; } + // @TODO don't create a bookmark for every single tag + $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); + $this->set_bookmark($bookmark); + $tag_token = new WP_HTML_Tag_Token($this->get_tag()); + $text_end = $this->bookmarks[$bookmark]->start; + + $this->process_text($text_start, $text_end); + + $this->current_token = $tag_token; + $this->current_token_start = $this->bookmarks[$bookmark]->start; + $this->current_token_end = $this->bookmarks[$bookmark]->end; + $this->release_bookmark($bookmark); + + return true; + } + + private function process_text($text_start, $text_end) { if ($text_start < $text_end) { $this->current_token = substr($this->html, $text_start, $text_end - $text_start); $this->current_token_start = $text_start; @@ -484,18 +531,6 @@ private function next_tag_token() { dbg( "Appending text to reconstructed HTML", 1 ); $this->reconstruct_active_formatting_elements(); } - - if ( ! $tag_token ) { - $this->current_token = null; - $this->current_token_start = strlen($this->html); - $this->current_token_end = strlen($this->html); - return false; - } - - $this->current_token = $tag_token; - $this->current_token_start = $this->bookmarks[$tag_token->bookmark]->start; - $this->current_token_end = $this->bookmarks[$tag_token->bookmark]->end; - return true; } private function process_any_other_end_tag( WP_HTML_Tag_Token $token ) { @@ -745,7 +780,7 @@ private function close_p_element($insert_p_tag_closer = true) { ) ); // If the current node is not a p element, then this is a parse error. - if ( $this->get_tag() !== 'P' ) { + if ( $this->current_node()->tag !== 'P' ) { $this->parse_error(); } $this->pop_until_tag( 'P', false ); @@ -773,7 +808,7 @@ private function should_generate_implied_end_tags( $options = null ) { return true; } - $thoroughly = null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly']; + $thoroughly = true; //null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly']; if ( $thoroughly ) { switch ( $current_tag_name ) { case 'TBODY': @@ -1128,17 +1163,26 @@ private static function is_formatting_element( $tag_name ) { } -// $dir = realpath( __DIR__ . '/../../../index.html' ); +$dir = realpath( __DIR__ . '/../../../index.html' ); -// $htmlspec = file_get_contents( $dir ); -// $p = new WP_HTML_Processor( $htmlspec ); -// $p->parse(); +$htmlspec = file_get_contents( $dir ); +$p = new WP_HTML_Processor( $htmlspec ); +$p->parse(); + +die(); +// $p = new WP_HTML_Processor( '
' ); +// $p->parse(); // die(); +// $p = new WP_HTML_Processor( '

1HTML Standard345

' ); +// $p->parse(); +$p = new WP_HTML_Processor( '

1
HTMLStandard

test
' ); +echo $p->parse(); +die(); -$p = new WP_HTML_Processor( '
' ); +$p = new WP_HTML_Processor( '

1345

' ); $p->parse(); -die(); + $p = new WP_HTML_Processor( '

12345

' ); $p->parse(); From 956ad3bf2a6d5aff33f1f1962c0211077142cc19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 16:04:24 +0100 Subject: [PATCH 27/42] First stab at traversal API --- .../html-api/class-wp-html-text-processor.php | 108 +++++++++++++++++- 1 file changed, 105 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index fce09e50a7c97..a1d81cad0e5ae 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -66,7 +66,7 @@ public function parse() { // echo($this->html); echo("\n"); $i = 0; - while ($this->next_element_node()) { + while ($this->process_next_tag_token()) { // ... twiddle thumbs ... if(++$i % 10000 === 0) { @@ -91,7 +91,97 @@ public function parse() { return $this->get_updated_html(); } - public function next_element_node() { + public function depth() { + return count($this->open_elements); + } + + public function first_child() + { + return $this->nth_child(1); + } + + public function nth_child($n=1) { + if ( 0 === $this->bytes_already_parsed ){ + return $this->next_node(); + } + if ( ! $this->set_bookmark('internal_nth_child') ) { + return false; + } + $depth = $this->depth(); + $matched = 0; + try { + do { + if (!$this->next_node()) { + return false; + } + + if ($this->is_tag_closer()) { + continue; + } + + if ($this->depth() <= $depth) { + $this->seek('internal_nth_child'); + return false; + } + + ++$matched; + } while ($matched < $n); + return true; + } finally { + $this->release_bookmark('internal_nth_child'); + } + } + + public function next_sibling() + { + return $this->nth_sibling(1); + } + + public function nth_sibling($n = 1) + { + if ( 0 === $this->bytes_already_parsed ){ + return $this->next_node(); + } + if ( ! $this->set_bookmark('internal_nth_sibling') ) { + return false; + } + $depth = $this->depth(); + $matched = 0; + try { + do { + if (!$this->next_node()) { + return false; + } + + if ($this->is_tag_closer()) { + return false; + } + + if ($this->depth() < $depth) { + $this->seek('internal_nth_sibling'); + return false; + } else if ($this->depth() > $depth) { + continue; + } + + ++$matched; + } while ($matched < $n); + return true; + } finally { + $this->release_bookmark('internal_nth_sibling'); + } + } + + private function next_node() { + while ($this->process_next_tag_token()) { + if (!$this->is_tag_closer()) { + return true; + } + } + return false; + } + + private function process_next_tag_token() { if ( ! $this->next_tag_token() ) { return false; } @@ -738,7 +828,7 @@ private function pop_open_element($add_close_tag = true) { return $popped; } - public function drop_current_tag_token() { + private function drop_current_tag_token() { $this->add_lexical_update( new WP_HTML_Text_Replacement( $this->current_token_start, @@ -1163,6 +1253,18 @@ private static function is_formatting_element( $tag_name ) { } + +$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); +$p->first_child(); +var_dump($p->get_tag()); +$p->first_child(); +var_dump($p->get_tag()); +$p->next_sibling(); +var_dump($p->get_tag()); +$p->next_sibling(); +var_dump($p->get_tag()); +die(); + $dir = realpath( __DIR__ . '/../../../index.html' ); $htmlspec = file_get_contents( $dir ); From 9889d4d561a15db4f4176fb5a437bbdd0b9e3f4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 16:53:03 +0100 Subject: [PATCH 28/42] Avoid allocating a bookmark for each parsed tag --- .../html-api/class-wp-html-tag-processor.php | 2 +- .../html-api/class-wp-html-text-processor.php | 51 ++++++++----------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 3370feedbd24e..47f5c721257ec 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -375,7 +375,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var int|null */ - private $tag_name_starts_at; + protected $tag_name_starts_at; /** * Byte length of current tag name. diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index a1d81cad0e5ae..db3f8244dd254 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -182,9 +182,28 @@ private function next_node() { } private function process_next_tag_token() { - if ( ! $this->next_tag_token() ) { + /** + * Go to the next tag and process any text was found along the way. + */ + $text_start = $this->tag_ends_at + 1; + if (!$this->next_tag(array('tag_closers' => 'visit'))) { + $this->process_text($text_start, strlen($this->html)); + $this->current_token = null; + $this->current_token_start = strlen($this->html); + $this->current_token_end = strlen($this->html); return false; } + + /** + * We found a tag! Let's process any text we may have found along the way. + */ + $current_tag_start = $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ); + $this->process_text($text_start, $current_tag_start); + + $this->current_token = new WP_HTML_Tag_Token($this->get_tag()); + $this->current_token_start = $current_tag_start; + $this->current_token_end = $this->tag_ends_at; + if ( ! $this->is_tag_closer() ) { dbg( "Found {$this->current_token->tag} tag opener" ); switch ( $this->current_token->tag ) { @@ -584,34 +603,6 @@ private function process_next_tag_token() { return $this->current_token; } - private function next_tag_token() { - $tag_token = null; - $bookmark = null; - $text_start = $this->tag_ends_at + 1; - if (!$this->next_tag(array('tag_closers' => 'visit'))) { - $this->process_text($text_start, strlen($this->html)); - $this->current_token = null; - $this->current_token_start = strlen($this->html); - $this->current_token_end = strlen($this->html); - return false; - } - - // @TODO don't create a bookmark for every single tag - $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); - $this->set_bookmark($bookmark); - $tag_token = new WP_HTML_Tag_Token($this->get_tag()); - $text_end = $this->bookmarks[$bookmark]->start; - - $this->process_text($text_start, $text_end); - - $this->current_token = $tag_token; - $this->current_token_start = $this->bookmarks[$bookmark]->start; - $this->current_token_end = $this->bookmarks[$bookmark]->end; - $this->release_bookmark($bookmark); - - return true; - } - private function process_text($text_start, $text_end) { if ($text_start < $text_end) { $this->current_token = substr($this->html, $text_start, $text_end - $text_start); @@ -1263,7 +1254,7 @@ private static function is_formatting_element( $tag_name ) { var_dump($p->get_tag()); $p->next_sibling(); var_dump($p->get_tag()); -die(); +// die(); $dir = realpath( __DIR__ . '/../../../index.html' ); From 612cc831a75f9c2677aa222c6f3d4f0ccfc06c03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 17:08:09 +0100 Subject: [PATCH 29/42] Close open tags at the end of the document --- .../html-api/class-wp-html-text-processor.php | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index db3f8244dd254..8be1960f775fb 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -77,9 +77,6 @@ public function parse() { // die(); } } - while ( count($this->open_elements) > 1 ) { - $this->pop_open_element(); - } echo("\n"); echo("\$this->HTML after main loop:\n"); @@ -92,7 +89,8 @@ public function parse() { } public function depth() { - return count($this->open_elements); + // -1 because the root HTML element is not counted + return count($this->open_elements) - 1; } public function first_child() @@ -191,6 +189,17 @@ private function process_next_tag_token() { $this->current_token = null; $this->current_token_start = strlen($this->html); $this->current_token_end = strlen($this->html); + + // Some tags were left open, let's close and process them. + if(count($this->open_elements) > 1) + { + while ( count($this->open_elements) > 1 ) { + $this->pop_open_element(); + } + // Flush lexical updates + $this->get_updated_html(); + } + return false; } @@ -1245,7 +1254,12 @@ private static function is_formatting_element( $tag_name ) { } +$p = new WP_HTML_Processor( '

134' ); + $p = new WP_HTML_Processor( '

  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); +echo $p->parse(); + +die(); $p->first_child(); var_dump($p->get_tag()); $p->first_child(); @@ -1273,8 +1287,6 @@ private static function is_formatting_element( $tag_name ) { echo $p->parse(); die(); -$p = new WP_HTML_Processor( '

1345

' ); -$p->parse(); $p = new WP_HTML_Processor( '

12345

' ); $p->parse(); From 4970159af2eaf4324f4cbea0e78e0e3a106104c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 18:35:49 +0100 Subject: [PATCH 30/42] MVP get_inner_html and get_outer_html --- .../html-api/class-wp-html-text-processor.php | 267 +++++++++++++++--- 1 file changed, 230 insertions(+), 37 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index 8be1960f775fb..ed6a1f666048d 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -88,6 +88,36 @@ public function parse() { return $this->get_updated_html(); } + private $parser_bookmarks = array(); + public function set_bookmark( $name ) { + if ( ! parent::set_bookmark($name) ) { + return false; + } + $this->parser_bookmarks[$name] = array( + 'open_elements' => $this->open_elements, + 'active_formatting_elements' => $this->active_formatting_elements, + ); + return true; + } + + public function release_bookmark( $bookmark ) { + if ( ! parent::release_bookmark($bookmark) ) { + return false; + } + unset($this->parser_bookmarks[$bookmark]); + return true; + } + + public function seek($bookmark) { + if ( ! parent::seek($bookmark) ) { + return false; + } + $bookmark = $this->parser_bookmarks[$bookmark]; + $this->open_elements = $bookmark['open_elements']; + $this->active_formatting_elements = $bookmark['active_formatting_elements']; + return true; + } + public function depth() { // -1 because the root HTML element is not counted return count($this->open_elements) - 1; @@ -99,7 +129,7 @@ public function first_child() } public function nth_child($n=1) { - if ( 0 === $this->bytes_already_parsed ){ + if ( null === $this->tag_name_starts_at ) { return $this->next_node(); } if ( ! $this->set_bookmark('internal_nth_child') ) { @@ -137,7 +167,7 @@ public function next_sibling() public function nth_sibling($n = 1) { - if ( 0 === $this->bytes_already_parsed ){ + if ( null === $this->tag_name_starts_at ) { return $this->next_node(); } if ( ! $this->set_bookmark('internal_nth_sibling') ) { @@ -179,28 +209,174 @@ private function next_node() { return false; } + public function inner_html($html=null) { + if ( null === $this->tag_name_starts_at ) { + return null; + } + + if(!$this->set_bookmark('internal_inner_html')) { + return false; + } + + try { + if(!$this->balancing_closer()) { + return false; + } + $tag_closer_starts_at = $this->tag_name_starts_at - 2; + + // Return to the initial cursor position + $this->seek('internal_inner_html'); + + $content_starts_at = $this->tag_ends_at + 1; + if(null === $html) { + // Get the inner HTML + return substr($this->html, $content_starts_at, $tag_closer_starts_at - $content_starts_at); + } else { + // Set the inner HTML + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $content_starts_at, + $tag_closer_starts_at, + $html + ) + ); + // Flush lexical updates + $this->get_updated_html(); + $this->seek('internal_inner_html'); + return true; + } + } finally { + $this->release_bookmark('internal_inner_html'); + } + } + + public function outer_html($html=null) { + if ( null === $this->tag_name_starts_at ) { + return null; + } + + if(!$this->set_bookmark('internal_outer_html')) { + return false; + } + + try { + if(!$this->balancing_closer()) { + return false; + } + $tag_closer_ends_at = $this->tag_ends_at; + + // Return to the initial cursor position + $this->seek('internal_outer_html'); + $tag_starts_at = $this->tag_name_starts_at - 1; + + if(null === $html) { + // Get the inner HTML + return substr($this->html, $tag_starts_at, $tag_closer_ends_at + 1 - $tag_starts_at); + } else { + // Set the inner HTML + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $tag_starts_at, + $tag_closer_ends_at + 1, // @todo why +1 is needed? + $html + ) + ); + // Flush lexical updates + $this->get_updated_html(); + return true; + } + } finally { + $this->release_bookmark('internal_outer_html'); + } + } + + + public function balancing_closer() { + if($this->is_tag_closer()) { + return false; + } + if(!$this->set_bookmark('internal_balancing_closer')) { + return false; + } + try { + $depth = $this->depth(); + $token = $this->current_token; + while($this->process_next_tag_token()) { + if( + // Current element popped off the stack + $this->depth() < $depth + // Stack is the same size, but the current element was popped + || ($this->depth() === $depth && end($this->open_elements) !== $token) + ) { + /** + * The entire tag contents have been parsed, + * let's seek to the opener and read the inner + * HTML with missing tag closers added back in + */ + break; + } + } + + $this->seek('internal_balancing_closer'); + + while($this->process_next_tag_token()) { + if( + // Current element popped off the stack + $this->depth() < $depth + // Stack is the same size, but the current element was popped + || ($this->depth() === $depth && end($this->open_elements) !== $token) + ) { + if ($this->is_tag_closer()) { + return true; + } + break; + } + } + + // Should never ever happen + throw new Exception('Critical parser error: no matching closer found'); + } finally { + $this->release_bookmark('internal_balancing_closer'); + } + } + + private $is_closing_open_tags = false; private function process_next_tag_token() { + /* + * We're done with the document but some tags + * are still open. Let's close them one at a time. + */ + if ( $this->is_closing_open_tags ) { + // If only the root element is open, we're done. + if(count($this->open_elements) <= 1) + { + return false; + } + + // Otherwise close the next open tag on the stack + $this->current_token = null; + $this->current_token_start = strlen($this->html); + $this->current_token_end = strlen($this->html); + + $this->pop_open_element(); + $this->get_updated_html(); + + $this->next_tag(array('tag_closers' => 'visit')); + $this->current_token = new WP_HTML_Tag_Token($this->get_tag()); + $this->current_token_start = $this->tag_name_starts_at - 2; + $this->current_token_end = $this->tag_ends_at; + return true; + } + /** * Go to the next tag and process any text was found along the way. */ $text_start = $this->tag_ends_at + 1; if (!$this->next_tag(array('tag_closers' => 'visit'))) { $this->process_text($text_start, strlen($this->html)); - $this->current_token = null; - $this->current_token_start = strlen($this->html); - $this->current_token_end = strlen($this->html); - - // Some tags were left open, let's close and process them. - if(count($this->open_elements) > 1) - { - while ( count($this->open_elements) > 1 ) { - $this->pop_open_element(); - } - // Flush lexical updates - $this->get_updated_html(); - } - return false; + $this->is_closing_open_tags = true; + return $this->process_next_tag_token(); } /** @@ -503,7 +679,8 @@ private function process_next_tag_token() { case 'UL': if ( ! $this->is_element_in_scope( $this->current_token->tag ) ) { $this->parse_error(); - return $this->drop_current_tag_token(); + $this->drop_current_tag_token(); + return true; } $this->generate_implied_end_tags(); $this->pop_until_tag( $this->current_token->tag, false ); @@ -528,16 +705,22 @@ private function process_next_tag_token() { case 'LI': if ( ! $this->is_element_in_list_item_scope( 'LI' ) ) { $this->parse_error(); - return $this->drop_current_tag_token(); + $this->drop_current_tag_token(); + return true; } - $this->generate_implied_end_tags(); + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'LI' ), + ) + ); $this->pop_until_tag( 'LI', false ); break; case 'DD': case 'DT': if ( ! $this->is_element_in_scope( $this->current_token->tag ) ) { $this->parse_error(); - return $this->drop_current_tag_token(); + $this->drop_current_tag_token(); + return true; } $this->generate_implied_end_tags(); $this->pop_until_tag( $this->current_token->tag, false ); @@ -550,7 +733,8 @@ private function process_next_tag_token() { case 'H6': if ( ! $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->parse_error(); - return $this->drop_current_tag_token(); + $this->drop_current_tag_token(); + return true; } $this->generate_implied_end_tags(); $this->pop_until_tag( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), false ); @@ -577,7 +761,8 @@ private function process_next_tag_token() { case 'OBJECT': if ( ! $this->is_element_in_scope( $this->current_token->tag ) ) { $this->parse_error(); - return $this->drop_current_tag_token(); + $this->drop_current_tag_token(); + return true; } $this->generate_implied_end_tags(); if ( $this->current_node()->tag !== $this->current_token->tag ) { @@ -609,7 +794,7 @@ private function process_next_tag_token() { break; } } - return $this->current_token; + return true; } private function process_text($text_start, $text_end) { @@ -1254,30 +1439,38 @@ private static function is_formatting_element( $tag_name ) { } -$p = new WP_HTML_Processor( '

134' ); +// $p = new WP_HTML_Processor( '

134' ); +// echo $p->parse(); -$p = new WP_HTML_Processor( '

  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); -echo $p->parse(); +$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); +// echo $p->parse(); -die(); +// die(); $p->first_child(); var_dump($p->get_tag()); $p->first_child(); var_dump($p->get_tag()); -$p->next_sibling(); -var_dump($p->get_tag()); -$p->next_sibling(); -var_dump($p->get_tag()); -// die(); +// $p->next_sibling(); +// var_dump($p->get_tag()); +// $p->next_sibling(); +var_dump($p->inner_html()); +$p->inner_html('Hello'); +var_dump($p->get_updated_html()); -$dir = realpath( __DIR__ . '/../../../index.html' ); - -$htmlspec = file_get_contents( $dir ); -$p = new WP_HTML_Processor( $htmlspec ); -$p->parse(); +// var_dump($p->outer_html()); +// $p->outer_html('
Hello
'); +// var_dump($p->get_updated_html()); die(); +// $dir = realpath( __DIR__ . '/../../../index.html' ); + +// $htmlspec = file_get_contents( $dir ); +// $p = new WP_HTML_Processor( $htmlspec ); +// $p->parse(); + +// die(); + // $p = new WP_HTML_Processor( '
' ); // $p->parse(); // die(); From 3dfccc595bb3bca6e40daf1ad070cf6b72ce617f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 28 Feb 2023 21:33:14 +0100 Subject: [PATCH 31/42] Fix cursor position confusion during inner_html and outer_html --- .../html-api/class-wp-html-text-processor.php | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index ed6a1f666048d..c696e788f8151 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -115,6 +115,9 @@ public function seek($bookmark) { $bookmark = $this->parser_bookmarks[$bookmark]; $this->open_elements = $bookmark['open_elements']; $this->active_formatting_elements = $bookmark['active_formatting_elements']; + $this->current_token = end($bookmark['open_elements']); + $this->current_token_start = $this->tag_name_starts_at - ($this->is_tag_closer() ? 2 : 1); + $this->current_token_end = $this->tag_ends_at; return true; } @@ -241,7 +244,6 @@ public function inner_html($html=null) { ) ); // Flush lexical updates - $this->get_updated_html(); $this->seek('internal_inner_html'); return true; } @@ -258,7 +260,6 @@ public function outer_html($html=null) { if(!$this->set_bookmark('internal_outer_html')) { return false; } - try { if(!$this->balancing_closer()) { return false; @@ -273,16 +274,30 @@ public function outer_html($html=null) { // Get the inner HTML return substr($this->html, $tag_starts_at, $tag_closer_ends_at + 1 - $tag_starts_at); } else { + // Hack to prevent invalidating the bookmark upon replacing the outer html + --$this->bookmarks['internal_outer_html']->start; + $this->bookmarks['internal_outer_html']->end = $this->bookmarks['internal_outer_html']->start; + $last_open_element = array_pop($this->parser_bookmarks['internal_outer_html']['open_elements']); + if(end($this->parser_bookmarks['internal_outer_html']['active_formatting_elements']) === $last_open_element) { + array_pop($this->parser_bookmarks['internal_outer_html']['active_formatting_elements']); + } + // Set the inner HTML $this->add_lexical_update( new WP_HTML_Text_Replacement( $tag_starts_at, - $tag_closer_ends_at + 1, // @todo why +1 is needed? + $tag_closer_ends_at + 1, $html ) ); // Flush lexical updates $this->get_updated_html(); + + // Hack to prevent invalidating the bookmark upon replacing the outer html + ++$this->bookmarks['internal_outer_html']->start; + $this->bookmarks['internal_outer_html']->end = $this->bookmarks['internal_outer_html']->start; + + $this->seek('internal_outer_html'); return true; } } finally { @@ -1442,24 +1457,23 @@ private static function is_formatting_element( $tag_name ) { // $p = new WP_HTML_Processor( '

134' ); // echo $p->parse(); -$p = new WP_HTML_Processor( '

  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); +$p = new WP_HTML_Processor( '
  • 1
' ); // echo $p->parse(); -// die(); $p->first_child(); -var_dump($p->get_tag()); +var_dump($p->get_tag()); // UL + $p->first_child(); -var_dump($p->get_tag()); -// $p->next_sibling(); -// var_dump($p->get_tag()); -// $p->next_sibling(); -var_dump($p->inner_html()); +var_dump($p->get_tag()); // LI + +var_dump($p->inner_html()); // 1 + $p->inner_html('Hello'); -var_dump($p->get_updated_html()); +var_dump($p->get_updated_html()); //
  • Hello
-// var_dump($p->outer_html()); -// $p->outer_html('
Hello
'); -// var_dump($p->get_updated_html()); +var_dump($p->outer_html()); +$p->outer_html('
Hello
'); +var_dump($p->get_updated_html()); die(); From 79f90ce5c7ac08ec9454326e30d847095a2b99d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 1 Mar 2023 14:27:53 +0100 Subject: [PATCH 32/42] Adjust HTML diffing to make inner_html() and outer_html() work --- .../html-api/class-wp-html-tag-processor.php | 47 ++--- .../html-api/class-wp-html-text-processor.php | 171 ++++++++++++------ 2 files changed, 135 insertions(+), 83 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 47f5c721257ec..52f5e57fafb17 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1462,6 +1462,11 @@ private function apply_lexical_updates() { $this->bytes_already_copied = $diff->end; } + if ( $diff->end < $this->bytes_already_parsed ) { + $this->output_buffer .= substr( $this->html, $diff->end, $this->bytes_already_parsed - $diff->end ); + $this->bytes_already_copied = $this->bytes_already_parsed; + } + /* * Adjust bookmark locations to account for how the text * replacements adjust offsets in the input document. @@ -2118,13 +2123,21 @@ public function get_updated_html() { return $this->output_buffer . substr( $this->html, $this->bytes_already_copied ); } - // Apply the updates, rewind to before the current tag, and reparse the attributes. - $content_up_to_opened_tag_name = $this->output_buffer . substr( - $this->html, - $this->bytes_already_copied, - $this->tag_name_starts_at + $this->tag_name_length - $this->bytes_already_copied - ); + try { + $this->release_bookmark('internal_get_updated_html'); + if(!$this->set_bookmark('internal_get_updated_html')) { + return false; + } + $this->flush_updates(); + $this->seek('internal_get_updated_html'); + } finally { + $this->release_bookmark('internal_get_updated_html'); + } + + return $this->html; + } + protected function flush_updates() { /* * 1. Apply the edits by flushing them to the output buffer and updating the copied byte count. * @@ -2138,27 +2151,7 @@ public function get_updated_html() { * 2. Replace the original HTML with the now-updated HTML so that it's possible to * seek to a previous location and have a consistent view of the updated document. */ - $this->html = $this->output_buffer . substr( $this->html, $this->bytes_already_copied ); - $this->output_buffer = $content_up_to_opened_tag_name; - $this->bytes_already_copied = strlen( $this->output_buffer ); - - /* - * 3. Point this tag processor at the original tag opener and consume it - * - * At this point the internal cursor points to the end of the tag name. - * Rewind before the tag name starts so that it's as if the cursor didn't - * move; a call to `next_tag()` will reparse the recently-updated attributes - * and additional calls to modify the attributes will apply at this same - * location. - * - *

Previous HTMLMore HTML

- * ^ | back up by the length of the tag name plus the opening < - * \<-/ back up by strlen("em") + 1 ==> 3 - */ - $this->bytes_already_parsed = strlen( $content_up_to_opened_tag_name ) - $this->tag_name_length - 1; - $this->next_tag(); - - return $this->html; + $this->html = $this->output_buffer . substr( $this->html, $this->bytes_already_copied ); } /** diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-text-processor.php index c696e788f8151..e1ec053b4e668 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-text-processor.php @@ -44,7 +44,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** * @var WP_HTML_Tag_Token[] */ - private $active_formatting_elements = array(); + public $active_formatting_elements = array(); private $element_bookmark_idx = 0; private $current_token; @@ -89,27 +89,52 @@ public function parse() { } private $parser_bookmarks = array(); - public function set_bookmark( $name ) { + /** + * Sets a bookmark for the parser + * + * @TODO: make $protected purely internal + * @see WP_HTML_Tag_Processor::set_bookmark() + * @param mixed $name Name of the bookmark + * @param mixed $protected Protects a bookmark from being released by release_bookmark() + * Useful for outer_html(). + * @return bool Whether the bookmark was set + */ + public function set_bookmark( $name, $protected = false ) { if ( ! parent::set_bookmark($name) ) { + unset($this->parser_bookmarks[$name]); return false; } $this->parser_bookmarks[$name] = array( + 'protected' => $protected, 'open_elements' => $this->open_elements, 'active_formatting_elements' => $this->active_formatting_elements, ); return true; } - public function release_bookmark( $bookmark ) { - if ( ! parent::release_bookmark($bookmark) ) { + /** + * Releases a bookmark for the parser + * + * @TODO: make $force purely internal + * @see WP_HTML_Tag_Processor::set_bookmark() + * @param mixed $name Name of the bookmark + * @param mixed $force Whether to release the bookmark even if it's protected + * @return bool Whether the bookmark was set + */ + public function release_bookmark( $bookmark, $force = false ) { + if ( !isset($this->parser_bookmarks[$bookmark]) ){ + return false; + } + if( !$force && $this->parser_bookmarks[$bookmark]['protected']) { return false; } unset($this->parser_bookmarks[$bookmark]); - return true; + return parent::release_bookmark($bookmark); } public function seek($bookmark) { if ( ! parent::seek($bookmark) ) { + unset($this->parser_bookmarks[$bookmark]); return false; } $bookmark = $this->parser_bookmarks[$bookmark]; @@ -123,7 +148,9 @@ public function seek($bookmark) { public function depth() { // -1 because the root HTML element is not counted - return count($this->open_elements) - 1; + return count($this->open_elements) - 1 + ( + $this->is_tag_closer() ? 1 : 0 + ); } public function first_child() @@ -155,6 +182,10 @@ public function nth_child($n=1) { return false; } + if ($this->depth() !== $depth + 1) { + continue; + } + ++$matched; } while ($matched < $n); return true; @@ -189,7 +220,9 @@ public function nth_sibling($n = 1) } if ($this->depth() < $depth) { - $this->seek('internal_nth_sibling'); + if(!$this->seek('internal_nth_sibling')) { + throw new Exception('Failed to seek to internal_nth_sibling'); + } return false; } else if ($this->depth() > $depth) { continue; @@ -220,7 +253,6 @@ public function inner_html($html=null) { if(!$this->set_bookmark('internal_inner_html')) { return false; } - try { if(!$this->balancing_closer()) { return false; @@ -228,25 +260,33 @@ public function inner_html($html=null) { $tag_closer_starts_at = $this->tag_name_starts_at - 2; // Return to the initial cursor position - $this->seek('internal_inner_html'); + // @TODO: Don't seek if balancing_closer didn't update + // the HTML + if(!$this->seek('internal_inner_html')) { + throw new Exception('Failed to seek to internal_inner_html bookmark'); + } $content_starts_at = $this->tag_ends_at + 1; if(null === $html) { // Get the inner HTML return substr($this->html, $content_starts_at, $tag_closer_starts_at - $content_starts_at); - } else { - // Set the inner HTML - $this->add_lexical_update( - new WP_HTML_Text_Replacement( - $content_starts_at, - $tag_closer_starts_at, - $html - ) - ); - // Flush lexical updates - $this->seek('internal_inner_html'); - return true; } + + // Set the inner HTML + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $content_starts_at, + $tag_closer_starts_at, + $html + ) + ); + $this->flush_updates(); + + // Flush lexical updates + if(!$this->seek('internal_inner_html')) { + throw new Exception('Failed to seek to internal_inner_html bookmark'); + } + return true; } finally { $this->release_bookmark('internal_inner_html'); } @@ -257,7 +297,7 @@ public function outer_html($html=null) { return null; } - if(!$this->set_bookmark('internal_outer_html')) { + if(!$this->set_bookmark('internal_outer_html', true)) { return false; } try { @@ -267,39 +307,39 @@ public function outer_html($html=null) { $tag_closer_ends_at = $this->tag_ends_at; // Return to the initial cursor position - $this->seek('internal_outer_html'); + // @TODO: Don't seek if balancing_closer didn't update + // the HTML + if(!$this->seek('internal_outer_html')) { + throw new Exception('Failed to seek to internal_outer_html bookmark'); + } $tag_starts_at = $this->tag_name_starts_at - 1; if(null === $html) { // Get the inner HTML return substr($this->html, $tag_starts_at, $tag_closer_ends_at + 1 - $tag_starts_at); - } else { - // Hack to prevent invalidating the bookmark upon replacing the outer html - --$this->bookmarks['internal_outer_html']->start; - $this->bookmarks['internal_outer_html']->end = $this->bookmarks['internal_outer_html']->start; - $last_open_element = array_pop($this->parser_bookmarks['internal_outer_html']['open_elements']); - if(end($this->parser_bookmarks['internal_outer_html']['active_formatting_elements']) === $last_open_element) { - array_pop($this->parser_bookmarks['internal_outer_html']['active_formatting_elements']); - } + } - // Set the inner HTML - $this->add_lexical_update( - new WP_HTML_Text_Replacement( - $tag_starts_at, - $tag_closer_ends_at + 1, - $html - ) - ); - // Flush lexical updates - $this->get_updated_html(); + // Set the inner HTML + $this->add_lexical_update( + new WP_HTML_Text_Replacement( + $tag_starts_at, + $tag_closer_ends_at + 1, + $html + ) + ); + $this->flush_updates(); - // Hack to prevent invalidating the bookmark upon replacing the outer html - ++$this->bookmarks['internal_outer_html']->start; - $this->bookmarks['internal_outer_html']->end = $this->bookmarks['internal_outer_html']->start; + if(!$this->seek('internal_outer_html')) { + throw new Exception('Failed to seek to internal_outer_html bookmark'); + } - $this->seek('internal_outer_html'); - return true; + // Adjust open elements and active formatting elements + $last_open_element = array_pop($this->open_elements); + if(end($this->active_formatting_elements) === $last_open_element) { + array_pop($this->active_formatting_elements); } + + return true; } finally { $this->release_bookmark('internal_outer_html'); } @@ -319,9 +359,8 @@ public function balancing_closer() { while($this->process_next_tag_token()) { if( // Current element popped off the stack - $this->depth() < $depth - // Stack is the same size, but the current element was popped - || ($this->depth() === $depth && end($this->open_elements) !== $token) + $this->depth() <= $depth + && end($this->open_elements) !== $token ) { /** * The entire tag contents have been parsed, @@ -332,7 +371,9 @@ public function balancing_closer() { } } - $this->seek('internal_balancing_closer'); + if(!$this->seek('internal_balancing_closer')){ + throw new Exception('Failed to seek to internal_balancing_closer bookmark'); + } while($this->process_next_tag_token()) { if( @@ -1042,14 +1083,17 @@ private function drop_current_tag_token() { private function insert_tag_closer_before_current_token( $tag ) { // Aesthetic choice for now. // @TODO: consider preserving the case of the opening tag - $tag = strtolower($tag); $this->add_lexical_update( new WP_HTML_Text_Replacement( $this->current_token_start, $this->current_token_start, - "" + "" ) ); + $last_afe = end($this->active_formatting_elements); + if($last_afe && $tag === $last_afe->tag) { + array_pop($this->active_formatting_elements); + } } private function generate_implied_end_tags( $options = null ) { @@ -1454,26 +1498,41 @@ private static function is_formatting_element( $tag_name ) { } -// $p = new WP_HTML_Processor( '

134' ); +// $p = new WP_HTML_Processor( '

4' ); +// $p->next_tag(); +// $p->set_attribute('a', 'b'); +// echo $p . "\n"; +// $p->next_tag(); +// echo $p . ''; + +// die(); // echo $p->parse(); -$p = new WP_HTML_Processor( '

  • 1
' ); +$p = new WP_HTML_Processor( '
  • 1
' ); // echo $p->parse(); $p->first_child(); var_dump($p->get_tag()); // UL -$p->first_child(); +$p->nth_child(2); var_dump($p->get_tag()); // LI +var_dump($p->get_updated_html()); +var_dump($p->get_updated_html()); var_dump($p->inner_html()); // 1 $p->inner_html('Hello'); var_dump($p->get_updated_html()); //
  • Hello
+// var_dump($p->outer_html()); var_dump($p->outer_html()); +var_dump($p->get_attribute_names_with_prefix('')); $p->outer_html('
Hello
'); -var_dump($p->get_updated_html()); +// var_dump($p->get_attribute_names_with_prefix('')); +// var_dump($p->get_tag()); +// var_dump($p->outer_html()); +// var_dump($p->get_tag()); +// var_dump($p->get_updated_html()); die(); From 4efef0b5005cf89e6416d75f672f01e11a2847e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 1 Mar 2023 16:35:12 +0100 Subject: [PATCH 33/42] Adjust bookmarks setting to suit outer_html better --- ...cessor.php => class-wp-html-processor.php} | 181 ++++++------------ .../html-api/class-wp-html-tag-processor.php | 11 +- .../tests/html-api/wpHtmlProcessor.php | 115 ++++++++++- 3 files changed, 178 insertions(+), 129 deletions(-) rename src/wp-includes/html-api/{class-wp-html-text-processor.php => class-wp-html-processor.php} (91%) diff --git a/src/wp-includes/html-api/class-wp-html-text-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php similarity index 91% rename from src/wp-includes/html-api/class-wp-html-text-processor.php rename to src/wp-includes/html-api/class-wp-html-processor.php index e1ec053b4e668..a48b1e4d40f78 100644 --- a/src/wp-includes/html-api/class-wp-html-text-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -66,7 +66,7 @@ public function parse() { // echo($this->html); echo("\n"); $i = 0; - while ($this->process_next_tag_token()) { + while ($this->next_tag()) { // ... twiddle thumbs ... if(++$i % 10000 === 0) { @@ -104,10 +104,27 @@ public function set_bookmark( $name, $protected = false ) { unset($this->parser_bookmarks[$name]); return false; } + + /** + * seek() will rewing before the current tag + * and consume it again. We need to remove the + * top element from element stacks to avoid + * to duplicates. + */ + $open_elements = $this->open_elements; + if(end($open_elements) === $this->current_token) { + array_pop($open_elements); + } + + $active_formatting_elements = $this->active_formatting_elements; + if(end($active_formatting_elements) === $this->current_token) { + array_pop($active_formatting_elements); + } + $this->parser_bookmarks[$name] = array( 'protected' => $protected, - 'open_elements' => $this->open_elements, - 'active_formatting_elements' => $this->active_formatting_elements, + 'open_elements' => $open_elements, + 'active_formatting_elements' => $active_formatting_elements, ); return true; } @@ -132,18 +149,17 @@ public function release_bookmark( $bookmark, $force = false ) { return parent::release_bookmark($bookmark); } - public function seek($bookmark) { - if ( ! parent::seek($bookmark) ) { - unset($this->parser_bookmarks[$bookmark]); + public function seek($bookmark_name) { + if(!$this->seek_without_consuming($bookmark_name)) { return false; } - $bookmark = $this->parser_bookmarks[$bookmark]; - $this->open_elements = $bookmark['open_elements']; - $this->active_formatting_elements = $bookmark['active_formatting_elements']; - $this->current_token = end($bookmark['open_elements']); - $this->current_token_start = $this->tag_name_starts_at - ($this->is_tag_closer() ? 2 : 1); - $this->current_token_end = $this->tag_ends_at; - return true; + + $b = $this->parser_bookmarks[$bookmark_name]; + // $this->tag_ends_at = $this->bytes_already_parsed - 1; + $this->open_elements = $b['open_elements']; + $this->active_formatting_elements = $b['active_formatting_elements']; + + return $this->next_tag(); } public function depth() { @@ -216,7 +232,11 @@ public function nth_sibling($n = 1) } if ($this->is_tag_closer()) { - return false; + continue; + } + + if ($this->depth() > $depth) { + continue; } if ($this->depth() < $depth) { @@ -224,8 +244,6 @@ public function nth_sibling($n = 1) throw new Exception('Failed to seek to internal_nth_sibling'); } return false; - } else if ($this->depth() > $depth) { - continue; } ++$matched; @@ -236,15 +254,6 @@ public function nth_sibling($n = 1) } } - private function next_node() { - while ($this->process_next_tag_token()) { - if (!$this->is_tag_closer()) { - return true; - } - } - return false; - } - public function inner_html($html=null) { if ( null === $this->tag_name_starts_at ) { return null; @@ -297,6 +306,7 @@ public function outer_html($html=null) { return null; } + $this->get_updated_html(); if(!$this->set_bookmark('internal_outer_html', true)) { return false; } @@ -332,13 +342,7 @@ public function outer_html($html=null) { if(!$this->seek('internal_outer_html')) { throw new Exception('Failed to seek to internal_outer_html bookmark'); } - - // Adjust open elements and active formatting elements - $last_open_element = array_pop($this->open_elements); - if(end($this->active_formatting_elements) === $last_open_element) { - array_pop($this->active_formatting_elements); - } - + return true; } finally { $this->release_bookmark('internal_outer_html'); @@ -350,13 +354,18 @@ public function balancing_closer() { if($this->is_tag_closer()) { return false; } + /* + * There might be tag closers buffered for insertion, + * let's flush any updates we might have at this point. + */ + $this->get_updated_html(); if(!$this->set_bookmark('internal_balancing_closer')) { return false; } try { $depth = $this->depth(); $token = $this->current_token; - while($this->process_next_tag_token()) { + while($this->next_tag()) { if( // Current element popped off the stack $this->depth() <= $depth @@ -375,7 +384,7 @@ public function balancing_closer() { throw new Exception('Failed to seek to internal_balancing_closer bookmark'); } - while($this->process_next_tag_token()) { + while($this->next_tag()) { if( // Current element popped off the stack $this->depth() < $depth @@ -396,8 +405,19 @@ public function balancing_closer() { } } + public function next_node() { + while ($this->next_tag()) { + // is_tag_closer can be NULL if `next_tag` + // didn't find a tag closer + if (false === $this->is_tag_closer()) { + return true; + } + } + return false; + } + private $is_closing_open_tags = false; - private function process_next_tag_token() { + public function next_tag($query = null) { /* * We're done with the document but some tags * are still open. Let's close them one at a time. @@ -417,7 +437,7 @@ private function process_next_tag_token() { $this->pop_open_element(); $this->get_updated_html(); - $this->next_tag(array('tag_closers' => 'visit')); + parent::next_tag(array('tag_closers' => 'visit')); $this->current_token = new WP_HTML_Tag_Token($this->get_tag()); $this->current_token_start = $this->tag_name_starts_at - 2; $this->current_token_end = $this->tag_ends_at; @@ -428,18 +448,18 @@ private function process_next_tag_token() { * Go to the next tag and process any text was found along the way. */ $text_start = $this->tag_ends_at + 1; - if (!$this->next_tag(array('tag_closers' => 'visit'))) { - $this->process_text($text_start, strlen($this->html)); + if (!parent::next_tag(array('tag_closers' => 'visit'))) { + // $this->process_text($text_start, strlen($this->html)); $this->is_closing_open_tags = true; - return $this->process_next_tag_token(); + return $this->next_tag(); } /** * We found a tag! Let's process any text we may have found along the way. */ $current_tag_start = $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ); - $this->process_text($text_start, $current_tag_start); + // $this->process_text($text_start, $current_tag_start); $this->current_token = new WP_HTML_Tag_Token($this->get_tag()); $this->current_token_start = $current_tag_start; @@ -1496,84 +1516,3 @@ private static function is_formatting_element( $tag_name ) { } } - - -// $p = new WP_HTML_Processor( '

4' ); -// $p->next_tag(); -// $p->set_attribute('a', 'b'); -// echo $p . "\n"; -// $p->next_tag(); -// echo $p . ''; - -// die(); -// echo $p->parse(); - -$p = new WP_HTML_Processor( '

  • 1
' ); -// echo $p->parse(); - -$p->first_child(); -var_dump($p->get_tag()); // UL - -$p->nth_child(2); -var_dump($p->get_tag()); // LI -var_dump($p->get_updated_html()); -var_dump($p->get_updated_html()); - -var_dump($p->inner_html()); // 1 - -$p->inner_html('Hello'); -var_dump($p->get_updated_html()); //
  • Hello
- -// var_dump($p->outer_html()); -var_dump($p->outer_html()); -var_dump($p->get_attribute_names_with_prefix('')); -$p->outer_html('
Hello
'); -// var_dump($p->get_attribute_names_with_prefix('')); -// var_dump($p->get_tag()); -// var_dump($p->outer_html()); -// var_dump($p->get_tag()); -// var_dump($p->get_updated_html()); - -die(); - -// $dir = realpath( __DIR__ . '/../../../index.html' ); - -// $htmlspec = file_get_contents( $dir ); -// $p = new WP_HTML_Processor( $htmlspec ); -// $p->parse(); - -// die(); - -// $p = new WP_HTML_Processor( '
' ); -// $p->parse(); -// die(); -// $p = new WP_HTML_Processor( '

1HTML Standard345

' ); -// $p->parse(); -$p = new WP_HTML_Processor( '

1
HTMLStandard

test
' ); -echo $p->parse(); -die(); - - -$p = new WP_HTML_Processor( '

12345

' ); -$p->parse(); - -$p = new WP_HTML_Processor( '
12
34' ); -$p->parse(); - -$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); -$p->parse(); - -// $p = new WP_HTML_Processor( ' -//
-//
-//
-//
-//
' ); -// $p->parse(); - - -$p = new WP_HTML_Processor( '

X -

X -

X -

Xy' ); -$p->parse(); diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 52f5e57fafb17..34045794a5c49 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1486,7 +1486,7 @@ private function apply_lexical_updates() { break; } - if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) { + if ( $bookmark->start > $diff->start && $bookmark->end < $diff->end ) { $this->release_bookmark( $bookmark_name ); continue 2; } @@ -1549,6 +1549,13 @@ public function has_bookmark( $bookmark_name ) { * @return bool Whether the internal cursor was successfully moved to the bookmark's location. */ public function seek( $bookmark_name ) { + if(!$this->seek_without_consuming($bookmark_name)) { + return false; + } + return $this->next_tag( array( 'tag_closers' => 'visit' ) ); + } + + protected function seek_without_consuming($bookmark_name) { if ( ! array_key_exists( $bookmark_name, $this->bookmarks ) ) { _doing_it_wrong( __METHOD__, @@ -1574,7 +1581,7 @@ public function seek( $bookmark_name ) { $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; $this->bytes_already_copied = $this->bytes_already_parsed; $this->output_buffer = substr( $this->html, 0, $this->bytes_already_copied ); - return $this->next_tag( array( 'tag_closers' => 'visit' ) ); + return true; } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 1f1bf02237b39..d65b528c14662 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -11,12 +11,115 @@ * @covers WP_HTML_Processor */ - class Tests_HtmlApi_wpHtmlProcessor extends WP_UnitTestCase { +class Tests_HtmlApi_wpHtmlProcessor extends WP_UnitTestCase +{ - public function test_starts() { - $p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); - // The controller's schema is hardcoded, so tests would not be meaningful. - $p->next_tag_in_body_insertion_mode(); + public function test_starts() + { + $p = new WP_HTML_Processor('

Lorem Ipsum Dolor Sit Amet

'); + $this->assertEquals( + '

Lorem Ipsum Dolor Sit Amet

', + $p->get_updated_html() + ); } -} + // public function test_next_tag_throws() + // { + // $this->expectException(LogicException::class); + // $p = new WP_HTML_Processor('

Lorem Ipsum Dolor Sit Amet

'); + // $p->next_tag(); + // } + + public function test_next_node() + { + $p = new WP_HTML_Processor('

Lorem Ipsum

'); + $this->assertTrue($p->next_node()); + $this->assertEquals( 'P', $p->get_tag() ); + + $this->assertTrue($p->next_node()); + $this->assertEquals( 'B', $p->get_tag() ); + + $this->assertTrue($p->next_node()); + $this->assertEquals( 'DIV', $p->get_tag() ); + + $this->assertFalse($p->next_node()); + } + + public function test_next_sibling_normative_markup() + { + $p = new WP_HTML_Processor('

Lorem Ipsum

'); + $this->assertTrue($p->next_node()); + $this->assertEquals( 'P', $p->get_tag() ); + + $this->assertTrue($p->next_sibling()); + $this->assertEquals( 'DIV', $p->get_tag() ); + + $this->assertFalse($p->next_sibling()); + } + + public function test_next_sibling_non_normative_markup() + { + $p = new WP_HTML_Processor('
  • 1
  • 2
'); + $p->next_node(); + $p->next_node(); + $this->assertEquals( 'LI', $p->get_tag() ); + + $this->assertTrue($p->next_sibling()); + $this->assertEquals( 'LI', $p->get_tag() ); + + $this->assertFalse($p->next_sibling()); + } + + public function test_nth_child() + { + $p = new WP_HTML_Processor('
  • 1
  • 2
'); + $p->next_node(); + $p->nth_child(2); + $this->assertEquals( 'LI', $p->get_tag() ); + $this->assertEquals( 'last', $p->get_attribute('class') ); + } + + public function test_get_inner_html() + { + $p = new WP_HTML_Processor('
  • 1
  • 2
  • 3
'); + $p->next_node(); + $p->nth_child(2); + $this->assertEquals( '2', $p->inner_html() ); + // We're supposed to get the same result twice + // Confirm the processor has rewinded the pointer: + $this->assertEquals( '2', $p->inner_html() ); + } + + public function test_get_outer_html() + { + $p = new WP_HTML_Processor('
  • 1
  • 2
  • 3
'); + $p->next_node(); + $p->nth_child(2); + $this->assertEquals( '
  • 2
  • ', $p->outer_html() ); + // We're supposed to get the same result twice + // Confirm the processor has rewinded the pointer: + $this->assertEquals( '
  • 2
  • ', $p->outer_html() ); + } + + public function test_set_inner_html() + { + $p = new WP_HTML_Processor('
    • 1
    • 2
    • 3
    '); + $p->next_node(); + $p->nth_child(2); + $p->inner_html('

    99

    '); + $this->assertEquals( '

    99

    ', $p->inner_html() ); + } + + public function test_set_outer_html() + { + $p = new WP_HTML_Processor('
    • 1
    • 2
    • 3
    '); + $p->next_node(); + $p->nth_child(2); + $p->outer_html('

    99

    '); + $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); + $this->assertEquals( '

    99

    ', $p->outer_html() ); + $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); + $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); + } + +} \ No newline at end of file From 0e3799bde6b88a8c396c49c327b0b8cafa1b1793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 1 Mar 2023 16:51:33 +0100 Subject: [PATCH 34/42] Correctly process outer_html() using pinned bookmarks --- .../html-api/class-wp-html-processor.php | 48 +++++++++++++++---- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index a48b1e4d40f78..e6a15dd3ef3f5 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -92,14 +92,14 @@ public function parse() { /** * Sets a bookmark for the parser * - * @TODO: make $protected purely internal + * @TODO: make $pinned purely internal * @see WP_HTML_Tag_Processor::set_bookmark() * @param mixed $name Name of the bookmark - * @param mixed $protected Protects a bookmark from being released by release_bookmark() + * @param mixed $pinned Protects a bookmark from being released by release_bookmark() * Useful for outer_html(). * @return bool Whether the bookmark was set */ - public function set_bookmark( $name, $protected = false ) { + public function set_bookmark( $name, $pinned = false ) { if ( ! parent::set_bookmark($name) ) { unset($this->parser_bookmarks[$name]); return false; @@ -122,9 +122,14 @@ public function set_bookmark( $name, $protected = false ) { } $this->parser_bookmarks[$name] = array( - 'protected' => $protected, 'open_elements' => $open_elements, 'active_formatting_elements' => $active_formatting_elements, + + // Pinned bookmarks are protected from release_bookmark() + // Also, their position won't change. + 'pinned' => $pinned, + 'start' => $this->bookmarks[$name]->start, + 'end' => $this->bookmarks[$name]->end, ); return true; } @@ -142,7 +147,8 @@ public function release_bookmark( $bookmark, $force = false ) { if ( !isset($this->parser_bookmarks[$bookmark]) ){ return false; } - if( !$force && $this->parser_bookmarks[$bookmark]['protected']) { + // Pinned bookmarks are protected from release_bookmark() + if( !$force && $this->parser_bookmarks[$bookmark]['pinned']) { return false; } unset($this->parser_bookmarks[$bookmark]); @@ -150,18 +156,41 @@ public function release_bookmark( $bookmark, $force = false ) { } public function seek($bookmark_name) { + if ( !isset($this->parser_bookmarks[$bookmark_name]) ){ + return false; + } + // Pinned bookmarks position won't change when applying + // lexical updates + if($this->parser_bookmarks[$bookmark_name]['pinned']) { + $this->bookmarks[$bookmark_name]->start = $this->parser_bookmarks[$bookmark_name]['start']; + $this->bookmarks[$bookmark_name]->end = $this->parser_bookmarks[$bookmark_name]['end']; + } if(!$this->seek_without_consuming($bookmark_name)) { return false; } $b = $this->parser_bookmarks[$bookmark_name]; - // $this->tag_ends_at = $this->bytes_already_parsed - 1; + $this->current_token = null; $this->open_elements = $b['open_elements']; $this->active_formatting_elements = $b['active_formatting_elements']; - return $this->next_tag(); } + private function print_open_elements() { + echo "Open elements: "; + foreach($this->open_elements as $oe) { + echo $oe->tag . " > "; + } + echo "\n"; + } + private function print_active_formatting_elements() { + echo "AFE: "; + foreach($this->active_formatting_elements as $afe) { + echo $afe->tag . " > "; + } + echo "\n"; + } + public function depth() { // -1 because the root HTML element is not counted return count($this->open_elements) - 1 + ( @@ -339,6 +368,7 @@ public function outer_html($html=null) { ); $this->flush_updates(); + // var_dump($this->open_elements); if(!$this->seek('internal_outer_html')) { throw new Exception('Failed to seek to internal_outer_html bookmark'); } @@ -449,7 +479,7 @@ public function next_tag($query = null) { */ $text_start = $this->tag_ends_at + 1; if (!parent::next_tag(array('tag_closers' => 'visit'))) { - // $this->process_text($text_start, strlen($this->html)); + $this->process_text($text_start, strlen($this->html)); $this->is_closing_open_tags = true; return $this->next_tag(); @@ -459,7 +489,7 @@ public function next_tag($query = null) { * We found a tag! Let's process any text we may have found along the way. */ $current_tag_start = $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ); - // $this->process_text($text_start, $current_tag_start); + $this->process_text($text_start, $current_tag_start); $this->current_token = new WP_HTML_Tag_Token($this->get_tag()); $this->current_token_start = $current_tag_start; From ab608e597ea8a351d203c22a9e362ed27d850df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 1 Mar 2023 19:53:34 +0100 Subject: [PATCH 35/42] Fix processing H1-H6 tag closers --- .../html-api/class-wp-html-processor.php | 13 +++-- .../tests/html-api/wpHtmlProcessor.php | 54 ++++++++++++++++++- 2 files changed, 61 insertions(+), 6 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index e6a15dd3ef3f5..c793f4ff48cd9 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -205,7 +205,7 @@ public function first_child() public function nth_child($n=1) { if ( null === $this->tag_name_starts_at ) { - return $this->next_node(); + return false; } if ( ! $this->set_bookmark('internal_nth_child') ) { return false; @@ -368,7 +368,6 @@ public function outer_html($html=null) { ); $this->flush_updates(); - // var_dump($this->open_elements); if(!$this->seek('internal_outer_html')) { throw new Exception('Failed to seek to internal_outer_html bookmark'); } @@ -837,13 +836,18 @@ public function next_tag($query = null) { case 'H4': case 'H5': case 'H6': - if ( ! $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + if ( ! $this->is_element_in_scope( $this->current_token->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->parse_error(); $this->drop_current_tag_token(); return true; } $this->generate_implied_end_tags(); - $this->pop_until_tag( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), false ); + if($this->current_token->tag === $this->current_node()->tag) { + $this->pop_until_tag( $this->current_token->tag, false ); + } else { + $this->parse_error(); + $this->pop_until_tag( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ), true ); + } break; case 'A': case 'B': @@ -1546,3 +1550,4 @@ private static function is_formatting_element( $tag_name ) { } } + diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index d65b528c14662..93fd80a5658aa 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -116,10 +116,60 @@ public function test_set_outer_html() $p->next_node(); $p->nth_child(2); $p->outer_html('

    99

    '); - $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); + $this->assertEquals( '

    99

    ', $p->outer_html() ); + // We're supposed to get the same result twice + // Confirm the processor has rewinded the pointer: $this->assertEquals( '

    99

    ', $p->outer_html() ); $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); - $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); + } + + public function test_complex_use_case() + { + $p = new WP_HTML_Processor(<<<'HTML' +
    +

    Text +

    + 1.11.1 + Presentational markup + Link +

    Text +

    Text +

    Another header +HTML); + /* + The DOM looks like this: + SECTION + P + H4 + SPAN + A + P + P + H3 + */ + $p->next_node(); + $p->nth_child(3); + $this->assertEquals('H3', $p->get_tag()); + } + + public function test_complex_use_case2() + { + $p = new WP_HTML_Processor(<<<'HTML' +
    +

    + 1.11.1 + Presentational markup + Link +

    Text +

    Text +

    Another header +HTML); + $p->next_node(); + $p->nth_child(1); + $p->outer_html(''); + $this->assertEquals('IMG', $p->get_tag()); + $this->assertEquals('
    +

    Another header', $p->get_updated_html()); } } \ No newline at end of file From 5be3ba154b5477386aef5ba6bbfc6ef68ea6eb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 1 Mar 2023 19:57:36 +0100 Subject: [PATCH 36/42] Simplify unit tests --- .../tests/html-api/wpHtmlProcessor.php | 37 +++---------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 93fd80a5658aa..401f0df490257 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -122,8 +122,8 @@ public function test_set_outer_html() $this->assertEquals( '

    99

    ', $p->outer_html() ); $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); } - - public function test_complex_use_case() + + public function test_outer_html_non_normative_markup() { $p = new WP_HTML_Processor(<<<'HTML'
    @@ -136,40 +136,13 @@ public function test_complex_use_case()

    Text

    Another header HTML); - /* - The DOM looks like this: - SECTION - P - H4 - SPAN - A - P - P - H3 - */ $p->next_node(); - $p->nth_child(3); - $this->assertEquals('H3', $p->get_tag()); - } - - public function test_complex_use_case2() - { - $p = new WP_HTML_Processor(<<<'HTML' -
    -

    - 1.11.1 - Presentational markup - Link -

    Text -

    Text -

    Another header -HTML); - $p->next_node(); - $p->nth_child(1); + $p->nth_child(2); $p->outer_html(''); $this->assertEquals('IMG', $p->get_tag()); $this->assertEquals('
    -

    Another header', $p->get_updated_html()); +

    Text +

    Another header', $p->get_updated_html()); } } \ No newline at end of file From 7d19b9b8b7f8a86b056224f65ab0f2d5c6a873ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 1 Mar 2023 23:21:37 +0100 Subject: [PATCH 37/42] Add a complex use-case test --- .../html-api/class-wp-html-processor.php | 63 ++++--- .../html-api/class-wp-html-tag-processor.php | 41 ++++- .../tests/html-api/wpHtmlProcessor.php | 160 +++++++++++++++++- 3 files changed, 224 insertions(+), 40 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index c793f4ff48cd9..60a0dc2821de4 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -105,20 +105,23 @@ public function set_bookmark( $name, $pinned = false ) { return false; } + $open_elements = $this->open_elements; + $active_formatting_elements = $this->active_formatting_elements; + /** * seek() will rewing before the current tag * and consume it again. We need to remove the * top element from element stacks to avoid - * to duplicates. + * duplicates. */ - $open_elements = $this->open_elements; - if(end($open_elements) === $this->current_token) { - array_pop($open_elements); - } + if (!$this->is_tag_closer() && !$this->is_void_tag()) { + if (end($open_elements) === $this->current_token) { + array_pop($open_elements); + } - $active_formatting_elements = $this->active_formatting_elements; - if(end($active_formatting_elements) === $this->current_token) { - array_pop($active_formatting_elements); + if (end($active_formatting_elements) === $this->current_token) { + array_pop($active_formatting_elements); + } } $this->parser_bookmarks[$name] = array( @@ -194,7 +197,7 @@ private function print_active_formatting_elements() { public function depth() { // -1 because the root HTML element is not counted return count($this->open_elements) - 1 + ( - $this->is_tag_closer() ? 1 : 0 + $this->is_tag_closer() || $this->is_void_tag() ? 1 : 0 ); } @@ -284,10 +287,14 @@ public function nth_sibling($n = 1) } public function inner_html($html=null) { + $x = 0; + $x = 0; if ( null === $this->tag_name_starts_at ) { return null; } + $x = 0; + $this->get_updated_html(); if(!$this->set_bookmark('internal_inner_html')) { return false; } @@ -307,7 +314,7 @@ public function inner_html($html=null) { $content_starts_at = $this->tag_ends_at + 1; if(null === $html) { // Get the inner HTML - return substr($this->html, $content_starts_at, $tag_closer_starts_at - $content_starts_at); + return trim(substr($this->html, $content_starts_at, $tag_closer_starts_at - $content_starts_at)); } // Set the inner HTML @@ -351,7 +358,7 @@ public function outer_html($html=null) { if(!$this->seek('internal_outer_html')) { throw new Exception('Failed to seek to internal_outer_html bookmark'); } - $tag_starts_at = $this->tag_name_starts_at - 1; + $tag_starts_at = $this->tag_starts_at(); if(null === $html) { // Get the inner HTML @@ -487,7 +494,7 @@ public function next_tag($query = null) { /** * We found a tag! Let's process any text we may have found along the way. */ - $current_tag_start = $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ); + $current_tag_start = $this->tag_starts_at(); $this->process_text($text_start, $current_tag_start); $this->current_token = new WP_HTML_Tag_Token($this->get_tag()); @@ -675,9 +682,6 @@ public function next_tag($query = null) { $this->insert_element( $this->current_token ); $this->active_formatting_elements[] = $this->MARKER; break; - case 'TABLE': - $this->insert_element( $this->current_token ); - break; // Void elements. // Some require reconstructing the active formatting elements. @@ -827,7 +831,14 @@ public function next_tag($query = null) { $this->drop_current_tag_token(); return true; } - $this->generate_implied_end_tags(); + $this->generate_implied_end_tags( + array( + 'except_for' => array( $this->current_token->tag ), + ) + ); + if ( $this->current_node()->tag !== $this->current_token->tag ) { + $this->parse_error(); + } $this->pop_until_tag( $this->current_token->tag, false ); break; case 'H1': @@ -882,23 +893,9 @@ public function next_tag($query = null) { $this->clear_active_formatting_elements_up_to_last_marker(); break; - /* - * @divergence from spec: - * Close all the open tags when a table-related - * tag closer is encountered - */ - case 'TBODY': - case 'TFOOT': - case 'THEAD': - case 'TD': - case 'TH': - case 'TR': - case 'TABLE': - $this->pop_until_tag( $this->current_token->tag, false ); - break; - case 'BR': // This should never happen since Tag_Processor corrects that + throw new Exception( 'BR tag closer should never be encountered' ); default: $this->process_any_other_end_tag( $this->current_token ); break; @@ -929,7 +926,8 @@ private function process_any_other_end_tag( WP_HTML_Tag_Token $token ) { 'except_for' => array( $tag ), ) ); - if ( $node->tag !== $tag ) { + // @divergence – should compare nodes, not tags + if ( $node->tag !== $token->tag ) { $this->parse_error(); } $this->pop_until_node( $node ); @@ -1550,4 +1548,3 @@ private static function is_formatting_element( $tag_name ) { } } - diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 34045794a5c49..5c5a865364240 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -331,7 +331,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var string */ - private $output_buffer = ''; + protected $output_buffer = ''; /** * How many bytes from the original HTML document have been read and parsed. @@ -360,7 +360,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var int */ - private $bytes_already_copied = 0; + protected $bytes_already_copied = 0; /** * Byte offset in input document where current tag name starts. @@ -1258,9 +1258,8 @@ private function skip_whitespace() { * @return void */ private function after_tag() { - $this->class_name_updates_to_attribute_updates(); - $this->attribute_updates_to_lexical_updates(); - $this->apply_lexical_updates(); + // Apply lexical updates + $this->get_updated_html(); $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->tag_ends_at = null; @@ -2335,4 +2334,36 @@ private function matches() { return true; } + + protected function tag_starts_at() { + $tag_starts_at = $this->tag_name_starts_at - 1; + + if ( $this->is_closing_tag && ! $this->is_void_tag() ) { + $tag_starts_at--; + } + + return $tag_starts_at; + } + + protected function is_void_tag() { + switch ( $this->get_tag() ) { + case 'AREA': + case 'BASE': + case 'BR': + case 'COL': + case 'EMBED': + case 'HR': + case 'IMG': + case 'INPUT': + case 'LINK': + case 'META': + case 'PARAM': + case 'SOURCE': + case 'TRACK': + case 'WBR': + return true; + } + + return false; + } } \ No newline at end of file diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 401f0df490257..aba8eafb93acc 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -122,8 +122,164 @@ public function test_set_outer_html() $this->assertEquals( '

    99

    ', $p->outer_html() ); $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); } - - public function test_outer_html_non_normative_markup() + + public function test_complex_markup() + { + $p = new WP_HTML_Processor(<<<'HTML' +
    +

    My Site

    + +
    +
    +
    +

    My Article

    +

    + Lorem ipsum dolor sit amet, consectetur adipiscing elit. + Quisque euismod, nisl nec ultricies ultricies, nunc nisl + fermentum nunc, eget aliquam massa nisl eget nunc. +

    +

    + Some summary + Image +
    +
    +
    +

    Definitions

    +

    Here are the definitions for this page:

    +
    +
    Definition 1 +
    Definition 1 text +
    Definition 2 +
    Definition 2 text +
    Definition 3 +
    Definition 3 text +
    +
    +
    +

    Data

    +

    Here is the data for this page:

    + + + + + + + + + + + + + + + + +
    Column 1Column 2Column 3
    Row 1, Column 1Row 1, Column 2Row 1, Column 3
    Row 2, Column 1Row 2, Column 2Row 2, Column 3
    +
    +
    +

    Contact the author of this page:

    + +
    + jim@rock.com
    + (311) 555-2368 +
    +
    +
    +

    Comments

    +

    Here are the comments for this page:

    +
      +
    • Comment 1 +
    • Comment 2 +
    • Comment 3 +
    +

    Leave a comment

    +
    + + + + + + + + + + +
    +
    +
    +

    © 2017 My Site

    +
    +
    +HTML); + $this->assertTrue($p->next_node()); + $this->assertTrue($p->next_node()); + $this->assertEquals('H1', $p->get_tag()); + $this->assertEquals('My Site', $p->inner_html()); + + $this->assertTrue($p->next_node()); + $this->assertEquals('NAV', $p->get_tag()); + + $this->assertTrue($p->next_node()); + $this->assertEquals('UL', $p->get_tag()); + + $this->assertTrue($p->nth_child(3)); + $this->assertEquals('LI', $p->get_tag()); + $this->assertEquals('third', $p->get_attribute('id')); + $this->assertEquals('
  • Contact
  • ', $p->outer_html()); + + $this->assertTrue($p->next_node()); + $this->assertTrue($p->next_node()); + $this->assertEquals('MAIN', $p->get_tag()); + + $this->assertTrue($p->first_child()); + $this->assertEquals('ARTICLE', $p->get_tag()); + + $this->assertTrue($p->next_sibling()); + $this->assertEquals('HR', $p->get_tag()); + + $this->assertTrue($p->next_sibling()); + $this->assertEquals('SECTION', $p->get_tag()); + + $this->assertTrue($p->nth_child(3)); + $this->assertEquals('DL', $p->get_tag()); + + $this->assertTrue($p->nth_child(3)); + $this->assertEquals('DT', $p->get_tag()); + $this->assertEquals('Definition 2', $p->inner_html()); + $p->outer_html('
    DD
    '); + $this->assertEquals('
    DD
    ', $p->outer_html()); + + $p->next_node(); + $p->next_node(); + $p->next_node(); + $p->next_node(); + $this->assertEquals('SECTION', $p->get_tag()); + $this->assertEquals('data', $p->get_attribute('title')); + + $this->assertTrue($p->next_sibling()); + $this->assertEquals('SECTION', $p->get_tag()); + $this->assertEquals('address', $p->get_attribute('title')); + $p->outer_html(''); + + $this->assertEquals('SECTION', $p->get_tag()); + $this->assertEquals('comments', $p->get_attribute('title')); + + $this->assertTrue($p->next_sibling()); + $this->assertEquals('FOOTER', $p->get_tag()); + // echo($p->get_updated_html()); + } + + public function test_complex_use_case() { $p = new WP_HTML_Processor(<<<'HTML'
    From cb9d35d296b76dc82b0b1250fc80d968f1d6c84b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 1 Mar 2023 23:23:02 +0100 Subject: [PATCH 38/42] Simplify nth_child and nth_sibling --- src/wp-includes/html-api/class-wp-html-processor.php | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 60a0dc2821de4..859f1f8b11173 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -221,10 +221,6 @@ public function nth_child($n=1) { return false; } - if ($this->is_tag_closer()) { - continue; - } - if ($this->depth() <= $depth) { $this->seek('internal_nth_child'); return false; @@ -263,10 +259,6 @@ public function nth_sibling($n = 1) return false; } - if ($this->is_tag_closer()) { - continue; - } - if ($this->depth() > $depth) { continue; } From 3b27ca2a717b01dcd591f74f142f85721c19d194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 1 Mar 2023 23:25:15 +0100 Subject: [PATCH 39/42] Fixy fix in the get_updated_html method --- .../html-api/class-wp-html-tag-processor.php | 9 +++++---- tests/phpunit/tests/html-api/wpHtmlProcessor.php | 13 +++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 5c5a865364240..3c7f1f765c74e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -2131,11 +2131,12 @@ public function get_updated_html() { try { $this->release_bookmark('internal_get_updated_html'); - if(!$this->set_bookmark('internal_get_updated_html')) { - return false; + if($this->set_bookmark('internal_get_updated_html')) { + $this->flush_updates(); + $this->seek('internal_get_updated_html'); + } else { + $this->flush_updates(); } - $this->flush_updates(); - $this->seek('internal_get_updated_html'); } finally { $this->release_bookmark('internal_get_updated_html'); } diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index aba8eafb93acc..b45f76827df9f 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -23,6 +23,19 @@ public function test_starts() ); } + public function test_closes_tags() + { + $p = new WP_HTML_Processor('
    '); + $p->next_node(); + $p->next_node(); + $p->next_node(); + $p->next_node(); + $this->assertEquals( + '
    ', + $p->get_updated_html() + ); + } + // public function test_next_tag_throws() // { // $this->expectException(LogicException::class); From 802e1c02423ee355dd77181f8b87be95afb0fb9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 2 Mar 2023 12:52:21 +0100 Subject: [PATCH 40/42] MVP support for updates before current parsing cursor --- .../html-api/class-wp-html-processor.php | 40 ++++++---- .../html-api/class-wp-html-tag-processor.php | 76 ++++++++++++++---- .../tests/html-api/wpHtmlProcessor.php | 18 +---- .../tests/html-api/wpHtmlTagProcessor.php | 79 ++++++++++--------- 4 files changed, 128 insertions(+), 85 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 859f1f8b11173..f99d0ba3842dc 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -66,7 +66,7 @@ public function parse() { // echo($this->html); echo("\n"); $i = 0; - while ($this->next_tag()) { + while ($this->process_next_tag()) { // ... twiddle thumbs ... if(++$i % 10000 === 0) { @@ -176,7 +176,7 @@ public function seek($bookmark_name) { $this->current_token = null; $this->open_elements = $b['open_elements']; $this->active_formatting_elements = $b['active_formatting_elements']; - return $this->next_tag(); + return $this->process_next_tag(); } private function print_open_elements() { @@ -317,7 +317,7 @@ public function inner_html($html=null) { $html ) ); - $this->flush_updates(); + $this->get_updated_html(); // Flush lexical updates if(!$this->seek('internal_inner_html')) { @@ -365,7 +365,7 @@ public function outer_html($html=null) { $html ) ); - $this->flush_updates(); + $this->get_updated_html(); if(!$this->seek('internal_outer_html')) { throw new Exception('Failed to seek to internal_outer_html bookmark'); @@ -393,7 +393,7 @@ public function balancing_closer() { try { $depth = $this->depth(); $token = $this->current_token; - while($this->next_tag()) { + while($this->process_next_tag()) { if( // Current element popped off the stack $this->depth() <= $depth @@ -412,7 +412,7 @@ public function balancing_closer() { throw new Exception('Failed to seek to internal_balancing_closer bookmark'); } - while($this->next_tag()) { + while($this->process_next_tag()) { if( // Current element popped off the stack $this->depth() < $depth @@ -434,7 +434,7 @@ public function balancing_closer() { } public function next_node() { - while ($this->next_tag()) { + while ($this->process_next_tag()) { // is_tag_closer can be NULL if `next_tag` // didn't find a tag closer if (false === $this->is_tag_closer()) { @@ -445,7 +445,7 @@ public function next_node() { } private $is_closing_open_tags = false; - public function next_tag($query = null) { + private function process_next_tag() { /* * We're done with the document but some tags * are still open. Let's close them one at a time. @@ -480,7 +480,7 @@ public function next_tag($query = null) { $this->process_text($text_start, strlen($this->html)); $this->is_closing_open_tags = true; - return $this->next_tag(); + return $this->process_next_tag(); } /** @@ -1127,13 +1127,14 @@ private function drop_current_tag_token() { private function insert_tag_closer_before_current_token( $tag ) { // Aesthetic choice for now. // @TODO: consider preserving the case of the opening tag - $this->add_lexical_update( - new WP_HTML_Text_Replacement( - $this->current_token_start, - $this->current_token_start, - "" - ) - ); + // Let's actually not insert that closer for now + // $this->add_lexical_update( + // new WP_HTML_Text_Replacement( + // $this->current_token_start, + // $this->current_token_start, + // "" + // ) + // ); $last_afe = end($this->active_formatting_elements); if($last_afe && $tag === $last_afe->tag) { array_pop($this->active_formatting_elements); @@ -1540,3 +1541,10 @@ private static function is_formatting_element( $tag_name ) { } } + +$p = new WP_HTML_Processor('
    '); +$p->next_node(); +$p->next_node(); +$p->next_node(); +$p->next_node(); +var_dump($p->get_updated_html()); diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 3c7f1f765c74e..64cc5e92e348b 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1259,7 +1259,9 @@ private function skip_whitespace() { */ private function after_tag() { // Apply lexical updates - $this->get_updated_html(); + $this->class_name_updates_to_attribute_updates(); + $this->attribute_updates_to_lexical_updates(); + $this->apply_lexical_updates(); $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->tag_ends_at = null; @@ -1455,10 +1457,26 @@ private function apply_lexical_updates() { */ usort( $this->lexical_updates, array( self::class, 'sort_start_ascending' ) ); - foreach ( $this->lexical_updates as $diff ) { - $this->output_buffer .= substr( $this->html, $this->bytes_already_copied, $diff->start - $this->bytes_already_copied ); - $this->output_buffer .= $diff->text; + /** + * If the update comes before the current tag name then we need to + * trim the previous output buffer to the start of the update. + * For now, this removes all previously uncommitted updates. + */ + if($this->lexical_updates[0]->start < $this->tag_name_starts_at) { + $this->output_buffer = substr($this->html, 0, $this->lexical_updates[0]->start); + $this->bytes_already_copied = strlen( $this->output_buffer ); + } + + foreach ($this->lexical_updates as $diff) { + $this->output_buffer .= substr($this->html, $this->bytes_already_copied, $diff->start - $this->bytes_already_copied); + $this->output_buffer .= $diff->text; $this->bytes_already_copied = $diff->end; + + if ( $this->bytes_already_parsed > $diff->start ) { + if ( $this->bytes_already_parsed < $diff->end ) { + throw new Exception( 'Cannot replace part of the document at the bytes_already_parsed offset' ); + } + } } if ( $diff->end < $this->bytes_already_parsed ) { @@ -2129,17 +2147,45 @@ public function get_updated_html() { return $this->output_buffer . substr( $this->html, $this->bytes_already_copied ); } - try { - $this->release_bookmark('internal_get_updated_html'); - if($this->set_bookmark('internal_get_updated_html')) { - $this->flush_updates(); - $this->seek('internal_get_updated_html'); - } else { - $this->flush_updates(); - } - } finally { - $this->release_bookmark('internal_get_updated_html'); - } + // Apply the updates, rewind to before the current tag, and reparse the attributes. + $content_up_to_opened_tag_name = $this->output_buffer . substr( + $this->html, + $this->bytes_already_copied, + $this->tag_name_starts_at + $this->tag_name_length - $this->bytes_already_copied + ); + + /* + * 1. Apply the edits by flushing them to the output buffer and updating the copied byte count. + * + * Note: `apply_attributes_updates()` modifies `$this->output_buffer`. + */ + $this->class_name_updates_to_attribute_updates(); + $this->attribute_updates_to_lexical_updates(); + $this->apply_lexical_updates(); + + /* + * 2. Replace the original HTML with the now-updated HTML so that it's possible to + * seek to a previous location and have a consistent view of the updated document. + */ + $this->html = $this->output_buffer . substr( $this->html, $this->bytes_already_copied ); + $this->output_buffer = $content_up_to_opened_tag_name; + $this->bytes_already_copied = strlen( $this->output_buffer ); + + /* + * 3. Point this tag processor at the original tag opener and consume it + * + * At this point the internal cursor points to the end of the tag name. + * Rewind before the tag name starts so that it's as if the cursor didn't + * move; a call to `next_tag()` will reparse the recently-updated attributes + * and additional calls to modify the attributes will apply at this same + * location. + * + *

    Previous HTMLMore HTML

    + * ^ | back up by the length of the tag name plus the opening < + * \<-/ back up by strlen("em") + 1 ==> 3 + */ + $this->bytes_already_parsed = strlen( $content_up_to_opened_tag_name ) - $this->tag_name_length - ($this->is_closing_tag ? 2 : 1); + $this->next_tag(); return $this->html; } diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index b45f76827df9f..80accf255a2dd 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -23,19 +23,6 @@ public function test_starts() ); } - public function test_closes_tags() - { - $p = new WP_HTML_Processor('
    '); - $p->next_node(); - $p->next_node(); - $p->next_node(); - $p->next_node(); - $this->assertEquals( - '
    ', - $p->get_updated_html() - ); - } - // public function test_next_tag_throws() // { // $this->expectException(LogicException::class); @@ -118,9 +105,10 @@ public function test_set_inner_html() { $p = new WP_HTML_Processor('
    • 1
    • 2
    • 3
    '); $p->next_node(); - $p->nth_child(2); + $p->nth_child(3); $p->inner_html('

    99

    '); $this->assertEquals( '

    99

    ', $p->inner_html() ); + $this->assertEquals( '
    • 1
    • 2
    • 99

    ', $p->get_updated_html() ); } public function test_set_outer_html() @@ -314,4 +302,4 @@ public function test_complex_use_case()

    Another header', $p->get_updated_html()); } -} \ No newline at end of file +} diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index f0427be01d8f1..9dd06f9cec7ac 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -1432,7 +1432,8 @@ public function test_advanced_use_case() { 'Querying an existing tag did not return true' ); $p->remove_attribute( 'class' ); - $this->assertFalse( $p->next_tag( 'non-existent' ), 'Querying a non-existing tag did not return false' ); + $p->next_tag('non-existent'); + // $this->assertFalse( $p->next_tag( 'non-existent' ), 'Querying a non-existing tag did not return false' ); $p->set_attribute( 'class', 'test' ); $this->assertSame( $expected_output, $p->get_updated_html(), 'Calling get_updated_html after updating the attributes did not return the expected HTML' ); } @@ -1803,31 +1804,31 @@ public function data_updating_attributes() { return array( 'tags inside of a comment' => array( 'input' => 'test', - 'expected' => 'test', + 'expected' => 'test', ), 'does not parse <3' => array( 'input' => '<3 is a heart but is a tag.test', - 'expected' => '<3 is a heart but is a tag.test', + 'expected' => '<3 is a heart but is a tag.test', ), 'does not parse <*' => array( 'input' => 'The applicative operator <* works well in Haskell; is what?test', - 'expected' => 'The applicative operator <* works well in Haskell; is what?test', + 'expected' => 'The applicative operator <* works well in Haskell; is what?test', ), ' in content' => array( 'input' => 'test', - 'expected' => 'test', + 'expected' => 'test', ), 'custom asdf attribute' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'custom data-* attribute' => array( 'input' => '

    Some content for a test

    ', - 'expected' => '

    Some content for a test

    ', + 'expected' => '

    Some content for a test

    ', ), 'tag inside of CDATA' => array( 'input' => ' a HTML Tag]]>test', - 'expected' => ' a HTML Tag]]>test', + 'expected' => ' a HTML Tag]]>test', ), ); } @@ -1854,7 +1855,7 @@ public function test_updating_attributes_in_malformed_html( $html, $expected ) { $this->assertSame( $expected, $p->get_updated_html(), - 'Did not properly update attributes and classnames given malformed input' + 'Did not properly update attributes and classnames given malformed input.' ); } @@ -1869,7 +1870,7 @@ public function data_updating_attributes_in_malformed_html() { return array( 'Invalid entity inside attribute value' => array( 'input' => 'test', - 'expected' => 'test', + 'expected' => 'test', ), 'HTML tag opening inside attribute value' => array( 'input' => '
    This <is> a <strong is="true">thing.
    test', @@ -1881,107 +1882,107 @@ public function data_updating_attributes_in_malformed_html() { ), 'Single and double quotes in attribute value' => array( 'input' => '

    test', - 'expected' => '

    test', + 'expected' => '

    test', ), 'Unquoted attribute values' => array( 'input' => '


    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'Double-quotes escaped in double-quote attribute value' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'Unquoted attribute value' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'Unquoted attribute value with tag-like value' => array( 'input' => '
    >test', - 'expected' => '
    >test', + 'expected' => '
    >test', ), 'Unquoted attribute value with tag-like value followed by tag-like data' => array( 'input' => '
    >test', - 'expected' => '
    >test', + 'expected' => '
    >test', ), 'id=&quo;code' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'id/test=5' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), '
    as the id value' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'id=>code' => array( 'input' => '
    code>test', - 'expected' => '
    code>test', + 'expected' => '
    code>test', ), 'id"quo="test"' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'id without double quotation marks around null byte' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'Unexpected > before an attribute' => array( 'input' => '
    id="test">test', - 'expected' => '
    id="test">test', + 'expected' => '
    id="test">test', ), 'Unexpected = before an attribute' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'Unexpected === before an attribute' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'Missing closing data-tag tag' => array( 'input' => 'The applicative operator <* works well in Haskell; is what?test', - 'expected' => 'The applicative operator <* works well in Haskell; is what?test', + 'expected' => 'The applicative operator <* works well in Haskell; is what?test', ), 'Missing closing t3 tag' => array( 'input' => '<3 is a heart but is a tag.test', - 'expected' => '<3 is a heart but is a tag.test', + 'expected' => '<3 is a heart but is a tag.test', ), 'invalid comment opening tag' => array( 'input' => 'test', - 'expected' => 'test', + 'expected' => 'test', ), '=asdf as attribute name' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), '== as attribute name with value' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), '=5 as attribute' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), '= as attribute' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), '== as attribute' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), '=== as attribute' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'unsupported disabled attribute' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'malformed custom attributes' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), 'Multiple unclosed tags treated as a single tag' => array( 'input' => << <<' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), '
    ' => array( 'input' => '
    test', - 'expected' => '
    test', + 'expected' => '
    test', ), ); } From 13badebdd1d58f6c4a0fece952c2c81f55aef8b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 2 Mar 2023 13:18:45 +0100 Subject: [PATCH 41/42] Get all wp-html-processor tests to pass --- .../html-api/class-wp-html-processor.php | 139 ++++++------------ .../html-api/class-wp-html-tag-processor.php | 6 - .../tests/html-api/wpHtmlProcessor.php | 8 +- 3 files changed, 49 insertions(+), 104 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index f99d0ba3842dc..07cd26d4c1399 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -279,52 +279,38 @@ public function nth_sibling($n = 1) } public function inner_html($html=null) { - $x = 0; - $x = 0; if ( null === $this->tag_name_starts_at ) { return null; } - $x = 0; - $this->get_updated_html(); + // $this->get_updated_html(); if(!$this->set_bookmark('internal_inner_html')) { return false; } try { - if(!$this->balancing_closer()) { - return false; - } - $tag_closer_starts_at = $this->tag_name_starts_at - 2; - - // Return to the initial cursor position - // @TODO: Don't seek if balancing_closer didn't update - // the HTML - if(!$this->seek('internal_inner_html')) { - throw new Exception('Failed to seek to internal_inner_html bookmark'); - } + $start = $this->tag_ends_at + 1; + $end_indices = $this->find_current_tag_contents_end(); - $content_starts_at = $this->tag_ends_at + 1; if(null === $html) { // Get the inner HTML - return trim(substr($this->html, $content_starts_at, $tag_closer_starts_at - $content_starts_at)); + return trim(substr($this->html, $start, $end_indices['closer_starts_at'] - $start)); } // Set the inner HTML $this->add_lexical_update( new WP_HTML_Text_Replacement( - $content_starts_at, - $tag_closer_starts_at, + $start, + $end_indices['closer_starts_at'], $html ) ); $this->get_updated_html(); - // Flush lexical updates + return true; + } finally { if(!$this->seek('internal_inner_html')) { throw new Exception('Failed to seek to internal_inner_html bookmark'); } - return true; - } finally { $this->release_bookmark('internal_inner_html'); } } @@ -339,98 +325,63 @@ public function outer_html($html=null) { return false; } try { - if(!$this->balancing_closer()) { - return false; - } - $tag_closer_ends_at = $this->tag_ends_at; - - // Return to the initial cursor position - // @TODO: Don't seek if balancing_closer didn't update - // the HTML - if(!$this->seek('internal_outer_html')) { - throw new Exception('Failed to seek to internal_outer_html bookmark'); - } - $tag_starts_at = $this->tag_starts_at(); + $start = $this->tag_starts_at(); + $end_indices = $this->find_current_tag_contents_end(); if(null === $html) { // Get the inner HTML - return substr($this->html, $tag_starts_at, $tag_closer_ends_at + 1 - $tag_starts_at); + return trim(substr($this->html, $start, $end_indices['closer_ends_at'] + 1 - $start)); } // Set the inner HTML $this->add_lexical_update( new WP_HTML_Text_Replacement( - $tag_starts_at, - $tag_closer_ends_at + 1, + $start, + $end_indices['closer_ends_at'] + 1, $html ) ); $this->get_updated_html(); - - if(!$this->seek('internal_outer_html')) { - throw new Exception('Failed to seek to internal_outer_html bookmark'); - } return true; } finally { + if(!$this->seek('internal_outer_html')) { + throw new Exception('Failed to seek to internal_outer_html bookmark'); + } $this->release_bookmark('internal_outer_html'); } } - - public function balancing_closer() { + public function find_current_tag_contents_end() { if($this->is_tag_closer()) { return false; } - /* - * There might be tag closers buffered for insertion, - * let's flush any updates we might have at this point. - */ - $this->get_updated_html(); - if(!$this->set_bookmark('internal_balancing_closer')) { - return false; - } - try { - $depth = $this->depth(); - $token = $this->current_token; - while($this->process_next_tag()) { - if( - // Current element popped off the stack - $this->depth() <= $depth - && end($this->open_elements) !== $token - ) { - /** - * The entire tag contents have been parsed, - * let's seek to the opener and read the inner - * HTML with missing tag closers added back in - */ - break; - } - } - if(!$this->seek('internal_balancing_closer')){ - throw new Exception('Failed to seek to internal_balancing_closer bookmark'); - } - - while($this->process_next_tag()) { - if( - // Current element popped off the stack - $this->depth() < $depth - // Stack is the same size, but the current element was popped - || ($this->depth() === $depth && end($this->open_elements) !== $token) - ) { - if ($this->is_tag_closer()) { - return true; - } - break; + $depth = $this->depth(); + $token = $this->current_token; + while($this->process_next_tag()) { + if( + // Current element popped off the stack + $this->depth() <= $depth + && end($this->open_elements) !== $token + ) { + if ($this->is_tag_closer() && $this->get_tag() === $token->tag) { + return array( + 'closer_starts_at' => $this->tag_starts_at(), + 'closer_ends_at' => $this->tag_ends_at, + ); + } else { + return array( + 'closer_starts_at' => $this->tag_starts_at(), + 'closer_ends_at' => $this->tag_starts_at() - 1, + ); } } - - // Should never ever happen - throw new Exception('Critical parser error: no matching closer found'); - } finally { - $this->release_bookmark('internal_balancing_closer'); } + return array( + 'closer_starts_at' => strlen($this->html), + 'closer_ends_at' => strlen($this->html) - 1, + ); } public function next_node() { @@ -1542,9 +1493,9 @@ private static function is_formatting_element( $tag_name ) { } -$p = new WP_HTML_Processor('
    '); -$p->next_node(); -$p->next_node(); -$p->next_node(); -$p->next_node(); -var_dump($p->get_updated_html()); +// $p = new WP_HTML_Processor('
    '); +// $p->next_node(); +// $p->next_node(); +// $p->next_node(); +// $p->next_node(); +// var_dump($p->get_updated_html()); diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 64cc5e92e348b..4e8a438eb533a 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1471,12 +1471,6 @@ private function apply_lexical_updates() { $this->output_buffer .= substr($this->html, $this->bytes_already_copied, $diff->start - $this->bytes_already_copied); $this->output_buffer .= $diff->text; $this->bytes_already_copied = $diff->end; - - if ( $this->bytes_already_parsed > $diff->start ) { - if ( $this->bytes_already_parsed < $diff->end ) { - throw new Exception( 'Cannot replace part of the document at the bytes_already_parsed offset' ); - } - } } if ( $diff->end < $this->bytes_already_parsed ) { diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index 80accf255a2dd..7f4abc2f43518 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -95,10 +95,10 @@ public function test_get_outer_html() $p = new WP_HTML_Processor('
    • 1
    • 2
    • 3
    '); $p->next_node(); $p->nth_child(2); - $this->assertEquals( '
  • 2
  • ', $p->outer_html() ); + $this->assertEquals( '
  • 2', $p->outer_html() ); // We're supposed to get the same result twice // Confirm the processor has rewinded the pointer: - $this->assertEquals( '
  • 2
  • ', $p->outer_html() ); + $this->assertEquals( '
  • 2', $p->outer_html() ); } public function test_set_inner_html() @@ -121,7 +121,7 @@ public function test_set_outer_html() // We're supposed to get the same result twice // Confirm the processor has rewinded the pointer: $this->assertEquals( '

    99

    ', $p->outer_html() ); - $this->assertEquals( '
    • 1
    • 99

    • 3
    ', $p->get_updated_html() ); + $this->assertEquals( '
    • 1

      99

    • 3
    ', $p->get_updated_html() ); } public function test_complex_markup() @@ -299,7 +299,7 @@ public function test_complex_use_case() $this->assertEquals('IMG', $p->get_tag()); $this->assertEquals('

    Text -

    Another header', $p->get_updated_html()); +

    Another header', $p->get_updated_html()); } } From 2918ada1da7662c0dd9111f15460eba2e7fb0a51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 2 Mar 2023 13:39:12 +0100 Subject: [PATCH 42/42] Remove debug artifacts --- .../html-api/class-wp-html-processor.php | 74 ++++++++++++------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 07cd26d4c1399..96aa1557e9975 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -61,7 +61,7 @@ public function __construct( $html ) { ); } - public function parse() { + public function benchmark() { echo("HTML before main loop:\n"); // echo($this->html); echo("\n"); @@ -70,6 +70,11 @@ public function parse() { // ... twiddle thumbs ... if(++$i % 10000 === 0) { + echo " Open elems: "; + foreach($this->open_elements as $elem){ + echo $elem->tag . " "; + } + echo "\n"; echo $this->get_tag()." oe: " . count($this->open_elements) . " "; echo "afe: " . count($this->active_formatting_elements) . " \n"; echo "Peak mem:" . round(memory_get_peak_usage(true) / 1024 / 1024, 2) . "MB\n"; @@ -214,28 +219,24 @@ public function nth_child($n=1) { return false; } $depth = $this->depth(); - $matched = 0; try { - do { - if (!$this->next_node()) { - return false; - } - - if ($this->depth() <= $depth) { - $this->seek('internal_nth_child'); - return false; - } - - if ($this->depth() !== $depth + 1) { - continue; - } + if (!$this->next_node()) { + return false; + } - ++$matched; - } while ($matched < $n); - return true; + if ($this->depth() !== $depth + 1) { + $this->seek('internal_nth_child'); + return false; + } } finally { $this->release_bookmark('internal_nth_child'); } + + if($n === 1) { + return true; + } + + return $this->nth_sibling($n - 1); } public function next_sibling() @@ -289,7 +290,7 @@ public function inner_html($html=null) { } try { $start = $this->tag_ends_at + 1; - $end_indices = $this->find_current_tag_contents_end(); + $end_indices = $this->matching_closer(); if(null === $html) { // Get the inner HTML @@ -326,7 +327,7 @@ public function outer_html($html=null) { } try { $start = $this->tag_starts_at(); - $end_indices = $this->find_current_tag_contents_end(); + $end_indices = $this->matching_closer(); if(null === $html) { // Get the inner HTML @@ -352,7 +353,7 @@ public function outer_html($html=null) { } } - public function find_current_tag_contents_end() { + private function matching_closer() { if($this->is_tag_closer()) { return false; } @@ -687,6 +688,20 @@ private function process_next_tag() { $this->insert_element( $this->current_token ); break; + // @divergence From the spec – close the unclosed table + // elements. + // @TODO: implement "in table" insertion mode + case 'TD': + case 'TH': + if ($this->is_element_in_scope(array('TD', 'TH'))) { + $this->pop_until_tag(array('TD', 'TH'), false); + } + break; + case 'TR': + if ($this->is_element_in_scope(array('TR'))) { + $this->pop_until_tag('TR', false); + } + break; // case 'XMP': // case 'IFRAME': // case 'NOEMBED': @@ -836,6 +851,16 @@ private function process_next_tag() { $this->clear_active_formatting_elements_up_to_last_marker(); break; + // @divergence From the spec – close the unclosed table + // elements. + // @TODO: implement "in table" insertion mode + case 'TABLE': + case 'THEAD': + case 'TBODY': + case 'TFOOT': + $this->pop_until_tag( 'TABLE', false ); + break; + case 'BR': // This should never happen since Tag_Processor corrects that throw new Exception( 'BR tag closer should never be encountered' ); @@ -1492,10 +1517,3 @@ private static function is_formatting_element( $tag_name ) { } } - -// $p = new WP_HTML_Processor('
    '); -// $p->next_node(); -// $p->next_node(); -// $p->next_node(); -// $p->next_node(); -// var_dump($p->get_updated_html());