From 7e7602cc5c4f898ba4ef6f9a1cfcf3dea4fd5bea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Tue, 21 Feb 2023 18:13:07 +0100 Subject: [PATCH 01/14] Explore HTML parsing and Adoption Agency Algorithm --- .../html-api/class-wp-html-processor.php | 1376 +++++++++++++++++ .../html-api/class-wp-html-tag-processor.php | 113 +- src/wp-settings.php | 1 + .../tests/html-api/wpHtmlProcessor.php | 22 + 4 files changed, 1481 insertions(+), 31 deletions(-) create mode 100644 src/wp-includes/html-api/class-wp-html-processor.php create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessor.php diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php new file mode 100644 index 0000000000000..da0c95738b2a8 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -0,0 +1,1376 @@ +tag = $tag; + $this->attributes = $attributes; + $this->is_opener = $is_opener; + $this->is_closer = ! $is_opener; + } + + public function equivalent( WP_HTML_Element $other ) { + if ( $this->is_closer !== $other->is_closer ) { + return false; + } + + if ( $this->tag !== $other->tag ) { + return false; + } + + if ( count( $this->attributes ) !== count( $other->attributes ) ) { + return false; + } + + $attributes_match = true; + foreach ( $other->attributes as $name => $value ) { + if ( ! isset( $this->attributes[ $name ] ) || $this->attributes[ $name ] !== $value ) { + $attributes_match = false; + break; + } + } + return $attributes_match; + } + + public function is_marker() { + return self::MARKER === $this->tag; + } +} + +class WP_HTML_Insertion_Mode { + + const INITIAL = 'INITIAL'; + const IN_SELECT = 'IN_SELECT'; + const IN_SELECT_IN_TABLE = 'IN_SELECT_IN_TABLE'; + const IN_CELL = 'IN_CELL'; + const IN_ROW = 'IN_ROW'; + const IN_TABLE_BODY = 'IN_TABLE_BODY'; + const IN_CAPTION = 'IN_CAPTION'; + const IN_COLUMN_GROUP = 'IN_COLUMN_GROUP'; + const IN_TABLE = 'IN_TABLE'; + const IN_HEAD = 'IN_HEAD'; + const IN_BODY = 'IN_BODY'; + const IN_FRAMESET = 'IN_FRAMESET'; + const BEFORE_HEAD = 'BEFORE_HEAD'; + const TEXT = 'TEXT'; + +} + +/** + * + */ +class WP_HTML_Processor extends WP_HTML_Tag_Processor { + + private $tag_processor; + /** + * @var WP_HTML_Element[] + */ + private $open_elements = array(); + /** + * @var WP_HTML_Element[] + */ + private $active_formatting_elements = array(); + private $root_node = null; + private $context_node = null; + private $original_insertion_mode = null; + private $insertion_mode = null; + + private $inserted_tokens = array(); + + private $head_pointer; + private $form_pointer; + + public function __construct( $html ) { + parent::__construct( $html ); + $this->root_node = new WP_HTML_Element( 'HTML' ); + $this->context_node = new WP_HTML_Element( 'DOCUMENT' ); + $this->open_elements = array( $this->root_node ); + $this->reset_insertion_mode(); + } + + public function parse_next() { + return $this->next_tag_in_body_insertion_mode(); + // @TODO: + // switch($this->insertion_mode) { + // case WP_HTML_Insertion_Mode::INITIAL: + // $this->next_tag_in_initial_mode(); + // break; + // case WP_HTML_Insertion_Mode::BEFORE_HEAD: + // $this->next_tag_in_before_head_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_HEAD: + // $this->next_tag_in_head_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_BODY: + // $this->next_tag_in_body_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_TABLE: + // $this->next_tag_in_table_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_TABLE_BODY: + // $this->next_tag_in_table_body_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_ROW: + // $this->next_tag_in_row_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_CELL: + // $this->next_tag_in_cell_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_SELECT: + // $this->next_tag_in_select_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE: + // $this->next_tag_in_select_in_table_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_CAPTION: + // $this->next_tag_in_caption_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_COLUMN_GROUP: + // $this->next_tag_in_column_group_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::IN_FRAMESET: + // $this->next_tag_in_frameset_insertion_mode(); + // break; + // case WP_HTML_Insertion_Mode::TEXT: + // $this->next_tag_in_text_insertion_mode(); + // break; + // } + } + + public function next_tag_in_body_insertion_mode() { + $token = $this->next_token(); + if ( $token->is_opener ) { + // Should we care? + // if(self::is_rcdata_element($token->tag)) { + // $this->original_insertion_mode = $this->insertion_mode; + // $this->insertion_mode = WP_HTML_Insertion_Mode::TEXT; + // } + switch ( $token->tag ) { + case 'ADDRESS': + case 'ARTICLE': + case 'ASIDE': + case 'BLOCKQUOTE': + case 'CENTER': + case 'DETAILS': + case 'DIALOG': + case 'DIR': + case 'DIV': + case 'DL': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'HEADER': + case 'HGROUP': + case 'MAIN': + case 'MENU': + case 'NAV': + case 'OL': + case 'P': + case 'SECTION': + case 'SUMMARY': + case 'UL': + // Ignore special rules for 'PRE' and 'LISTING' + case 'PRE': + case 'LISTING': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + // A start tag whose tag name is "h1", "h2", "h3", "h4", "h5", or "h6" + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + if ( in_array( $this->current_node()->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + $this->pop_open_element(); + } + $this->insert_element( $token ); + break; + case 'FORM': + if ( $this->form_pointer ) { + $this->ignore_token( $token ); + return $this->next_tag(); + } + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->form_pointer = $token; + $this->insert_element( $token ); + break; + case 'LI': + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === 'LI' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'LI' ), + ) + ); + $this->pop_until_tag_name( 'LI' ); + break; + } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + break; + } else { + --$i; + $node = $this->open_elements[ $i ]; + } + } + + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + case 'DD': + case 'DT': + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === 'DD' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'DD' ), + ) + ); + $this->pop_until_tag_name( 'DD' ); + break; + } elseif ( $node->tag === 'DT' ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'DT' ), + ) + ); + $this->pop_until_tag_name( 'DT' ); + break; + } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + break; + } else { + --$i; + $node = $this->open_elements[ $i ]; + } + } + + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + break; + case 'PLAINTEXT': + throw new Exception( 'PLAINTEXT not implemented yet' ); + case 'BUTTON': + if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( 'BUTTON' ); + } + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'A': + $active_a = null; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; --$i ) { + $elem = $this->active_formatting_elements[ $i ]; + if ( $elem->tag === 'A' ) { + $active_a = $elem; + break; + } elseif ( $elem->is_marker() ) { + break; + } + } + + if ( $active_a ) { + $this->parse_error(); + // @TODO: + // Run the adoption agency algorithm with the tag name "a". + } + + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + $this->reconstruct_active_formatting_elements(); + $this->push_active_formatting_element( $token ); + $this->insert_element( $token ); + break; + case 'NOBR': + $this->reconstruct_active_formatting_elements(); + if ( $this->is_element_in_scope( 'NOBR' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $this->insert_element( $token ); + $this->push_active_formatting_element( $token ); + break; + case 'APPLET': + case 'MARQUEE': + case 'OBJECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + $this->active_formatting_elements[] = new WP_HTML_Element( WP_HTML_Element::MARKER ); + break; + case 'TABLE': + $this->insert_element( $token ); + $this->insertion_mode = WP_HTML_Insertion_Mode::IN_TABLE; + break; + case 'AREA': + case 'BR': + case 'EMBED': + case 'IMG': + case 'KEYGEN': + case 'WBR': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + $this->pop_open_element(); + // @TODO: Acknowledge the token's self-closing flag, if it is set. + break; + case 'PARAM': + case 'SOURCE': + case 'TRACK': + $this->insert_element( $token ); + $this->pop_open_element(); + break; + case 'HR': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->insert_element( $token ); + $this->pop_open_element(); + break; + case 'IMAGE': + $this->parse_error(); + // Change the tag name to "img" and reprocess the token. + throw new Exception( 'IMAGE not implemented yet' ); + case 'TEXTAREA': + $this->insert_element( $token ); + $this->original_insertion_mode = $this->insertion_mode; + $this->insertion_mode = WP_HTML_Insertion_Mode::TEXT; + break; + + case 'XMP': + if ( $this->is_element_in_button_scope( 'P' ) ) { + $this->close_p_element(); + } + $this->reconstruct_active_formatting_elements(); + // @TODO: Follow the generic raw text element parsing algorithm. + throw new Exception( 'XMP not implemented yet' ); + break; + case 'IFRAME': + case 'NOEMBED': + case 'NOSCRIPT': + // @TODO: Follow the generic raw text element parsing algorithm. + throw new Exception( $token->tag . ' not implemented yet' ); + case 'SELECT': + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + if ( in_array( + $this->insertion_mode, + array( + WP_HTML_Insertion_Mode::IN_TABLE, + WP_HTML_Insertion_Mode::IN_CAPTION, + WP_HTML_Insertion_Mode::IN_TABLE_BODY, + WP_HTML_Insertion_Mode::IN_ROW, + WP_HTML_Insertion_Mode::IN_CELL, + ) + ) ) { + $this->insertion_mode = WP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE; + } else { + $this->insertion_mode = WP_HTML_Insertion_Mode::IN_SELECT; + } + break; + case 'OPTGROUP': + case 'OPTION': + if ( 'OPTION' === $token->tag ) { + $this->pop_open_element(); + } + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + case 'RB': + case 'RTC': + if ( $this->is_element_in_scope( 'RB' ) || $this->is_element_in_scope( 'RTC' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $this->insert_element( $token ); + break; + case 'RP': + case 'RT': + if ( $this->is_element_in_scope( 'RP' ) || $this->is_element_in_scope( 'RT' ) ) { + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + $this->reconstruct_active_formatting_elements(); + } + $this->insert_element( $token ); + break; + case 'MATH': + throw new Exception( 'MATH not implemented yet' ); + case 'SVG': + throw new Exception( 'SVG not implemented yet' ); + case 'CAPTION': + case 'COL': + case 'COLGROUP': + case 'FRAME': + case 'HEAD': + case 'TBODY': + case 'TD': + case 'TFOOT': + case 'TH': + case 'THEAD': + case 'TR': + $this->parse_error(); + // Ignore the token. + return; + default: + $this->reconstruct_active_formatting_elements(); + $this->insert_element( $token ); + break; + } + } else { + switch ( $token->tag ) { + case 'ADDRESS': + case 'ARTICLE': + case 'ASIDE': + case 'BLOCKQUOTE': + case 'CENTER': + case 'DETAILS': + case 'DIALOG': + case 'DIR': + case 'DIV': + case 'DL': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'HEADER': + case 'HGROUP': + case 'MAIN': + case 'MENU': + case 'NAV': + case 'OL': + case 'P': + case 'SECTION': + case 'SUMMARY': + case 'UL': + if ( $this->is_element_in_scope( $token->tag ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( $token->tag ); + break; + case 'FORM': + if ( $this->form_pointer ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + if ( $this->is_element_in_scope( $this->form_pointer ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + array_splice( $this->open_elements, array_search( $this->form_pointer, $this->open_elements ), 1 ); + $this->form_pointer = null; + break; + case 'P': + if ( ! $this->is_element_in_button_scope( 'P' ) ) { + // Parse error, insert an HTML element for a "p" start tag token with no attributes. + $this->parse_error(); + $this->insert_element( new WP_HTML_Element( 'P', array() ) ); + } + $this->close_p_element(); + break; + case 'LI': + if ( $this->is_element_in_list_item_scope( 'LI' ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( 'LI' ); + break; + case 'DD': + case 'DT': + if ( $this->is_element_in_scope( $token->tag ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( $token->tag ); + break; + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + if ( $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + $this->pop_until_tag_name( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ); + break; + case 'A': + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + $this->parse_error(); + $this->adoption_agency_algorithm( $token ); + break; + + case 'APPLET': + case 'MARQUEE': + case 'OBJECT': + if ( $this->is_element_in_scope( $token->tag ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } + $this->generate_implied_end_tags(); + if ( $this->current_node()->tag !== $token->tag ) { + $this->parse_error(); + } + $this->pop_until_tag_name( $token->tag ); + $this->clear_active_formatting_elements_up_to_last_marker(); + break; + case 'BR': + // This should never happen since Tag_Processor corrects that + default: + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + if ( $node->tag === $token->tag ) { + $this->generate_implied_end_tags( + array( + 'except_for' => array( $token->tag ), + ) + ); + $this->pop_until_node( $node ); + break; + } elseif ( $this->is_special_element( $node->tag ) ) { + $this->ignore_token( $token ); + $this->parse_error(); + return $this->next_tag(); + } else { + --$i; + } + } + break; + } + } + } + + private $element_bookmark_idx = 0; + private function next_token() { + if ( ! $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + return false; + } + + $consumed_node = new WP_HTML_Element( + $this->get_tag(), + array(), + ! $this->is_tag_closer() + ); + + $consumed_node->tag_processor_bookmark = $this->set_bookmark( + '__internal_' . ( $this->element_bookmark_idx++ ) + ); + + return $consumed_node; + } + + const ANY_OTHER_END_TAG = 1; + private function adoption_agency_algorithm( WP_HTML_Element $token ) { + $subject = $token->tag; + if ( + $this->current_node()->tag === $subject + && ! in_array( $subject, $this->active_formatting_elements, true ) + ) { + $this->pop_open_element(); + return; + } + + $outer_loop_counter = 0; + while ( ++$outer_loop_counter < 8 ) { + /* + * Let __formatting element__ be the last element in the list of active + * formatting elements that: + * - is between the end of the list and the last marker in the list, + * if any, or the start of the list otherwise, and + * - has the same tag name as the token. + */ + $formatting_element = null; + $formatting_element_idx = -1; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { + $candidate = $this->active_formatting_elements[ $i ]; + if ( $candidate->is_marker() ) { + break; + } + if ( $candidate->tag === $subject ) { + $formatting_element = $candidate; + $formatting_element_idx = $i; + break; + } + } + // If there is no such element, then abort these steps and instead act as + // described in the "any other end tag" entry below. + if ( null === $formatting_element ) { + return self::ANY_OTHER_END_TAG; + } + + // If formatting element is not in the stack of open elements, then this is + // a parse error; remove the element from the list, and return. + if ( ! in_array( $formatting_element, $this->open_elements, true ) ) { + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); + $this->parse_error(); + return; + } + + // If formatting element is not in scope, then this is a parse error; return + if ( ! $this->is_element_in_scope( $formatting_element->tag ) ) { + $this->parse_error(); + return; + } + + // If formatting element is not the current node, then this is a parse error. + // (But do not return.) + if ( $formatting_element !== $this->current_node() ) { + $this->parse_error(); + } + + /* + * Let furthest block be the topmost node in the stack of open elements that + * is lower in the stack than formatting element, and is an element in the + * special category. There might not be one. + */ + $furthest_block = null; + for ( $i = count( $this->open_elements ) - 1; $i >= 0; $i-- ) { + $node = $this->open_elements[ $i ]; + if ( $node === $formatting_element ) { + break; + } + if ( $this->is_special_element( $node->tag ) ) { + $furthest_block = $node; + break; + } + } + + // If there is no such node, then the UA must first pop all the nodes from + // the bottom of the stack of open elements, from the current node up to + // and including formatting element, then remove formatting element from + // the list of active formatting elements, and finally abort these steps. + if ( null === $furthest_block ) { + $this->pop_until_node( $formatting_element ); + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); + return; + } + + // Let common ancestor be the element immediately above formatting element + // in the stack of open elements. + $formatting_elem_stack_index = array_search( $formatting_element, $this->open_elements, true ); + $common_ancestor = $this->open_elements[ $formatting_elem_stack_index - 1 ]; + + // Let a bookmark note the position of formatting element in the list of + // active formatting elements relative to the elements on either side of it + // in the list. + $bookmark = $formatting_element_idx; + + // Let node and last node be furthest block. + $node = $last_node = $furthest_block; + $node_open_elements_index = array_search( $node, $this->open_elements, true ); + + $prev_node_open_elements_index = -1; + $inner_loop_counter = 0; + while ( true ) { + $inner_loop_counter++; + + /** + * Let node be the element immediately above node in the stack of open elements, + * or if node is no longer in the stack of open elements (e.g. because it got + * removed by this algorithm), the element that was immediately above node in + * the stack of open elements before node was removed. + */ + $node_open_elements_index = array_search( $node, $this->open_elements, true ); + if ( false === $node_open_elements_index ) { + $node_open_elements_index = $prev_node_open_elements_index; + return; + } + --$node_open_elements_index; + $node = $this->open_elements[ $node_open_elements_index ]; + $prev_node_open_elements_index = $node_open_elements_index; + + // If node is formatting element, then break. + if ( $node === $formatting_element ) { + break; + } + + /* + * If inner loop counter is greater than 3 and node is in the list + * of active formatting elements, then remove node from the list of + * active formatting elements. + */ + if ( $inner_loop_counter > 3 && in_array( $node, $this->active_formatting_elements, true ) ) { + $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); + array_splice( $this->active_formatting_elements, $node_formatting_idx, 1 ); + } + + /* + * If node is not in the list of active formatting elements, then remove + * node from the stack of open elements and continue. + */ + if ( ! in_array( $node, $this->active_formatting_elements, true ) ) { + array_splice( $this->open_elements, $node_open_elements_index, 1 ); + continue; + } + + /* + * Create an element for the token for which the element node was created, + * in the HTML namespace, with common ancestor as the intended parent. + * + * Replace the entry for node in the list of active formatting elements with an entry + * for the new element. + * + * Replace the entry for node in the stack of open elements with an entry for + * the new element. + * + * Let node be the new element. + */ + $new_node = new WP_HTML_Element( $node->tag, array() ); + $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); + $this->active_formatting_elements[ $node_formatting_idx ] = $new_node; + + $node_open_elements_index = array_search( $node, $this->open_elements, true ); + $this->open_elements[ $node_open_elements_index ] = $new_node; + $node = $new_node; + + /* + * If last node is furthest block, then move the aforementioned bookmark to be + * immediately after the new node in the list of active formatting elements. + */ + if ( $last_node === $furthest_block ) { + $bookmark = $node_formatting_idx + 1; + } + + // Append last node to node. + // @TODO + + // Set last node to node. + $last_node = $node; + } + + // Insert whatever last node ended up being in the previous step at the appropriate place + // for inserting a node, but using common ancestor as the override target. + // @TODO + + // Create an element for the token for which formatting element was created, in the HTML + // namespace, with furthest block as the intended parent. + $new_element = new WP_HTML_Element( $formatting_element->tag, array() ); + + // Take all of the child nodes of furthest block and append them to the element created in + // the last step. + // @TODO + + // Append that new element to furthest block. + // @TODO + + // Remove formatting element from the list of active formatting elements, and insert the new + // element into the list of active formatting elements at the position of the aforementioned + // bookmark. + $formatting_element_idx = array_search( $formatting_element, $this->active_formatting_elements, true ); + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1, array( $new_element ) ); + array_splice( $this->active_formatting_elements, $bookmark, 0, array( $new_element ) ); + + // Remove formatting element from the stack of open elements, and insert the new element into + // the stack of open elements immediately below the position of furthest block in that stack. + $formatting_element_idx = array_search( $formatting_element, $this->active_formatting_elements, true ); + array_splice( $this->active_formatting_elements, $formatting_element_idx, 1, array( $new_element ) ); + + $furthest_block_idx = array_search( $furthest_block, $this->open_elements, true ); + array_splice( $this->open_elements, $furthest_block_idx + 1, 0, array( $new_element ) ); + } + } + + /* + @TODO Implement https://html.spec.whatwg.org/multipage/parsing.html#insert-a-foreign-element + + Let the adjusted insertion location be the appropriate place for inserting a node. + + Let element be the result of creating an element for the token in the given namespace, with the intended parent being the element in which the adjusted insertion location finds itself. + + If it is possible to insert element at the adjusted insertion location, then: + + If the parser was not created as part of the HTML fragment parsing algorithm, then push a new element queue onto element's relevant agent's custom element reactions stack. + + Insert element at the adjusted insertion location. + + If the parser was not created as part of the HTML fragment parsing algorithm, then pop the element queue from element's relevant agent's custom element reactions stack, and invoke custom element reactions in that queue. + + If the adjusted insertion location cannot accept more elements, e.g. because it's a Document that already has an element child, then element is dropped on the floor. + + Push element onto the stack of open elements so that it is the new current node. + + Return element. + + */ + private function insert_html_element( $node ) { + if ( ! $node->is_closer ) { + $this->insert_element( $node ); + } + $this->inserted_tokens[] = $node; + } + + private function ignore_token( $token ) { + if ( $token->tag_processor_bookmark ) { + $this->release_bookmark( $token->tag_processor_bookmark ); + $token->tag_processor_bookmark = null; + } + return; + } + + private function insert_element( $node ) { + $this->open_elements[] = $node; + } + + private function parse_error() { + // Noop for now + } + + private function pop_until_tag_name( $tags ) { + if ( ! is_array( $tags ) ) { + $tags = array( $tags ); + } + while ( ! in_array( $this->current_node()->tag, $tags ) ) { + $this->pop_open_element(); + } + } + + private function pop_until_node( $node ) { + do { + $popped = $this->pop_open_element(); + } while ( $popped !== $node ); + } + + private function pop_open_element() { + $popped = array_pop( $this->open_elements ); + if ( $popped->tag_processor_bookmark ) { + $this->release_bookmark( $popped->tag_processor_bookmark ); + $popped->tag_processor_bookmark = null; + } + return $popped; + } + + private function generate_implied_end_tags( $options = null ) { + while ( $this->should_generate_implied_end_tags( $options ) ) { + yield $this->pop_open_element(); + } + } + + private function current_node() { + return end( $this->open_elements ); + } + + private function close_p_element() { + $this->generate_implied_end_tags( + array( + 'except_for' => array( 'P' ), + ) + ); + // If the current node is not a p element, then this is a parse error. + if ( $this->current_node()->tag !== 'P' ) { + $this->parse_error(); + } + $this->pop_until_tag_name( 'P' ); + } + + private function should_generate_implied_end_tags( $options = null ) { + $current_tag_name = $this->current_node()->tag; + if ( null !== $options && isset( $options['except_for'] ) && in_array( $current_tag_name, $options['except_for'] ) ) { + return false; + } + switch ( $current_tag_name ) { + case 'DD': + case 'DT': + case 'LI': + case 'OPTION': + case 'OPTGROUP': + case 'P': + case 'RB': + case 'RP': + case 'RT': + case 'RTC': + return true; + } + + $thoroughly = null !== $options && isset( $options['thoroughly'] ) && $options['thoroughly']; + if ( $thoroughly ) { + switch ( $current_tag_name ) { + case 'TBODY': + case 'TFOOT': + case 'THEAD': + case 'TD': + case 'TH': + case 'TR': + return true; + } + } + + return false; + } + + /** + * https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements + */ + private function push_active_formatting_element( $node ) { + $count = 0; + for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { + $formatting_element = $this->active_formatting_elements[ $i ]; + if ( $formatting_element->is_marker() ) { + break; + } + if ( ! $node->equivalent( $node ) ) { + continue; + } + $count++; + if ( $count === 3 ) { + array_splice( $this->active_formatting_elements, $i, 1 ); + break; + } + } + $this->active_formatting_elements[] = $node; + } + + private function reconstruct_active_formatting_elements() { + if ( empty( $this->active_formatting_elements ) ) { + return; + } + $i = count( $this->active_formatting_elements ) - 1; + $last_entry = $this->active_formatting_elements[ $i ]; + if ( $last_entry->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { + return; + } + $entry = $last_entry; + while ( true ) { + if ( $i <= 0 ) { + break; + } + --$i; + $entry = $this->active_formatting_elements[ $i ]; + if ( $entry->is_marker() || in_array( $entry, $this->open_elements, true ) ) { + break; + } + } + while ( true ) { + ++$i; + $entry = $this->active_formatting_elements[ $i ]; + if ( $entry === $last_entry ) { + break; + } + + // @TODO: + // Create: Insert an HTML element for the token for which the element entry + // was created, to obtain new element. + $new_element = new WP_HTML_Element( $entry->tag, $entry->attributes ); + + // Replace the entry for entry in the list with an entry for new element. + $index = array_search( $entry, $this->active_formatting_elements, true ); + + $this->active_formatting_elements[ $index ] = $new_element; + if ( $index === count( $this->active_formatting_elements ) - 1 ) { + break; + } + } + } + + private function clear_active_formatting_elements_up_to_last_marker() { + while ( ! empty( $this->active_formatting_elements ) ) { + $entry = array_pop( $this->active_formatting_elements ); + if ( $entry->is_marker() ) { + break; + } + } + } + + private function is_element_in_select_scope( $target_node ) { + return $this->is_element_in_specific_scope( + $target_node, + array( + 'optgroup', + 'option', + ), + array( + 'negative_match' => 'true', + ) + ); + } + + private function is_element_in_table_scope( $target_node ) { + return $this->is_element_in_specific_scope( + $target_node, + array( + 'html', + 'table', + 'template', + ) + ); + } + + private function is_element_in_button_scope( $target_node ) { + return $this->is_element_in_scope( + $target_node, + array( + 'button', + ) + ); + } + + private function is_element_in_list_item_scope( $target_node ) { + return $this->is_element_in_scope( + $target_node, + array( + 'li', + 'dd', + 'dt', + ) + ); + } + + private function is_element_in_scope( $target_node, $additional_elements = array() ) { + return $this->is_element_in_specific_scope( + $target_node, + array_merge( + array( + 'applet', + 'caption', + 'html', + 'table', + 'td', + 'th', + 'marquee', + 'object', + 'template', + ), + $additional_elements + ) + ); + } + + /** + * https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements + */ + private function is_element_in_specific_scope( $target_node, $element_types_list, $options = array() ) { + $negative_match = isset( $options['negative_match'] ) ? $options['negative_match'] : false; + $i = count( $this->open_elements ) - 1; + while ( true ) { + $node = $this->open_elements[ $i ]; + + if ( $node === $target_node ) { + return true; + } + + $is_in_the_list = in_array( $node->tag, $element_types_list, true ); + $failure = $negative_match ? $is_in_the_list : ! $is_in_the_list; + if ( $failure ) { + return false; + } + } + } + + /** + * https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately + */ + private function reset_insertion_mode() { + $last = false; + $node = end( $this->open_elements ); + + while ( true ) { + if ( count( $this->open_elements ) === 1 && $node === reset( $this->open_elements ) ) { + $last = true; + $node = $this->context_node; + } + + if ( $node->tag === 'select' ) { + if ( $last ) { + break; + } + + $ancestor = $node; + while ( true ) { + if ( $ancestor === $this->open_elements[0] ) { + break; + } + + $index = array_search( $ancestor, $this->open_elements ); + $ancestor = $this->open_elements[ $index - 1 ]; + if ( $ancestor->tag === 'template' ) { + break; + } + + if ( $ancestor->tag === 'table' ) { + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE; + return; + } + } + + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_SELECT; + return; + } + + switch ( $node->tag ) { + case 'TD': + case 'TH': + if ( ! $last ) { + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_CELL; + return; + } + break; + case 'TR': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_ROW; + return; + case 'TBODY': + case 'THEAD': + case 'TFOOT': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_TABLE_BODY; + return; + case 'CAPTION': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_CAPTION; + return; + case 'COLGROUP': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_COLUMN_GROUP; + return; + case 'TABLE': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_TABLE; + return; + case 'TEMPLATE': + // TODO: implement the current template insertion mode + $this->insertion_mode = 0; + return; + case 'HEAD': + if ( ! $last ) { + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_HEAD; + return; + } + break; + case 'BODY': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; + return; + case 'FRAMESET': + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_FRAMESET; + return; + case 'HTML': + // TODO: implement the head element pointer + $this->insertion_mode = WP_HTML_Insertion_Mode::BEFORE_HEAD; + return; + default: + if ( $last ) { + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; + return; + } + } + + $index = array_search( $node, $this->open_elements ); + $node = $this->open_elements[ $index - 1 ]; + } + + $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; + } + + + private static function is_special_element( $tag_name, $except = null ) { + if ( null !== $except && in_array( $tag_name, $except, true ) ) { + return false; + } + + switch ( $tag_name ) { + case 'ADDRESS': + case 'APPLET': + case 'AREA': + case 'ARTICLE': + case 'ASIDE': + case 'BASE': + case 'BASEFONT': + case 'BGSOUND': + case 'BLOCKQUOTE': + case 'BODY': + case 'BR': + case 'BUTTON': + case 'CAPTION': + case 'CENTER': + case 'COL': + case 'COLGROUP': + case 'DD': + case 'DETAILS': + case 'DIR': + case 'DIV': + case 'DL': + case 'DT': + case 'EMBED': + case 'FIELDSET': + case 'FIGCAPTION': + case 'FIGURE': + case 'FOOTER': + case 'FORM': + case 'FRAME': + case 'FRAMESET': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + case 'HEAD': + case 'HEADER': + case 'HGROUP': + case 'HR': + case 'HTML': + case 'IFRAME': + case 'IMG': + case 'INPUT': + case 'ISINDEX': + case 'LI': + case 'LINK': + case 'LISTING': + case 'MAIN': + case 'MARQUEE': + case 'MENU': + case 'MENUITEM': + case 'META': + case 'NAV': + case 'NOEMBED': + case 'NOFRAMES': + case 'NOSCRIPT': + case 'OBJECT': + case 'OL': + case 'P': + case 'PARAM': + case 'PLAINTEXT': + case 'PRE': + case 'SCRIPT': + case 'SECTION': + case 'SELECT': + case 'SOURCE': + case 'STYLE': + case 'SUMMARY': + case 'TABLE': + case 'TBODY': + case 'TD': + case 'TEMPLATE': + case 'TEXTAREA': + case 'TFOOT': + case 'TH': + case 'THEAD': + case 'TITLE': + case 'TR': + case 'TRACK': + case 'UL': + case 'WBR': + case 'XMP': + return true; + default: + return false; + } + } + + private static function is_rcdata_element( $tag_name ) { + switch ( $tag_name ) { + case 'TITLE': + case 'TEXTAREA': + case 'STYLE': + case 'XMP': + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + case 'NOSCRIPT': + return true; + default: + return false; + } + } + + private static function is_formatting_element( $tag_name ) { + switch ( strtoupper( $tag_name ) ) { + case 'A': + case 'B': + case 'BIG': + case 'CODE': + case 'EM': + case 'FONT': + case 'I': + case 'NOBR': + case 'S': + case 'SMALL': + case 'STRIKE': + case 'STRONG': + case 'TT': + case 'U': + return true; + default: + return false; + } + } + +} + + +$p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); +// The controller's schema is hardcoded, so tests would not be meaningful. +$p->parse_next(); + +// $this->tag_processor->next_tag( +// array( +// 'tag_closers' => 'visit', +// ) +// ); +// var_dump( $this->tag_processor->get_tag() ); +// var_dump( $this->tag_processor->is_tag_closer() ); +// $last_parent = end( $this->open_elements ); diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 31db41a3c86ad..68f6d213155d4 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -528,6 +528,18 @@ class WP_HTML_Tag_Processor { */ protected $lexical_updates = array(); + /** + * Attribute replacements to apply to input HTML document. + * + * Unlike more generic lexical updates, attribute updates are stored + * in an associative array, where the keys are (lowercase-normalized) + * attribute names, in order to avoid duplication. + * + * @since 6.3.0 + * @var WP_HTML_Text_Replacement[] + */ + private $attribute_updates = array(); + /** * Tracks and limits `seek()` calls to prevent accidental infinite loops. * @@ -1237,15 +1249,16 @@ private function skip_whitespace() { } /** - * Applies attribute updates and cleans up once a tag is fully parsed. + * Applies lexical updates and cleans up once a tag is fully parsed. * * @since 6.2.0 * * @return void */ private function after_tag() { - $this->class_name_updates_to_attributes_updates(); - $this->apply_attributes_updates(); + $this->class_name_updates_to_attribute_updates(); + $this->attribute_updates_to_lexical_updates(); + $this->apply_lexical_updates(); $this->tag_name_starts_at = null; $this->tag_name_length = null; $this->tag_ends_at = null; @@ -1254,17 +1267,17 @@ private function after_tag() { } /** - * Converts class name updates into tag attributes updates + * Converts class name updates into tag attribute updates * (they are accumulated in different data formats for performance). * - * @see $lexical_updates + * @see $attribute_updates * @see $classname_updates * * @since 6.2.0 * * @return void */ - private function class_name_updates_to_attributes_updates() { + private function class_name_updates_to_attribute_updates() { if ( count( $this->classname_updates ) === 0 ) { return; } @@ -1398,13 +1411,33 @@ private function class_name_updates_to_attributes_updates() { } /** - * Applies attribute updates to HTML document. + * Converts attribute updates into lexical updates. + * + * This method is only meant to run right before the attribute updates are applied. + * The behavior in all other cases is undefined. + * + * @return void + * @since 6.3.0 + * + * @see $attribute_updates + * @see $lexical_updates + */ + private function attribute_updates_to_lexical_updates() { + foreach ( $this->attribute_updates as $update ) { + $this->lexical_updates[] = $update; + } + $this->attribute_updates = array(); + } + + /** + * Applies lexical updates to HTML document. * * @since 6.2.0 + * @since 6.3.0 Invalidate any bookmarks whose targets are overwritten. * * @return void */ - private function apply_attributes_updates() { + private function apply_lexical_updates() { if ( ! count( $this->lexical_updates ) ) { return; } @@ -1431,7 +1464,7 @@ private function apply_attributes_updates() { * Adjust bookmark locations to account for how the text * replacements adjust offsets in the input document. */ - foreach ( $this->bookmarks as $bookmark ) { + foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { /* * Each lexical update which appears before the bookmark's endpoints * might shift the offsets for those endpoints. Loop through each change @@ -1442,20 +1475,22 @@ private function apply_attributes_updates() { $tail_delta = 0; foreach ( $this->lexical_updates as $diff ) { - $update_head = $bookmark->start >= $diff->start; - $update_tail = $bookmark->end >= $diff->start; - - if ( ! $update_head && ! $update_tail ) { + if ( $bookmark->start < $diff->start && $bookmark->end < $diff->start ) { break; } + if ( $bookmark->start >= $diff->start && $bookmark->end < $diff->end ) { + $this->release_bookmark( $bookmark_name ); + continue 2; + } + $delta = strlen( $diff->text ) - ( $diff->end - $diff->start ); - if ( $update_head ) { + if ( $bookmark->start >= $diff->start ) { $head_delta += $delta; } - if ( $update_tail ) { + if ( $bookmark->end >= $diff->end ) { $tail_delta += $delta; } } @@ -1467,6 +1502,18 @@ private function apply_attributes_updates() { $this->lexical_updates = array(); } + /** + * Checks whether a bookmark with the given name exists. + * + * @since 6.3.0 + * + * @param string $bookmark_name Name to identify a bookmark that potentially exists. + * @return bool Whether that bookmark exists. + */ + public function has_bookmark( $bookmark_name ) { + return array_key_exists( $bookmark_name, $this->bookmarks ); + } + /** * Move the internal cursor in the Tag Processor to a given bookmark's location. * @@ -1512,8 +1559,8 @@ public function seek( $bookmark_name ) { * * @since 6.2.0 * - * @param WP_HTML_Text_Replacement $a First attribute update. - * @param WP_HTML_Text_Replacement $b Second attribute update. + * @param WP_HTML_Text_Replacement $a First lexical update. + * @param WP_HTML_Text_Replacement $b Second lexical update. * @return int Comparison value for string order. */ private static function sort_start_ascending( $a, $b ) { @@ -1549,11 +1596,11 @@ private static function sort_start_ascending( $a, $b ) { * @return string|boolean|null Value of enqueued update if present, otherwise false. */ private function get_enqueued_attribute_value( $comparable_name ) { - if ( ! isset( $this->lexical_updates[ $comparable_name ] ) ) { + if ( ! isset( $this->attribute_updates[ $comparable_name ] ) ) { return false; } - $enqueued_text = $this->lexical_updates[ $comparable_name ]->text; + $enqueued_text = $this->attribute_updates[ $comparable_name ]->text; // Removed attributes erase the entire span. if ( '' === $enqueued_text ) { @@ -1626,7 +1673,7 @@ public function get_attribute( $name ) { /* * For every attribute other than `class` it's possible to perform a quick check if - * there's an enqueued lexical update whose value takes priority over what's found in + * there's an enqueued attribute update whose value takes priority over what's found in * the input document. * * The `class` attribute is special though because of the exposed helpers `add_class` @@ -1636,7 +1683,7 @@ public function get_attribute( $name ) { * into an attribute value update. */ if ( 'class' === $name ) { - $this->class_name_updates_to_attributes_updates(); + $this->class_name_updates_to_attribute_updates(); } // Return any enqueued attribute value updates if they exist. @@ -1864,8 +1911,8 @@ public function set_attribute( $name, $value ) { * * Result:
*/ - $existing_attribute = $this->attributes[ $comparable_name ]; - $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $existing_attribute = $this->attributes[ $comparable_name ]; + $this->attribute_updates[ $name ] = new WP_HTML_Text_Replacement( $existing_attribute->start, $existing_attribute->end, $updated_attribute @@ -1882,7 +1929,7 @@ public function set_attribute( $name, $value ) { * * Result:
*/ - $this->lexical_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( + $this->attribute_updates[ $comparable_name ] = new WP_HTML_Text_Replacement( $this->tag_name_starts_at + $this->tag_name_length, $this->tag_name_starts_at + $this->tag_name_length, ' ' . $updated_attribute @@ -1940,8 +1987,8 @@ public function remove_attribute( $name ) { * and when that attribute wasn't originally present. */ if ( ! isset( $this->attributes[ $name ] ) ) { - if ( isset( $this->lexical_updates[ $name ] ) ) { - unset( $this->lexical_updates[ $name ] ); + if ( isset( $this->attribute_updates[ $name ] ) ) { + unset( $this->attribute_updates[ $name ] ); } return false; } @@ -1957,7 +2004,7 @@ public function remove_attribute( $name ) { * * Result:
*/ - $this->lexical_updates[ $name ] = new WP_HTML_Text_Replacement( + $this->attribute_updates[ $name ] = new WP_HTML_Text_Replacement( $this->attributes[ $name ]->start, $this->attributes[ $name ]->end, '' @@ -2026,7 +2073,10 @@ public function __toString() { * @return string The processed HTML. */ public function get_updated_html() { - $requires_no_updating = 0 === count( $this->classname_updates ) && 0 === count( $this->lexical_updates ); + $requires_no_updating = + 0 === count( $this->classname_updates ) && + 0 === count( $this->attribute_updates ) && + 0 === count( $this->lexical_updates ); /* * When there is nothing more to update and nothing has already been @@ -2057,8 +2107,9 @@ public function get_updated_html() { * * Note: `apply_attributes_updates()` modifies `$this->output_buffer`. */ - $this->class_name_updates_to_attributes_updates(); - $this->apply_attributes_updates(); + $this->class_name_updates_to_attribute_updates(); + $this->attribute_updates_to_lexical_updates(); + $this->apply_lexical_updates(); /* * 2. Replace the original HTML with the now-updated HTML so that it's possible to @@ -2261,4 +2312,4 @@ private function matches() { return true; } -} +} \ No newline at end of file diff --git a/src/wp-settings.php b/src/wp-settings.php index a11b07ca28d07..ef5c6abc4355e 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -238,6 +238,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-span.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-processor.php'; require ABSPATH . WPINC . '/class-wp-http.php'; require ABSPATH . WPINC . '/class-wp-http-streams.php'; require ABSPATH . WPINC . '/class-wp-http-curl.php'; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php new file mode 100644 index 0000000000000..1f1bf02237b39 --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -0,0 +1,22 @@ +LoremIpsum

Dolor
Sit' ); + // The controller's schema is hardcoded, so tests would not be meaningful. + $p->next_tag_in_body_insertion_mode(); + } + +} From 0bdd4f6994429b1733558f92af60330d6a4b0f23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 10:48:04 +0100 Subject: [PATCH 02/14] Emit text tokens --- .../html-api/class-wp-html-processor.php | 189 ++++++++++++++---- .../html-api/class-wp-html-tag-processor.php | 8 +- 2 files changed, 150 insertions(+), 47 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index da0c95738b2a8..5a8a89588a7f0 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,22 +11,68 @@ function esc_attr( $text ) { } } -// Could be just WP_HTML_Node actually -class WP_HTML_Element { - const MARKER = -1; +class WP_HTML_Token { + const MARKER = 1; + const TAG = 2; + const TEXT = 3; + + public $type; + + // For tag tokens public $tag; public $attributes; public $is_closer; public $is_opener; - public $tag_processor_bookmark; - public function __construct( $tag, $attributes = null, $is_opener = true ) { - $this->tag = $tag; - $this->attributes = $attributes; - $this->is_opener = $is_opener; - $this->is_closer = ! $is_opener; + public $bookmark; + + // For text tokens + public $value; + + static public function marker() { + return new WP_HTML_Token( self::MARKER ); + } + + static public function tag( $tag, $attributes = null, $is_opener = true, $bookmark = null ) { + $token = new WP_HTML_Token( self::TAG, $tag ); + $token->tag = $tag; + $token->attributes = $attributes; + $token->is_opener = $is_opener; + $token->is_closer = ! $is_opener; + $token->bookmark = $bookmark; + return $token; + } + + static public function text( $text ) { + $token = new WP_HTML_Token( self::TEXT ); + $token->value = $text; + return $token; + } + + public function __construct( $type ) { + $this->type = $type; } - public function equivalent( WP_HTML_Element $other ) { + public function __toString() { + switch ( $this->type ) { + case self::MARKER: + return 'MARKER'; + case self::TAG: + return sprintf( + '<%s%s%s>', + $this->is_closer ? '/' : '', + $this->tag, + $this->attributes ? ' ' . implode( ' ', $this->attributes ) : '' + ); + case self::TEXT: + return $this->value; + } + } + + public function equivalent( WP_HTML_Token $other ) { + if ( ! $this->tag || ! $other->tag ) { + throw new Exception( 'Cannot compare non-tag tokens' ); + } + if ( $this->is_closer !== $other->is_closer ) { return false; } @@ -50,7 +96,15 @@ public function equivalent( WP_HTML_Element $other ) { } public function is_marker() { - return self::MARKER === $this->tag; + return self::MARKER === $this->type; + } + + public function is_tag() { + return self::TAG === $this->type; + } + + public function is_text() { + return self::TEXT === $this->type; } } @@ -78,13 +132,12 @@ class WP_HTML_Insertion_Mode { */ class WP_HTML_Processor extends WP_HTML_Tag_Processor { - private $tag_processor; /** - * @var WP_HTML_Element[] + * @var WP_HTML_Token[] */ private $open_elements = array(); /** - * @var WP_HTML_Element[] + * @var WP_HTML_Token[] */ private $active_formatting_elements = array(); private $root_node = null; @@ -92,6 +145,20 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $original_insertion_mode = null; private $insertion_mode = null; + /* + * WP_HTML_Tag_Processor skips over text nodes and only + * processes tags. + * + * WP_HTML_Processor needs to process text nodes as well. + * + * Whenever the tag processor skips over text to move to + * the next tag, the next_token() method emits that text + * as a token and stores the tag in $buffered_tag to be + * returned the next time. + */ + private $buffered_tag = null; + + private $last_token = null; private $inserted_tokens = array(); private $head_pointer; @@ -99,14 +166,22 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function __construct( $html ) { parent::__construct( $html ); - $this->root_node = new WP_HTML_Element( 'HTML' ); - $this->context_node = new WP_HTML_Element( 'DOCUMENT' ); + $this->root_node = WP_HTML_Token::tag( 'HTML' ); + $this->context_node = WP_HTML_Token::tag( 'DOCUMENT' ); $this->open_elements = array( $this->root_node ); $this->reset_insertion_mode(); } - public function parse_next() { - return $this->next_tag_in_body_insertion_mode(); + public function main() { + for ($i = 0; $i < 10; $i++) { + $token = $this->next_token(); + if(!$token) { + break; + } + echo "TOKEN: $token\n"; + $processed_token = $this->process_in_body_insertion_mode($token); + $this->last_token = $processed_token; + } // @TODO: // switch($this->insertion_mode) { // case WP_HTML_Insertion_Mode::INITIAL: @@ -154,9 +229,11 @@ public function parse_next() { // } } - public function next_tag_in_body_insertion_mode() { - $token = $this->next_token(); - if ( $token->is_opener ) { + public function process_in_body_insertion_mode(WP_HTML_Token $token) { + if ( $token->is_text() ) { + // ? + } + else if ( $token->is_opener ) { // Should we care? // if(self::is_rcdata_element($token->tag)) { // $this->original_insertion_mode = $this->insertion_mode; @@ -304,8 +381,7 @@ public function next_tag_in_body_insertion_mode() { if ( $active_a ) { $this->parse_error(); - // @TODO: - // Run the adoption agency algorithm with the tag name "a". + $this->adoption_agency_algorithm( $token ); } $this->reconstruct_active_formatting_elements(); @@ -342,7 +418,7 @@ public function next_tag_in_body_insertion_mode() { case 'OBJECT': $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); - $this->active_formatting_elements[] = new WP_HTML_Element( WP_HTML_Element::MARKER ); + $this->active_formatting_elements[] = WP_HTML_Token::marker(); break; case 'TABLE': $this->insert_element( $token ); @@ -515,7 +591,7 @@ public function next_tag_in_body_insertion_mode() { if ( ! $this->is_element_in_button_scope( 'P' ) ) { // Parse error, insert an HTML element for a "p" start tag token with no attributes. $this->parse_error(); - $this->insert_element( new WP_HTML_Element( 'P', array() ) ); + $this->insert_element( WP_HTML_Token::tag( 'P' ) ); } $this->close_p_element(); break; @@ -609,29 +685,56 @@ public function next_tag_in_body_insertion_mode() { break; } } + return $token; } private $element_bookmark_idx = 0; private function next_token() { + if($this->buffered_tag){ + $next_tag = $this->buffered_tag; + $this->buffered_tag = null; + return $next_tag; + } + if ( ! $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { return false; } - $consumed_node = new WP_HTML_Element( + $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); + $this->set_bookmark($bookmark); + $next_tag = WP_HTML_Token::tag( $this->get_tag(), array(), - ! $this->is_tag_closer() + ! $this->is_tag_closer(), + $bookmark ); - $consumed_node->tag_processor_bookmark = $this->set_bookmark( - '__internal_' . ( $this->element_bookmark_idx++ ) - ); + /* + * If any text was found between the last tag and this one, + * save the next tag for later and return the text token. + */ + $last = $this->last_token; + if ( + $last + && $last->is_tag() + && $last->bookmark + && $this->has_bookmark($last->bookmark) + ) { + $this->buffered_tag = $next_tag; + + $text_start = $this->bookmarks[$last->bookmark]->end + 1; + $text_end = $this->bookmarks[$bookmark]->start; + if ($text_start < $text_end) { + $text = substr($this->html, $text_start, $text_end - $text_start); + return WP_HTML_Token::text($text); + } + } - return $consumed_node; + return $next_tag; } const ANY_OTHER_END_TAG = 1; - private function adoption_agency_algorithm( WP_HTML_Element $token ) { + private function adoption_agency_algorithm( WP_HTML_Token $token ) { $subject = $token->tag; if ( $this->current_node()->tag === $subject @@ -786,7 +889,7 @@ private function adoption_agency_algorithm( WP_HTML_Element $token ) { * * Let node be the new element. */ - $new_node = new WP_HTML_Element( $node->tag, array() ); + $new_node = WP_HTML_Token::tag( $node->tag ); $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); $this->active_formatting_elements[ $node_formatting_idx ] = $new_node; @@ -815,7 +918,7 @@ private function adoption_agency_algorithm( WP_HTML_Element $token ) { // Create an element for the token for which formatting element was created, in the HTML // namespace, with furthest block as the intended parent. - $new_element = new WP_HTML_Element( $formatting_element->tag, array() ); + $new_element = WP_HTML_Token::tag( $formatting_element->tag ); // Take all of the child nodes of furthest block and append them to the element created in // the last step. @@ -871,9 +974,9 @@ private function insert_html_element( $node ) { } private function ignore_token( $token ) { - if ( $token->tag_processor_bookmark ) { - $this->release_bookmark( $token->tag_processor_bookmark ); - $token->tag_processor_bookmark = null; + if ( $token->bookmark ) { + $this->release_bookmark( $token->bookmark ); + $token->bookmark = null; } return; } @@ -903,9 +1006,9 @@ private function pop_until_node( $node ) { private function pop_open_element() { $popped = array_pop( $this->open_elements ); - if ( $popped->tag_processor_bookmark ) { - $this->release_bookmark( $popped->tag_processor_bookmark ); - $popped->tag_processor_bookmark = null; + if ( $popped->bookmark ) { + $this->release_bookmark( $popped->bookmark ); + $popped->bookmark = null; } return $popped; } @@ -1020,7 +1123,7 @@ private function reconstruct_active_formatting_elements() { // @TODO: // Create: Insert an HTML element for the token for which the element entry // was created, to obtain new element. - $new_element = new WP_HTML_Element( $entry->tag, $entry->attributes ); + $new_element = WP_HTML_Token::tag( $entry->tag, $entry->attributes ); // Replace the entry for entry in the list with an entry for new element. $index = array_search( $entry, $this->active_formatting_elements, true ); @@ -1362,9 +1465,9 @@ private static function is_formatting_element( $tag_name ) { } -$p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); +$p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); // The controller's schema is hardcoded, so tests would not be meaningful. -$p->parse_next(); +$p->main(); // $this->tag_processor->next_tag( // array( diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 68f6d213155d4..94a06fea79072 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -274,7 +274,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var string */ - private $html; + public $html; /** * The last query passed to next_tag(). @@ -343,7 +343,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var int */ - private $bytes_already_parsed = 0; + protected $bytes_already_parsed = 0; /** * How many bytes from the input HTML document have already been @@ -406,7 +406,7 @@ class WP_HTML_Tag_Processor { * @since 6.2.0 * @var int|null */ - private $tag_ends_at; + protected $tag_ends_at; /** * Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
. @@ -734,7 +734,7 @@ public function set_bookmark( $name ) { } $this->bookmarks[ $name ] = new WP_HTML_Span( - $this->tag_name_starts_at - 1, + $this->tag_name_starts_at - ($this->is_closing_tag ? 2 : 1), $this->tag_ends_at ); From 9026d1c3f78a532a8b04e42349e7fea3f70ad369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 10:49:18 +0100 Subject: [PATCH 03/14] WP_HTML_Tag_Processor: Use the correct starting position when bookmarking a tag closer This commit marks the start of a bookmark one byte before the tag name start for tag openers, and two bytes before the tag name for tag closers. Setting a bookmark on a tag should set its "start" position before the opening "<", e.g.: ```
Testing a Bookmark ----------------^ ``` The current calculation assumes this is always one byte to the left from $tag_name_starts_at. However, in tag closers that index points to a solidus symbol "/": ```
Testing a Bookmark ----------------------------^ ``` The bookmark should therefore start two bytes before the tag name: ```
Testing a Bookmark ---------------------------^ ``` --- .../html-api/class-wp-html-tag-processor.php | 4 ++-- .../html-api/wpHtmlTagProcessor-bookmark.php | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 31db41a3c86ad..aa52dcb37b283 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -722,7 +722,7 @@ public function set_bookmark( $name ) { } $this->bookmarks[ $name ] = new WP_HTML_Span( - $this->tag_name_starts_at - 1, + $this->tag_name_starts_at - ( $this->is_closing_tag ? 2 : 1 ), $this->tag_ends_at ); @@ -1504,7 +1504,7 @@ public function seek( $bookmark_name ) { $this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start; $this->bytes_already_copied = $this->bytes_already_parsed; $this->output_buffer = substr( $this->html, 0, $this->bytes_already_copied ); - return $this->next_tag(); + return $this->next_tag( array( 'tag_closers' => 'visit' ) ); } /** diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php index 04a6ae590cd7d..69a9695d1fd59 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor-bookmark.php @@ -63,6 +63,28 @@ public function test_seek() { ); } + /** + * @ticket 56299 + * + * @covers WP_HTML_Tag_Processor::seek + */ + public function test_seeks_to_tag_closer_bookmark() { + $p = new WP_HTML_Tag_Processor( '
First
Second' ); + $p->next_tag( array( 'tag_closers' => 'visit' ) ); + $p->set_bookmark( 'first' ); + $p->next_tag( array( 'tag_closers' => 'visit' ) ); + $p->set_bookmark( 'second' ); + + $p->seek( 'first' ); + $p->seek( 'second' ); + + $this->assertSame( + 'DIV', + $p->get_tag(), + 'Did not seek to the intended bookmark location' + ); + } + /** * WP_HTML_Tag_Processor used to test for the diffs affecting * the adjusted bookmark position while simultaneously adjusting From ff9505b3a00f88b9430d99da5f070011ee7f2b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 11:27:45 +0100 Subject: [PATCH 04/14] Consume HTML text nodes as tokens --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 5a8a89588a7f0..8616d9e4736fe 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -179,8 +179,9 @@ public function main() { break; } echo "TOKEN: $token\n"; - $processed_token = $this->process_in_body_insertion_mode($token); - $this->last_token = $processed_token; + $this->last_token = $token; + // $processed_token = $this->process_in_body_insertion_mode($token); + // $this->last_token = $processed_token; } // @TODO: // switch($this->insertion_mode) { @@ -720,11 +721,10 @@ private function next_token() { && $last->bookmark && $this->has_bookmark($last->bookmark) ) { - $this->buffered_tag = $next_tag; - $text_start = $this->bookmarks[$last->bookmark]->end + 1; $text_end = $this->bookmarks[$bookmark]->start; if ($text_start < $text_end) { + $this->buffered_tag = $next_tag; $text = substr($this->html, $text_start, $text_end - $text_start); return WP_HTML_Token::text($text); } From afbfdc5ac251fe4cdc119c76a935fae29081618b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 14:03:56 +0100 Subject: [PATCH 05/14] Implement DOM insertion --- .../html-api/class-wp-html-processor.php | 543 ++++++++++++------ 1 file changed, 357 insertions(+), 186 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 8616d9e4736fe..d2ac45a14ccb4 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,6 +11,15 @@ function esc_attr( $text ) { } } +function dbg( $message, $indent = 0 ) { + $show_debug = true; + // $show_debug = false; + if( $show_debug ) { + $indent = str_repeat( ' ', $indent * 2 ); + echo $indent . $message . "\n"; + } +} + class WP_HTML_Token { const MARKER = 1; const TAG = 2; @@ -33,7 +42,7 @@ static public function marker() { } static public function tag( $tag, $attributes = null, $is_opener = true, $bookmark = null ) { - $token = new WP_HTML_Token( self::TAG, $tag ); + $token = new WP_HTML_Token( self::TAG ); $token->tag = $tag; $token->attributes = $attributes; $token->is_opener = $is_opener; @@ -108,6 +117,46 @@ public function is_text() { } } +class WP_HTML_Node { + public $parent; + public $children = array(); + public $token; + public $depth = 1; + + // For the adoption agency algorithm: + public $intended_parent = null; + + public function __construct( WP_HTML_Token $token ) { + $this->token = $token; + } + + public function append_child( WP_HTML_Node $node ) { + if($node->parent) { + $node->parent->remove($node); + } + $node->parent = $this; + $this->children[] = $node; + $node->depth = $this->depth + 1; + } + + public function remove( WP_HTML_Node $node ) { + $index = array_search( $node, $this->children, true ); + if ( false !== $index ) { + unset( $this->children[ $index ] ); + } + } + + public function __toString() { + $out = ''; + $indent = str_repeat( ' ', $this->depth ); + $out .= $indent . $this->token . "\n"; + foreach ( $this->children as $child ) { + $out .= $child; + } + return $out; + } +} + class WP_HTML_Insertion_Mode { const INITIAL = 'INITIAL'; @@ -133,11 +182,11 @@ class WP_HTML_Insertion_Mode { class WP_HTML_Processor extends WP_HTML_Tag_Processor { /** - * @var WP_HTML_Token[] + * @var WP_HTML_Node[] */ private $open_elements = array(); /** - * @var WP_HTML_Token[] + * @var WP_HTML_Node[] */ private $active_formatting_elements = array(); private $root_node = null; @@ -166,23 +215,24 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { public function __construct( $html ) { parent::__construct( $html ); - $this->root_node = WP_HTML_Token::tag( 'HTML' ); - $this->context_node = WP_HTML_Token::tag( 'DOCUMENT' ); + $this->root_node = new WP_HTML_Node(WP_HTML_Token::tag( 'HTML' )); + $this->context_node = new WP_HTML_Node(WP_HTML_Token::tag( 'DOCUMENT' )); $this->open_elements = array( $this->root_node ); $this->reset_insertion_mode(); } - public function main() { - for ($i = 0; $i < 10; $i++) { - $token = $this->next_token(); - if(!$token) { - break; - } - echo "TOKEN: $token\n"; + public function parse() { + echo("HTML before main loop:\n"); + echo($this->html); + echo("\n\n"); + while ($token = $this->next_token()) { $this->last_token = $token; - // $processed_token = $this->process_in_body_insertion_mode($token); - // $this->last_token = $processed_token; + $processed_token = $this->process_in_body_insertion_mode($token); + $this->last_token = $processed_token; } + echo("\n"); + echo("DOM after main loop:\n"); + echo($this->root_node.''); // @TODO: // switch($this->insertion_mode) { // case WP_HTML_Insertion_Mode::INITIAL: @@ -232,7 +282,10 @@ public function main() { public function process_in_body_insertion_mode(WP_HTML_Token $token) { if ( $token->is_text() ) { - // ? + dbg( "Found text node '$token'" ); + dbg( "Inserting text to current node " . $this->current_node()->token->tag, 1 ); + $this->reconstruct_active_formatting_elements(); + $this->insert_text( $token ); } else if ( $token->is_opener ) { // Should we care? @@ -268,6 +321,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { // Ignore special rules for 'PRE' and 'LISTING' case 'PRE': case 'LISTING': + dbg( "Found {$token->tag} tag opener" ); if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } @@ -283,7 +337,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - if ( in_array( $this->current_node()->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + if ( in_array( $this->current_node()->token->tag, array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->pop_open_element(); } $this->insert_element( $token ); @@ -303,7 +357,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $i = count( $this->open_elements ) - 1; while ( true ) { $node = $this->open_elements[ $i ]; - if ( $node->tag === 'LI' ) { + if ( $node->token->tag === 'LI' ) { $this->generate_implied_end_tags( array( 'except_for' => array( 'LI' ), @@ -311,7 +365,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { ); $this->pop_until_tag_name( 'LI' ); break; - } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + } elseif ( self::is_special_element( $node->token->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; } else { --$i; @@ -329,7 +383,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $i = count( $this->open_elements ) - 1; while ( true ) { $node = $this->open_elements[ $i ]; - if ( $node->tag === 'DD' ) { + if ( $node->token->tag === 'DD' ) { $this->generate_implied_end_tags( array( 'except_for' => array( 'DD' ), @@ -337,7 +391,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { ); $this->pop_until_tag_name( 'DD' ); break; - } elseif ( $node->tag === 'DT' ) { + } elseif ( $node->token->tag === 'DT' ) { $this->generate_implied_end_tags( array( 'except_for' => array( 'DT' ), @@ -345,7 +399,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { ); $this->pop_until_tag_name( 'DT' ); break; - } elseif ( self::is_special_element( $node->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { + } elseif ( self::is_special_element( $node->token->tag, array( 'ADDRESS', 'DIV', 'P' ) ) ) { break; } else { --$i; @@ -371,11 +425,11 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'A': $active_a = null; for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; --$i ) { - $elem = $this->active_formatting_elements[ $i ]; - if ( $elem->tag === 'A' ) { - $active_a = $elem; + $node = $this->active_formatting_elements[ $i ]; + if ( $node->token->tag === 'A' ) { + $active_a = $node; break; - } elseif ( $elem->is_marker() ) { + } elseif ( $node->token->is_marker() ) { break; } } @@ -400,9 +454,10 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'STRONG': case 'TT': case 'U': + dbg( "Found {$token->tag} tag opener" ); $this->reconstruct_active_formatting_elements(); - $this->push_active_formatting_element( $token ); - $this->insert_element( $token ); + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); break; case 'NOBR': $this->reconstruct_active_formatting_elements(); @@ -411,8 +466,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->adoption_agency_algorithm( $token ); $this->reconstruct_active_formatting_elements(); } - $this->insert_element( $token ); - $this->push_active_formatting_element( $token ); + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); break; case 'APPLET': case 'MARQUEE': @@ -466,7 +521,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->reconstruct_active_formatting_elements(); // @TODO: Follow the generic raw text element parsing algorithm. throw new Exception( 'XMP not implemented yet' ); - break; case 'IFRAME': case 'NOEMBED': case 'NOSCRIPT': @@ -561,7 +615,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'MENU': case 'NAV': case 'OL': - case 'P': + case 'PRE': case 'SECTION': case 'SUMMARY': case 'UL': @@ -589,6 +643,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->form_pointer = null; break; case 'P': + dbg( "Found {$token->tag} tag closer" ); if ( ! $this->is_element_in_button_scope( 'P' ) ) { // Parse error, insert an HTML element for a "p" start tag token with no attributes. $this->parse_error(); @@ -642,7 +697,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'STRONG': case 'TT': case 'U': - $this->parse_error(); + dbg( "Found {$token->tag} tag closer" ); $this->adoption_agency_algorithm( $token ); break; @@ -655,7 +710,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { return $this->next_tag(); } $this->generate_implied_end_tags(); - if ( $this->current_node()->tag !== $token->tag ) { + if ( $this->current_node()->token->tag !== $token->tag ) { $this->parse_error(); } $this->pop_until_tag_name( $token->tag ); @@ -667,7 +722,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $i = count( $this->open_elements ) - 1; while ( true ) { $node = $this->open_elements[ $i ]; - if ( $node->tag === $token->tag ) { + if ( $node->token->tag === $token->tag ) { $this->generate_implied_end_tags( array( 'except_for' => array( $token->tag ), @@ -675,7 +730,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { ); $this->pop_until_node( $node ); break; - } elseif ( $this->is_special_element( $node->tag ) ) { + } elseif ( $this->is_special_element( $node->token->tag ) ) { $this->ignore_token( $token ); $this->parse_error(); return $this->next_tag(); @@ -697,19 +752,21 @@ private function next_token() { return $next_tag; } - if ( ! $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { - return false; + $next_tag = false; + if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { + $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); + $this->set_bookmark($bookmark); + $next_tag = WP_HTML_Token::tag( + $this->get_tag(), + array(), + ! $this->is_tag_closer(), + $bookmark + ); + $text_end = $this->bookmarks[$bookmark]->start; + } else { + $text_end = strlen($this->html); } - $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); - $this->set_bookmark($bookmark); - $next_tag = WP_HTML_Token::tag( - $this->get_tag(), - array(), - ! $this->is_tag_closer(), - $bookmark - ); - /* * If any text was found between the last tag and this one, * save the next tag for later and return the text token. @@ -722,7 +779,6 @@ private function next_token() { && $this->has_bookmark($last->bookmark) ) { $text_start = $this->bookmarks[$last->bookmark]->end + 1; - $text_end = $this->bookmarks[$bookmark]->start; if ($text_start < $text_end) { $this->buffered_tag = $next_tag; $text = substr($this->html, $text_start, $text_end - $text_start); @@ -735,12 +791,15 @@ private function next_token() { const ANY_OTHER_END_TAG = 1; private function adoption_agency_algorithm( WP_HTML_Token $token ) { + dbg("Adoption Agency Algorithm", 1); $subject = $token->tag; + $current_node = $this->current_node(); if ( - $this->current_node()->tag === $subject - && ! in_array( $subject, $this->active_formatting_elements, true ) + $current_node->token->tag === $subject + && ! in_array( $current_node, $this->active_formatting_elements, true ) ) { $this->pop_open_element(); + dbg("Skipping AAA: current node is \$subject ($subject) and is not AFE", 2); return; } @@ -757,18 +816,21 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { $formatting_element_idx = -1; for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { $candidate = $this->active_formatting_elements[ $i ]; - if ( $candidate->is_marker() ) { + if ( $candidate->token->is_marker() ) { break; } - if ( $candidate->tag === $subject ) { + if ( $candidate->token->tag === $subject ) { $formatting_element = $candidate; $formatting_element_idx = $i; break; } } + dbg("AAA: Formatting element = {$formatting_element->token->tag}", 2); + // If there is no such element, then abort these steps and instead act as // described in the "any other end tag" entry below. if ( null === $formatting_element ) { + dbg("Skipping AAA: no formatting element found", 2); return self::ANY_OTHER_END_TAG; } @@ -777,12 +839,15 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { if ( ! in_array( $formatting_element, $this->open_elements, true ) ) { array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); $this->parse_error(); + dbg("Skipping AAA: formatting element is not in the stack of open elements", 2); return; } // If formatting element is not in scope, then this is a parse error; return - if ( ! $this->is_element_in_scope( $formatting_element->tag ) ) { + if ( ! $this->is_element_in_scope( $formatting_element ) ) { $this->parse_error(); + dbg("Skipping AAA: formatting element {$formatting_element->token->tag} is not in scope", 2); + $this->print_open_elements('Open elements: ', 2); return; } @@ -803,9 +868,8 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { if ( $node === $formatting_element ) { break; } - if ( $this->is_special_element( $node->tag ) ) { + if ( $this->is_special_element( $node->token->tag ) ) { $furthest_block = $node; - break; } } @@ -816,14 +880,22 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { if ( null === $furthest_block ) { $this->pop_until_node( $formatting_element ); array_splice( $this->active_formatting_elements, $formatting_element_idx, 1 ); + dbg("Skipping AAA: no furthest block found", 2); return; } + dbg("AAA: Furthest block = {$furthest_block->token->tag}", 2); + // Let common ancestor be the element immediately above formatting element // in the stack of open elements. $formatting_elem_stack_index = array_search( $formatting_element, $this->open_elements, true ); $common_ancestor = $this->open_elements[ $formatting_elem_stack_index - 1 ]; + dbg("AAA: Common ancestor = {$common_ancestor->token->tag}", 2); + + $this->print_open_elements('AAA: Open elements: ', 2); + $this->print_rafe_formats('AAA: Formatting elements: ', 2); + // Let a bookmark note the position of formatting element in the list of // active formatting elements relative to the elements on either side of it // in the list. @@ -833,8 +905,8 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { $node = $last_node = $furthest_block; $node_open_elements_index = array_search( $node, $this->open_elements, true ); - $prev_node_open_elements_index = -1; - $inner_loop_counter = 0; + $prev_open_element_index = false; + $inner_loop_counter = 0; while ( true ) { $inner_loop_counter++; @@ -846,15 +918,21 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { */ $node_open_elements_index = array_search( $node, $this->open_elements, true ); if ( false === $node_open_elements_index ) { - $node_open_elements_index = $prev_node_open_elements_index; - return; + if ( false === $prev_open_element_index ) { + throw new Exception( 'Unexpected error in AAA algorithm – cannot find node.' ); + } + $node_open_elements_index = $prev_open_element_index; } --$node_open_elements_index; - $node = $this->open_elements[ $node_open_elements_index ]; - $prev_node_open_elements_index = $node_open_elements_index; + if( $node_open_elements_index < 0 ) { + throw new Exception( 'Unexpected error in AAA algorithm – node is not in the stack of open elements.' ); + } + $node = $this->open_elements[ $node_open_elements_index ]; + $prev_open_element_index = $node_open_elements_index; // If node is formatting element, then break. if ( $node === $formatting_element ) { + dbg("AAA: Inner loop break – node is formatting element", 3); break; } @@ -873,28 +951,34 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { * node from the stack of open elements and continue. */ if ( ! in_array( $node, $this->active_formatting_elements, true ) ) { + dbg("AAA: Inner loop – removing node from the stack of open elements", 3); array_splice( $this->open_elements, $node_open_elements_index, 1 ); - continue; } /* * Create an element for the token for which the element node was created, * in the HTML namespace, with common ancestor as the intended parent. - * + */ + $new_node = $this->create_element_for_token( $node->token ); + $new_node->intended_parent = $common_ancestor; + + /* * Replace the entry for node in the list of active formatting elements with an entry * for the new element. - * - * Replace the entry for node in the stack of open elements with an entry for - * the new element. - * - * Let node be the new element. */ - $new_node = WP_HTML_Token::tag( $node->tag ); $node_formatting_idx = array_search( $node, $this->active_formatting_elements, true ); $this->active_formatting_elements[ $node_formatting_idx ] = $new_node; - $node_open_elements_index = array_search( $node, $this->open_elements, true ); - $this->open_elements[ $node_open_elements_index ] = $new_node; + /* + * Replace the entry for node in the stack of open elements with an entry for + * the new element. + */ + $idx = array_search( $node, $this->open_elements, true ); + $this->open_elements[ $idx ] = $new_node; + + /* + * Let node be the new element. + */ $node = $new_node; /* @@ -906,7 +990,8 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { } // Append last node to node. - // @TODO + dbg("AAA: Appending {$last_node->token->tag} to {$node->token->tag}", 3); + $node->append_child( $last_node ); // Set last node to node. $last_node = $node; @@ -914,63 +999,77 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { // Insert whatever last node ended up being in the previous step at the appropriate place // for inserting a node, but using common ancestor as the override target. - // @TODO + $this->insert_element( $last_node, $common_ancestor ); // Create an element for the token for which formatting element was created, in the HTML // namespace, with furthest block as the intended parent. - $new_element = WP_HTML_Token::tag( $formatting_element->tag ); + $new_element = $this->create_element_for_token( $formatting_element->token ); + $new_element->intended_parent = $furthest_block; // Take all of the child nodes of furthest block and append them to the element created in // the last step. - // @TODO + foreach ($furthest_block->children as $child) { + $new_element->append_child( $child ); + } // Append that new element to furthest block. - // @TODO + $furthest_block->append_child( $new_element ); - // Remove formatting element from the list of active formatting elements, and insert the new - // element into the list of active formatting elements at the position of the aforementioned - // bookmark. - $formatting_element_idx = array_search( $formatting_element, $this->active_formatting_elements, true ); - array_splice( $this->active_formatting_elements, $formatting_element_idx, 1, array( $new_element ) ); + // Remove formatting element from the list of active formatting elements + $idx = array_search( $formatting_element, $this->active_formatting_elements, true ); + array_splice( $this->active_formatting_elements, $idx, 1 ); + + // Insert the new element into the list of active formatting elements at the + // position of the aforementioned bookmark. array_splice( $this->active_formatting_elements, $bookmark, 0, array( $new_element ) ); - // Remove formatting element from the stack of open elements, and insert the new element into - // the stack of open elements immediately below the position of furthest block in that stack. - $formatting_element_idx = array_search( $formatting_element, $this->active_formatting_elements, true ); - array_splice( $this->active_formatting_elements, $formatting_element_idx, 1, array( $new_element ) ); - - $furthest_block_idx = array_search( $furthest_block, $this->open_elements, true ); - array_splice( $this->open_elements, $furthest_block_idx + 1, 0, array( $new_element ) ); + // Remove formatting element from the stack of open elements + $idx = array_search( $formatting_element, $this->open_elements, true ); + array_splice( $this->open_elements, $idx, 1 ); + + // Insert the new element into the stack of open elements immediately below the + // position of furthest block in that stack. + $idx = array_search( $furthest_block, $this->open_elements, true ); + array_splice( $this->open_elements, $idx + 1, 0, array( $new_element ) ); } } - /* - @TODO Implement https://html.spec.whatwg.org/multipage/parsing.html#insert-a-foreign-element - - Let the adjusted insertion location be the appropriate place for inserting a node. - - Let element be the result of creating an element for the token in the given namespace, with the intended parent being the element in which the adjusted insertion location finds itself. - - If it is possible to insert element at the adjusted insertion location, then: - - If the parser was not created as part of the HTML fragment parsing algorithm, then push a new element queue onto element's relevant agent's custom element reactions stack. - - Insert element at the adjusted insertion location. - - If the parser was not created as part of the HTML fragment parsing algorithm, then pop the element queue from element's relevant agent's custom element reactions stack, and invoke custom element reactions in that queue. + private function insert_element( $token_or_node, $override_target = null ) { + // Create element for a token + // Skip reset algorithm for now + // Skip form-association for now + if($token_or_node instanceof WP_HTML_Token) { + $node = $this->create_element_for_token($token_or_node); + } else { + $node = $token_or_node; + } - If the adjusted insertion location cannot accept more elements, e.g. because it's a Document that already has an element child, then element is dropped on the floor. + $target = $override_target ?: $this->current_node(); - Push element onto the stack of open elements so that it is the new current node. + // Appropriate place for inserting a node: + // For now skip foster parenting and always use the + // location after the last child of the target + $target->append_child($node); + array_push($this->open_elements, $node); + dbg("inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); + return $node; + } - Return element. + private function create_element_for_token( WP_HTML_Token $token ) { + $node = new WP_HTML_Node($token); + return $node; + } - */ - private function insert_html_element( $node ) { - if ( ! $node->is_closer ) { - $this->insert_element( $node ); + private function insert_text( WP_HTML_Token $token ) { + $target = $this->current_node(); + if(count($target->children)){ + $last_child = end($target->children); + if ( $last_child && $last_child->token->is_text() ) { + $last_child->token->value .= $token->value; + return; + } } - $this->inserted_tokens[] = $node; + $target->append_child(new WP_HTML_Node($token)); } private function ignore_token( $token ) { @@ -981,10 +1080,6 @@ private function ignore_token( $token ) { return; } - private function insert_element( $node ) { - $this->open_elements[] = $node; - } - private function parse_error() { // Noop for now } @@ -993,9 +1088,12 @@ private function pop_until_tag_name( $tags ) { if ( ! is_array( $tags ) ) { $tags = array( $tags ); } - while ( ! in_array( $this->current_node()->tag, $tags ) ) { + dbg( "Popping until tag names: " . implode(', ', $tags), 1 ); + $this->print_open_elements( "Open elements before: " ); + while ( ! in_array( $this->current_node()->token->tag, $tags ) ) { $this->pop_open_element(); } + $this->print_open_elements( "Open elements after: " ); } private function pop_until_node( $node ) { @@ -1006,9 +1104,9 @@ private function pop_until_node( $node ) { private function pop_open_element() { $popped = array_pop( $this->open_elements ); - if ( $popped->bookmark ) { - $this->release_bookmark( $popped->bookmark ); - $popped->bookmark = null; + if ( $popped->token->bookmark ) { + $this->release_bookmark( $popped->token->bookmark ); + $popped->token->bookmark = null; } return $popped; } @@ -1024,20 +1122,21 @@ private function current_node() { } private function close_p_element() { + dbg( "close_p_element" ); $this->generate_implied_end_tags( array( 'except_for' => array( 'P' ), ) ); // If the current node is not a p element, then this is a parse error. - if ( $this->current_node()->tag !== 'P' ) { + if ( $this->current_node()->token->tag !== 'P' ) { $this->parse_error(); } $this->pop_until_tag_name( 'P' ); } private function should_generate_implied_end_tags( $options = null ) { - $current_tag_name = $this->current_node()->tag; + $current_tag_name = $this->current_node()->token->tag; if ( null !== $options && isset( $options['except_for'] ) && in_array( $current_tag_name, $options['except_for'] ) ) { return false; } @@ -1074,14 +1173,14 @@ private function should_generate_implied_end_tags( $options = null ) { /** * https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements */ - private function push_active_formatting_element( $node ) { + private function push_active_formatting_element( WP_HTML_Node $node ) { $count = 0; for ( $i = count( $this->active_formatting_elements ) - 1; $i >= 0; $i-- ) { $formatting_element = $this->active_formatting_elements[ $i ]; - if ( $formatting_element->is_marker() ) { + if ( $formatting_element->token->is_marker() ) { break; } - if ( ! $node->equivalent( $node ) ) { + if ( ! $formatting_element->token->equivalent( $node->token ) ) { continue; } $count++; @@ -1093,63 +1192,100 @@ private function push_active_formatting_element( $node ) { $this->active_formatting_elements[] = $node; } + private function print_rafe_formats($msg, $indent=1) { + $formats = array_map( function( $node ) { + return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); + }, $this->active_formatting_elements); + dbg( "$msg " . implode(', ', $formats), $indent ); + } + + private function print_open_elements($msg, $indent=1) { + $elems = array_map(function ($node) { + return $node->token->tag; + }, $this->open_elements); + dbg( "$msg " . implode(', ', $elems), $indent ); + } + private function reconstruct_active_formatting_elements() { + $this->print_rafe_formats('RAFE: before'); if ( empty( $this->active_formatting_elements ) ) { + dbg( "Skipping RAFE: empty list", 1 ); return; } - $i = count( $this->active_formatting_elements ) - 1; - $last_entry = $this->active_formatting_elements[ $i ]; - if ( $last_entry->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { + $entry_idx = count( $this->active_formatting_elements ) - 1; + $last_entry = $this->active_formatting_elements[ $entry_idx ]; + if ( $last_entry->token->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { + dbg( "Skipping RAFE: marker or open element", 1 ); return; } + + // Let entry be the last (most recently added) element in the list of active formatting elements. $entry = $last_entry; + + $is_rewinding = true; while ( true ) { - if ( $i <= 0 ) { - break; - } - --$i; - $entry = $this->active_formatting_elements[ $i ]; - if ( $entry->is_marker() || in_array( $entry, $this->open_elements, true ) ) { - break; - } - } - while ( true ) { - ++$i; - $entry = $this->active_formatting_elements[ $i ]; - if ( $entry === $last_entry ) { - break; + if ( $is_rewinding ) { + // Rewind: + /* + * If there are no entries before entry in the list of active formatting elements, + * then jump to the step labeled create. + */ + if ( $entry_idx === 0 ) { + $is_rewinding = false; + } else { + // Let entry be the entry one earlier than entry in the list of active formatting elements. + $entry = $this->active_formatting_elements[ --$entry_idx ]; + + // If entry is neither a marker nor an element that is also in the stack of open elements, + // go to the step labeled rewind. + if ( ! $entry->token->is_marker() && ! in_array( $entry, $this->open_elements, true ) ) { + continue; + } + } + } else { + // Advance: + // Let entry be the element one later than entry in the list of active formatting elements. + $entry = $this->active_formatting_elements[ ++$entry_idx ]; } - // @TODO: - // Create: Insert an HTML element for the token for which the element entry - // was created, to obtain new element. - $new_element = WP_HTML_Token::tag( $entry->tag, $entry->attributes ); + // Create: Insert an HTML element for the token for which the element entry was created, + // to obtain new element. + $new_element = $this->insert_element( $entry->token ); // Replace the entry for entry in the list with an entry for new element. - $index = array_search( $entry, $this->active_formatting_elements, true ); + $this->active_formatting_elements[ $entry_idx ] = $new_element; - $this->active_formatting_elements[ $index ] = $new_element; - if ( $index === count( $this->active_formatting_elements ) - 1 ) { + // If the entry for new element in the list of active formatting elements is not the last entry + // in the list, return to the step labeled advance. + if ( $entry_idx === count( $this->active_formatting_elements ) - 1 ) { break; } } + $this->print_rafe_formats('RAFE: after'); } private function clear_active_formatting_elements_up_to_last_marker() { while ( ! empty( $this->active_formatting_elements ) ) { $entry = array_pop( $this->active_formatting_elements ); - if ( $entry->is_marker() ) { + if ( $entry->token->is_marker() ) { break; } } } + /** + * The stack of open elements is said to have a particular element in + * select scope when it has that element in the specific scope consisting + * of all element types except the following: + * * optgroup + * * option + */ private function is_element_in_select_scope( $target_node ) { return $this->is_element_in_specific_scope( $target_node, array( - 'optgroup', - 'option', + 'OPTGROUP', + 'OPTION', ), array( 'negative_match' => 'true', @@ -1161,9 +1297,9 @@ private function is_element_in_table_scope( $target_node ) { return $this->is_element_in_specific_scope( $target_node, array( - 'html', - 'table', - 'template', + 'HTML', + 'TABLE', + 'TEMPLATE', ) ); } @@ -1172,7 +1308,7 @@ private function is_element_in_button_scope( $target_node ) { return $this->is_element_in_scope( $target_node, array( - 'button', + 'BUTTON', ) ); } @@ -1181,9 +1317,9 @@ private function is_element_in_list_item_scope( $target_node ) { return $this->is_element_in_scope( $target_node, array( - 'li', - 'dd', - 'dt', + 'LI', + 'DD', + 'DT', ) ); } @@ -1193,39 +1329,60 @@ private function is_element_in_scope( $target_node, $additional_elements = array $target_node, array_merge( array( - 'applet', - 'caption', - 'html', - 'table', - 'td', - 'th', - 'marquee', - 'object', - 'template', + 'APPLET', + 'CAPTION', + 'HTML', + 'TABLE', + 'TD', + 'TH', + 'MARQUEE', + 'OBJECT', + 'TEMPLATE', ), $additional_elements ) ); } - /** + /* * https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements */ private function is_element_in_specific_scope( $target_node, $element_types_list, $options = array() ) { $negative_match = isset( $options['negative_match'] ) ? $options['negative_match'] : false; - $i = count( $this->open_elements ) - 1; - while ( true ) { - $node = $this->open_elements[ $i ]; - if ( $node === $target_node ) { + /** + * The stack of open elements is said to have an element target node in a + * specific scope consisting of a list of element types list when the following + * algorithm terminates in a match state: + */ + $i = count( $this->open_elements ) - 1; + // 1. Initialize node to be the current node (the bottommost node of the stack). + $node = $this->open_elements[ $i ]; + + while ( true ) { + // 2. If node is the target node, terminate in a match state. + if ( $node === $target_node || $node->token->tag === $target_node ) { return true; } - $is_in_the_list = in_array( $node->tag, $element_types_list, true ); - $failure = $negative_match ? $is_in_the_list : ! $is_in_the_list; + // 3. Otherwise, if node is one of the element types in list, terminate in a failure state. + $failure = in_array( $node->token->tag, $element_types_list, true ); + + // Some elements say: + // > If has that element in the specific scope consisting of all element types + // > except the following + // So we need to invert the result. + if($negative_match) { + $failure = ! $failure; + } if ( $failure ) { return false; } + + // Otherwise, set node to the previous entry in the stack of open elements and + // return to step 2. (This will never fail, since the loop will always terminate + // in the previous step if the top of the stack — an html element — is reached.) + $node = $this->open_elements[ --$i ]; } } @@ -1242,7 +1399,7 @@ private function reset_insertion_mode() { $node = $this->context_node; } - if ( $node->tag === 'select' ) { + if ( $node->token->tag === 'select' ) { if ( $last ) { break; } @@ -1269,7 +1426,7 @@ private function reset_insertion_mode() { return; } - switch ( $node->tag ) { + switch ( $node->token->tag ) { case 'TD': case 'TH': if ( ! $last ) { @@ -1465,15 +1622,29 @@ private static function is_formatting_element( $tag_name ) { } -$p = new WP_HTML_Processor( '

LoremIpsum

DolorSit' ); -// The controller's schema is hardcoded, so tests would not be meaningful. -$p->main(); - -// $this->tag_processor->next_tag( -// array( -// 'tag_closers' => 'visit', -// ) -// ); -// var_dump( $this->tag_processor->get_tag() ); -// var_dump( $this->tag_processor->is_tag_closer() ); -// $last_parent = end( $this->open_elements ); +// $p = new WP_HTML_Processor( '

12345

' ); +// $p->parse(); +/* +Should output: + p + ├─ #text: 1 + ├─ b + │ ├─ #text: 2 + │ └─ i + │ └─ #text: 3 + ├─ i + │ └─ #text: 4 + └─ #text: 5 +*/ + +$p = new WP_HTML_Processor( '1

23

' ); +$p->parse(); +/* +Should output: +b +└─ #text: 1 +p +├─ b +│ └─ #text: 2 +└─ #text: 3 +*/ From 9d31cb7fdca4d9d40cf74755ac04dc6eee67977b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 17:11:58 +0100 Subject: [PATCH 06/14] Fix a bug in the adoption agency algorithm --- .../html-api/class-wp-html-processor.php | 100 +++++++++++------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index d2ac45a14ccb4..68fc9028b18a8 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -12,8 +12,8 @@ function esc_attr( $text ) { } function dbg( $message, $indent = 0 ) { - $show_debug = true; - // $show_debug = false; + // $show_debug = true; + $show_debug = false; if( $show_debug ) { $indent = str_repeat( ' ', $indent * 2 ); echo $indent . $message . "\n"; @@ -21,9 +21,9 @@ function dbg( $message, $indent = 0 ) { } class WP_HTML_Token { - const MARKER = 1; - const TAG = 2; - const TEXT = 3; + const MARKER = 'MARKER'; + const TAG = 'TAG'; + const TEXT = 'TEXT'; public $type; @@ -67,13 +67,13 @@ public function __toString() { return 'MARKER'; case self::TAG: return sprintf( - '<%s%s%s>', + '%s%s%s', $this->is_closer ? '/' : '', $this->tag, $this->attributes ? ' ' . implode( ' ', $this->attributes ) : '' ); case self::TEXT: - return $this->value; + return '#text: ' . trim($this->value); } } @@ -118,16 +118,32 @@ public function is_text() { } class WP_HTML_Node { + /** + * @var WP_HTML_Node + */ public $parent; + /** + * @var WP_HTML_Node[] + */ public $children = array(); + /** + * @var string + */ public $token; public $depth = 1; // For the adoption agency algorithm: public $intended_parent = null; + private $type; + private $value; + private $tag; public function __construct( WP_HTML_Token $token ) { $this->token = $token; + // Just for debugging convenience – remove eventually + $this->type = $token->type; + $this->value = $token->value; + $this->tag = $token->tag; } public function append_child( WP_HTML_Node $node ) { @@ -147,16 +163,26 @@ public function remove( WP_HTML_Node $node ) { } public function __toString() { - $out = ''; - $indent = str_repeat( ' ', $this->depth ); - $out .= $indent . $this->token . "\n"; - foreach ( $this->children as $child ) { - $out .= $child; - } - return $out; + return wp_html_node_to_ascii_tree( $this ); } } + +function wp_html_node_to_ascii_tree( WP_HTML_Node $node, $prefix = '', $is_last = false ) { + $ascii_tree = $prefix . ( $node->parent ? ($is_last ? '└─ ' : '├─ ') : ' ' ) . $node->token . "\n"; + + // Recursively process the children of the current node + $children = array_values($node->children); + $num_children = count( $children ); + for ( $i = 0; $i < $num_children; $i++ ) { + $child_prefix = $prefix . ( $i == $num_children - 1 ? ' ' : ' ' ); + $is_last_child = ( $i == $num_children - 1 ); + $ascii_tree .= wp_html_node_to_ascii_tree( $children[ $i ], $child_prefix, $is_last_child ); + } + + return $ascii_tree; +} + class WP_HTML_Insertion_Mode { const INITIAL = 'INITIAL'; @@ -224,7 +250,7 @@ public function __construct( $html ) { public function parse() { echo("HTML before main loop:\n"); echo($this->html); - echo("\n\n"); + echo("\n"); while ($token = $this->next_token()) { $this->last_token = $token; $processed_token = $this->process_in_body_insertion_mode($token); @@ -233,6 +259,7 @@ public function parse() { echo("\n"); echo("DOM after main loop:\n"); echo($this->root_node.''); + echo "\n\n"; // @TODO: // switch($this->insertion_mode) { // case WP_HTML_Insertion_Mode::INITIAL: @@ -999,7 +1026,7 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { // Insert whatever last node ended up being in the previous step at the appropriate place // for inserting a node, but using common ancestor as the override target. - $this->insert_element( $last_node, $common_ancestor ); + $this->insert_node( $last_node, $common_ancestor ); // Create an element for the token for which formatting element was created, in the HTML // namespace, with furthest block as the intended parent. @@ -1034,30 +1061,28 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { } } - private function insert_element( $token_or_node, $override_target = null ) { + private function insert_element( WP_HTML_Token $token, $override_target = null ) { // Create element for a token // Skip reset algorithm for now // Skip form-association for now - if($token_or_node instanceof WP_HTML_Token) { - $node = $this->create_element_for_token($token_or_node); - } else { - $node = $token_or_node; - } + $node = $this->create_element_for_token($token); + $this->insert_node($node, $override_target); + array_push($this->open_elements, $node); + return $node; + } + private function insert_node( WP_HTML_Node $node, $override_target = null ) { $target = $override_target ?: $this->current_node(); // Appropriate place for inserting a node: // For now skip foster parenting and always use the // location after the last child of the target $target->append_child($node); - array_push($this->open_elements, $node); - dbg("inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); - return $node; + dbg("Inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); } private function create_element_for_token( WP_HTML_Token $token ) { - $node = new WP_HTML_Node($token); - return $node; + return new WP_HTML_Node($token); } private function insert_text( WP_HTML_Token $token ) { @@ -1622,10 +1647,10 @@ private static function is_formatting_element( $tag_name ) { } -// $p = new WP_HTML_Processor( '

12345

' ); -// $p->parse(); +$p = new WP_HTML_Processor( '

12345

' ); +$p->parse(); /* -Should output: +Outputs: p ├─ #text: 1 ├─ b @@ -1640,11 +1665,12 @@ private static function is_formatting_element( $tag_name ) { $p = new WP_HTML_Processor( '1

23

' ); $p->parse(); /* -Should output: -b -└─ #text: 1 -p -├─ b -│ └─ #text: 2 -└─ #text: 3 +Outputs the correct result: + HTML + ├─ B + └─ #text: 1 + └─ P + ├─ B + └─ #text: 2 + └─ #text: 3 */ From eeea95ae21c74ad6c97c07997a112ef0a3838c5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 17:30:23 +0100 Subject: [PATCH 07/14] Correctly cose the p tags --- .../html-api/class-wp-html-processor.php | 93 +++++++++++++++---- 1 file changed, 76 insertions(+), 17 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 68fc9028b18a8..28f27d0d9040d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -66,11 +66,17 @@ public function __toString() { case self::MARKER: return 'MARKER'; case self::TAG: + $attributes = ''; + if($this->attributes) { + foreach( $this->attributes as $name => $value ) { + $attributes .= ' ' . $name . '="' . esc_attr( $value ) . '"'; + } + } return sprintf( '%s%s%s', $this->is_closer ? '/' : '', $this->tag, - $this->attributes ? ' ' . implode( ' ', $this->attributes ) : '' + $attributes ); case self::TEXT: return '#text: ' . trim($this->value); @@ -127,7 +133,7 @@ class WP_HTML_Node { */ public $children = array(); /** - * @var string + * @var WP_HTML_Token */ public $token; public $depth = 1; @@ -315,6 +321,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->insert_text( $token ); } else if ( $token->is_opener ) { + dbg( "Found {$token->tag} tag opener" ); // Should we care? // if(self::is_rcdata_element($token->tag)) { // $this->original_insertion_mode = $this->insertion_mode; @@ -348,7 +355,10 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { // Ignore special rules for 'PRE' and 'LISTING' case 'PRE': case 'LISTING': - dbg( "Found {$token->tag} tag opener" ); + /* + * If the stack of open elements has a p element in button scope, + * then close a p element. + */ if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } @@ -481,7 +491,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'STRONG': case 'TT': case 'U': - dbg( "Found {$token->tag} tag opener" ); $this->reconstruct_active_formatting_elements(); $node = $this->insert_element( $token ); $this->push_active_formatting_element( $node ); @@ -621,6 +630,7 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { break; } } else { + dbg( "Found {$token->tag} tag closer" ); switch ( $token->tag ) { case 'ADDRESS': case 'ARTICLE': @@ -670,12 +680,16 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->form_pointer = null; break; case 'P': - dbg( "Found {$token->tag} tag closer" ); + /* + * If the stack of open elements does not have a p element in button scope, + * then this is a parse error; insert an HTML element for a "p" start tag + * token with no attributes. + */ if ( ! $this->is_element_in_button_scope( 'P' ) ) { - // Parse error, insert an HTML element for a "p" start tag token with no attributes. $this->parse_error(); $this->insert_element( WP_HTML_Token::tag( 'P' ) ); } + // Close a p element. $this->close_p_element(); break; case 'LI': @@ -783,9 +797,16 @@ private function next_token() { if ( $this->next_tag( array( 'tag_closers' => 'visit' ) ) ) { $bookmark = '__internal_' . ( $this->element_bookmark_idx++ ); $this->set_bookmark($bookmark); + $attributes = array(); + $attrs = $this->get_attribute_names_with_prefix(''); + if ($attrs) { + foreach ($attrs as $name) { + $attributes[$name] = $this->get_attribute($name); + } + } $next_tag = WP_HTML_Token::tag( $this->get_tag(), - array(), + $attributes, ! $this->is_tag_closer(), $bookmark ); @@ -852,7 +873,6 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { break; } } - dbg("AAA: Formatting element = {$formatting_element->token->tag}", 2); // If there is no such element, then abort these steps and instead act as // described in the "any other end tag" entry below. @@ -860,6 +880,7 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { dbg("Skipping AAA: no formatting element found", 2); return self::ANY_OTHER_END_TAG; } + dbg("AAA: Formatting element = {$formatting_element->token->tag}", 2); // If formatting element is not in the stack of open elements, then this is // a parse error; remove the element from the list, and return. @@ -921,7 +942,7 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { dbg("AAA: Common ancestor = {$common_ancestor->token->tag}", 2); $this->print_open_elements('AAA: Open elements: ', 2); - $this->print_rafe_formats('AAA: Formatting elements: ', 2); + $this->print_active_formatting_elements('AAA: Formatting elements: ', 2); // Let a bookmark note the position of formatting element in the list of // active formatting elements relative to the elements on either side of it @@ -1115,9 +1136,9 @@ private function pop_until_tag_name( $tags ) { } dbg( "Popping until tag names: " . implode(', ', $tags), 1 ); $this->print_open_elements( "Open elements before: " ); - while ( ! in_array( $this->current_node()->token->tag, $tags ) ) { - $this->pop_open_element(); - } + do { + $popped = $this->pop_open_element(); + } while (!in_array($popped->token->tag, $tags)); $this->print_open_elements( "Open elements after: " ); } @@ -1217,7 +1238,7 @@ private function push_active_formatting_element( WP_HTML_Node $node ) { $this->active_formatting_elements[] = $node; } - private function print_rafe_formats($msg, $indent=1) { + private function print_active_formatting_elements($msg, $indent=1) { $formats = array_map( function( $node ) { return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); }, $this->active_formatting_elements); @@ -1232,15 +1253,15 @@ private function print_open_elements($msg, $indent=1) { } private function reconstruct_active_formatting_elements() { - $this->print_rafe_formats('RAFE: before'); + $this->print_active_formatting_elements('AFE: before'); if ( empty( $this->active_formatting_elements ) ) { - dbg( "Skipping RAFE: empty list", 1 ); + dbg( "Skipping AFE: empty list", 1 ); return; } $entry_idx = count( $this->active_formatting_elements ) - 1; $last_entry = $this->active_formatting_elements[ $entry_idx ]; if ( $last_entry->token->is_marker() || in_array( $last_entry, $this->open_elements, true ) ) { - dbg( "Skipping RAFE: marker or open element", 1 ); + dbg( "Skipping AFE: marker or open element", 1 ); return; } @@ -1286,7 +1307,7 @@ private function reconstruct_active_formatting_elements() { break; } } - $this->print_rafe_formats('RAFE: after'); + $this->print_active_formatting_elements('AFE: after'); } private function clear_active_formatting_elements_up_to_last_marker() { @@ -1674,3 +1695,41 @@ private static function is_formatting_element( $tag_name ) { └─ #text: 2 └─ #text: 3 */ + + +$p = new WP_HTML_Processor( '

X +

X +

X +

X' ); +$p->parse(); +/* +DOM after main loop: + HTML + ├─ P + └─ B class="x" + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ #text: X + ├─ P + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ #text: X + ├─ P + └─ B class="x" + └─ B + └─ B class="x" + └─ B class="x" + └─ B + └─ B + └─ B class="x" + └─ B + └─ #text: X + └─ P + └─ #text: X +*/ From ddf2c7311218318b03bbc7df1382ea02538ed9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 18:45:54 +0100 Subject: [PATCH 08/14] Simplify HTML Processor --- .../html-api/class-wp-html-processor.php | 392 ++++++------------ .../html-api/class-wp-html-tag-processor.php | 2 +- 2 files changed, 124 insertions(+), 270 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 28f27d0d9040d..c24eee4a430f2 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,10 +11,9 @@ function esc_attr( $text ) { } } +define('HTML_DEBUG_MODE', false); function dbg( $message, $indent = 0 ) { - // $show_debug = true; - $show_debug = false; - if( $show_debug ) { + if( HTML_DEBUG_MODE ) { $indent = str_repeat( ' ', $indent * 2 ); echo $indent . $message . "\n"; } @@ -138,8 +137,6 @@ class WP_HTML_Node { public $token; public $depth = 1; - // For the adoption agency algorithm: - public $intended_parent = null; private $type; private $value; private $tag; @@ -223,8 +220,6 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $active_formatting_elements = array(); private $root_node = null; private $context_node = null; - private $original_insertion_mode = null; - private $insertion_mode = null; /* * WP_HTML_Tag_Processor skips over text nodes and only @@ -242,78 +237,57 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $last_token = null; private $inserted_tokens = array(); - private $head_pointer; - private $form_pointer; + const MAX_BOOKMARKS = 1000000; public function __construct( $html ) { parent::__construct( $html ); $this->root_node = new WP_HTML_Node(WP_HTML_Token::tag( 'HTML' )); $this->context_node = new WP_HTML_Node(WP_HTML_Token::tag( 'DOCUMENT' )); $this->open_elements = array( $this->root_node ); - $this->reset_insertion_mode(); } public function parse() { echo("HTML before main loop:\n"); echo($this->html); echo("\n"); - while ($token = $this->next_token()) { - $this->last_token = $token; - $processed_token = $this->process_in_body_insertion_mode($token); - $this->last_token = $processed_token; + while ($this->process_next_token()) { + // ... twiddle thumbs ... } echo("\n"); echo("DOM after main loop:\n"); echo($this->root_node.''); echo "\n\n"; - // @TODO: - // switch($this->insertion_mode) { - // case WP_HTML_Insertion_Mode::INITIAL: - // $this->next_tag_in_initial_mode(); - // break; - // case WP_HTML_Insertion_Mode::BEFORE_HEAD: - // $this->next_tag_in_before_head_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_HEAD: - // $this->next_tag_in_head_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_BODY: - // $this->next_tag_in_body_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_TABLE: - // $this->next_tag_in_table_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_TABLE_BODY: - // $this->next_tag_in_table_body_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_ROW: - // $this->next_tag_in_row_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_CELL: - // $this->next_tag_in_cell_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_SELECT: - // $this->next_tag_in_select_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE: - // $this->next_tag_in_select_in_table_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_CAPTION: - // $this->next_tag_in_caption_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_COLUMN_GROUP: - // $this->next_tag_in_column_group_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::IN_FRAMESET: - // $this->next_tag_in_frameset_insertion_mode(); - // break; - // case WP_HTML_Insertion_Mode::TEXT: - // $this->next_tag_in_text_insertion_mode(); - // break; + + echo "Mem peak usage:" . memory_get_peak_usage(true) . "\n"; + } + + private function process_next_token() { + $token = $this->next_token(); + if(!$token){ + return false; + } + $this->last_token = $token; + $processed_token = $this->process_token($token); + $this->last_token = $processed_token; + return $processed_token; + } + + private function ignore_token( $ignored_token ) { + // if ( $ignored_token->bookmark ) { + // // $this->release_bookmark( $ignored_token->bookmark ); + // // $ignored_token->bookmark = null; // } + + $token = $this->next_token(); + if(!$token){ + return false; + } + $processed_token = $this->process_token($token); + $this->last_token = $processed_token; + return $processed_token; } - public function process_in_body_insertion_mode(WP_HTML_Token $token) { + public function process_token(WP_HTML_Token $token) { if ( $token->is_text() ) { dbg( "Found text node '$token'" ); dbg( "Inserting text to current node " . $this->current_node()->token->tag, 1 ); @@ -322,11 +296,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { } else if ( $token->is_opener ) { dbg( "Found {$token->tag} tag opener" ); - // Should we care? - // if(self::is_rcdata_element($token->tag)) { - // $this->original_insertion_mode = $this->insertion_mode; - // $this->insertion_mode = WP_HTML_Insertion_Mode::TEXT; - // } switch ( $token->tag ) { case 'ADDRESS': case 'ARTICLE': @@ -380,14 +349,9 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->insert_element( $token ); break; case 'FORM': - if ( $this->form_pointer ) { - $this->ignore_token( $token ); - return $this->next_tag(); - } if ( $this->is_element_in_button_scope( 'P' ) ) { $this->close_p_element(); } - $this->form_pointer = $token; $this->insert_element( $token ); break; case 'LI': @@ -449,8 +413,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { } $this->insert_element( $token ); break; - case 'PLAINTEXT': - throw new Exception( 'PLAINTEXT not implemented yet' ); case 'BUTTON': if ( $this->is_element_in_button_scope( 'BUTTON' ) ) { $this->generate_implied_end_tags(); @@ -514,7 +476,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { break; case 'TABLE': $this->insert_element( $token ); - $this->insertion_mode = WP_HTML_Insertion_Mode::IN_TABLE; break; case 'AREA': case 'BR': @@ -525,7 +486,6 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); $this->pop_open_element(); - // @TODO: Acknowledge the token's self-closing flag, if it is set. break; case 'PARAM': case 'SOURCE': @@ -540,45 +500,12 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->insert_element( $token ); $this->pop_open_element(); break; - case 'IMAGE': - $this->parse_error(); - // Change the tag name to "img" and reprocess the token. - throw new Exception( 'IMAGE not implemented yet' ); case 'TEXTAREA': $this->insert_element( $token ); - $this->original_insertion_mode = $this->insertion_mode; - $this->insertion_mode = WP_HTML_Insertion_Mode::TEXT; break; - - case 'XMP': - if ( $this->is_element_in_button_scope( 'P' ) ) { - $this->close_p_element(); - } - $this->reconstruct_active_formatting_elements(); - // @TODO: Follow the generic raw text element parsing algorithm. - throw new Exception( 'XMP not implemented yet' ); - case 'IFRAME': - case 'NOEMBED': - case 'NOSCRIPT': - // @TODO: Follow the generic raw text element parsing algorithm. - throw new Exception( $token->tag . ' not implemented yet' ); case 'SELECT': $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); - if ( in_array( - $this->insertion_mode, - array( - WP_HTML_Insertion_Mode::IN_TABLE, - WP_HTML_Insertion_Mode::IN_CAPTION, - WP_HTML_Insertion_Mode::IN_TABLE_BODY, - WP_HTML_Insertion_Mode::IN_ROW, - WP_HTML_Insertion_Mode::IN_CELL, - ) - ) ) { - $this->insertion_mode = WP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE; - } else { - $this->insertion_mode = WP_HTML_Insertion_Mode::IN_SELECT; - } break; case 'OPTGROUP': case 'OPTION': @@ -606,24 +533,17 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { } $this->insert_element( $token ); break; - case 'MATH': - throw new Exception( 'MATH not implemented yet' ); - case 'SVG': - throw new Exception( 'SVG not implemented yet' ); - case 'CAPTION': - case 'COL': - case 'COLGROUP': - case 'FRAME': - case 'HEAD': - case 'TBODY': - case 'TD': - case 'TFOOT': - case 'TH': - case 'THEAD': - case 'TR': - $this->parse_error(); - // Ignore the token. - return; + + // case 'XMP': + // case 'IFRAME': + // case 'NOEMBED': + // case 'MATH': + // case 'SVG': + // case 'NOSCRIPT': + // case 'PLAINTEXT': + // case 'IMAGE': + // throw new Exception( $token->tag . ' not implemented yet' ); + default: $this->reconstruct_active_formatting_elements(); $this->insert_element( $token ); @@ -656,28 +576,16 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'SECTION': case 'SUMMARY': case 'UL': - if ( $this->is_element_in_scope( $token->tag ) ) { - $this->ignore_token( $token ); + if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); $this->pop_until_tag_name( $token->tag ); break; case 'FORM': - if ( $this->form_pointer ) { - $this->ignore_token( $token ); - $this->parse_error(); - return $this->next_tag(); - } - if ( $this->is_element_in_scope( $this->form_pointer ) ) { - $this->ignore_token( $token ); - $this->parse_error(); - return $this->next_tag(); - } $this->generate_implied_end_tags(); - array_splice( $this->open_elements, array_search( $this->form_pointer, $this->open_elements ), 1 ); - $this->form_pointer = null; + $this->pop_until_tag_name( $token->tag ); break; case 'P': /* @@ -694,9 +602,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { break; case 'LI': if ( $this->is_element_in_list_item_scope( 'LI' ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); $this->pop_until_tag_name( 'LI' ); @@ -704,9 +611,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'DD': case 'DT': if ( $this->is_element_in_scope( $token->tag ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); $this->pop_until_tag_name( $token->tag ); @@ -718,9 +624,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'H5': case 'H6': if ( $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); $this->pop_until_tag_name( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ); @@ -746,9 +651,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { case 'MARQUEE': case 'OBJECT': if ( $this->is_element_in_scope( $token->tag ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } $this->generate_implied_end_tags(); if ( $this->current_node()->token->tag !== $token->tag ) { @@ -772,9 +676,8 @@ public function process_in_body_insertion_mode(WP_HTML_Token $token) { $this->pop_until_node( $node ); break; } elseif ( $this->is_special_element( $node->token->tag ) ) { - $this->ignore_token( $token ); $this->parse_error(); - return $this->next_tag(); + return $this->ignore_token( $token ); } else { --$i; } @@ -821,7 +724,7 @@ private function next_token() { */ $last = $this->last_token; if ( - $last + $last && $last->is_tag() && $last->bookmark && $this->has_bookmark($last->bookmark) @@ -1008,7 +911,6 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { * in the HTML namespace, with common ancestor as the intended parent. */ $new_node = $this->create_element_for_token( $node->token ); - $new_node->intended_parent = $common_ancestor; /* * Replace the entry for node in the list of active formatting elements with an entry @@ -1052,7 +954,6 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { // Create an element for the token for which formatting element was created, in the HTML // namespace, with furthest block as the intended parent. $new_element = $this->create_element_for_token( $formatting_element->token ); - $new_element->intended_parent = $furthest_block; // Take all of the child nodes of furthest block and append them to the element created in // the last step. @@ -1118,14 +1019,6 @@ private function insert_text( WP_HTML_Token $token ) { $target->append_child(new WP_HTML_Node($token)); } - private function ignore_token( $token ) { - if ( $token->bookmark ) { - $this->release_bookmark( $token->bookmark ); - $token->bookmark = null; - } - return; - } - private function parse_error() { // Noop for now } @@ -1239,17 +1132,21 @@ private function push_active_formatting_element( WP_HTML_Node $node ) { } private function print_active_formatting_elements($msg, $indent=1) { - $formats = array_map( function( $node ) { - return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); - }, $this->active_formatting_elements); - dbg( "$msg " . implode(', ', $formats), $indent ); + if (HTML_DEBUG_MODE) { + $formats = array_map(function ($node) { + return $node->token->tag ?: ($node->token->is_marker() ? 'M' : 'ERROR'); + }, $this->active_formatting_elements); + dbg("$msg " . implode(', ', $formats), $indent); + } } private function print_open_elements($msg, $indent=1) { - $elems = array_map(function ($node) { - return $node->token->tag; - }, $this->open_elements); - dbg( "$msg " . implode(', ', $elems), $indent ); + if (HTML_DEBUG_MODE) { + $elems = array_map(function ($node) { + return $node->token->tag; + }, $this->open_elements); + dbg("$msg " . implode(', ', $elems), $indent); + } } private function reconstruct_active_formatting_elements() { @@ -1407,7 +1304,11 @@ private function is_element_in_specific_scope( $target_node, $element_types_list while ( true ) { // 2. If node is the target node, terminate in a match state. - if ( $node === $target_node || $node->token->tag === $target_node ) { + if ( is_string( $target_node ) ) { + if ( $node->token->tag === $target_node ) { + return true; + } + } else if ( $node === $target_node ) { return true; } @@ -1432,106 +1333,6 @@ private function is_element_in_specific_scope( $target_node, $element_types_list } } - /** - * https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately - */ - private function reset_insertion_mode() { - $last = false; - $node = end( $this->open_elements ); - - while ( true ) { - if ( count( $this->open_elements ) === 1 && $node === reset( $this->open_elements ) ) { - $last = true; - $node = $this->context_node; - } - - if ( $node->token->tag === 'select' ) { - if ( $last ) { - break; - } - - $ancestor = $node; - while ( true ) { - if ( $ancestor === $this->open_elements[0] ) { - break; - } - - $index = array_search( $ancestor, $this->open_elements ); - $ancestor = $this->open_elements[ $index - 1 ]; - if ( $ancestor->tag === 'template' ) { - break; - } - - if ( $ancestor->tag === 'table' ) { - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_SELECT_IN_TABLE; - return; - } - } - - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_SELECT; - return; - } - - switch ( $node->token->tag ) { - case 'TD': - case 'TH': - if ( ! $last ) { - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_CELL; - return; - } - break; - case 'TR': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_ROW; - return; - case 'TBODY': - case 'THEAD': - case 'TFOOT': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_TABLE_BODY; - return; - case 'CAPTION': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_CAPTION; - return; - case 'COLGROUP': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_COLUMN_GROUP; - return; - case 'TABLE': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_TABLE; - return; - case 'TEMPLATE': - // TODO: implement the current template insertion mode - $this->insertion_mode = 0; - return; - case 'HEAD': - if ( ! $last ) { - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_HEAD; - return; - } - break; - case 'BODY': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; - return; - case 'FRAMESET': - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_FRAMESET; - return; - case 'HTML': - // TODO: implement the head element pointer - $this->insertion_mode = WP_HTML_Insertion_Mode::BEFORE_HEAD; - return; - default: - if ( $last ) { - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; - return; - } - } - - $index = array_search( $node, $this->open_elements ); - $node = $this->open_elements[ $index - 1 ]; - } - - $this->insertion_mode = wP_HTML_Insertion_Mode::IN_BODY; - } - - private static function is_special_element( $tag_name, $except = null ) { if ( null !== $except && in_array( $tag_name, $except, true ) ) { return false; @@ -1667,6 +1468,59 @@ private static function is_formatting_element( $tag_name ) { } +// $dir = realpath( __DIR__ . '/../../../index.html' ); + +// $htmlspec = file_get_contents( $dir ); +// $p = new WP_HTML_Processor( $htmlspec ); +// $p->parse(); + +// die(); + +$p = new WP_HTML_Processor( '

  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
Sit
Amet' ); +$p->parse(); +/* +Outputs: + +DOM after main loop: + HTML + ├─ UL + ├─ LI + └─ #text: 1 + ├─ LI + └─ #text: 2 + ├─ LI + └─ #text: 3 + ├─ LI + ├─ #text: Lorem + └─ B + └─ #text: Ipsum + └─ LI + └─ B + └─ #text: Dolor + └─ B + ├─ #text: Sit + └─ DIV + └─ #text: Amet +*/ + +die(); + +$p = new WP_HTML_Processor( '
12
34' ); +$p->parse(); +/* +Outputs: + p + ├─ #text: 1 + ├─ b + │ ├─ #text: 2 + │ └─ i + │ └─ #text: 3 + ├─ i + │ └─ #text: 4 + └─ #text: 5 +*/ + +die(); $p = new WP_HTML_Processor( '

12345

' ); $p->parse(); diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 958e02cca7cfa..9aca0d6f28b85 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -724,7 +724,7 @@ public function set_bookmark( $name ) { return false; } - if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= self::MAX_BOOKMARKS ) { + if ( ! array_key_exists( $name, $this->bookmarks ) && count( $this->bookmarks ) >= static::MAX_BOOKMARKS ) { _doing_it_wrong( __METHOD__, __( 'Too many bookmarks: cannot create any more.' ), From db40a948624fe7cc167b9757f48c1ff3678c2a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 18:48:14 +0100 Subject: [PATCH 09/14] Correct the is_element_in_scope checks --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index c24eee4a430f2..93b3b93fbdb93 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -601,7 +601,7 @@ public function process_token(WP_HTML_Token $token) { $this->close_p_element(); break; case 'LI': - if ( $this->is_element_in_list_item_scope( 'LI' ) ) { + if ( ! $this->is_element_in_list_item_scope( 'LI' ) ) { $this->parse_error(); return $this->ignore_token( $token ); } @@ -610,7 +610,7 @@ public function process_token(WP_HTML_Token $token) { break; case 'DD': case 'DT': - if ( $this->is_element_in_scope( $token->tag ) ) { + if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); return $this->ignore_token( $token ); } @@ -623,7 +623,7 @@ public function process_token(WP_HTML_Token $token) { case 'H4': case 'H5': case 'H6': - if ( $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { + if ( ! $this->is_element_in_scope( array( 'H1', 'H2', 'H3', 'H4', 'H5', 'H6' ) ) ) { $this->parse_error(); return $this->ignore_token( $token ); } @@ -650,7 +650,7 @@ public function process_token(WP_HTML_Token $token) { case 'APPLET': case 'MARQUEE': case 'OBJECT': - if ( $this->is_element_in_scope( $token->tag ) ) { + if ( ! $this->is_element_in_scope( $token->tag ) ) { $this->parse_error(); return $this->ignore_token( $token ); } From ea4f392f574fa165da5d34fb225b9b3d1e559ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 18:49:20 +0100 Subject: [PATCH 10/14] Uncomment some test inputs --- src/wp-includes/html-api/class-wp-html-processor.php | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 93b3b93fbdb93..521a924ee846f 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -1503,8 +1503,6 @@ private static function is_formatting_element( $tag_name ) { └─ #text: Amet */ -die(); - $p = new WP_HTML_Processor( '
12
34' ); $p->parse(); /* @@ -1520,8 +1518,6 @@ private static function is_formatting_element( $tag_name ) { └─ #text: 5 */ -die(); - $p = new WP_HTML_Processor( '

12345

' ); $p->parse(); /* From 66fd636c47f7125d3ea84070eba169dc81a0da9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Wed, 22 Feb 2023 18:53:06 +0100 Subject: [PATCH 11/14] Document insert_node --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 521a924ee846f..ffe10bc0f2be8 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -995,10 +995,10 @@ private function insert_element( WP_HTML_Token $token, $override_target = null ) private function insert_node( WP_HTML_Node $node, $override_target = null ) { $target = $override_target ?: $this->current_node(); - - // Appropriate place for inserting a node: - // For now skip foster parenting and always use the - // location after the last child of the target + /** + * Appropriate place for inserting a node is always the end of the + * target's children thanks to the assumptions this parser makes. + */ $target->append_child($node); dbg("Inserted element: {$node->token->tag} to parent {$target->token->tag}", 2); } From 93fea6ccd499ed861ae7e4f0aa05c87a6f7ff0ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 23 Feb 2023 10:57:01 +0100 Subject: [PATCH 12/14] Simplify ignore_token() --- .../html-api/class-wp-html-processor.php | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ffe10bc0f2be8..11b1a5b52d237 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -149,7 +149,7 @@ public function __construct( WP_HTML_Token $token ) { $this->tag = $token->tag; } - public function append_child( WP_HTML_Node $node ) { + public function append_child( WP_HTML_Node $node ) { if($node->parent) { $node->parent->remove($node); } @@ -278,13 +278,8 @@ private function ignore_token( $ignored_token ) { // // $ignored_token->bookmark = null; // } - $token = $this->next_token(); - if(!$token){ - return false; - } - $processed_token = $this->process_token($token); - $this->last_token = $processed_token; - return $processed_token; + $this->last_token = $ignored_token; + return $this->process_next_token(); } public function process_token(WP_HTML_Token $token) { @@ -1476,7 +1471,7 @@ private static function is_formatting_element( $tag_name ) { // die(); -$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
Sit
Amet' ); +$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); $p->parse(); /* Outputs: @@ -1498,9 +1493,13 @@ private static function is_formatting_element( $tag_name ) { └─ B └─ #text: Dolor └─ B - ├─ #text: Sit - └─ DIV - └─ #text: Amet + └─ SPAN + ├─ #text: Sit + └─ SPAN + ├─ #text: Sit + └─ SPAN + └─ DIV + └─ #text: Amet */ $p = new WP_HTML_Processor( '
12
34' ); From fd2ddcfa086d6d0b3748155ae69294cb48ff45cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 23 Feb 2023 13:01:53 +0100 Subject: [PATCH 13/14] Start exploring a text-based API --- .../html-api/class-wp-html-processor.php | 88 ++++++++++++------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 11b1a5b52d237..c9f213b900494 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,7 +11,7 @@ function esc_attr( $text ) { } } -define('HTML_DEBUG_MODE', false); +define('HTML_DEBUG_MODE', true); function dbg( $message, $indent = 0 ) { if( HTML_DEBUG_MODE ) { $indent = str_repeat( ' ', $indent * 2 ); @@ -237,6 +237,8 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { private $last_token = null; private $inserted_tokens = array(); + public $reconstructed_html = ''; + const MAX_BOOKMARKS = 1000000; public function __construct( $html ) { @@ -253,6 +255,11 @@ public function parse() { while ($this->process_next_token()) { // ... twiddle thumbs ... } + + while ( count($this->open_elements) > 1 ) { + $this->pop_open_element(); + } + echo("\n"); echo("DOM after main loop:\n"); echo($this->root_node.''); @@ -979,6 +986,11 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { } private function insert_element( WP_HTML_Token $token, $override_target = null ) { + // Text API: + $this->reconstructed_html .= '<'.$token->tag.'>'; + + // Object-oriented API: + // Create element for a token // Skip reset algorithm for now // Skip form-association for now @@ -1003,6 +1015,10 @@ private function create_element_for_token( WP_HTML_Token $token ) { } private function insert_text( WP_HTML_Token $token ) { + // Text API: + $this->reconstructed_html .= $token->value; + + // Object-oriented API: $target = $this->current_node(); if(count($target->children)){ $last_child = end($target->children); @@ -1038,6 +1054,11 @@ private function pop_until_node( $node ) { private function pop_open_element() { $popped = array_pop( $this->open_elements ); + + // Text API: + $this->reconstructed_html .= 'token->tag.'>'; + + // Object-oriented API: if ( $popped->token->bookmark ) { $this->release_bookmark( $popped->token->bookmark ); $popped->token->bookmark = null; @@ -1471,8 +1492,36 @@ private static function is_formatting_element( $tag_name ) { // die(); -$p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); -$p->parse(); +// $p = new WP_HTML_Processor( '

12345

' ); +// $p->parse(); +/* +Outputs: + p + ├─ #text: 1 + ├─ b + │ ├─ #text: 2 + │ └─ i + │ └─ #text: 3 + ├─ i + │ └─ #text: 4 + └─ #text: 5 +*/ +// die(); + +// $p = new WP_HTML_Processor( '
12
34' ); +// $p->parse(); +/* +DOM after main loop: + HTML + ├─ DIV + ├─ #text: 1 + └─ SPAN + └─ #text: 2 + └─ #text: 34 +*/ + +// $p = new WP_HTML_Processor( '
  • 1
  • 2
  • 3
  • LoremIpsum
  • Dolor
SitSit
Amet' ); +// $p->parse(); /* Outputs: @@ -1502,35 +1551,6 @@ private static function is_formatting_element( $tag_name ) { └─ #text: Amet */ -$p = new WP_HTML_Processor( '
12
34' ); -$p->parse(); -/* -Outputs: - p - ├─ #text: 1 - ├─ b - │ ├─ #text: 2 - │ └─ i - │ └─ #text: 3 - ├─ i - │ └─ #text: 4 - └─ #text: 5 -*/ - -$p = new WP_HTML_Processor( '

12345

' ); -$p->parse(); -/* -Outputs: - p - ├─ #text: 1 - ├─ b - │ ├─ #text: 2 - │ └─ i - │ └─ #text: 3 - ├─ i - │ └─ #text: 4 - └─ #text: 5 -*/ $p = new WP_HTML_Processor( '1

23

' ); $p->parse(); @@ -1544,7 +1564,9 @@ private static function is_formatting_element( $tag_name ) { └─ #text: 2 └─ #text: 3 */ - +echo "\n\n"; +echo $p->reconstructed_html; +die(); $p = new WP_HTML_Processor( '

X

X From faf724e56dde1aa7d6e0c033e5b8e04f76d3ade3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 23 Feb 2023 15:31:25 +0100 Subject: [PATCH 14/14] Doodling more --- .../html-api/class-wp-html-processor.php | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index c9f213b900494..707c82d0560eb 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -11,7 +11,7 @@ function esc_attr( $text ) { } } -define('HTML_DEBUG_MODE', true); +define('HTML_DEBUG_MODE', false); function dbg( $message, $indent = 0 ) { if( HTML_DEBUG_MODE ) { $indent = str_repeat( ' ', $indent * 2 ); @@ -441,7 +441,8 @@ public function process_token(WP_HTML_Token $token) { } $this->reconstruct_active_formatting_elements(); - $this->insert_element( $token ); + $node = $this->insert_element( $token ); + $this->push_active_formatting_element( $node ); break; case 'B': case 'BIG': @@ -949,6 +950,10 @@ private function adoption_agency_algorithm( WP_HTML_Token $token ) { $last_node = $node; } + // $this->reconstructed_html .= ''; + // $this->reconstructed_html .= '<'.$common_ancestor->token->tag.'>'; + // $this->reconstructed_html .= '<'.$last_node->token->tag.'>'; + // Insert whatever last node ended up being in the previous step at the appropriate place // for inserting a node, but using common ancestor as the override target. $this->insert_node( $last_node, $common_ancestor ); @@ -1551,24 +1556,29 @@ private static function is_formatting_element( $tag_name ) { └─ #text: Amet */ - -$p = new WP_HTML_Processor( '1

23

' ); +$p = new WP_HTML_Processor( ' +
+
+
+
+
' ); $p->parse(); -/* -Outputs the correct result: - HTML - ├─ B - └─ #text: 1 - └─ P - ├─ B - └─ #text: 2 - └─ #text: 3 -*/ +// $p = new WP_HTML_Processor( '1

23

' ); +// $p->parse(); +// /* +// Outputs the correct result: +// B +// └─ #text: 1 +// P +// ├─ B +// └─ #text: 2 +// └─ #text: 3 +// */ echo "\n\n"; echo $p->reconstructed_html; die(); -$p = new WP_HTML_Processor( '

X +$p = new WP_HTML_Processor( '

X

X

X

X' );