Parses next element in the ‘in body’ insertion mode.
Description
This internal function performs the ‘in body’ insertion mode logic for the generalized WP_HTML_Processor::step() function.
See also
Source
* which could open up surprising performance breakdowns.
*
* Example:
*
* $processor = WP_HTML_Processor::create_fragment( '<div><span><figure><img></figure></span></div>' );
* $processor->next_tag( 'img' );
* true === $processor->matches_breadcrumbs( array( 'figure', 'img' ) );
* true === $processor->matches_breadcrumbs( array( 'span', 'figure', 'img' ) );
* false === $processor->matches_breadcrumbs( array( 'span', 'img' ) );
* true === $processor->matches_breadcrumbs( array( 'span', '*', 'img' ) );
*
* @since 6.4.0
*
* @param string[] $breadcrumbs DOM sub-path at which element is found, e.g. `array( 'FIGURE', 'IMG' )`.
* May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`.
* @return bool Whether the currently-matched tag is found at the given nested structure.
*/
public function matches_breadcrumbs( $breadcrumbs ): bool {
// Everything matches when there are zero constraints.
if ( 0 === count( $breadcrumbs ) ) {
return true;
}
// Start at the last crumb.
$crumb = end( $breadcrumbs );
if ( '*' !== $crumb && $this->get_tag() !== strtoupper( $crumb ) ) {
return false;
}
for ( $i = count( $this->breadcrumbs ) - 1; $i >= 0; $i-- ) {
$node = $this->breadcrumbs[ $i ];
$crumb = strtoupper( current( $breadcrumbs ) );
if ( '*' !== $crumb && $node !== $crumb ) {
return false;
}
if ( false === prev( $breadcrumbs ) ) {
return true;
}
}
return false;
}
/**
* Indicates if the currently-matched node expects a closing
* token, or if it will self-close on the next step.
*
* Most HTML elements expect a closer, such as a P element or
* a DIV element. Others, like an IMG element are void and don't
* have a closing tag. Special elements, such as SCRIPT and STYLE,
* are treated just like void tags. Text nodes and self-closing
* foreign content will also act just like a void tag, immediately
* closing as soon as the processor advances to the next token.
*
* @since 6.6.0
*
* @param WP_HTML_Token|null $node Optional. Node to examine, if provided.
* Default is to examine current node.
* @return bool|null Whether to expect a closer for the currently-matched node,
* or `null` if not matched on any token.
*/
public function expects_closer( ?WP_HTML_Token $node = null ): ?bool {
$token_name = $node->node_name ?? $this->get_token_name();
if ( ! isset( $token_name ) ) {
return null;
}
$token_namespace = $node->namespace ?? $this->get_namespace();
$token_has_self_closing = $node->has_self_closing_flag ?? $this->has_self_closing_flag();
return ! (
// Comments, text nodes, and other atomic tokens.
'#' === $token_name[0] ||
// Doctype declarations.
'html' === $token_name ||
// Void elements.
( 'html' === $token_namespace && self::is_void( $token_name ) ) ||
// Special atomic elements.
( 'html' === $token_namespace && in_array( $token_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) ||
// Self-closing elements in foreign content.
( 'html' !== $token_namespace && $token_has_self_closing )
);
}
/**
* Steps through the HTML document and stop at the next tag, if any.
*
* @since 6.4.0
*
* @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
*
* @see self::PROCESS_NEXT_NODE
* @see self::REPROCESS_CURRENT_NODE
*
* @param string $node_to_process Whether to parse the next node or reprocess the current node.
* @return bool Whether a tag was matched.
*/
public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool {
// Refuse to proceed if there was a previous error.
if ( null !== $this->last_error ) {
return false;
}
if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) {
/*
* Void elements still hop onto the stack of open elements even though
* there's no corresponding closing tag. This is important for managing
* stack-based operations such as "navigate to parent node" or checking
* on an element's breadcrumbs.
*
* When moving on to the next node, therefore, if the bottom-most element
* on the stack is a void element, it must be closed.
*/
$top_node = $this->state->stack_of_open_elements->current_node();
if ( isset( $top_node ) && ! $this->expects_closer( $top_node ) ) {
$this->state->stack_of_open_elements->pop();
}
}
if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
parent::next_token();
if ( WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ) {
parent::subdivide_text_appropriately();
}
}
// Finish stepping when there are no more tokens in the document.
if (
WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
) {
return false;
}
$adjusted_current_node = $this->get_adjusted_current_node();
$is_closer = $this->is_tag_closer();
$is_start_tag = WP_HTML_Tag_Processor::STATE_MATCHED_TAG === $this->parser_state && ! $is_closer;
$token_name = $this->get_token_name();
if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) {
$this->state->current_token = new WP_HTML_Token(
$this->bookmark_token(),
$token_name,
$this->has_self_closing_flag(),
$this->release_internal_bookmark_on_destruct
);
}
$parse_in_current_insertion_mode = (
0 === $this->state->stack_of_open_elements->count() ||
'html' === $adjusted_current_node->namespace ||
(
'math' === $adjusted_current_node->integration_node_type &&
(
( $is_start_tag && ! in_array( $token_name, array( 'MGLYPH', 'MALIGNMARK' ), true ) ) ||
'#text' === $token_name
)
) ||
(
'math' === $adjusted_current_node->namespace &&
'ANNOTATION-XML' === $adjusted_current_node->node_name &&
$is_start_tag && 'SVG' === $token_name
) ||
(
'html' === $adjusted_current_node->integration_node_type &&
( $is_start_tag || '#text' === $token_name )
)
);
try {
if ( ! $parse_in_current_insertion_mode ) {
return $this->step_in_foreign_content();
}
switch ( $this->state->insertion_mode ) {
case WP_HTML_Processor_State::INSERTION_MODE_INITIAL:
return $this->step_initial();
case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML:
return $this->step_before_html();
case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD:
return $this->step_before_head();
case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD:
return $this->step_in_head();
case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT:
return $this->step_in_head_noscript();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD:
return $this->step_after_head();
case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY:
return $this->step_in_body();
case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE:
return $this->step_in_table();
case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_TEXT:
return $this->step_in_table_text();
case WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION:
return $this->step_in_caption();
case WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP:
return $this->step_in_column_group();
case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY:
return $this->step_in_table_body();
case WP_HTML_Processor_State::INSERTION_MODE_IN_ROW:
return $this->step_in_row();
case WP_HTML_Processor_State::INSERTION_MODE_IN_CELL:
return $this->step_in_cell();
case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT:
return $this->step_in_select();
case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE:
return $this->step_in_select_in_table();
case WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE:
return $this->step_in_template();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY:
return $this->step_after_body();
case WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET:
return $this->step_in_frameset();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET:
return $this->step_after_frameset();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY:
return $this->step_after_after_body();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET:
return $this->step_after_after_frameset();
// This should be unreachable but PHP doesn't have total type checking on switch.
default:
$this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." );
}
} catch ( WP_HTML_Unsupported_Exception $e ) {
/*
* Exceptions are used in this class to escape deep call stacks that
* otherwise might involve messier calling and return conventions.
*/
return false;
}
}
/**
* Computes the HTML breadcrumbs for the currently-matched node, if matched.
*
* Breadcrumbs start at the outermost parent and descend toward the matched element.
* They always include the entire path from the root HTML node to the matched element.
*
* Example:
*
* $processor = WP_HTML_Processor::create_fragment( '<p><strong><em><img></em></strong></p>' );
* $processor->next_tag( 'IMG' );
* $processor->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' );
*
* @since 6.4.0
*
* @return string[] Array of tag names representing path to matched node.
*/
public function get_breadcrumbs(): array {
return $this->breadcrumbs;
}
/**
* Returns the nesting depth of the current location in the document.
*
* Example:
*
* $processor = WP_HTML_Processor::create_fragment( '<div><p></p></div>' );
* // The processor starts in the BODY context, meaning it has depth from the start: HTML > BODY.
* 2 === $processor->get_current_depth();
*
* // Opening the DIV element increases the depth.
* $processor->next_token();
* 3 === $processor->get_current_depth();
*
* // Opening the P element increases the depth.
* $processor->next_token();
* 4 === $processor->get_current_depth();
*
* // The P element is closed during `next_token()` so the depth is decreased to reflect that.
* $processor->next_token();
* 3 === $processor->get_current_depth();
*
* @since 6.6.0
*
* @return int Nesting-depth of current location in the document.
*/
public function get_current_depth(): int {
return count( $this->breadcrumbs );
}
/**
* Normalizes an HTML fragment by serializing it.
*
* This method assumes that the given HTML snippet is found in BODY context.
* For normalizing full documents or fragments found in other contexts, create
* a new processor using WP_HTML_Processor::create_fragment or
* WP_HTML_Processor::create_full_parser and call WP_HTML_Processor::serialize
* on the created instances.
*
* Many aspects of an input HTML fragment may be changed during normalization.
*
* - Attribute values will be double-quoted.
* - Duplicate attributes will be removed.
* - Omitted tags will be added.
* - Tag and attribute name casing will be lower-cased,
* except for specific SVG and MathML tags or attributes.
* - Text will be re-encoded, null bytes handled,
* and invalid UTF-8 replaced with U+FFFD.
* - Any incomplete syntax trailing at the end will be omitted,
* for example, an unclosed comment opener will be removed.
*
* Example:
*
* echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
* // <a href="#anchor" v="5" enabled>One</a>
*
* echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' );
* // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
*
* echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
* // <!--[CDATA[invalid comment]]--> syntax < <> "oddities"
*
* @since 6.7.0
*
* @param string $html Input HTML to normalize.
*
* @return string|null Normalized output, or `null` if unable to normalize.
*/
public static function normalize( string $html ): ?string {
return static::create_fragment( $html )->serialize();
}
/**
* Returns normalized HTML for a fragment by serializing it.
*
* This differs from WP_HTML_Processor::normalize in that it starts with
* a specific HTML Processor, which _must_ not have already started scanning;
* it must be in the initial ready state and will be in the completed state once
* serialization is complete.
*
* Many aspects of an input HTML fragment may be changed during normalization.
*
* - Attribute values will be double-quoted.
* - Duplicate attributes will be removed.
* - Omitted tags will be added.
* - Tag and attribute name casing will be lower-cased,
* except for specific SVG and MathML tags or attributes.
* - Text will be re-encoded, null bytes handled,
* and invalid UTF-8 replaced with U+FFFD.
* - Any incomplete syntax trailing at the end will be omitted,
* for example, an unclosed comment opener will be removed.
*
* Example:
*
* $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
* echo $processor->serialize();
* // <a href="#anchor" v="5" enabled>One</a>
*
* $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
* echo $processor->serialize();
* // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
*
* $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
* echo $processor->serialize();
* // <!--[CDATA[invalid comment]]--> syntax < <> "oddities"
*
* @since 6.7.0
*
* @return string|null Normalized HTML markup represented by processor,
* or `null` if unable to generate serialization.
*/
public function serialize(): ?string {
if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) {
wp_trigger_error(
__METHOD__,
'An HTML Processor which has already started processing cannot serialize its contents. Serialize immediately after creating the instance.',
E_USER_WARNING
);
return null;
}
$html = '';
while ( $this->next_token() ) {
$html .= $this->serialize_token();
}
if ( null !== $this->get_last_error() ) {
wp_trigger_error(
__METHOD__,
"Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.",
E_USER_WARNING
);
return null;
}
return $html;
}
/**
* Serializes the currently-matched token.
*
* This method produces a fully-normative HTML string for the currently-matched token,
* if able. If not matched at any token or if the token doesn't correspond to any HTML
* it will return an empty string (for example, presumptuous end tags are ignored).
*
* @see static::serialize()
*
* @since 6.7.0
*
* @return string Serialization of token, or empty string if no serialization exists.
*/
protected function serialize_token(): string {
$html = '';
$token_type = $this->get_token_type();
switch ( $token_type ) {
case '#doctype':
$doctype = $this->get_doctype_info();
if ( null === $doctype ) {
break;
}
$html .= '<!DOCTYPE';
if ( $doctype->name ) {
$html .= " {$doctype->name}";
}
if ( null !== $doctype->public_identifier ) {
$quote = str_contains( $doctype->public_identifier, '"' ) ? "'" : '"';
$html .= " PUBLIC {$quote}{$doctype->public_identifier}{$quote}";
}
if ( null !== $doctype->system_identifier ) {
if ( null === $doctype->public_identifier ) {
$html .= ' SYSTEM';
}
$quote = str_contains( $doctype->system_identifier, '"' ) ? "'" : '"';
$html .= " {$quote}{$doctype->system_identifier}{$quote}";
}
$html .= '>';
break;
case '#text':
$html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
break;
// Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
case '#presumptuous-tag':
break;
case '#funky-comment':
case '#comment':
$html .= "<!--{$this->get_full_comment_text()}-->";
break;
case '#cdata-section':
$html .= "<![CDATA[{$this->get_modifiable_text()}]]>";
break;
}
if ( '#tag' !== $token_type ) {
return $html;
}
$tag_name = str_replace( "\x00", "\u{FFFD}", $this->get_tag() );
$in_html = 'html' === $this->get_namespace();
$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();
if ( $this->is_tag_closer() ) {
$html .= "</{$qualified_name}>";
return $html;
}
$attribute_names = $this->get_attribute_names_with_prefix( '' );
if ( ! isset( $attribute_names ) ) {
$html .= "<{$qualified_name}>";
return $html;
}
$html .= "<{$qualified_name}";
foreach ( $attribute_names as $attribute_name ) {
$html .= " {$this->get_qualified_attribute_name( $attribute_name )}";
$value = $this->get_attribute( $attribute_name );
if ( is_string( $value ) ) {
$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
}
$html = str_replace( "\x00", "\u{FFFD}", $html );
}
if ( ! $in_html && $this->has_self_closing_flag() ) {
$html .= ' /';
}
$html .= '>';
// Flush out self-contained elements.
if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) {
$text = $this->get_modifiable_text();
switch ( $tag_name ) {
case 'IFRAME':
case 'NOEMBED':
case 'NOFRAMES':
$text = '';
break;
case 'SCRIPT':
case 'STYLE':
break;
default:
$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
}
$html .= "{$text}</{$qualified_name}>";
}
return $html;
}
/**
* Parses next element in the 'initial' insertion mode.
*
* This internal function performs the 'initial' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.7.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#the-initial-insertion-mode
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_initial(): bool {
$token_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$token_name}";
switch ( $op ) {
/*
* > A character token that is one of U+0009 CHARACTER TABULATION,
* > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
* > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
*
* Parse error: ignore the token.
Changelog
Version | Description |
---|---|
6.4.0 | Introduced. |
User Contributed Notes
You must log in before being able to contribute a note or feedback.