WP_HTML_Processor::step_in_body(): bool

In this article

This function’s access is marked private. This means it is not intended for use by plugin or theme developers, only in other core functions. It is listed here for completeness. Use https://html.spec.whatwg.org/#parsing-main-inbody instead.

Parses next element in the ‘in body’ insertion mode.

Description

This internal function performs the ‘in body’ insertion mode logic for the generalized WP_HTML_Processor::step() function.

See also

Return

bool Whether an element was found.

Source

 * which could open up surprising performance breakdowns.
 *
 * Example:
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<div><span><figure><img></figure></span></div>' );
 *     $processor->next_tag( 'img' );
 *     true  === $processor->matches_breadcrumbs( array( 'figure', 'img' ) );
 *     true  === $processor->matches_breadcrumbs( array( 'span', 'figure', 'img' ) );
 *     false === $processor->matches_breadcrumbs( array( 'span', 'img' ) );
 *     true  === $processor->matches_breadcrumbs( array( 'span', '*', 'img' ) );
 *
 * @since 6.4.0
 *
 * @param string[] $breadcrumbs DOM sub-path at which element is found, e.g. `array( 'FIGURE', 'IMG' )`.
 *                              May also contain the wildcard `*` which matches a single element, e.g. `array( 'SECTION', '*' )`.
 * @return bool Whether the currently-matched tag is found at the given nested structure.
 */
public function matches_breadcrumbs( $breadcrumbs ): bool {
	// Everything matches when there are zero constraints.
	if ( 0 === count( $breadcrumbs ) ) {
		return true;
	}

	// Start at the last crumb.
	$crumb = end( $breadcrumbs );

	if ( '*' !== $crumb && $this->get_tag() !== strtoupper( $crumb ) ) {
		return false;
	}

	for ( $i = count( $this->breadcrumbs ) - 1; $i >= 0; $i-- ) {
		$node  = $this->breadcrumbs[ $i ];
		$crumb = strtoupper( current( $breadcrumbs ) );

		if ( '*' !== $crumb && $node !== $crumb ) {
			return false;
		}

		if ( false === prev( $breadcrumbs ) ) {
			return true;
		}
	}

	return false;
}

/**
 * Indicates if the currently-matched node expects a closing
 * token, or if it will self-close on the next step.
 *
 * Most HTML elements expect a closer, such as a P element or
 * a DIV element. Others, like an IMG element are void and don't
 * have a closing tag. Special elements, such as SCRIPT and STYLE,
 * are treated just like void tags. Text nodes and self-closing
 * foreign content will also act just like a void tag, immediately
 * closing as soon as the processor advances to the next token.
 *
 * @since 6.6.0
 *
 * @param WP_HTML_Token|null $node Optional. Node to examine, if provided.
 *                                 Default is to examine current node.
 * @return bool|null Whether to expect a closer for the currently-matched node,
 *                   or `null` if not matched on any token.
 */
public function expects_closer( ?WP_HTML_Token $node = null ): ?bool {
	$token_name = $node->node_name ?? $this->get_token_name();

	if ( ! isset( $token_name ) ) {
		return null;
	}

	$token_namespace        = $node->namespace ?? $this->get_namespace();
	$token_has_self_closing = $node->has_self_closing_flag ?? $this->has_self_closing_flag();

	return ! (
		// Comments, text nodes, and other atomic tokens.
		'#' === $token_name[0] ||
		// Doctype declarations.
		'html' === $token_name ||
		// Void elements.
		( 'html' === $token_namespace && self::is_void( $token_name ) ) ||
		// Special atomic elements.
		( 'html' === $token_namespace && in_array( $token_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) ||
		// Self-closing elements in foreign content.
		( 'html' !== $token_namespace && $token_has_self_closing )
	);
}

/**
 * Steps through the HTML document and stop at the next tag, if any.
 *
 * @since 6.4.0
 *
 * @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
 *
 * @see self::PROCESS_NEXT_NODE
 * @see self::REPROCESS_CURRENT_NODE
 *
 * @param string $node_to_process Whether to parse the next node or reprocess the current node.
 * @return bool Whether a tag was matched.
 */
public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool {
	// Refuse to proceed if there was a previous error.
	if ( null !== $this->last_error ) {
		return false;
	}

	if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) {
		/*
		 * Void elements still hop onto the stack of open elements even though
		 * there's no corresponding closing tag. This is important for managing
		 * stack-based operations such as "navigate to parent node" or checking
		 * on an element's breadcrumbs.
		 *
		 * When moving on to the next node, therefore, if the bottom-most element
		 * on the stack is a void element, it must be closed.
		 */
		$top_node = $this->state->stack_of_open_elements->current_node();
		if ( isset( $top_node ) && ! $this->expects_closer( $top_node ) ) {
			$this->state->stack_of_open_elements->pop();
		}
	}

	if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
		parent::next_token();
		if ( WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ) {
			parent::subdivide_text_appropriately();
		}
	}

	// Finish stepping when there are no more tokens in the document.
	if (
		WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT === $this->parser_state ||
		WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state
	) {
		return false;
	}

	$adjusted_current_node = $this->get_adjusted_current_node();
	$is_closer             = $this->is_tag_closer();
	$is_start_tag          = WP_HTML_Tag_Processor::STATE_MATCHED_TAG === $this->parser_state && ! $is_closer;
	$token_name            = $this->get_token_name();

	if ( self::REPROCESS_CURRENT_NODE !== $node_to_process ) {
		$this->state->current_token = new WP_HTML_Token(
			$this->bookmark_token(),
			$token_name,
			$this->has_self_closing_flag(),
			$this->release_internal_bookmark_on_destruct
		);
	}

	$parse_in_current_insertion_mode = (
		0 === $this->state->stack_of_open_elements->count() ||
		'html' === $adjusted_current_node->namespace ||
		(
			'math' === $adjusted_current_node->integration_node_type &&
			(
				( $is_start_tag && ! in_array( $token_name, array( 'MGLYPH', 'MALIGNMARK' ), true ) ) ||
				'#text' === $token_name
			)
		) ||
		(
			'math' === $adjusted_current_node->namespace &&
			'ANNOTATION-XML' === $adjusted_current_node->node_name &&
			$is_start_tag && 'SVG' === $token_name
		) ||
		(
			'html' === $adjusted_current_node->integration_node_type &&
			( $is_start_tag || '#text' === $token_name )
		)
	);

	try {
		if ( ! $parse_in_current_insertion_mode ) {
			return $this->step_in_foreign_content();
		}

		switch ( $this->state->insertion_mode ) {
			case WP_HTML_Processor_State::INSERTION_MODE_INITIAL:
				return $this->step_initial();

			case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML:
				return $this->step_before_html();

			case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD:
				return $this->step_before_head();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD:
				return $this->step_in_head();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT:
				return $this->step_in_head_noscript();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD:
				return $this->step_after_head();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY:
				return $this->step_in_body();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE:
				return $this->step_in_table();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_TEXT:
				return $this->step_in_table_text();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION:
				return $this->step_in_caption();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP:
				return $this->step_in_column_group();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY:
				return $this->step_in_table_body();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_ROW:
				return $this->step_in_row();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_CELL:
				return $this->step_in_cell();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT:
				return $this->step_in_select();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE:
				return $this->step_in_select_in_table();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE:
				return $this->step_in_template();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY:
				return $this->step_after_body();

			case WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET:
				return $this->step_in_frameset();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET:
				return $this->step_after_frameset();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY:
				return $this->step_after_after_body();

			case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET:
				return $this->step_after_after_frameset();

			// This should be unreachable but PHP doesn't have total type checking on switch.
			default:
				$this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." );
		}
	} catch ( WP_HTML_Unsupported_Exception $e ) {
		/*
		 * Exceptions are used in this class to escape deep call stacks that
		 * otherwise might involve messier calling and return conventions.
		 */
		return false;
	}
}

/**
 * Computes the HTML breadcrumbs for the currently-matched node, if matched.
 *
 * Breadcrumbs start at the outermost parent and descend toward the matched element.
 * They always include the entire path from the root HTML node to the matched element.
 *
 * Example:
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<p><strong><em><img></em></strong></p>' );
 *     $processor->next_tag( 'IMG' );
 *     $processor->get_breadcrumbs() === array( 'HTML', 'BODY', 'P', 'STRONG', 'EM', 'IMG' );
 *
 * @since 6.4.0
 *
 * @return string[] Array of tag names representing path to matched node.
 */
public function get_breadcrumbs(): array {
	return $this->breadcrumbs;
}

/**
 * Returns the nesting depth of the current location in the document.
 *
 * Example:
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<div><p></p></div>' );
 *     // The processor starts in the BODY context, meaning it has depth from the start: HTML > BODY.
 *     2 === $processor->get_current_depth();
 *
 *     // Opening the DIV element increases the depth.
 *     $processor->next_token();
 *     3 === $processor->get_current_depth();
 *
 *     // Opening the P element increases the depth.
 *     $processor->next_token();
 *     4 === $processor->get_current_depth();
 *
 *     // The P element is closed during `next_token()` so the depth is decreased to reflect that.
 *     $processor->next_token();
 *     3 === $processor->get_current_depth();
 *
 * @since 6.6.0
 *
 * @return int Nesting-depth of current location in the document.
 */
public function get_current_depth(): int {
	return count( $this->breadcrumbs );
}

/**
 * Normalizes an HTML fragment by serializing it.
 *
 * This method assumes that the given HTML snippet is found in BODY context.
 * For normalizing full documents or fragments found in other contexts, create
 * a new processor using WP_HTML_Processor::create_fragment or
 * WP_HTML_Processor::create_full_parser and call WP_HTML_Processor::serialize
 * on the created instances.
 *
 * Many aspects of an input HTML fragment may be changed during normalization.
 *
 *  - Attribute values will be double-quoted.
 *  - Duplicate attributes will be removed.
 *  - Omitted tags will be added.
 *  - Tag and attribute name casing will be lower-cased,
 *    except for specific SVG and MathML tags or attributes.
 *  - Text will be re-encoded, null bytes handled,
 *    and invalid UTF-8 replaced with U+FFFD.
 *  - Any incomplete syntax trailing at the end will be omitted,
 *    for example, an unclosed comment opener will be removed.
 *
 * Example:
 *
 *     echo WP_HTML_Processor::normalize( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
 *     // <a href="#anchor" v="5" enabled>One</a>
 *
 *     echo WP_HTML_Processor::normalize( '<div></p>fun<table><td>cell</div>' );
 *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
 *
 *     echo WP_HTML_Processor::normalize( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
 *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
 *
 * @since 6.7.0
 *
 * @param string $html Input HTML to normalize.
 *
 * @return string|null Normalized output, or `null` if unable to normalize.
 */
public static function normalize( string $html ): ?string {
	return static::create_fragment( $html )->serialize();
}

/**
 * Returns normalized HTML for a fragment by serializing it.
 *
 * This differs from WP_HTML_Processor::normalize in that it starts with
 * a specific HTML Processor, which _must_ not have already started scanning;
 * it must be in the initial ready state and will be in the completed state once
 * serialization is complete.
 *
 * Many aspects of an input HTML fragment may be changed during normalization.
 *
 *  - Attribute values will be double-quoted.
 *  - Duplicate attributes will be removed.
 *  - Omitted tags will be added.
 *  - Tag and attribute name casing will be lower-cased,
 *    except for specific SVG and MathML tags or attributes.
 *  - Text will be re-encoded, null bytes handled,
 *    and invalid UTF-8 replaced with U+FFFD.
 *  - Any incomplete syntax trailing at the end will be omitted,
 *    for example, an unclosed comment opener will be removed.
 *
 * Example:
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<a href=#anchor v=5 href="/" enabled>One</a another v=5><!--' );
 *     echo $processor->serialize();
 *     // <a href="#anchor" v="5" enabled>One</a>
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<div></p>fun<table><td>cell</div>' );
 *     echo $processor->serialize();
 *     // <div><p></p>fun<table><tbody><tr><td>cell</td></tr></tbody></table></div>
 *
 *     $processor = WP_HTML_Processor::create_fragment( '<![CDATA[invalid comment]]> syntax < <> "oddities"' );
 *     echo $processor->serialize();
 *     // <!--[CDATA[invalid comment]]--> syntax &lt; &lt;&gt; &quot;oddities&quot;
 *
 * @since 6.7.0
 *
 * @return string|null Normalized HTML markup represented by processor,
 *                     or `null` if unable to generate serialization.
 */
public function serialize(): ?string {
	if ( WP_HTML_Tag_Processor::STATE_READY !== $this->parser_state ) {
		wp_trigger_error(
			__METHOD__,
			'An HTML Processor which has already started processing cannot serialize its contents. Serialize immediately after creating the instance.',
			E_USER_WARNING
		);
		return null;
	}

	$html = '';
	while ( $this->next_token() ) {
		$html .= $this->serialize_token();
	}

	if ( null !== $this->get_last_error() ) {
		wp_trigger_error(
			__METHOD__,
			"Cannot serialize HTML Processor with parsing error: {$this->get_last_error()}.",
			E_USER_WARNING
		);
		return null;
	}

	return $html;
}

/**
 * Serializes the currently-matched token.
 *
 * This method produces a fully-normative HTML string for the currently-matched token,
 * if able. If not matched at any token or if the token doesn't correspond to any HTML
 * it will return an empty string (for example, presumptuous end tags are ignored).
 *
 * @see static::serialize()
 *
 * @since 6.7.0
 *
 * @return string Serialization of token, or empty string if no serialization exists.
 */
protected function serialize_token(): string {
	$html       = '';
	$token_type = $this->get_token_type();

	switch ( $token_type ) {
		case '#doctype':
			$doctype = $this->get_doctype_info();
			if ( null === $doctype ) {
				break;
			}

			$html .= '<!DOCTYPE';

			if ( $doctype->name ) {
				$html .= " {$doctype->name}";
			}

			if ( null !== $doctype->public_identifier ) {
				$quote = str_contains( $doctype->public_identifier, '"' ) ? "'" : '"';
				$html .= " PUBLIC {$quote}{$doctype->public_identifier}{$quote}";
			}
			if ( null !== $doctype->system_identifier ) {
				if ( null === $doctype->public_identifier ) {
					$html .= ' SYSTEM';
				}
				$quote = str_contains( $doctype->system_identifier, '"' ) ? "'" : '"';
				$html .= " {$quote}{$doctype->system_identifier}{$quote}";
			}

			$html .= '>';
			break;

		case '#text':
			$html .= htmlspecialchars( $this->get_modifiable_text(), ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
			break;

		// Unlike the `<>` which is interpreted as plaintext, this is ignored entirely.
		case '#presumptuous-tag':
			break;

		case '#funky-comment':
		case '#comment':
			$html .= "<!--{$this->get_full_comment_text()}-->";
			break;

		case '#cdata-section':
			$html .= "<![CDATA[{$this->get_modifiable_text()}]]>";
			break;
	}

	if ( '#tag' !== $token_type ) {
		return $html;
	}

	$tag_name       = str_replace( "\x00", "\u{FFFD}", $this->get_tag() );
	$in_html        = 'html' === $this->get_namespace();
	$qualified_name = $in_html ? strtolower( $tag_name ) : $this->get_qualified_tag_name();

	if ( $this->is_tag_closer() ) {
		$html .= "</{$qualified_name}>";
		return $html;
	}

	$attribute_names = $this->get_attribute_names_with_prefix( '' );
	if ( ! isset( $attribute_names ) ) {
		$html .= "<{$qualified_name}>";
		return $html;
	}

	$html .= "<{$qualified_name}";
	foreach ( $attribute_names as $attribute_name ) {
		$html .= " {$this->get_qualified_attribute_name( $attribute_name )}";
		$value = $this->get_attribute( $attribute_name );

		if ( is_string( $value ) ) {
			$html .= '="' . htmlspecialchars( $value, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) . '"';
		}

		$html = str_replace( "\x00", "\u{FFFD}", $html );
	}

	if ( ! $in_html && $this->has_self_closing_flag() ) {
		$html .= ' /';
	}

	$html .= '>';

	// Flush out self-contained elements.
	if ( $in_html && in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP' ), true ) ) {
		$text = $this->get_modifiable_text();

		switch ( $tag_name ) {
			case 'IFRAME':
			case 'NOEMBED':
			case 'NOFRAMES':
				$text = '';
				break;

			case 'SCRIPT':
			case 'STYLE':
				break;

			default:
				$text = htmlspecialchars( $text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5, 'UTF-8' );
		}

		$html .= "{$text}</{$qualified_name}>";
	}

	return $html;
}

/**
 * Parses next element in the 'initial' insertion mode.
 *
 * This internal function performs the 'initial' insertion mode
 * logic for the generalized WP_HTML_Processor::step() function.
 *
 * @since 6.7.0
 *
 * @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
 *
 * @see https://html.spec.whatwg.org/#the-initial-insertion-mode
 * @see WP_HTML_Processor::step
 *
 * @return bool Whether an element was found.
 */
private function step_initial(): bool {
	$token_name = $this->get_token_name();
	$token_type = $this->get_token_type();
	$op_sigil   = '#tag' === $token_type ? ( parent::is_tag_closer() ? '-' : '+' ) : '';
	$op         = "{$op_sigil}{$token_name}";

	switch ( $op ) {
		/*
		 * > A character token that is one of U+0009 CHARACTER TABULATION,
		 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
		 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
		 *
		 * Parse error: ignore the token.

Changelog

VersionDescription
6.4.0Introduced.

User Contributed Notes

You must log in before being able to contribute a note or feedback.