WP_HTML_Tag_Processor::parse_next_tag(): bool

In this article

This function’s access is marked private. This means it is not intended for use by plugin or theme developers, only in other core functions. It is listed here for completeness.

Parses the next tag.

Description

This will find and start parsing the next tag, including the opening <, the potential closer /, and the tag name. It does not parse the attributes or scan to the closing >; these are left for other methods.

Return

bool Whether a tag was found before the end of the document.

Source

private function parse_next_tag() {
	$this->after_tag();

	$html       = $this->html;
	$doc_length = strlen( $html );
	$at         = $this->bytes_already_parsed;

	while ( false !== $at && $at < $doc_length ) {
		$at = strpos( $html, '<', $at );
		if ( false === $at ) {
			return false;
		}

		if ( '/' === $this->html[ $at + 1 ] ) {
			$this->is_closing_tag = true;
			++$at;
		} else {
			$this->is_closing_tag = false;
		}

		/*
		 * HTML tag names must start with [a-zA-Z] otherwise they are not tags.
		 * For example, "<3" is rendered as text, not a tag opener. If at least
		 * one letter follows the "<" then _it is_ a tag, but if the following
		 * character is anything else it _is not a tag_.
		 *
		 * It's not uncommon to find non-tags starting with `<` in an HTML
		 * document, so it's good for performance to make this pre-check before
		 * continuing to attempt to parse a tag name.
		 *
		 * Reference:
		 * * https://html.spec.whatwg.org/multipage/parsing.html#data-state
		 * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
		 */
		$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
		if ( $tag_name_prefix_length > 0 ) {
			++$at;
			$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
			$this->tag_name_starts_at   = $at;
			$this->bytes_already_parsed = $at + $this->tag_name_length;
			return true;
		}

		/*
		 * Abort if no tag is found before the end of
		 * the document. There is nothing left to parse.
		 */
		if ( $at + 1 >= strlen( $html ) ) {
			return false;
		}

		/*
		 * <! transitions to markup declaration open state
		 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
		 */
		if ( '!' === $html[ $at + 1 ] ) {
			/*
			 * <!-- transitions to a bogus comment state – skip to the nearest -->
			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
			 */
			if (
				strlen( $html ) > $at + 3 &&
				'-' === $html[ $at + 2 ] &&
				'-' === $html[ $at + 3 ]
			) {
				$closer_at = $at + 4;
				// If it's not possible to close the comment then there is nothing more to scan.
				if ( strlen( $html ) <= $closer_at ) {
					return false;
				}

				// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
				$span_of_dashes = strspn( $html, '-', $closer_at );
				if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
					$at = $closer_at + $span_of_dashes + 1;
					continue;
				}

				/*
				 * Comments may be closed by either a --> or an invalid --!>.
				 * The first occurrence closes the comment.
				 *
				 * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
				 */
				--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
				while ( ++$closer_at < strlen( $html ) ) {
					$closer_at = strpos( $html, '--', $closer_at );
					if ( false === $closer_at ) {
						return false;
					}

					if ( $closer_at + 2 < strlen( $html ) && '>' === $html[ $closer_at + 2 ] ) {
						$at = $closer_at + 3;
						continue 2;
					}

					if ( $closer_at + 3 < strlen( $html ) && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) {
						$at = $closer_at + 4;
						continue 2;
					}
				}
			}

			/*
			 * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]>
			 * The CDATA is case-sensitive.
			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
			 */
			if (
				strlen( $html ) > $at + 8 &&
				'[' === $html[ $at + 2 ] &&
				'C' === $html[ $at + 3 ] &&
				'D' === $html[ $at + 4 ] &&
				'A' === $html[ $at + 5 ] &&
				'T' === $html[ $at + 6 ] &&
				'A' === $html[ $at + 7 ] &&
				'[' === $html[ $at + 8 ]
			) {
				$closer_at = strpos( $html, ']]>', $at + 9 );
				if ( false === $closer_at ) {
					return false;
				}

				$at = $closer_at + 3;
				continue;
			}

			/*
			 * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest >
			 * These are ASCII-case-insensitive.
			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
			 */
			if (
				strlen( $html ) > $at + 8 &&
				( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
				( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
				( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
				( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) &&
				( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) &&
				( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) &&
				( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] )
			) {
				$closer_at = strpos( $html, '>', $at + 9 );
				if ( false === $closer_at ) {
					return false;
				}

				$at = $closer_at + 1;
				continue;
			}

			/*
			 * Anything else here is an incorrectly-opened comment and transitions
			 * to the bogus comment state - skip to the nearest >.
			 */
			$at = strpos( $html, '>', $at + 1 );
			continue;
		}

		/*
		 * </> is a missing end tag name, which is ignored.
		 *
		 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
		 */
		if ( '>' === $html[ $at + 1 ] ) {
			++$at;
			continue;
		}

		/*
		 * <? transitions to a bogus comment state – skip to the nearest >
		 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
		 */
		if ( '?' === $html[ $at + 1 ] ) {
			$closer_at = strpos( $html, '>', $at + 2 );
			if ( false === $closer_at ) {
				return false;
			}

			$at = $closer_at + 1;
			continue;
		}

		/*
		 * If a non-alpha starts the tag name in a tag closer it's a comment.
		 * Find the first `>`, which closes the comment.
		 *
		 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
		 */
		if ( $this->is_closing_tag ) {
			$closer_at = strpos( $html, '>', $at + 3 );
			if ( false === $closer_at ) {
				return false;
			}

			$at = $closer_at + 1;
			continue;
		}

		++$at;
	}

	return false;
}

Changelog

VersionDescription
6.2.1Support abruptly-closed comments, invalid-tag-closer-comments, and empty elements.
6.2.0Introduced.

User Contributed Notes

You must log in before being able to contribute a note or feedback.