WP_HTML_Tag_Processor::parse_next_tag(): bool

In this article

This function’s access is marked private. This means it is not intended for use by plugin or theme developers, only in other core functions. It is listed here for completeness.

Parses the next tag.

Description

This will find and start parsing the next tag, including the opening <, the potential closer /, and the tag name. It does not parse the attributes or scan to the closing >; these are left for other methods.

Return

bool Whether a tag was found before the end of the document.

Source

private function parse_next_tag() {
	$this->after_tag();

	$html       = $this->html;
	$doc_length = strlen( $html );
	$was_at     = $this->bytes_already_parsed;
	$at         = $was_at;

	while ( false !== $at && $at < $doc_length ) {
		$at = strpos( $html, '<', $at );

		/*
		 * This does not imply an incomplete parse; it indicates that there
		 * can be nothing left in the document other than a #text node.
		 */
		if ( false === $at ) {
			$this->parser_state         = self::STATE_TEXT_NODE;
			$this->token_starts_at      = $was_at;
			$this->token_length         = strlen( $html ) - $was_at;
			$this->text_starts_at       = $was_at;
			$this->text_length          = $this->token_length;
			$this->bytes_already_parsed = strlen( $html );
			return true;
		}

		if ( $at > $was_at ) {
			/*
			 * A "<" normally starts a new HTML tag or syntax token, but in cases where the
			 * following character can't produce a valid token, the "<" is instead treated
			 * as plaintext and the parser should skip over it. This avoids a problem when
			 * following earlier practices of typing emoji with text, e.g. "<3". This
			 * should be a heart, not a tag. It's supposed to be rendered, not hidden.
			 *
			 * At this point the parser checks if this is one of those cases and if it is
			 * will continue searching for the next "<" in search of a token boundary.
			 *
			 * @see https://html.spec.whatwg.org/#tag-open-state
			 */
			if ( strlen( $html ) > $at + 1 ) {
				$next_character  = $html[ $at + 1 ];
				$at_another_node = (
					'!' === $next_character ||
					'/' === $next_character ||
					'?' === $next_character ||
					( 'A' <= $next_character && $next_character <= 'Z' ) ||
					( 'a' <= $next_character && $next_character <= 'z' )
				);
				if ( ! $at_another_node ) {
					++$at;
					continue;
				}
			}

			$this->parser_state         = self::STATE_TEXT_NODE;
			$this->token_starts_at      = $was_at;
			$this->token_length         = $at - $was_at;
			$this->text_starts_at       = $was_at;
			$this->text_length          = $this->token_length;
			$this->bytes_already_parsed = $at;
			return true;
		}

		$this->token_starts_at = $at;

		if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) {
			$this->is_closing_tag = true;
			++$at;
		} else {
			$this->is_closing_tag = false;
		}

		/*
		 * HTML tag names must start with [a-zA-Z] otherwise they are not tags.
		 * For example, "<3" is rendered as text, not a tag opener. If at least
		 * one letter follows the "<" then _it is_ a tag, but if the following
		 * character is anything else it _is not a tag_.
		 *
		 * It's not uncommon to find non-tags starting with `<` in an HTML
		 * document, so it's good for performance to make this pre-check before
		 * continuing to attempt to parse a tag name.
		 *
		 * Reference:
		 * * https://html.spec.whatwg.org/multipage/parsing.html#data-state
		 * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
		 */
		$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
		if ( $tag_name_prefix_length > 0 ) {
			++$at;
			$this->parser_state         = self::STATE_MATCHED_TAG;
			$this->tag_name_starts_at   = $at;
			$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
			$this->bytes_already_parsed = $at + $this->tag_name_length;
			return true;
		}

		/*
		 * Abort if no tag is found before the end of
		 * the document. There is nothing left to parse.
		 */
		if ( $at + 1 >= $doc_length ) {
			$this->parser_state = self::STATE_INCOMPLETE_INPUT;

			return false;
		}

		/*
		 * `<!` transitions to markup declaration open state
		 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
		 */
		if ( ! $this->is_closing_tag && '!' === $html[ $at + 1 ] ) {
			/*
			 * `<!--` transitions to a comment state – apply further comment rules.
			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
			 */
			if (
				$doc_length > $at + 3 &&
				'-' === $html[ $at + 2 ] &&
				'-' === $html[ $at + 3 ]
			) {
				$closer_at = $at + 4;
				// If it's not possible to close the comment then there is nothing more to scan.
				if ( $doc_length <= $closer_at ) {
					$this->parser_state = self::STATE_INCOMPLETE_INPUT;

					return false;
				}

				// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
				$span_of_dashes = strspn( $html, '-', $closer_at );
				if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
					/*
					 * @todo When implementing `set_modifiable_text()` ensure that updates to this token
					 *       don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment
					 *       and bogus comment syntax, these leave no clear insertion point for text and
					 *       they need to be modified specially in order to contain text. E.g. to store
					 *       `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which
					 *       involves inserting an additional `-` into the token after the modifiable text.
					 */
					$this->parser_state = self::STATE_COMMENT;
					$this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
					$this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;

					// Only provide modifiable text if the token is long enough to contain it.
					if ( $span_of_dashes >= 2 ) {
						$this->comment_type   = self::COMMENT_AS_HTML_COMMENT;
						$this->text_starts_at = $this->token_starts_at + 4;
						$this->text_length    = $span_of_dashes - 2;
					}

					$this->bytes_already_parsed = $closer_at + $span_of_dashes + 1;
					return true;
				}

				/*
				 * Comments may be closed by either a --> or an invalid --!>.
				 * The first occurrence closes the comment.
				 *
				 * See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
				 */
				--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
				while ( ++$closer_at < $doc_length ) {
					$closer_at = strpos( $html, '--', $closer_at );
					if ( false === $closer_at ) {
						$this->parser_state = self::STATE_INCOMPLETE_INPUT;

						return false;
					}

					if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
						$this->parser_state         = self::STATE_COMMENT;
						$this->comment_type         = self::COMMENT_AS_HTML_COMMENT;
						$this->token_length         = $closer_at + 3 - $this->token_starts_at;
						$this->text_starts_at       = $this->token_starts_at + 4;
						$this->text_length          = $closer_at - $this->text_starts_at;
						$this->bytes_already_parsed = $closer_at + 3;
						return true;
					}

					if (
						$closer_at + 3 < $doc_length &&
						'!' === $html[ $closer_at + 2 ] &&
						'>' === $html[ $closer_at + 3 ]
					) {
						$this->parser_state         = self::STATE_COMMENT;
						$this->comment_type         = self::COMMENT_AS_HTML_COMMENT;
						$this->token_length         = $closer_at + 4 - $this->token_starts_at;
						$this->text_starts_at       = $this->token_starts_at + 4;
						$this->text_length          = $closer_at - $this->text_starts_at;
						$this->bytes_already_parsed = $closer_at + 4;
						return true;
					}
				}
			}

			/*
			 * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest >
			 * These are ASCII-case-insensitive.
			 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
			 */
			if (
				$doc_length > $at + 8 &&
				( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
				( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
				( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
				( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) &&
				( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) &&
				( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) &&
				( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] )
			) {
				$closer_at = strpos( $html, '>', $at + 9 );
				if ( false === $closer_at ) {
					$this->parser_state = self::STATE_INCOMPLETE_INPUT;

					return false;
				}

				$this->parser_state         = self::STATE_DOCTYPE;
				$this->token_length         = $closer_at + 1 - $this->token_starts_at;
				$this->text_starts_at       = $this->token_starts_at + 9;
				$this->text_length          = $closer_at - $this->text_starts_at;
				$this->bytes_already_parsed = $closer_at + 1;
				return true;
			}

			/*
			 * Anything else here is an incorrectly-opened comment and transitions
			 * to the bogus comment state - skip to the nearest >. If no closer is
			 * found then the HTML was truncated inside the markup declaration.
			 */
			$closer_at = strpos( $html, '>', $at + 1 );
			if ( false === $closer_at ) {
				$this->parser_state = self::STATE_INCOMPLETE_INPUT;

				return false;
			}

			$this->parser_state         = self::STATE_COMMENT;
			$this->comment_type         = self::COMMENT_AS_INVALID_HTML;
			$this->token_length         = $closer_at + 1 - $this->token_starts_at;
			$this->text_starts_at       = $this->token_starts_at + 2;
			$this->text_length          = $closer_at - $this->text_starts_at;
			$this->bytes_already_parsed = $closer_at + 1;

			/*
			 * Identify nodes that would be CDATA if HTML had CDATA sections.
			 *
			 * This section must occur after identifying the bogus comment end
			 * because in an HTML parser it will span to the nearest `>`, even
			 * if there's no `]]>` as would be required in an XML document. It
			 * is therefore not possible to parse a CDATA section containing
			 * a `>` in the HTML syntax.
			 *
			 * Inside foreign elements there is a discrepancy between browsers
			 * and the specification on this.
			 *
			 * @todo Track whether the Tag Processor is inside a foreign element
			 *       and require the proper closing `]]>` in those cases.
			 */
			if (
				$this->token_length >= 10 &&
				'[' === $html[ $this->token_starts_at + 2 ] &&
				'C' === $html[ $this->token_starts_at + 3 ] &&
				'D' === $html[ $this->token_starts_at + 4 ] &&
				'A' === $html[ $this->token_starts_at + 5 ] &&
				'T' === $html[ $this->token_starts_at + 6 ] &&
				'A' === $html[ $this->token_starts_at + 7 ] &&
				'[' === $html[ $this->token_starts_at + 8 ] &&
				']' === $html[ $closer_at - 1 ] &&
				']' === $html[ $closer_at - 2 ]
			) {
				$this->parser_state    = self::STATE_COMMENT;
				$this->comment_type    = self::COMMENT_AS_CDATA_LOOKALIKE;
				$this->text_starts_at += 7;
				$this->text_length    -= 9;
			}

			return true;
		}

		/*
		 * </> is a missing end tag name, which is ignored.
		 *
		 * This was also known as the "presumptuous empty tag"
		 * in early discussions as it was proposed to close
		 * the nearest previous opening tag.
		 *
		 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
		 */
		if ( '>' === $html[ $at + 1 ] ) {
			// `<>` is interpreted as plaintext.
			if ( ! $this->is_closing_tag ) {
				++$at;
				continue;
			}

			$this->parser_state         = self::STATE_PRESUMPTUOUS_TAG;
			$this->token_length         = $at + 2 - $this->token_starts_at;
			$this->bytes_already_parsed = $at + 2;
			return true;
		}

		/*
		 * `<?` transitions to a bogus comment state – skip to the nearest >
		 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
		 */
		if ( ! $this->is_closing_tag && '?' === $html[ $at + 1 ] ) {
			$closer_at = strpos( $html, '>', $at + 2 );
			if ( false === $closer_at ) {
				$this->parser_state = self::STATE_INCOMPLETE_INPUT;

				return false;
			}

			$this->parser_state         = self::STATE_COMMENT;
			$this->comment_type         = self::COMMENT_AS_INVALID_HTML;
			$this->token_length         = $closer_at + 1 - $this->token_starts_at;
			$this->text_starts_at       = $this->token_starts_at + 2;
			$this->text_length          = $closer_at - $this->text_starts_at;
			$this->bytes_already_parsed = $closer_at + 1;

			/*
			 * Identify a Processing Instruction node were HTML to have them.
			 *
			 * This section must occur after identifying the bogus comment end
			 * because in an HTML parser it will span to the nearest `>`, even
			 * if there's no `?>` as would be required in an XML document. It
			 * is therefore not possible to parse a Processing Instruction node
			 * containing a `>` in the HTML syntax.
			 *
			 * XML allows for more target names, but this code only identifies
			 * those with ASCII-representable target names. This means that it
			 * may identify some Processing Instruction nodes as bogus comments,
			 * but it will not misinterpret the HTML structure. By limiting the
			 * identification to these target names the Tag Processor can avoid
			 * the need to start parsing UTF-8 sequences.
			 *
			 * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
			 *                     [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
			 *                     [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
			 *                     [#x10000-#xEFFFF]
			 * > NameChar      ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
			 *
			 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
			 */
			if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
				$comment_text     = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
				$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );

				if ( 0 < $pi_target_length ) {
					$pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );

					$this->comment_type       = self::COMMENT_AS_PI_NODE_LOOKALIKE;
					$this->tag_name_starts_at = $this->token_starts_at + 2;
					$this->tag_name_length    = $pi_target_length;
					$this->text_starts_at    += $pi_target_length;
					$this->text_length       -= $pi_target_length + 1;
				}
			}

			return true;
		}

		/*
		 * If a non-alpha starts the tag name in a tag closer it's a comment.
		 * Find the first `>`, which closes the comment.
		 *
		 * This parser classifies these particular comments as special "funky comments"
		 * which are made available for further processing.
		 *
		 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
		 */
		if ( $this->is_closing_tag ) {
			// No chance of finding a closer.
			if ( $at + 3 > $doc_length ) {
				return false;
			}

			$closer_at = strpos( $html, '>', $at + 2 );
			if ( false === $closer_at ) {
				$this->parser_state = self::STATE_INCOMPLETE_INPUT;

				return false;
			}

			$this->parser_state         = self::STATE_FUNKY_COMMENT;
			$this->token_length         = $closer_at + 1 - $this->token_starts_at;
			$this->text_starts_at       = $this->token_starts_at + 2;
			$this->text_length          = $closer_at - $this->text_starts_at;
			$this->bytes_already_parsed = $closer_at + 1;
			return true;
		}

		++$at;
	}

	return false;
}

Changelog

VersionDescription
6.2.1Support abruptly-closed comments, invalid-tag-closer-comments, and empty elements.
6.2.0Introduced.

User Contributed Notes

You must log in before being able to contribute a note or feedback.