Parses the next tag.
Description
This will find and start parsing the next tag, including the opening <
, the potential closer /
, and the tag name. It does not parse the attributes or scan to the closing >
; these are left for other methods.
Source
/*
* Everything of interest past here starts with "<".
* Check this character and advance position regardless.
*/
if ( '<' !== $html[ $at++ ] ) {
continue;
}
/*
* Unlike with "-->", the "<!--" only transitions
* into the escaped mode if not already there.
*
* Inside the escaped modes it will be ignored; and
* should never break out of the double-escaped
* mode and back into the escaped mode.
*
* While this requires a mode change, it does not
* impact the parsing otherwise, so continue
* parsing after updating the state.
*/
if (
$at + 2 < $doc_length &&
'!' === $html[ $at ] &&
'-' === $html[ $at + 1 ] &&
'-' === $html[ $at + 2 ]
) {
$at += 3;
$state = 'unescaped' === $state ? 'escaped' : $state;
continue;
}
if ( '/' === $html[ $at ] ) {
$closer_potentially_starts_at = $at - 1;
$is_closing = true;
++$at;
} else {
$is_closing = false;
}
/*
* At this point the only remaining state-changes occur with the
* <script> and </script> tags; unless one of these appears next,
* proceed scanning to the next potential token in the text.
*/
if ( ! (
$at + 6 < $doc_length &&
( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
( 'i' === $html[ $at + 3 ] || 'I' === $html[ $at + 3 ] ) &&
( 'p' === $html[ $at + 4 ] || 'P' === $html[ $at + 4 ] ) &&
( 't' === $html[ $at + 5 ] || 'T' === $html[ $at + 5 ] )
) ) {
++$at;
continue;
}
/*
* Ensure that the script tag terminates to avoid matching on
* substrings of a non-match. For example, the sequence
* "<script123" should not end a script region even though
* "<script" is found within the text.
*/
if ( $at + 6 >= $doc_length ) {
continue;
}
$at += 6;
$c = $html[ $at ];
if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
++$at;
continue;
}
if ( 'escaped' === $state && ! $is_closing ) {
$state = 'double-escaped';
continue;
}
if ( 'double-escaped' === $state && $is_closing ) {
$state = 'escaped';
continue;
}
if ( $is_closing ) {
$this->bytes_already_parsed = $closer_potentially_starts_at;
$this->tag_name_starts_at = $closer_potentially_starts_at;
if ( $this->bytes_already_parsed >= $doc_length ) {
return false;
}
while ( $this->parse_next_attribute() ) {
continue;
}
if ( $this->bytes_already_parsed >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
if ( '>' === $html[ $this->bytes_already_parsed ] ) {
++$this->bytes_already_parsed;
return true;
}
}
++$at;
}
return false;
}
/**
* Parses the next tag.
*
* This will find and start parsing the next tag, including
* the opening `<`, the potential closer `/`, and the tag
* name. It does not parse the attributes or scan to the
* closing `>`; these are left for other methods.
*
* @since 6.2.0
* @since 6.2.1 Support abruptly-closed comments, invalid-tag-closer-comments, and empty elements.
*
* @return bool Whether a tag was found before the end of the document.
*/
private function parse_next_tag(): bool {
$this->after_tag();
$html = $this->html;
$doc_length = strlen( $html );
$was_at = $this->bytes_already_parsed;
$at = $was_at;
while ( $at < $doc_length ) {
$at = strpos( $html, '<', $at );
if ( false === $at ) {
break;
}
if ( $at > $was_at ) {
/*
* A "<" normally starts a new HTML tag or syntax token, but in cases where the
* following character can't produce a valid token, the "<" is instead treated
* as plaintext and the parser should skip over it. This avoids a problem when
* following earlier practices of typing emoji with text, e.g. "<3". This
* should be a heart, not a tag. It's supposed to be rendered, not hidden.
*
* At this point the parser checks if this is one of those cases and if it is
* will continue searching for the next "<" in search of a token boundary.
*
* @see https://html.spec.whatwg.org/#tag-open-state
*/
if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
++$at;
continue;
}
$this->parser_state = self::STATE_TEXT_NODE;
$this->token_starts_at = $was_at;
$this->token_length = $at - $was_at;
$this->text_starts_at = $was_at;
$this->text_length = $this->token_length;
$this->bytes_already_parsed = $at;
return true;
}
$this->token_starts_at = $at;
if ( $at + 1 < $doc_length && '/' === $this->html[ $at + 1 ] ) {
$this->is_closing_tag = true;
++$at;
} else {
$this->is_closing_tag = false;
}
/*
* HTML tag names must start with [a-zA-Z] otherwise they are not tags.
* For example, "<3" is rendered as text, not a tag opener. If at least
* one letter follows the "<" then _it is_ a tag, but if the following
* character is anything else it _is not a tag_.
*
* It's not uncommon to find non-tags starting with `<` in an HTML
* document, so it's good for performance to make this pre-check before
* continuing to attempt to parse a tag name.
*
* Reference:
* * https://html.spec.whatwg.org/multipage/parsing.html#data-state
* * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
if ( $tag_name_prefix_length > 0 ) {
++$at;
$this->parser_state = self::STATE_MATCHED_TAG;
$this->tag_name_starts_at = $at;
$this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
$this->bytes_already_parsed = $at + $this->tag_name_length;
return true;
}
/*
* Abort if no tag is found before the end of
* the document. There is nothing left to parse.
*/
if ( $at + 1 >= $doc_length ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
/*
* `<!` transitions to markup declaration open state
* https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
*/
if ( ! $this->is_closing_tag && '!' === $html[ $at + 1 ] ) {
/*
* `<!--` transitions to a comment state – apply further comment rules.
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if ( 0 === substr_compare( $html, '--', $at + 2, 2 ) ) {
$closer_at = $at + 4;
// If it's not possible to close the comment then there is nothing more to scan.
if ( $doc_length <= $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
$span_of_dashes = strspn( $html, '-', $closer_at );
if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
/*
* @todo When implementing `set_modifiable_text()` ensure that updates to this token
* don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment
* and bogus comment syntax, these leave no clear insertion point for text and
* they need to be modified specially in order to contain text. E.g. to store
* `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which
* involves inserting an additional `-` into the token after the modifiable text.
*/
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
$this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;
// Only provide modifiable text if the token is long enough to contain it.
if ( $span_of_dashes >= 2 ) {
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
$this->text_starts_at = $this->token_starts_at + 4;
$this->text_length = $span_of_dashes - 2;
}
$this->bytes_already_parsed = $closer_at + $span_of_dashes + 1;
return true;
}
/*
* Comments may be closed by either a --> or an invalid --!>.
* The first occurrence closes the comment.
*
* See https://html.spec.whatwg.org/#parse-error-incorrectly-closed-comment
*/
--$closer_at; // Pre-increment inside condition below reduces risk of accidental infinite looping.
while ( ++$closer_at < $doc_length ) {
$closer_at = strpos( $html, '--', $closer_at );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
$this->token_length = $closer_at + 3 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 4;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 3;
return true;
}
if (
$closer_at + 3 < $doc_length &&
'!' === $html[ $closer_at + 2 ] &&
'>' === $html[ $closer_at + 3 ]
) {
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_HTML_COMMENT;
$this->token_length = $closer_at + 4 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 4;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 4;
return true;
}
}
}
/*
* `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest >
* These are ASCII-case-insensitive.
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if (
$doc_length > $at + 8 &&
( 'D' === $html[ $at + 2 ] || 'd' === $html[ $at + 2 ] ) &&
( 'O' === $html[ $at + 3 ] || 'o' === $html[ $at + 3 ] ) &&
( 'C' === $html[ $at + 4 ] || 'c' === $html[ $at + 4 ] ) &&
( 'T' === $html[ $at + 5 ] || 't' === $html[ $at + 5 ] ) &&
( 'Y' === $html[ $at + 6 ] || 'y' === $html[ $at + 6 ] ) &&
( 'P' === $html[ $at + 7 ] || 'p' === $html[ $at + 7 ] ) &&
( 'E' === $html[ $at + 8 ] || 'e' === $html[ $at + 8 ] )
) {
$closer_at = strpos( $html, '>', $at + 9 );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->parser_state = self::STATE_DOCTYPE;
$this->token_length = $closer_at + 1 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 9;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
return true;
}
if (
'html' !== $this->parsing_namespace &&
strlen( $html ) > $at + 8 &&
'[' === $html[ $at + 2 ] &&
'C' === $html[ $at + 3 ] &&
'D' === $html[ $at + 4 ] &&
'A' === $html[ $at + 5 ] &&
'T' === $html[ $at + 6 ] &&
'A' === $html[ $at + 7 ] &&
'[' === $html[ $at + 8 ]
) {
$closer_at = strpos( $html, ']]>', $at + 9 );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->parser_state = self::STATE_CDATA_NODE;
$this->text_starts_at = $at + 9;
$this->text_length = $closer_at - $this->text_starts_at;
$this->token_length = $closer_at + 3 - $this->token_starts_at;
$this->bytes_already_parsed = $closer_at + 3;
return true;
}
/*
* Anything else here is an incorrectly-opened comment and transitions
* to the bogus comment state - skip to the nearest >. If no closer is
* found then the HTML was truncated inside the markup declaration.
*/
$closer_at = strpos( $html, '>', $at + 1 );
if ( false === $closer_at ) {
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->parser_state = self::STATE_COMMENT;
$this->comment_type = self::COMMENT_AS_INVALID_HTML;
$this->token_length = $closer_at + 1 - $this->token_starts_at;
$this->text_starts_at = $this->token_starts_at + 2;
$this->text_length = $closer_at - $this->text_starts_at;
$this->bytes_already_parsed = $closer_at + 1;
/*
* Identify nodes that would be CDATA if HTML had CDATA sections.
*
* This section must occur after identifying the bogus comment end
* because in an HTML parser it will span to the nearest `>`, even
* if there's no `]]>` as would be required in an XML document. It
* is therefore not possible to parse a CDATA section containing
* a `>` in the HTML syntax.
*
* Inside foreign elements there is a discrepancy between browsers
* and the specification on this.
*
* @todo Track whether the Tag Processor is inside a foreign element
* and require the proper closing `]]>` in those cases.
*/
if (
$this->token_length >= 10 &&
'[' === $html[ $this->token_starts_at + 2 ] &&
'C' === $html[ $this->token_starts_at + 3 ] &&
'D' === $html[ $this->token_starts_at + 4 ] &&
'A' === $html[ $this->token_starts_at + 5 ] &&
'T' === $html[ $this->token_starts_at + 6 ] &&
'A' === $html[ $this->token_starts_at + 7 ] &&
'[' === $html[ $this->token_starts_at + 8 ] &&
']' === $html[ $closer_at - 1 ] &&
']' === $html[ $closer_at - 2 ]
) {
$this->parser_state = self::STATE_COMMENT;
User Contributed Notes
You must log in before being able to contribute a note or feedback.