IdnaEncoder::utf8_to_codepoints( string $input ): array

In this article

Convert a UTF-8 string to a UCS-4 codepoint array

Description

Based on \WpOrg\Requests\Iri::replace_invalid_with_pct_encoding()

Parameters

$inputstringrequired
Text to convert.

Return

array Unicode code points

Source

protected static function utf8_to_codepoints($input) {
	$codepoints = [];

	// Get number of bytes
	$strlen = strlen($input);

	// phpcs:ignore Generic.CodeAnalysis.JumbledIncrementer -- This is a deliberate choice.
	for ($position = 0; $position < $strlen; $position++) {
		$value = ord($input[$position]);

		if ((~$value & 0x80) === 0x80) {            // One byte sequence:
			$character = $value;
			$length    = 1;
			$remaining = 0;
		} elseif (($value & 0xE0) === 0xC0) {       // Two byte sequence:
			$character = ($value & 0x1F) << 6;
			$length    = 2;
			$remaining = 1;
		} elseif (($value & 0xF0) === 0xE0) {       // Three byte sequence:
			$character = ($value & 0x0F) << 12;
			$length    = 3;
			$remaining = 2;
		} elseif (($value & 0xF8) === 0xF0) {       // Four byte sequence:
			$character = ($value & 0x07) << 18;
			$length    = 4;
			$remaining = 3;
		} else {                                    // Invalid byte:
			throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value);
		}

		if ($remaining > 0) {
			if ($position + $length > $strlen) {
				throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
			}

			for ($position++; $remaining > 0; $position++) {
				$value = ord($input[$position]);

				// If it is invalid, count the sequence as invalid and reprocess the current byte:
				if (($value & 0xC0) !== 0x80) {
					throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
				}

				--$remaining;
				$character |= ($value & 0x3F) << ($remaining * 6);
			}

			$position--;
		}

		if (// Non-shortest form sequences are invalid
			$length > 1 && $character <= 0x7F
			|| $length > 2 && $character <= 0x7FF
			|| $length > 3 && $character <= 0xFFFF
			// Outside of range of ucschar codepoints
			// Noncharacters
			|| ($character & 0xFFFE) === 0xFFFE
			|| $character >= 0xFDD0 && $character <= 0xFDEF
			|| (
				// Everything else not in ucschar
				$character > 0xD7FF && $character < 0xF900
				|| $character < 0x20
				|| $character > 0x7E && $character < 0xA0
				|| $character > 0xEFFFD
			)
		) {
			throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
		}

		$codepoints[] = $character;
	}

	return $codepoints;
}

User Contributed Notes

You must log in before being able to contribute a note or feedback.