Convert a UTF-8 string to a UCS-4 codepoint array
Description
Based on \WpOrg\Requests\Iri::replace_invalid_with_pct_encoding()
Parameters
$input
stringrequired- Text to convert.
Source
protected static function utf8_to_codepoints($input) {
$codepoints = [];
// Get number of bytes
$strlen = strlen($input);
// phpcs:ignore Generic.CodeAnalysis.JumbledIncrementer -- This is a deliberate choice.
for ($position = 0; $position < $strlen; $position++) {
$value = ord($input[$position]);
if ((~$value & 0x80) === 0x80) { // One byte sequence:
$character = $value;
$length = 1;
$remaining = 0;
} elseif (($value & 0xE0) === 0xC0) { // Two byte sequence:
$character = ($value & 0x1F) << 6;
$length = 2;
$remaining = 1;
} elseif (($value & 0xF0) === 0xE0) { // Three byte sequence:
$character = ($value & 0x0F) << 12;
$length = 3;
$remaining = 2;
} elseif (($value & 0xF8) === 0xF0) { // Four byte sequence:
$character = ($value & 0x07) << 18;
$length = 4;
$remaining = 3;
} else { // Invalid byte:
throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $value);
}
if ($remaining > 0) {
if ($position + $length > $strlen) {
throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
}
for ($position++; $remaining > 0; $position++) {
$value = ord($input[$position]);
// If it is invalid, count the sequence as invalid and reprocess the current byte:
if (($value & 0xC0) !== 0x80) {
throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
}
--$remaining;
$character |= ($value & 0x3F) << ($remaining * 6);
}
$position--;
}
if (// Non-shortest form sequences are invalid
$length > 1 && $character <= 0x7F
|| $length > 2 && $character <= 0x7FF
|| $length > 3 && $character <= 0xFFFF
// Outside of range of ucschar codepoints
// Noncharacters
|| ($character & 0xFFFE) === 0xFFFE
|| $character >= 0xFDD0 && $character <= 0xFDEF
|| (
// Everything else not in ucschar
$character > 0xD7FF && $character < 0xF900
|| $character < 0x20
|| $character > 0x7E && $character < 0xA0
|| $character > 0xEFFFD
)
) {
throw new Exception('Invalid Unicode codepoint', 'idna.invalidcodepoint', $character);
}
$codepoints[] = $character;
}
return $codepoints;
}
User Contributed Notes
You must log in before being able to contribute a note or feedback.