wpdb::strip_invalid_text( array $data ): array|WP_Error

In this article

Strips any invalid characters based on value/charset pairs.

Parameters

$dataarrayrequired
Array of value arrays. Each value array has the keys 'value', 'charset', and 'length'.
An optional 'ascii' key can be set to false to avoid redundant ASCII checks.

Return

array|WP_Error The $data parameter, with invalid characters removed from each value.
This works as a passthrough: any additional keys such as 'field' are retained in each value array. If we cannot remove invalid characters, a WP_Error object is returned.

Source

protected function strip_invalid_text( $data ) {
	$db_check_string = false;

	foreach ( $data as &$value ) {
		$charset = $value['charset'];

		if ( is_array( $value['length'] ) ) {
			$length                  = $value['length']['length'];
			$truncate_by_byte_length = 'byte' === $value['length']['type'];
		} else {
			$length = false;
			/*
			 * Since we have no length, we'll never truncate. Initialize the variable to false.
			 * True would take us through an unnecessary (for this case) codepath below.
			 */
			$truncate_by_byte_length = false;
		}

		// There's no charset to work with.
		if ( false === $charset ) {
			continue;
		}

		// Column isn't a string.
		if ( ! is_string( $value['value'] ) ) {
			continue;
		}

		$needs_validation = true;
		if (
			// latin1 can store any byte sequence.
			'latin1' === $charset
		||
			// ASCII is always OK.
			( ! isset( $value['ascii'] ) && $this->check_ascii( $value['value'] ) )
		) {
			$truncate_by_byte_length = true;
			$needs_validation        = false;
		}

		if ( $truncate_by_byte_length ) {
			mbstring_binary_safe_encoding();
			if ( false !== $length && strlen( $value['value'] ) > $length ) {
				$value['value'] = substr( $value['value'], 0, $length );
			}
			reset_mbstring_encoding();

			if ( ! $needs_validation ) {
				continue;
			}
		}

		// utf8 can be handled by regex, which is a bunch faster than a DB lookup.
		if ( ( 'utf8' === $charset || 'utf8mb3' === $charset || 'utf8mb4' === $charset ) && function_exists( 'mb_strlen' ) ) {
			$regex = '/
				(
					(?: [\x00-\x7F]                  # single-byte sequences   0xxxxxxx
					|   [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
					|   \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
					|   [\xE1-\xEC][\x80-\xBF]{2}
					|   \xED[\x80-\x9F][\x80-\xBF]
					|   [\xEE-\xEF][\x80-\xBF]{2}';

			if ( 'utf8mb4' === $charset ) {
				$regex .= '
					|    \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
					|    [\xF1-\xF3][\x80-\xBF]{3}
					|    \xF4[\x80-\x8F][\x80-\xBF]{2}
				';
			}

			$regex         .= '){1,40}                          # ...one or more times
				)
				| .                                  # anything else
				/x';
			$value['value'] = preg_replace( $regex, '$1', $value['value'] );

			if ( false !== $length && mb_strlen( $value['value'], 'UTF-8' ) > $length ) {
				$value['value'] = mb_substr( $value['value'], 0, $length, 'UTF-8' );
			}
			continue;
		}

		// We couldn't use any local conversions, send it to the DB.
		$value['db']     = true;
		$db_check_string = true;
	}
	unset( $value ); // Remove by reference.

	if ( $db_check_string ) {
		$queries = array();
		foreach ( $data as $col => $value ) {
			if ( ! empty( $value['db'] ) ) {
				// We're going to need to truncate by characters or bytes, depending on the length value we have.
				if ( isset( $value['length']['type'] ) && 'byte' === $value['length']['type'] ) {
					// Using binary causes LEFT() to truncate by bytes.
					$charset = 'binary';
				} else {
					$charset = $value['charset'];
				}

				if ( $this->charset ) {
					$connection_charset = $this->charset;
				} else {
					$connection_charset = mysqli_character_set_name( $this->dbh );
				}

				if ( is_array( $value['length'] ) ) {
					$length          = sprintf( '%.0f', $value['length']['length'] );
					$queries[ $col ] = $this->prepare( "CONVERT( LEFT( CONVERT( %s USING $charset ), $length ) USING $connection_charset )", $value['value'] );
				} elseif ( 'binary' !== $charset ) {
					// If we don't have a length, there's no need to convert binary - it will always return the same result.
					$queries[ $col ] = $this->prepare( "CONVERT( CONVERT( %s USING $charset ) USING $connection_charset )", $value['value'] );
				}

				unset( $data[ $col ]['db'] );
			}
		}

		$sql = array();
		foreach ( $queries as $column => $query ) {
			if ( ! $query ) {
				continue;
			}

			$sql[] = $query . " AS x_$column";
		}

		$this->check_current_query = false;
		$row                       = $this->get_row( 'SELECT ' . implode( ', ', $sql ), ARRAY_A );
		if ( ! $row ) {
			return new WP_Error( 'wpdb_strip_invalid_text_failure', __( 'Could not strip invalid text.' ) );
		}

		foreach ( array_keys( $data ) as $column ) {
			if ( isset( $row[ "x_$column" ] ) ) {
				$data[ $column ]['value'] = $row[ "x_$column" ];
			}
		}
	}

	return $data;
}

Changelog

VersionDescription
4.2.0Introduced.

User Contributed Notes

You must log in before being able to contribute a note or feedback.