Quellcodebibliothek Statistik Leitseite products/Sources/formale Sprachen/C/Firefox/third_party/rust/encoding_rs/src/   (Browser von der Mozilla Stiftung Version 136.0.1©)  Datei vom 10.2.2025 mit Größe 128 kB image not shown  

Quelle  mem.rs   Sprache: unbekannt

 
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Functions for converting between different in-RAM representations of text
//! and for quickly checking if the Unicode Bidirectional Algorithm can be
//! avoided.
//!
//! By using slices for output, the functions here seek to enable by-register
//! (ALU register or SIMD register as available) operations in order to
//! outperform iterator-based conversions available in the Rust standard
//! library.
//!
//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
//! in-memory encoding is sometimes used as a storage optimization of text
//! when UTF-16 indexing and length semantics are exposed.
//!
//! The FFI binding for this module are in the
//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).

#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
#[cfg(feature = "alloc")]
use alloc::string::String;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;

use super::in_inclusive_range16;
use super::in_inclusive_range32;
use super::in_inclusive_range8;
use super::in_range16;
use super::in_range32;
use super::DecoderResult;
use crate::ascii::*;
use crate::utf_8::*;

macro_rules! non_fuzz_debug_assert {
    ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
}

cfg_if! {
    if #[cfg(feature = "simd-accel")] {
        use ::core::intrinsics::likely;
        use ::core::intrinsics::unlikely;
    } else {
        #[inline(always)]
        fn likely(b: bool) -> bool {
            b
        }
        #[inline(always)]
        fn unlikely(b: bool) -> bool {
            b
        }
    }
}

/// Classification of text as Latin1 (all code points are below U+0100),
/// left-to-right with some non-Latin1 characters or as containing at least
/// some right-to-left characters.
#[must_use]
#[derive(Debug, PartialEq, Eq)]
#[repr(C)]
pub enum Latin1Bidi {
    /// Every character is below U+0100.
    Latin1 = 0,
    /// There is at least one character that's U+0100 or higher, but there
    /// are no right-to-left characters.
    LeftToRight = 1,
    /// There is at least one right-to-left character.
    Bidi = 2,
}

// `as` truncates, so works on 32-bit, too.
#[allow(dead_code)]
const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;

#[allow(unused_macros)]
macro_rules! by_unit_check_alu {
    ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
        #[inline(always)]
        fn $name(buffer: &[$unit]) -> bool {
            let mut offset = 0usize;
            let mut accu = 0usize;
            let unit_size = ::core::mem::size_of::<$unit>();
            let len = buffer.len();
            if len >= ALU_ALIGNMENT / unit_size {
                // The most common reason to return `false` is for the first code
                // unit to fail the test, so check that first.
                if buffer[0] >= $bound {
                    return false;
                }
                let src = buffer.as_ptr();
                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
                    & ALU_ALIGNMENT_MASK)
                    / unit_size;
                if until_alignment + ALU_ALIGNMENT / unit_size <= len {
                    if until_alignment != 0 {
                        accu |= buffer[offset] as usize;
                        offset += 1;
                        until_alignment -= 1;
                        while until_alignment != 0 {
                            accu |= buffer[offset] as usize;
                            offset += 1;
                            until_alignment -= 1;
                        }
                        if accu >= $bound {
                            return false;
                        }
                    }
                    let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
                    if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
                        // Safety: the above check lets us perform 4 consecutive reads of
                        // length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size
                        // is the size of the `src` pointer, so this is equal to performing four usize reads.
                        //
                        // This invariant is upheld on all loop iterations
                        let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
                        loop {
                            let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
                                | unsafe {
                                    *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
                                }
                                | unsafe {
                                    *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
                                        as *const usize)
                                }
                                | unsafe {
                                    *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
                                        as *const usize)
                                };
                            if unroll_accu & $mask != 0 {
                                return false;
                            }
                            offset += 4 * (ALU_ALIGNMENT / unit_size);
                            // Safety: this check lets us continue to perform the 4 reads earlier
                            if offset > len_minus_unroll {
                                break;
                            }
                        }
                    }
                    while offset <= len_minus_stride {
                        // Safety: the above check lets us perform one usize read.
                        accu |= unsafe { *(src.add(offset) as *const usize) };
                        offset += ALU_ALIGNMENT / unit_size;
                    }
                }
            }
            for &unit in &buffer[offset..] {
                accu |= unit as usize;
            }
            accu & $mask == 0
        }
    };
}

#[allow(unused_macros)]
macro_rules! by_unit_check_simd {
    ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
        #[inline(always)]
        fn $name(buffer: &[$unit]) -> bool {
            let mut offset = 0usize;
            let mut accu = 0usize;
            let unit_size = ::core::mem::size_of::<$unit>();
            let len = buffer.len();
            if len >= SIMD_STRIDE_SIZE / unit_size {
                // The most common reason to return `false` is for the first code
                // unit to fail the test, so check that first.
                if buffer[0] >= $bound {
                    return false;
                }
                let src = buffer.as_ptr();
                let mut until_alignment = ((SIMD_ALIGNMENT
                    - ((src as usize) & SIMD_ALIGNMENT_MASK))
                    & SIMD_ALIGNMENT_MASK)
                    / unit_size;
                if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
                    if until_alignment != 0 {
                        accu |= buffer[offset] as usize;
                        offset += 1;
                        until_alignment -= 1;
                        while until_alignment != 0 {
                            accu |= buffer[offset] as usize;
                            offset += 1;
                            until_alignment -= 1;
                        }
                        if accu >= $bound {
                            return false;
                        }
                    }
                    let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
                    if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
                        // Safety: the above check lets us perform 4 consecutive reads of
                        // length SIMD_STRIDE_SIZE / unit_size. SIMD_STRIDE_SIZE is the size of $simd_ty, and unit_size
                        // is the size of the `src` pointer, so this is equal to performing four $simd_ty reads.
                        //
                        // This invariant is upheld on all loop iterations
                        let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
                        loop {
                            let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
                                | unsafe {
                                    *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
                                        as *const $simd_ty)
                                }
                                | unsafe {
                                    *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
                                        as *const $simd_ty)
                                }
                                | unsafe {
                                    *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
                                        as *const $simd_ty)
                                };
                            if !$func(unroll_accu) {
                                return false;
                            }
                            offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
                            // Safety: this check lets us continue to perform the 4 reads earlier
                            if offset > len_minus_unroll {
                                break;
                            }
                        }
                    }
                    let mut simd_accu = $splat;
                    while offset <= len_minus_stride {
                        // Safety: the above check lets us perform one $simd_ty read.
                        simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
                        offset += SIMD_STRIDE_SIZE / unit_size;
                    }
                    if !$func(simd_accu) {
                        return false;
                    }
                }
            }
            for &unit in &buffer[offset..] {
                accu |= unit as usize;
            }
            accu < $bound
        }
    };
}

cfg_if! {
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
        use crate::simd_funcs::*;
        use core::simd::u8x16;
        use core::simd::u16x8;

        const SIMD_ALIGNMENT: usize = 16;

        const SIMD_ALIGNMENT_MASK: usize = 15;

        by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
        by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
        by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);

        #[inline(always)]
        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
            // This function is a mess, because it simultaneously tries to do
            // only aligned SIMD (perhaps misguidedly) and needs to deal with
            // the last code unit in a SIMD stride being part of a valid
            // surrogate pair.
            let unit_size = ::core::mem::size_of::<u16>();
            let src = buffer.as_ptr();
            let len = buffer.len();
            let mut offset = 0usize;
            'outer: loop {
                let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
                                        SIMD_ALIGNMENT_MASK) / unit_size;
                if until_alignment == 0 {
                    if offset + SIMD_STRIDE_SIZE / unit_size > len {
                        break;
                    }
                } else {
                    let offset_plus_until_alignment = offset + until_alignment;
                    let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
                    if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
                        break;
                    }
                    let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
                    if up_to < until_alignment {
                        return offset + up_to;
                    }
                    if last_valid_low {
                        offset = offset_plus_until_alignment_plus_one;
                        continue;
                    }
                    offset = offset_plus_until_alignment;
                }
                let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
                loop {
                    let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
                    if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
                        if offset_plus_stride == len {
                            break 'outer;
                        }
                        let offset_plus_stride_plus_one = offset_plus_stride + 1;
                        let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
                        if up_to < SIMD_STRIDE_SIZE / unit_size {
                            return offset + up_to;
                        }
                        if last_valid_low {
                            offset = offset_plus_stride_plus_one;
                            continue 'outer;
                        }
                    }
                    offset = offset_plus_stride;
                    if offset > len_minus_stride {
                        break 'outer;
                    }
                }
            }
            let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
            offset + up_to
        }
    } else {
        by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
        by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
        by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);

        #[inline(always)]
        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
            let (up_to, _) = utf16_valid_up_to_alu(buffer);
            up_to
        }
    }
}

/// The second return value is true iff the last code unit of the slice was
/// reached and turned out to be a low surrogate that is part of a valid pair.
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
#[inline(always)]
fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
    let len = buffer.len();
    if len == 0 {
        return (0, false);
    }
    let mut offset = 0usize;
    loop {
        let unit = buffer[offset];
        let next = offset + 1;
        let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
        if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
            // Not a surrogate
            offset = next;
            if offset == len {
                return (offset, false);
            }
            continue;
        }
        if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
            // high surrogate
            if next < len {
                let second = buffer[next];
                let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
                if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
                    // The next code unit is a low surrogate. Advance position.
                    offset = next + 1;
                    if offset == len {
                        return (offset, true);
                    }
                    continue;
                }
                // The next code unit is not a low surrogate. Don't advance
                // position and treat the high surrogate as unpaired.
                // fall through
            }
            // Unpaired, fall through
        }
        // Unpaired surrogate
        return (offset, false);
    }
}

cfg_if! {
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
        #[inline(always)]
        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
            let mut offset = 0usize;
            let bytes = buffer.as_bytes();
            let len = bytes.len();
            if len >= SIMD_STRIDE_SIZE {
                let src = bytes.as_ptr();
                let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
                                           SIMD_ALIGNMENT_MASK;
                if until_alignment + SIMD_STRIDE_SIZE <= len {
                    while until_alignment != 0 {
                        if bytes[offset] > 0xC3 {
                            return Some(offset);
                        }
                        offset += 1;
                        until_alignment -= 1;
                    }
                    let len_minus_stride = len - SIMD_STRIDE_SIZE;
                    loop {
                        if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
                            // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
                            while bytes[offset] & 0xC0 == 0x80 {
                                offset += 1;
                            }
                            return Some(offset);
                        }
                        offset += SIMD_STRIDE_SIZE;
                        if offset > len_minus_stride {
                            break;
                        }
                    }
                }
            }
            for i in offset..len {
                if bytes[i] > 0xC3 {
                    return Some(i);
                }
            }
            None
        }
    } else {
        #[inline(always)]
        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
            let mut bytes = buffer.as_bytes();
            let mut total = 0;
            loop {
                if let Some((byte, offset)) = validate_ascii(bytes) {
                    total += offset;
                    if byte > 0xC3 {
                        return Some(total);
                    }
                    bytes = &bytes[offset + 2..];
                    total += 2;
                } else {
                    return None;
                }
            }
        }
    }
}

#[inline(always)]
fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
    let mut bytes = buffer;
    let mut total = 0;
    loop {
        if let Some((byte, offset)) = validate_ascii(bytes) {
            total += offset;
            if in_inclusive_range8(byte, 0xC2, 0xC3) {
                let next = offset + 1;
                if next == bytes.len() {
                    return Some(total);
                }
                if bytes[next] & 0xC0 != 0x80 {
                    return Some(total);
                }
                bytes = &bytes[offset + 2..];
                total += 2;
            } else {
                return Some(total);
            }
        } else {
            return None;
        }
    }
}

cfg_if! {
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
        #[inline(always)]
        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
            let mut offset = 0usize;
            let len = buffer.len();
            if len >= SIMD_STRIDE_SIZE / 2 {
                let src = buffer.as_ptr();
                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
                                           SIMD_ALIGNMENT_MASK) / 2;
                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
                    while until_alignment != 0 {
                        if is_utf16_code_unit_bidi(buffer[offset]) {
                            return true;
                        }
                        offset += 1;
                        until_alignment -= 1;
                    }
                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
                    loop {
                        if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
                            return true;
                        }
                        offset += SIMD_STRIDE_SIZE / 2;
                        if offset > len_minus_stride {
                            break;
                        }
                    }
                }
            }
            for &u in &buffer[offset..] {
                if is_utf16_code_unit_bidi(u) {
                    return true;
                }
            }
            false
        }
    } else {
        #[inline(always)]
        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
            for &u in buffer {
                if is_utf16_code_unit_bidi(u) {
                    return true;
                }
            }
            false
        }
    }
}

cfg_if! {
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
        #[inline(always)]
        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
            let mut offset = 0usize;
            let len = buffer.len();
            if len >= SIMD_STRIDE_SIZE / 2 {
                let src = buffer.as_ptr();
                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
                                           SIMD_ALIGNMENT_MASK) / 2;
                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
                    while until_alignment != 0 {
                        if buffer[offset] > 0xFF {
                            // This transition isn't optimal, since the aligment is recomputing
                            // but not tweaking further today.
                            if is_utf16_bidi_impl(&buffer[offset..]) {
                                return Latin1Bidi::Bidi;
                            }
                            return Latin1Bidi::LeftToRight;
                        }
                        offset += 1;
                        until_alignment -= 1;
                    }
                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
                    loop {
                        let mut s = unsafe { *(src.add(offset) as *const u16x8) };
                        if !simd_is_latin1(s) {
                            loop {
                                if is_u16x8_bidi(s) {
                                    return Latin1Bidi::Bidi;
                                }
                                offset += SIMD_STRIDE_SIZE / 2;
                                if offset > len_minus_stride {
                                    for &u in &buffer[offset..] {
                                        if is_utf16_code_unit_bidi(u) {
                                            return Latin1Bidi::Bidi;
                                        }
                                    }
                                    return Latin1Bidi::LeftToRight;
                                }
                                s = unsafe { *(src.add(offset) as *const u16x8) };
                            }
                        }
                        offset += SIMD_STRIDE_SIZE / 2;
                        if offset > len_minus_stride {
                            break;
                        }
                    }
                }
            }
            let mut iter = (&buffer[offset..]).iter();
            loop {
                if let Some(&u) = iter.next() {
                    if u > 0xFF {
                        let mut inner_u = u;
                        loop {
                            if is_utf16_code_unit_bidi(inner_u) {
                                return Latin1Bidi::Bidi;
                            }
                            if let Some(&code_unit) = iter.next() {
                                inner_u = code_unit;
                            } else {
                                return Latin1Bidi::LeftToRight;
                            }
                        }
                    }
                } else {
                    return Latin1Bidi::Latin1;
                }
            }
        }
    } else {
        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
        #[inline(always)]
        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
            let mut offset = 0usize;
            let len = buffer.len();
            if len >= ALU_ALIGNMENT / 2 {
                let src = buffer.as_ptr();
                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
                                           ALU_ALIGNMENT_MASK) / 2;
                if until_alignment + ALU_ALIGNMENT / 2 <= len {
                    while until_alignment != 0 {
                        if buffer[offset] > 0xFF {
                            if is_utf16_bidi_impl(&buffer[offset..]) {
                                return Latin1Bidi::Bidi;
                            }
                            return Latin1Bidi::LeftToRight;
                        }
                        offset += 1;
                        until_alignment -= 1;
                    }
                    let len_minus_stride = len - ALU_ALIGNMENT / 2;
                    loop {
                        if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
                            if is_utf16_bidi_impl(&buffer[offset..]) {
                                return Latin1Bidi::Bidi;
                            }
                            return Latin1Bidi::LeftToRight;
                        }
                        offset += ALU_ALIGNMENT / 2;
                        if offset > len_minus_stride {
                            break;
                        }
                    }
                }
            }
            let mut iter = (&buffer[offset..]).iter();
            loop {
                if let Some(&u) = iter.next() {
                    if u > 0xFF {
                        let mut inner_u = u;
                        loop {
                            if is_utf16_code_unit_bidi(inner_u) {
                                return Latin1Bidi::Bidi;
                            }
                            if let Some(&code_unit) = iter.next() {
                                inner_u = code_unit;
                            } else {
                                return Latin1Bidi::LeftToRight;
                            }
                        }
                    }
                } else {
                    return Latin1Bidi::Latin1;
                }
            }
        }
    }
}

/// Checks whether the buffer is all-ASCII.
///
/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
/// is not guaranteed to fail fast.)
pub fn is_ascii(buffer: &[u8]) -> bool {
    is_ascii_impl(buffer)
}

/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
/// only ASCII characters).
///
/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
/// is not guaranteed to fail fast.)
pub fn is_basic_latin(buffer: &[u16]) -> bool {
    is_basic_latin_impl(buffer)
}

/// Checks whether the buffer is valid UTF-8 representing only code points
/// less than or equal to U+00FF.
///
/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
/// invalidity or code points above U+00FF are discovered.
pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
    is_utf8_latin1_impl(buffer).is_none()
}

/// Checks whether the buffer represents only code points less than or equal
/// to U+00FF.
///
/// Fails fast. (I.e. returns before having read the whole buffer if code
/// points above U+00FF are discovered.
pub fn is_str_latin1(buffer: &str) -> bool {
    is_str_latin1_impl(buffer).is_none()
}

/// Checks whether the buffer represents only code point less than or equal
/// to U+00FF.
///
/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
/// is not guaranteed to fail fast.)
pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
    is_utf16_latin1_impl(buffer)
}

/// Checks whether a potentially-invalid UTF-8 buffer contains code points
/// that trigger right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Returns `true` if the input is invalid UTF-8 or the input contains an
/// RTL character. Returns `false` if the input is valid UTF-8 and contains
/// no RTL characters.
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
#[inline]
pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
    // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
    // than UTF-8 validation followed by `is_str_bidi()` for German,
    // Russian and Japanese. However, this is considerably slower for Thai.
    // Chances are that the compiler makes some branch predictions that are
    // unfortunate for Thai. Not spending the time to manually optimize
    // further at this time, since it's unclear if this variant even has
    // use cases. However, this is worth revisiting once Rust gets the
    // ability to annotate relative priorities of match arms.

    // U+058F: D6 8F
    // U+0590: D6 90
    // U+08FF: E0 A3 BF
    // U+0900: E0 A4 80
    //
    // U+200F: E2 80 8F
    // U+202B: E2 80 AB
    // U+202E: E2 80 AE
    // U+2067: E2 81 A7
    //
    // U+FB1C: EF AC 9C
    // U+FB1D: EF AC 9D
    // U+FDFF: EF B7 BF
    // U+FE00: EF B8 80
    //
    // U+FE6F: EF B9 AF
    // U+FE70: EF B9 B0
    // U+FEFE: EF BB BE
    // U+FEFF: EF BB BF
    //
    // U+107FF: F0 90 9F BF
    // U+10800: F0 90 A0 80
    // U+10FFF: F0 90 BF BF
    // U+11000: F0 91 80 80
    //
    // U+1E7FF: F0 9E 9F BF
    // U+1E800: F0 9E A0 80
    // U+1EFFF: F0 9E BF BF
    // U+1F000: F0 9F 80 80
    let mut src = buffer;
    'outer: loop {
        if let Some((mut byte, mut read)) = validate_ascii(src) {
            // Check for the longest sequence to avoid checking twice for the
            // multi-byte sequences.
            if read + 4 <= src.len() {
                'inner: loop {
                    // At this point, `byte` is not included in `read`.
                    match byte {
                        0..=0x7F => {
                            // ASCII: go back to SIMD.
                            read += 1;
                            src = &src[read..];
                            continue 'outer;
                        }
                        0xC2..=0xD5 => {
                            // Two-byte
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            if !in_inclusive_range8(second, 0x80, 0xBF) {
                                return true;
                            }
                            read += 2;
                        }
                        0xD6 => {
                            // Two-byte
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            if !in_inclusive_range8(second, 0x80, 0xBF) {
                                return true;
                            }
                            // XXX consider folding the above and below checks
                            if second > 0x8F {
                                return true;
                            }
                            read += 2;
                        }
                        // two-byte starting with 0xD7 and above is bidi
                        0xE1 | 0xE3..=0xEC | 0xEE => {
                            // Three-byte normal
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            read += 3;
                        }
                        0xE2 => {
                            // Three-byte normal, potentially bidi
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            if second == 0x80 {
                                if third == 0x8F || third == 0xAB || third == 0xAE {
                                    return true;
                                }
                            } else if second == 0x81 {
                                if third == 0xA7 {
                                    return true;
                                }
                            }
                            read += 3;
                        }
                        0xEF => {
                            // Three-byte normal, potentially bidi
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            if in_inclusive_range8(second, 0xAC, 0xB7) {
                                if second == 0xAC {
                                    if third > 0x9C {
                                        return true;
                                    }
                                } else {
                                    return true;
                                }
                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
                                if second == 0xB9 {
                                    if third > 0xAF {
                                        return true;
                                    }
                                } else if second == 0xBB {
                                    if third != 0xBF {
                                        return true;
                                    }
                                } else {
                                    return true;
                                }
                            }
                            read += 3;
                        }
                        0xE0 => {
                            // Three-byte special lower bound, potentially bidi
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            // XXX can this be folded into the above validity check
                            if second < 0xA4 {
                                return true;
                            }
                            read += 3;
                        }
                        0xED => {
                            // Three-byte special upper bound
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            read += 3;
                        }
                        0xF1..=0xF4 => {
                            // Four-byte normal
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
                            if (u16::from(
                                UTF8_DATA.table[usize::from(second)]
                                    & unsafe {
                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                    },
                            ) | u16::from(third >> 6)
                                | (u16::from(fourth & 0xC0) << 2))
                                != 0x202
                            {
                                return true;
                            }
                            read += 4;
                        }
                        0xF0 => {
                            // Four-byte special lower bound, potentially bidi
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
                            if (u16::from(
                                UTF8_DATA.table[usize::from(second)]
                                    & unsafe {
                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                    },
                            ) | u16::from(third >> 6)
                                | (u16::from(fourth & 0xC0) << 2))
                                != 0x202
                            {
                                return true;
                            }
                            if unlikely(second == 0x90 || second == 0x9E) {
                                let third = src[read + 2];
                                if third >= 0xA0 {
                                    return true;
                                }
                            }
                            read += 4;
                        }
                        _ => {
                            // Invalid lead or bidi-only lead
                            return true;
                        }
                    }
                    if read + 4 > src.len() {
                        if read == src.len() {
                            return false;
                        }
                        byte = src[read];
                        break 'inner;
                    }
                    byte = src[read];
                    continue 'inner;
                }
            }
            // We can't have a complete 4-byte sequence, but we could still have
            // a complete shorter sequence.

            // At this point, `byte` is not included in `read`.
            match byte {
                0..=0x7F => {
                    // ASCII: go back to SIMD.
                    read += 1;
                    src = &src[read..];
                    continue 'outer;
                }
                0xC2..=0xD5 => {
                    // Two-byte
                    let new_read = read + 2;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    if !in_inclusive_range8(second, 0x80, 0xBF) {
                        return true;
                    }
                    read = new_read;
                    // We need to deal with the case where we came here with 3 bytes
                    // left, so we need to take a look at the last one.
                    src = &src[read..];
                    continue 'outer;
                }
                0xD6 => {
                    // Two-byte, potentially bidi
                    let new_read = read + 2;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    if !in_inclusive_range8(second, 0x80, 0xBF) {
                        return true;
                    }
                    // XXX consider folding the above and below checks
                    if second > 0x8F {
                        return true;
                    }
                    read = new_read;
                    // We need to deal with the case where we came here with 3 bytes
                    // left, so we need to take a look at the last one.
                    src = &src[read..];
                    continue 'outer;
                }
                // two-byte starting with 0xD7 and above is bidi
                0xE1 | 0xE3..=0xEC | 0xEE => {
                    // Three-byte normal
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                }
                0xE2 => {
                    // Three-byte normal, potentially bidi
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                    if second == 0x80 {
                        if third == 0x8F || third == 0xAB || third == 0xAE {
                            return true;
                        }
                    } else if second == 0x81 {
                        if third == 0xA7 {
                            return true;
                        }
                    }
                }
                0xEF => {
                    // Three-byte normal, potentially bidi
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                    if in_inclusive_range8(second, 0xAC, 0xB7) {
                        if second == 0xAC {
                            if third > 0x9C {
                                return true;
                            }
                        } else {
                            return true;
                        }
                    } else if in_inclusive_range8(second, 0xB9, 0xBB) {
                        if second == 0xB9 {
                            if third > 0xAF {
                                return true;
                            }
                        } else if second == 0xBB {
                            if third != 0xBF {
                                return true;
                            }
                        } else {
                            return true;
                        }
                    }
                }
                0xE0 => {
                    // Three-byte special lower bound, potentially bidi
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                    // XXX can this be folded into the above validity check
                    if second < 0xA4 {
                        return true;
                    }
                }
                0xED => {
                    // Three-byte special upper bound
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                }
                _ => {
                    // Invalid lead, 4-byte lead or 2-byte bidi-only lead
                    return true;
                }
            }
            return false;
        } else {
            return false;
        }
    }
}

/// Checks whether a valid UTF-8 buffer contains code points that trigger
/// right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
#[inline]
pub fn is_str_bidi(buffer: &str) -> bool {
    // U+058F: D6 8F
    // U+0590: D6 90
    // U+08FF: E0 A3 BF
    // U+0900: E0 A4 80
    //
    // U+200F: E2 80 8F
    // U+202B: E2 80 AB
    // U+202E: E2 80 AE
    // U+2067: E2 81 A7
    //
    // U+FB1C: EF AC 9C
    // U+FB1D: EF AC 9D
    // U+FDFF: EF B7 BF
    // U+FE00: EF B8 80
    //
    // U+FE6F: EF B9 AF
    // U+FE70: EF B9 B0
    // U+FEFE: EF BB BE
    // U+FEFF: EF BB BF
    //
    // U+107FF: F0 90 9F BF
    // U+10800: F0 90 A0 80
    // U+10FFF: F0 90 BF BF
    // U+11000: F0 91 80 80
    //
    // U+1E7FF: F0 9E 9F BF
    // U+1E800: F0 9E A0 80
    // U+1EFFF: F0 9E BF BF
    // U+1F000: F0 9F 80 80
    let mut bytes = buffer.as_bytes();
    'outer: loop {
        // TODO: Instead of just validating ASCII using SIMD, use SIMD
        // to check for non-ASCII lead bytes, too, to quickly conclude
        // that the vector consist entirely of CJK and below-Hebrew
        // code points.
        // Unfortunately, scripts above Arabic but below CJK share
        // lead bytes with RTL.
        if let Some((mut byte, mut read)) = validate_ascii(bytes) {
            'inner: loop {
                // At this point, `byte` is not included in `read`.
                if byte < 0xE0 {
                    if byte >= 0x80 {
                        // Two-byte
                        // Adding `unlikely` here improved throughput on
                        // Russian plain text by 33%!
                        if unlikely(byte >= 0xD6) {
                            if byte == 0xD6 {
                                let second = bytes[read + 1];
                                if second > 0x8F {
                                    return true;
                                }
                            } else {
                                return true;
                            }
                        }
                        read += 2;
                    } else {
                        // ASCII: write and go back to SIMD.
                        read += 1;
                        // Intuitively, we should go back to the outer loop only
                        // if byte is 0x30 or above, so as to avoid trashing on
                        // ASCII space, comma and period in non-Latin context.
                        // However, the extra branch seems to cost more than it's
                        // worth.
                        bytes = &bytes[read..];
                        continue 'outer;
                    }
                } else if byte < 0xF0 {
                    // Three-byte
                    if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) {
                        let second = bytes[read + 1];
                        if byte == 0xE0 {
                            if second < 0xA4 {
                                return true;
                            }
                        } else if byte == 0xE2 {
                            let third = bytes[read + 2];
                            if second == 0x80 {
                                if third == 0x8F || third == 0xAB || third == 0xAE {
                                    return true;
                                }
                            } else if second == 0x81 {
                                if third == 0xA7 {
                                    return true;
                                }
                            }
                        } else {
                            debug_assert_eq!(byte, 0xEF);
                            if in_inclusive_range8(second, 0xAC, 0xB7) {
                                if second == 0xAC {
                                    let third = bytes[read + 2];
                                    if third > 0x9C {
                                        return true;
                                    }
                                } else {
                                    return true;
                                }
                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
                                if second == 0xB9 {
                                    let third = bytes[read + 2];
                                    if third > 0xAF {
                                        return true;
                                    }
                                } else if second == 0xBB {
                                    let third = bytes[read + 2];
                                    if third != 0xBF {
                                        return true;
                                    }
                                } else {
                                    return true;
                                }
                            }
                        }
                    }
                    read += 3;
                } else {
                    // Four-byte
                    let second = bytes[read + 1];
                    if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) {
                        let third = bytes[read + 2];
                        if third >= 0xA0 {
                            return true;
                        }
                    }
                    read += 4;
                }
                // The comparison is always < or == and never >, but including
                // > here to let the compiler assume that < is true if this
                // comparison is false.
                if read >= bytes.len() {
                    return false;
                }
                byte = bytes[read];
                continue 'inner;
            }
        } else {
            return false;
        }
    }
}

/// Checks whether a UTF-16 buffer contains code points that trigger
/// right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Returns `true` if the input contains an RTL character or an unpaired
/// high surrogate that could be the high half of an RTL character.
/// Returns `false` if the input contains neither RTL characters nor
/// unpaired high surrogates that could be higher halves of RTL characters.
pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
    is_utf16_bidi_impl(buffer)
}

/// Checks whether a scalar value triggers right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
#[inline(always)]
pub fn is_char_bidi(c: char) -> bool {
    // Controls:
    // Every control with RIGHT-TO-LEFT in its name in
    // https://www.unicode.org/charts/PDF/U2000.pdf
    // U+200F RLM
    // U+202B RLE
    // U+202E RLO
    // U+2067 RLI
    //
    // BMP RTL:
    // https://www.unicode.org/roadmaps/bmp/
    // U+0590...U+08FF
    // U+FB1D...U+FDFF Hebrew presentation forms and
    //                 Arabic Presentation Forms A
    // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
    //
    // Supplementary RTL:
    // https://www.unicode.org/roadmaps/smp/
    // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
    // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
    let code_point = u32::from(c);
    if code_point < 0x0590 {
        // Below Hebrew
        return false;
    }
    if in_range32(code_point, 0x0900, 0xFB1D) {
        // Above Arabic Extended-A and below Hebrew presentation forms
        if in_inclusive_range32(code_point, 0x200F, 0x2067) {
            // In the range that contains the RTL controls
            return code_point == 0x200F
                || code_point == 0x202B
                || code_point == 0x202E
                || code_point == 0x2067;
        }
        return false;
    }
    if code_point > 0x1EFFF {
        // Above second astral RTL. (Emoji is here.)
        return false;
    }
    if in_range32(code_point, 0x11000, 0x1E800) {
        // Between astral RTL blocks
        return false;
    }
    if in_range32(code_point, 0xFEFF, 0x10800) {
        // Above Arabic Presentations Forms B (excl. BOM) and below first
        // astral RTL
        return false;
    }
    if in_range32(code_point, 0xFE00, 0xFE70) {
        // Between Arabic Presentations Forms
        return false;
    }
    true
}

/// Checks whether a UTF-16 code unit triggers right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Since supplementary-plane right-to-left blocks are identifiable from the
/// high surrogate without examining the low surrogate, this function returns
/// `true` for such high surrogates making the function suitable for handling
/// supplementary-plane text without decoding surrogate pairs to scalar
/// values. Obviously, such high surrogates are then reported as right-to-left
/// even if actually unpaired.
#[inline(always)]
pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
    if u < 0x0590 {
        // Below Hebrew
        return false;
    }
    if in_range16(u, 0x0900, 0xD802) {
        // Above Arabic Extended-A and below first RTL surrogate
        if in_inclusive_range16(u, 0x200F, 0x2067) {
            // In the range that contains the RTL controls
            return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
        }
        return false;
    }
    if in_range16(u, 0xD83C, 0xFB1D) {
        // Between astral RTL high surrogates and Hebrew presentation forms
        // (Emoji is here)
        return false;
    }
    if in_range16(u, 0xD804, 0xD83A) {
        // Between RTL high surragates
        return false;
    }
    if u > 0xFEFE {
        // Above Arabic Presentation Forms (excl. BOM)
        return false;
    }
    if in_range16(u, 0xFE00, 0xFE70) {
        // Between Arabic Presentations Forms
        return false;
    }
    true
}

/// Checks whether a potentially invalid UTF-8 buffer contains code points
/// that trigger right-to-left processing or is all-Latin1.
///
/// Possibly more efficient than performing the checks separately.
///
/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
    if let Some(offset) = is_utf8_latin1_impl(buffer) {
        if is_utf8_bidi(&buffer[offset..]) {
            Latin1Bidi::Bidi
        } else {
            Latin1Bidi::LeftToRight
        }
    } else {
        Latin1Bidi::Latin1
    }
}

/// Checks whether a valid UTF-8 buffer contains code points
/// that trigger right-to-left processing or is all-Latin1.
///
/// Possibly more efficient than performing the checks separately.
///
/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
    // The transition from the latin1 check to the bidi check isn't
    // optimal but not tweaking it to perfection today.
    if let Some(offset) = is_str_latin1_impl(buffer) {
        if is_str_bidi(&buffer[offset..]) {
            Latin1Bidi::Bidi
        } else {
            Latin1Bidi::LeftToRight
        }
    } else {
        Latin1Bidi::Latin1
    }
}

/// Checks whether a potentially invalid UTF-16 buffer contains code points
/// that trigger right-to-left processing or is all-Latin1.
///
/// Possibly more efficient than performing the checks separately.
///
/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
    check_utf16_for_latin1_and_bidi_impl(buffer)
}

/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
/// with the REPLACEMENT CHARACTER.
///
/// The length of the destination buffer must be at least the length of the
/// source buffer _plus one_.
///
/// Returns the number of `u16`s written.
///
/// # Panics
///
/// Panics if the destination buffer is shorter than stated above.
pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
    // TODO: Can the requirement for dst to be at least one unit longer
    // be eliminated?
    assert!(dst.len() > src.len());
    let mut decoder = Utf8Decoder::new_inner();
    let mut total_read = 0usize;
    let mut total_written = 0usize;
    loop {
        let (result, read, written) =
            decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
        total_read += read;
        total_written += written;
        match result {
            DecoderResult::InputEmpty => {
                return total_written;
            }
            DecoderResult::OutputFull => {
                unreachable!("The assert at the top of the function should have caught this.");
            }
            DecoderResult::Malformed(_, _) => {
                // There should always be space for the U+FFFD, because
                // otherwise we'd have gotten OutputFull already.
                dst[total_written] = 0xFFFD;
                total_written += 1;
            }
        }
    }
}

/// Converts valid UTF-8 to valid UTF-16.
///
/// The length of the destination buffer must be at least the length of the
/// source buffer.
///
/// Returns the number of `u16`s written.
///
/// # Panics
///
/// Panics if the destination buffer is shorter than stated above.
pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
    assert!(
        dst.len() >= src.len(),
        "Destination must not be shorter than the source."
    );
    let bytes = src.as_bytes();
    let mut read = 0;
    let mut written = 0;
    'outer: loop {
        let mut byte = {
            let src_remaining = &bytes[read..];
            let dst_remaining = &mut dst[written..];
            let length = src_remaining.len();
            match unsafe {
                ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
            } {
                None => {
                    written += length;
                    return written;
                }
                Some((non_ascii, consumed)) => {
--> --------------------

--> maximum size reached

--> --------------------

[ zur Elbe Produktseite wechseln0.57Quellennavigators  Analyse erneut starten  ]