Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quelle  mem.rs   Sprache: unbekannt

 
Untersuchungsergebnis.rs Download desUnknown {[0] [0] [0]}zum Wurzelverzeichnis wechseln

// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Functions for converting between different in-RAM representations of text
//! and for quickly checking if the Unicode Bidirectional Algorithm can be
//! avoided.
//!
//! By using slices for output, the functions here seek to enable by-register
//! (ALU register or SIMD register as available) operations in order to
//! outperform iterator-based conversions available in the Rust standard
//! library.
//!
//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
//! in-memory encoding is sometimes used as a storage optimization of text
//! when UTF-16 indexing and length semantics are exposed.
//!
//! The FFI binding for this module are in the
//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).

#[cfg(feature = "alloc")]
use alloc::borrow::Cow;
#[cfg(feature = "alloc")]
use alloc::string::String;
#[cfg(feature = "alloc")]
use alloc::vec::Vec;

use super::in_inclusive_range16;
use super::in_inclusive_range32;
use super::in_inclusive_range8;
use super::in_range16;
use super::in_range32;
use super::DecoderResult;
use crate::ascii::*;
use crate::utf_8::*;

macro_rules! non_fuzz_debug_assert {
    ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
}

cfg_if! {
    if #[cfg(feature = "simd-accel")] {
        use ::core::intrinsics::likely;
        use ::core::intrinsics::unlikely;
    } else {
        #[inline(always)]
        fn likely(b: bool) -> bool {
            b
        }
        #[inline(always)]
        fn unlikely(b: bool) -> bool {
            b
        }
    }
}

/// Classification of text as Latin1 (all code points are below U+0100),
/// left-to-right with some non-Latin1 characters or as containing at least
/// some right-to-left characters.
#[must_use]
#[derive(Debug, PartialEq, Eq)]
#[repr(C)]
pub enum Latin1Bidi {
    /// Every character is below U+0100.
    Latin1 = 0,
    /// There is at least one character that's U+0100 or higher, but there
    /// are no right-to-left characters.
    LeftToRight = 1,
    /// There is at least one right-to-left character.
    Bidi = 2,
}

// `as` truncates, so works on 32-bit, too.
#[allow(dead_code)]
const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;

#[allow(unused_macros)]
macro_rules! by_unit_check_alu {
    ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
        #[inline(always)]
        fn $name(buffer: &[$unit]) -> bool {
            let mut offset = 0usize;
            let mut accu = 0usize;
            let unit_size = ::core::mem::size_of::<$unit>();
            let len = buffer.len();
            if len >= ALU_ALIGNMENT / unit_size {
                // The most common reason to return `false` is for the first code
                // unit to fail the test, so check that first.
                if buffer[0] >= $bound {
                    return false;
                }
                let src = buffer.as_ptr();
                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
                    & ALU_ALIGNMENT_MASK)
                    / unit_size;
                if until_alignment + ALU_ALIGNMENT / unit_size <= len {
                    if until_alignment != 0 {
                        accu |= buffer[offset] as usize;
                        offset += 1;
                        until_alignment -= 1;
                        while until_alignment != 0 {
                            accu |= buffer[offset] as usize;
                            offset += 1;
                            until_alignment -= 1;
                        }
                        if accu >= $bound {
                            return false;
                        }
                    }
                    let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
                    if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
                        // Safety: the above check lets us perform 4 consecutive reads of
                        // length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size
                        // is the size of the `src` pointer, so this is equal to performing four usize reads.
                        //
                        // This invariant is upheld on all loop iterations
                        let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
                        loop {
                            let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
                                | unsafe {
                                    *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
                                }
                                | unsafe {
                                    *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
                                        as *const usize)
                                }
                                | unsafe {
                                    *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
                                        as *const usize)
                                };
                            if unroll_accu & $mask != 0 {
                                return false;
                            }
                            offset += 4 * (ALU_ALIGNMENT / unit_size);
                            // Safety: this check lets us continue to perform the 4 reads earlier
                            if offset > len_minus_unroll {
                                break;
                            }
                        }
                    }
                    while offset <= len_minus_stride {
                        // Safety: the above check lets us perform one usize read.
                        accu |= unsafe { *(src.add(offset) as *const usize) };
                        offset += ALU_ALIGNMENT / unit_size;
                    }
                }
            }
            for &unit in &buffer[offset..] {
                accu |= unit as usize;
            }
            accu & $mask == 0
        }
    };
}

#[allow(unused_macros)]
macro_rules! by_unit_check_simd {
    ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
        #[inline(always)]
        fn $name(buffer: &[$unit]) -> bool {
            let mut offset = 0usize;
            let mut accu = 0usize;
            let unit_size = ::core::mem::size_of::<$unit>();
            let len = buffer.len();
            if len >= SIMD_STRIDE_SIZE / unit_size {
                // The most common reason to return `false` is for the first code
                // unit to fail the test, so check that first.
                if buffer[0] >= $bound {
                    return false;
                }
                let src = buffer.as_ptr();
                let mut until_alignment = ((SIMD_ALIGNMENT
                    - ((src as usize) & SIMD_ALIGNMENT_MASK))
                    & SIMD_ALIGNMENT_MASK)
                    / unit_size;
                if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
                    if until_alignment != 0 {
                        accu |= buffer[offset] as usize;
                        offset += 1;
                        until_alignment -= 1;
                        while until_alignment != 0 {
                            accu |= buffer[offset] as usize;
                            offset += 1;
                            until_alignment -= 1;
                        }
                        if accu >= $bound {
                            return false;
                        }
                    }
                    let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
                    if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
                        // Safety: the above check lets us perform 4 consecutive reads of
                        // length SIMD_STRIDE_SIZE / unit_size. SIMD_STRIDE_SIZE is the size of $simd_ty, and unit_size
                        // is the size of the `src` pointer, so this is equal to performing four $simd_ty reads.
                        //
                        // This invariant is upheld on all loop iterations
                        let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
                        loop {
                            let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
                                | unsafe {
                                    *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
                                        as *const $simd_ty)
                                }
                                | unsafe {
                                    *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
                                        as *const $simd_ty)
                                }
                                | unsafe {
                                    *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
                                        as *const $simd_ty)
                                };
                            if !$func(unroll_accu) {
                                return false;
                            }
                            offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
                            // Safety: this check lets us continue to perform the 4 reads earlier
                            if offset > len_minus_unroll {
                                break;
                            }
                        }
                    }
                    let mut simd_accu = $splat;
                    while offset <= len_minus_stride {
                        // Safety: the above check lets us perform one $simd_ty read.
                        simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
                        offset += SIMD_STRIDE_SIZE / unit_size;
                    }
                    if !$func(simd_accu) {
                        return false;
                    }
                }
            }
            for &unit in &buffer[offset..] {
                accu |= unit as usize;
            }
            accu < $bound
        }
    };
}

cfg_if! {
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
        use crate::simd_funcs::*;
        use core::simd::u8x16;
        use core::simd::u16x8;

        const SIMD_ALIGNMENT: usize = 16;

        const SIMD_ALIGNMENT_MASK: usize = 15;

        by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
        by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
        by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);

        #[inline(always)]
        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
            // This function is a mess, because it simultaneously tries to do
            // only aligned SIMD (perhaps misguidedly) and needs to deal with
            // the last code unit in a SIMD stride being part of a valid
            // surrogate pair.
            let unit_size = ::core::mem::size_of::<u16>();
            let src = buffer.as_ptr();
            let len = buffer.len();
            let mut offset = 0usize;
            'outer: loop {
                let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
                                        SIMD_ALIGNMENT_MASK) / unit_size;
                if until_alignment == 0 {
                    if offset + SIMD_STRIDE_SIZE / unit_size > len {
                        break;
                    }
                } else {
                    let offset_plus_until_alignment = offset + until_alignment;
                    let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
                    if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
                        break;
                    }
                    let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
                    if up_to < until_alignment {
                        return offset + up_to;
                    }
                    if last_valid_low {
                        offset = offset_plus_until_alignment_plus_one;
                        continue;
                    }
                    offset = offset_plus_until_alignment;
                }
                let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
                loop {
                    let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
                    if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
                        if offset_plus_stride == len {
                            break 'outer;
                        }
                        let offset_plus_stride_plus_one = offset_plus_stride + 1;
                        let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
                        if up_to < SIMD_STRIDE_SIZE / unit_size {
                            return offset + up_to;
                        }
                        if last_valid_low {
                            offset = offset_plus_stride_plus_one;
                            continue 'outer;
                        }
                    }
                    offset = offset_plus_stride;
                    if offset > len_minus_stride {
                        break 'outer;
                    }
                }
            }
            let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
            offset + up_to
        }
    } else {
        by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
        by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
        by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);

        #[inline(always)]
        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
            let (up_to, _) = utf16_valid_up_to_alu(buffer);
            up_to
        }
    }
}

/// The second return value is true iff the last code unit of the slice was
/// reached and turned out to be a low surrogate that is part of a valid pair.
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
#[inline(always)]
fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
    let len = buffer.len();
    if len == 0 {
        return (0, false);
    }
    let mut offset = 0usize;
    loop {
        let unit = buffer[offset];
        let next = offset + 1;
        let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
        if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
            // Not a surrogate
            offset = next;
            if offset == len {
                return (offset, false);
            }
            continue;
        }
        if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
            // high surrogate
            if next < len {
                let second = buffer[next];
                let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
                if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
                    // The next code unit is a low surrogate. Advance position.
                    offset = next + 1;
                    if offset == len {
                        return (offset, true);
                    }
                    continue;
                }
                // The next code unit is not a low surrogate. Don't advance
                // position and treat the high surrogate as unpaired.
                // fall through
            }
            // Unpaired, fall through
        }
        // Unpaired surrogate
        return (offset, false);
    }
}

cfg_if! {
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
        #[inline(always)]
        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
            let mut offset = 0usize;
            let bytes = buffer.as_bytes();
            let len = bytes.len();
            if len >= SIMD_STRIDE_SIZE {
                let src = bytes.as_ptr();
                let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
                                           SIMD_ALIGNMENT_MASK;
                if until_alignment + SIMD_STRIDE_SIZE <= len {
                    while until_alignment != 0 {
                        if bytes[offset] > 0xC3 {
                            return Some(offset);
                        }
                        offset += 1;
                        until_alignment -= 1;
                    }
                    let len_minus_stride = len - SIMD_STRIDE_SIZE;
                    loop {
                        if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
                            // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
                            while bytes[offset] & 0xC0 == 0x80 {
                                offset += 1;
                            }
                            return Some(offset);
                        }
                        offset += SIMD_STRIDE_SIZE;
                        if offset > len_minus_stride {
                            break;
                        }
                    }
                }
            }
            for i in offset..len {
                if bytes[i] > 0xC3 {
                    return Some(i);
                }
            }
            None
        }
    } else {
        #[inline(always)]
        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
            let mut bytes = buffer.as_bytes();
            let mut total = 0;
            loop {
                if let Some((byte, offset)) = validate_ascii(bytes) {
                    total += offset;
                    if byte > 0xC3 {
                        return Some(total);
                    }
                    bytes = &bytes[offset + 2..];
                    total += 2;
                } else {
                    return None;
                }
            }
        }
    }
}

#[inline(always)]
fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
    let mut bytes = buffer;
    let mut total = 0;
    loop {
        if let Some((byte, offset)) = validate_ascii(bytes) {
            total += offset;
            if in_inclusive_range8(byte, 0xC2, 0xC3) {
                let next = offset + 1;
                if next == bytes.len() {
                    return Some(total);
                }
                if bytes[next] & 0xC0 != 0x80 {
                    return Some(total);
                }
                bytes = &bytes[offset + 2..];
                total += 2;
            } else {
                return Some(total);
            }
        } else {
            return None;
        }
    }
}

cfg_if! {
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
        #[inline(always)]
        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
            let mut offset = 0usize;
            let len = buffer.len();
            if len >= SIMD_STRIDE_SIZE / 2 {
                let src = buffer.as_ptr();
                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
                                           SIMD_ALIGNMENT_MASK) / 2;
                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
                    while until_alignment != 0 {
                        if is_utf16_code_unit_bidi(buffer[offset]) {
                            return true;
                        }
                        offset += 1;
                        until_alignment -= 1;
                    }
                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
                    loop {
                        if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
                            return true;
                        }
                        offset += SIMD_STRIDE_SIZE / 2;
                        if offset > len_minus_stride {
                            break;
                        }
                    }
                }
            }
            for &u in &buffer[offset..] {
                if is_utf16_code_unit_bidi(u) {
                    return true;
                }
            }
            false
        }
    } else {
        #[inline(always)]
        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
            for &u in buffer {
                if is_utf16_code_unit_bidi(u) {
                    return true;
                }
            }
            false
        }
    }
}

cfg_if! {
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
        #[inline(always)]
        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
            let mut offset = 0usize;
            let len = buffer.len();
            if len >= SIMD_STRIDE_SIZE / 2 {
                let src = buffer.as_ptr();
                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
                                           SIMD_ALIGNMENT_MASK) / 2;
                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
                    while until_alignment != 0 {
                        if buffer[offset] > 0xFF {
                            // This transition isn't optimal, since the aligment is recomputing
                            // but not tweaking further today.
                            if is_utf16_bidi_impl(&buffer[offset..]) {
                                return Latin1Bidi::Bidi;
                            }
                            return Latin1Bidi::LeftToRight;
                        }
                        offset += 1;
                        until_alignment -= 1;
                    }
                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
                    loop {
                        let mut s = unsafe { *(src.add(offset) as *const u16x8) };
                        if !simd_is_latin1(s) {
                            loop {
                                if is_u16x8_bidi(s) {
                                    return Latin1Bidi::Bidi;
                                }
                                offset += SIMD_STRIDE_SIZE / 2;
                                if offset > len_minus_stride {
                                    for &u in &buffer[offset..] {
                                        if is_utf16_code_unit_bidi(u) {
                                            return Latin1Bidi::Bidi;
                                        }
                                    }
                                    return Latin1Bidi::LeftToRight;
                                }
                                s = unsafe { *(src.add(offset) as *const u16x8) };
                            }
                        }
                        offset += SIMD_STRIDE_SIZE / 2;
                        if offset > len_minus_stride {
                            break;
                        }
                    }
                }
            }
            let mut iter = (&buffer[offset..]).iter();
            loop {
                if let Some(&u) = iter.next() {
                    if u > 0xFF {
                        let mut inner_u = u;
                        loop {
                            if is_utf16_code_unit_bidi(inner_u) {
                                return Latin1Bidi::Bidi;
                            }
                            if let Some(&code_unit) = iter.next() {
                                inner_u = code_unit;
                            } else {
                                return Latin1Bidi::LeftToRight;
                            }
                        }
                    }
                } else {
                    return Latin1Bidi::Latin1;
                }
            }
        }
    } else {
        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
        #[inline(always)]
        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
            let mut offset = 0usize;
            let len = buffer.len();
            if len >= ALU_ALIGNMENT / 2 {
                let src = buffer.as_ptr();
                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
                                           ALU_ALIGNMENT_MASK) / 2;
                if until_alignment + ALU_ALIGNMENT / 2 <= len {
                    while until_alignment != 0 {
                        if buffer[offset] > 0xFF {
                            if is_utf16_bidi_impl(&buffer[offset..]) {
                                return Latin1Bidi::Bidi;
                            }
                            return Latin1Bidi::LeftToRight;
                        }
                        offset += 1;
                        until_alignment -= 1;
                    }
                    let len_minus_stride = len - ALU_ALIGNMENT / 2;
                    loop {
                        if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
                            if is_utf16_bidi_impl(&buffer[offset..]) {
                                return Latin1Bidi::Bidi;
                            }
                            return Latin1Bidi::LeftToRight;
                        }
                        offset += ALU_ALIGNMENT / 2;
                        if offset > len_minus_stride {
                            break;
                        }
                    }
                }
            }
            let mut iter = (&buffer[offset..]).iter();
            loop {
                if let Some(&u) = iter.next() {
                    if u > 0xFF {
                        let mut inner_u = u;
                        loop {
                            if is_utf16_code_unit_bidi(inner_u) {
                                return Latin1Bidi::Bidi;
                            }
                            if let Some(&code_unit) = iter.next() {
                                inner_u = code_unit;
                            } else {
                                return Latin1Bidi::LeftToRight;
                            }
                        }
                    }
                } else {
                    return Latin1Bidi::Latin1;
                }
            }
        }
    }
}

/// Checks whether the buffer is all-ASCII.
///
/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
/// is not guaranteed to fail fast.)
pub fn is_ascii(buffer: &[u8]) -> bool {
    is_ascii_impl(buffer)
}

/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
/// only ASCII characters).
///
/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
/// is not guaranteed to fail fast.)
pub fn is_basic_latin(buffer: &[u16]) -> bool {
    is_basic_latin_impl(buffer)
}

/// Checks whether the buffer is valid UTF-8 representing only code points
/// less than or equal to U+00FF.
///
/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
/// invalidity or code points above U+00FF are discovered.
pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
    is_utf8_latin1_impl(buffer).is_none()
}

/// Checks whether the buffer represents only code points less than or equal
/// to U+00FF.
///
/// Fails fast. (I.e. returns before having read the whole buffer if code
/// points above U+00FF are discovered.
pub fn is_str_latin1(buffer: &str) -> bool {
    is_str_latin1_impl(buffer).is_none()
}

/// Checks whether the buffer represents only code point less than or equal
/// to U+00FF.
///
/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
/// is not guaranteed to fail fast.)
pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
    is_utf16_latin1_impl(buffer)
}

/// Checks whether a potentially-invalid UTF-8 buffer contains code points
/// that trigger right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Returns `true` if the input is invalid UTF-8 or the input contains an
/// RTL character. Returns `false` if the input is valid UTF-8 and contains
/// no RTL characters.
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
#[inline]
pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
    // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
    // than UTF-8 validation followed by `is_str_bidi()` for German,
    // Russian and Japanese. However, this is considerably slower for Thai.
    // Chances are that the compiler makes some branch predictions that are
    // unfortunate for Thai. Not spending the time to manually optimize
    // further at this time, since it's unclear if this variant even has
    // use cases. However, this is worth revisiting once Rust gets the
    // ability to annotate relative priorities of match arms.

    // U+058F: D6 8F
    // U+0590: D6 90
    // U+08FF: E0 A3 BF
    // U+0900: E0 A4 80
    //
    // U+200F: E2 80 8F
    // U+202B: E2 80 AB
    // U+202E: E2 80 AE
    // U+2067: E2 81 A7
    //
    // U+FB1C: EF AC 9C
    // U+FB1D: EF AC 9D
    // U+FDFF: EF B7 BF
    // U+FE00: EF B8 80
    //
    // U+FE6F: EF B9 AF
    // U+FE70: EF B9 B0
    // U+FEFE: EF BB BE
    // U+FEFF: EF BB BF
    //
    // U+107FF: F0 90 9F BF
    // U+10800: F0 90 A0 80
    // U+10FFF: F0 90 BF BF
    // U+11000: F0 91 80 80
    //
    // U+1E7FF: F0 9E 9F BF
    // U+1E800: F0 9E A0 80
    // U+1EFFF: F0 9E BF BF
    // U+1F000: F0 9F 80 80
    let mut src = buffer;
    'outer: loop {
        if let Some((mut byte, mut read)) = validate_ascii(src) {
            // Check for the longest sequence to avoid checking twice for the
            // multi-byte sequences.
            if read + 4 <= src.len() {
                'inner: loop {
                    // At this point, `byte` is not included in `read`.
                    match byte {
                        0..=0x7F => {
                            // ASCII: go back to SIMD.
                            read += 1;
                            src = &src[read..];
                            continue 'outer;
                        }
                        0xC2..=0xD5 => {
                            // Two-byte
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            if !in_inclusive_range8(second, 0x80, 0xBF) {
                                return true;
                            }
                            read += 2;
                        }
                        0xD6 => {
                            // Two-byte
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            if !in_inclusive_range8(second, 0x80, 0xBF) {
                                return true;
                            }
                            // XXX consider folding the above and below checks
                            if second > 0x8F {
                                return true;
                            }
                            read += 2;
                        }
                        // two-byte starting with 0xD7 and above is bidi
                        0xE1 | 0xE3..=0xEC | 0xEE => {
                            // Three-byte normal
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            read += 3;
                        }
                        0xE2 => {
                            // Three-byte normal, potentially bidi
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            if second == 0x80 {
                                if third == 0x8F || third == 0xAB || third == 0xAE {
                                    return true;
                                }
                            } else if second == 0x81 {
                                if third == 0xA7 {
                                    return true;
                                }
                            }
                            read += 3;
                        }
                        0xEF => {
                            // Three-byte normal, potentially bidi
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            if in_inclusive_range8(second, 0xAC, 0xB7) {
                                if second == 0xAC {
                                    if third > 0x9C {
                                        return true;
                                    }
                                } else {
                                    return true;
                                }
                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
                                if second == 0xB9 {
                                    if third > 0xAF {
                                        return true;
                                    }
                                } else if second == 0xBB {
                                    if third != 0xBF {
                                        return true;
                                    }
                                } else {
                                    return true;
                                }
                            }
                            read += 3;
                        }
                        0xE0 => {
                            // Three-byte special lower bound, potentially bidi
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            // XXX can this be folded into the above validity check
                            if second < 0xA4 {
                                return true;
                            }
                            read += 3;
                        }
                        0xED => {
                            // Three-byte special upper bound
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            if ((UTF8_DATA.table[usize::from(second)]
                                & unsafe {
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                })
                                | (third >> 6))
                                != 2
                            {
                                return true;
                            }
                            read += 3;
                        }
                        0xF1..=0xF4 => {
                            // Four-byte normal
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
                            if (u16::from(
                                UTF8_DATA.table[usize::from(second)]
                                    & unsafe {
                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                    },
                            ) | u16::from(third >> 6)
                                | (u16::from(fourth & 0xC0) << 2))
                                != 0x202
                            {
                                return true;
                            }
                            read += 4;
                        }
                        0xF0 => {
                            // Four-byte special lower bound, potentially bidi
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
                            if (u16::from(
                                UTF8_DATA.table[usize::from(second)]
                                    & unsafe {
                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
                                    },
                            ) | u16::from(third >> 6)
                                | (u16::from(fourth & 0xC0) << 2))
                                != 0x202
                            {
                                return true;
                            }
                            if unlikely(second == 0x90 || second == 0x9E) {
                                let third = src[read + 2];
                                if third >= 0xA0 {
                                    return true;
                                }
                            }
                            read += 4;
                        }
                        _ => {
                            // Invalid lead or bidi-only lead
                            return true;
                        }
                    }
                    if read + 4 > src.len() {
                        if read == src.len() {
                            return false;
                        }
                        byte = src[read];
                        break 'inner;
                    }
                    byte = src[read];
                    continue 'inner;
                }
            }
            // We can't have a complete 4-byte sequence, but we could still have
            // a complete shorter sequence.

            // At this point, `byte` is not included in `read`.
            match byte {
                0..=0x7F => {
                    // ASCII: go back to SIMD.
                    read += 1;
                    src = &src[read..];
                    continue 'outer;
                }
                0xC2..=0xD5 => {
                    // Two-byte
                    let new_read = read + 2;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    if !in_inclusive_range8(second, 0x80, 0xBF) {
                        return true;
                    }
                    read = new_read;
                    // We need to deal with the case where we came here with 3 bytes
                    // left, so we need to take a look at the last one.
                    src = &src[read..];
                    continue 'outer;
                }
                0xD6 => {
                    // Two-byte, potentially bidi
                    let new_read = read + 2;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    if !in_inclusive_range8(second, 0x80, 0xBF) {
                        return true;
                    }
                    // XXX consider folding the above and below checks
                    if second > 0x8F {
                        return true;
                    }
                    read = new_read;
                    // We need to deal with the case where we came here with 3 bytes
                    // left, so we need to take a look at the last one.
                    src = &src[read..];
                    continue 'outer;
                }
                // two-byte starting with 0xD7 and above is bidi
                0xE1 | 0xE3..=0xEC | 0xEE => {
                    // Three-byte normal
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                }
                0xE2 => {
                    // Three-byte normal, potentially bidi
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                    if second == 0x80 {
                        if third == 0x8F || third == 0xAB || third == 0xAE {
                            return true;
                        }
                    } else if second == 0x81 {
                        if third == 0xA7 {
                            return true;
                        }
                    }
                }
                0xEF => {
                    // Three-byte normal, potentially bidi
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                    if in_inclusive_range8(second, 0xAC, 0xB7) {
                        if second == 0xAC {
                            if third > 0x9C {
                                return true;
                            }
                        } else {
                            return true;
                        }
                    } else if in_inclusive_range8(second, 0xB9, 0xBB) {
                        if second == 0xB9 {
                            if third > 0xAF {
                                return true;
                            }
                        } else if second == 0xBB {
                            if third != 0xBF {
                                return true;
                            }
                        } else {
                            return true;
                        }
                    }
                }
                0xE0 => {
                    // Three-byte special lower bound, potentially bidi
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                    // XXX can this be folded into the above validity check
                    if second < 0xA4 {
                        return true;
                    }
                }
                0xED => {
                    // Three-byte special upper bound
                    let new_read = read + 3;
                    if new_read > src.len() {
                        return true;
                    }
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
                    if ((UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                        | (third >> 6))
                        != 2
                    {
                        return true;
                    }
                }
                _ => {
                    // Invalid lead, 4-byte lead or 2-byte bidi-only lead
                    return true;
                }
            }
            return false;
        } else {
            return false;
        }
    }
}

/// Checks whether a valid UTF-8 buffer contains code points that trigger
/// right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
#[inline]
pub fn is_str_bidi(buffer: &str) -> bool {
    // U+058F: D6 8F
    // U+0590: D6 90
    // U+08FF: E0 A3 BF
    // U+0900: E0 A4 80
    //
    // U+200F: E2 80 8F
    // U+202B: E2 80 AB
    // U+202E: E2 80 AE
    // U+2067: E2 81 A7
    //
    // U+FB1C: EF AC 9C
    // U+FB1D: EF AC 9D
    // U+FDFF: EF B7 BF
    // U+FE00: EF B8 80
    //
    // U+FE6F: EF B9 AF
    // U+FE70: EF B9 B0
    // U+FEFE: EF BB BE
    // U+FEFF: EF BB BF
    //
    // U+107FF: F0 90 9F BF
    // U+10800: F0 90 A0 80
    // U+10FFF: F0 90 BF BF
    // U+11000: F0 91 80 80
    //
    // U+1E7FF: F0 9E 9F BF
    // U+1E800: F0 9E A0 80
    // U+1EFFF: F0 9E BF BF
    // U+1F000: F0 9F 80 80
    let mut bytes = buffer.as_bytes();
    'outer: loop {
        // TODO: Instead of just validating ASCII using SIMD, use SIMD
        // to check for non-ASCII lead bytes, too, to quickly conclude
        // that the vector consist entirely of CJK and below-Hebrew
        // code points.
        // Unfortunately, scripts above Arabic but below CJK share
        // lead bytes with RTL.
        if let Some((mut byte, mut read)) = validate_ascii(bytes) {
            'inner: loop {
                // At this point, `byte` is not included in `read`.
                if byte < 0xE0 {
                    if byte >= 0x80 {
                        // Two-byte
                        // Adding `unlikely` here improved throughput on
                        // Russian plain text by 33%!
                        if unlikely(byte >= 0xD6) {
                            if byte == 0xD6 {
                                let second = bytes[read + 1];
                                if second > 0x8F {
                                    return true;
                                }
                            } else {
                                return true;
                            }
                        }
                        read += 2;
                    } else {
                        // ASCII: write and go back to SIMD.
                        read += 1;
                        // Intuitively, we should go back to the outer loop only
                        // if byte is 0x30 or above, so as to avoid trashing on
                        // ASCII space, comma and period in non-Latin context.
                        // However, the extra branch seems to cost more than it's
                        // worth.
                        bytes = &bytes[read..];
                        continue 'outer;
                    }
                } else if byte < 0xF0 {
                    // Three-byte
                    if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) {
                        let second = bytes[read + 1];
                        if byte == 0xE0 {
                            if second < 0xA4 {
                                return true;
                            }
                        } else if byte == 0xE2 {
                            let third = bytes[read + 2];
                            if second == 0x80 {
                                if third == 0x8F || third == 0xAB || third == 0xAE {
                                    return true;
                                }
                            } else if second == 0x81 {
                                if third == 0xA7 {
                                    return true;
                                }
                            }
                        } else {
                            debug_assert_eq!(byte, 0xEF);
                            if in_inclusive_range8(second, 0xAC, 0xB7) {
                                if second == 0xAC {
                                    let third = bytes[read + 2];
                                    if third > 0x9C {
                                        return true;
                                    }
                                } else {
                                    return true;
                                }
                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
                                if second == 0xB9 {
                                    let third = bytes[read + 2];
                                    if third > 0xAF {
                                        return true;
                                    }
                                } else if second == 0xBB {
                                    let third = bytes[read + 2];
                                    if third != 0xBF {
                                        return true;
                                    }
                                } else {
                                    return true;
                                }
                            }
                        }
                    }
                    read += 3;
                } else {
                    // Four-byte
                    let second = bytes[read + 1];
                    if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) {
                        let third = bytes[read + 2];
                        if third >= 0xA0 {
                            return true;
                        }
                    }
                    read += 4;
                }
                // The comparison is always < or == and never >, but including
                // > here to let the compiler assume that < is true if this
                // comparison is false.
                if read >= bytes.len() {
                    return false;
                }
                byte = bytes[read];
                continue 'inner;
            }
        } else {
            return false;
        }
    }
}

/// Checks whether a UTF-16 buffer contains code points that trigger
/// right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Returns `true` if the input contains an RTL character or an unpaired
/// high surrogate that could be the high half of an RTL character.
/// Returns `false` if the input contains neither RTL characters nor
/// unpaired high surrogates that could be higher halves of RTL characters.
pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
    is_utf16_bidi_impl(buffer)
}

/// Checks whether a scalar value triggers right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
#[inline(always)]
pub fn is_char_bidi(c: char) -> bool {
    // Controls:
    // Every control with RIGHT-TO-LEFT in its name in
    // https://www.unicode.org/charts/PDF/U2000.pdf
    // U+200F RLM
    // U+202B RLE
    // U+202E RLO
    // U+2067 RLI
    //
    // BMP RTL:
    // https://www.unicode.org/roadmaps/bmp/
    // U+0590...U+08FF
    // U+FB1D...U+FDFF Hebrew presentation forms and
    //                 Arabic Presentation Forms A
    // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
    //
    // Supplementary RTL:
    // https://www.unicode.org/roadmaps/smp/
    // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
    // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
    let code_point = u32::from(c);
    if code_point < 0x0590 {
        // Below Hebrew
        return false;
    }
    if in_range32(code_point, 0x0900, 0xFB1D) {
        // Above Arabic Extended-A and below Hebrew presentation forms
        if in_inclusive_range32(code_point, 0x200F, 0x2067) {
            // In the range that contains the RTL controls
            return code_point == 0x200F
                || code_point == 0x202B
                || code_point == 0x202E
                || code_point == 0x2067;
        }
        return false;
    }
    if code_point > 0x1EFFF {
        // Above second astral RTL. (Emoji is here.)
        return false;
    }
    if in_range32(code_point, 0x11000, 0x1E800) {
        // Between astral RTL blocks
        return false;
    }
    if in_range32(code_point, 0xFEFF, 0x10800) {
        // Above Arabic Presentations Forms B (excl. BOM) and below first
        // astral RTL
        return false;
    }
    if in_range32(code_point, 0xFE00, 0xFE70) {
        // Between Arabic Presentations Forms
        return false;
    }
    true
}

/// Checks whether a UTF-16 code unit triggers right-to-left processing.
///
/// The check is done on a Unicode block basis without regard to assigned
/// vs. unassigned code points in the block. Hebrew presentation forms in
/// the Alphabetic Presentation Forms block are treated as if they formed
/// a block on their own (i.e. it treated as right-to-left). Additionally,
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
/// for. Control characters that are technically bidi controls but do not
/// cause right-to-left behavior without the presence of right-to-left
/// characters or right-to-left controls are not checked for. As a special
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
///
/// Since supplementary-plane right-to-left blocks are identifiable from the
/// high surrogate without examining the low surrogate, this function returns
/// `true` for such high surrogates making the function suitable for handling
/// supplementary-plane text without decoding surrogate pairs to scalar
/// values. Obviously, such high surrogates are then reported as right-to-left
/// even if actually unpaired.
#[inline(always)]
pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
    if u < 0x0590 {
        // Below Hebrew
        return false;
    }
    if in_range16(u, 0x0900, 0xD802) {
        // Above Arabic Extended-A and below first RTL surrogate
        if in_inclusive_range16(u, 0x200F, 0x2067) {
            // In the range that contains the RTL controls
            return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
        }
        return false;
    }
    if in_range16(u, 0xD83C, 0xFB1D) {
        // Between astral RTL high surrogates and Hebrew presentation forms
        // (Emoji is here)
        return false;
    }
    if in_range16(u, 0xD804, 0xD83A) {
        // Between RTL high surragates
        return false;
    }
    if u > 0xFEFE {
        // Above Arabic Presentation Forms (excl. BOM)
        return false;
    }
    if in_range16(u, 0xFE00, 0xFE70) {
        // Between Arabic Presentations Forms
        return false;
    }
    true
}

/// Checks whether a potentially invalid UTF-8 buffer contains code points
/// that trigger right-to-left processing or is all-Latin1.
///
/// Possibly more efficient than performing the checks separately.
///
/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
    if let Some(offset) = is_utf8_latin1_impl(buffer) {
        if is_utf8_bidi(&buffer[offset..]) {
            Latin1Bidi::Bidi
        } else {
            Latin1Bidi::LeftToRight
        }
    } else {
        Latin1Bidi::Latin1
    }
}

/// Checks whether a valid UTF-8 buffer contains code points
/// that trigger right-to-left processing or is all-Latin1.
///
/// Possibly more efficient than performing the checks separately.
///
/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
    // The transition from the latin1 check to the bidi check isn't
    // optimal but not tweaking it to perfection today.
    if let Some(offset) = is_str_latin1_impl(buffer) {
        if is_str_bidi(&buffer[offset..]) {
            Latin1Bidi::Bidi
        } else {
            Latin1Bidi::LeftToRight
        }
    } else {
        Latin1Bidi::Latin1
    }
}

/// Checks whether a potentially invalid UTF-16 buffer contains code points
/// that trigger right-to-left processing or is all-Latin1.
///
/// Possibly more efficient than performing the checks separately.
///
/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
    check_utf16_for_latin1_and_bidi_impl(buffer)
}

/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
/// with the REPLACEMENT CHARACTER.
///
/// The length of the destination buffer must be at least the length of the
/// source buffer _plus one_.
///
/// Returns the number of `u16`s written.
///
/// # Panics
///
/// Panics if the destination buffer is shorter than stated above.
pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
    // TODO: Can the requirement for dst to be at least one unit longer
    // be eliminated?
    assert!(dst.len() > src.len());
    let mut decoder = Utf8Decoder::new_inner();
    let mut total_read = 0usize;
    let mut total_written = 0usize;
    loop {
        let (result, read, written) =
            decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
        total_read += read;
        total_written += written;
        match result {
            DecoderResult::InputEmpty => {
                return total_written;
            }
            DecoderResult::OutputFull => {
                unreachable!("The assert at the top of the function should have caught this.");
            }
            DecoderResult::Malformed(_, _) => {
                // There should always be space for the U+FFFD, because
                // otherwise we'd have gotten OutputFull already.
                dst[total_written] = 0xFFFD;
                total_written += 1;
            }
        }
    }
}

/// Converts valid UTF-8 to valid UTF-16.
///
/// The length of the destination buffer must be at least the length of the
/// source buffer.
///
/// Returns the number of `u16`s written.
///
/// # Panics
///
/// Panics if the destination buffer is shorter than stated above.
pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
    assert!(
        dst.len() >= src.len(),
        "Destination must not be shorter than the source."
    );
    let bytes = src.as_bytes();
    let mut read = 0;
    let mut written = 0;
    'outer: loop {
        let mut byte = {
            let src_remaining = &bytes[read..];
            let dst_remaining = &mut dst[written..];
            let length = src_remaining.len();
            match unsafe {
                ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
            } {
                None => {
                    written += length;
                    return written;
                }
                Some((non_ascii, consumed)) => {
--> --------------------

--> maximum size reached

--> --------------------

[ zur Elbe Produktseite wechseln0.79Quellennavigators  ]

                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....
    

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge