Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Impressum utf_8.rs   Sprache: unbekannt

 
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use super::*;
use crate::ascii::ascii_to_basic_latin;
use crate::ascii::basic_latin_to_ascii;
use crate::ascii::validate_ascii;
use crate::handles::*;
use crate::mem::convert_utf16_to_utf8_partial;
use crate::variant::*;

cfg_if! {
    if #[cfg(feature = "simd-accel")] {
        use ::core::intrinsics::unlikely;
        use ::core::intrinsics::likely;
    } else {
        #[inline(always)]
        fn unlikely(b: bool) -> bool {
            b
        }
        #[inline(always)]
        fn likely(b: bool) -> bool {
            b
        }
    }
}

#[repr(align(64))] // Align to cache lines
pub struct Utf8Data {
    pub table: [u8; 384],
}

// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
// Instead, please regenerate using generate-encoding-data.py

pub static UTF8_DATA: Utf8Data = Utf8Data {
    table: [
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
        148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
        164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
        164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
        252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
    ],
};

// END GENERATED CODE

pub fn utf8_valid_up_to(src: &[u8]) -> usize {
    let mut read = 0;
    'outer: loop {
        let mut byte = {
            let src_remaining = &src[read..];
            match validate_ascii(src_remaining) {
                None => {
                    return src.len();
                }
                Some((non_ascii, consumed)) => {
                    read += consumed;
                    non_ascii
                }
            }
        };
        // Check for the longest sequence to avoid checking twice for the
        // multi-byte sequences. This can't overflow with 64-bit address space,
        // because full 64 bits aren't in use. In the 32-bit PAE case, for this
        // to overflow would mean that the source slice would be so large that
        // the address space of the process would not have space for any code.
        // Therefore, the slice cannot be so long that this would overflow.
        if likely(read + 4 <= src.len()) {
            'inner: loop {
                // At this point, `byte` is not included in `read`, because we
                // don't yet know that a) the UTF-8 sequence is valid and b) that there
                // is output space if it is an astral sequence.
                // Inspecting the lead byte directly is faster than what the
                // std lib does!
                if likely(in_inclusive_range8(byte, 0xC2, 0xDF)) {
                    // Two-byte
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    if !in_inclusive_range8(second, 0x80, 0xBF) {
                        break 'outer;
                    }
                    read += 2;

                    // Next lead (manually inlined)
                    if likely(read + 4 <= src.len()) {
                        byte = unsafe { *(src.get_unchecked(read)) };
                        if byte < 0x80 {
                            read += 1;
                            continue 'outer;
                        }
                        continue 'inner;
                    }
                    break 'inner;
                }
                if likely(byte < 0xF0) {
                    'three: loop {
                        // Three-byte
                        let second = unsafe { *(src.get_unchecked(read + 1)) };
                        let third = unsafe { *(src.get_unchecked(read + 2)) };
                        if ((UTF8_DATA.table[usize::from(second)]
                            & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                            | (third >> 6))
                            != 2
                        {
                            break 'outer;
                        }
                        read += 3;

                        // Next lead (manually inlined)
                        if likely(read + 4 <= src.len()) {
                            byte = unsafe { *(src.get_unchecked(read)) };
                            if in_inclusive_range8(byte, 0xE0, 0xEF) {
                                continue 'three;
                            }
                            if likely(byte < 0x80) {
                                read += 1;
                                continue 'outer;
                            }
                            continue 'inner;
                        }
                        break 'inner;
                    }
                }
                // Four-byte
                let second = unsafe { *(src.get_unchecked(read + 1)) };
                let third = unsafe { *(src.get_unchecked(read + 2)) };
                let fourth = unsafe { *(src.get_unchecked(read + 3)) };
                if (u16::from(
                    UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
                ) | u16::from(third >> 6)
                    | (u16::from(fourth & 0xC0) << 2))
                    != 0x202
                {
                    break 'outer;
                }
                read += 4;

                // Next lead
                if likely(read + 4 <= src.len()) {
                    byte = unsafe { *(src.get_unchecked(read)) };
                    if byte < 0x80 {
                        read += 1;
                        continue 'outer;
                    }
                    continue 'inner;
                }
                break 'inner;
            }
        }
        // We can't have a complete 4-byte sequence, but we could still have
        // one to three shorter sequences.
        'tail: loop {
            // >= is better for bound check elision than ==
            if read >= src.len() {
                break 'outer;
            }
            byte = src[read];
            // At this point, `byte` is not included in `read`, because we
            // don't yet know that a) the UTF-8 sequence is valid and b) that there
            // is output space if it is an astral sequence.
            // Inspecting the lead byte directly is faster than what the
            // std lib does!
            if byte < 0x80 {
                read += 1;
                continue 'tail;
            }
            if in_inclusive_range8(byte, 0xC2, 0xDF) {
                // Two-byte
                let new_read = read + 2;
                if new_read > src.len() {
                    break 'outer;
                }
                let second = src[read + 1];
                if !in_inclusive_range8(second, 0x80, 0xBF) {
                    break 'outer;
                }
                read += 2;
                continue 'tail;
            }
            // We need to exclude valid four byte lead bytes, because
            // `UTF8_DATA.second_mask` covers
            if byte < 0xF0 {
                // Three-byte
                let new_read = read + 3;
                if new_read > src.len() {
                    break 'outer;
                }
                let second = src[read + 1];
                let third = src[read + 2];
                if ((UTF8_DATA.table[usize::from(second)]
                    & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                    | (third >> 6))
                    != 2
                {
                    break 'outer;
                }
                read += 3;
                // `'tail` handles sequences shorter than 4, so
                // there can't be another sequence after this one.
                break 'outer;
            }
            break 'outer;
        }
    }
    read
}

#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
    let mut read = 0;
    let mut written = 0;
    'outer: loop {
        let mut byte = {
            let src_remaining = &src[read..];
            let dst_remaining = &mut dst[written..];
            let length = ::core::cmp::min(src_remaining.len(), dst_remaining.len());
            match unsafe {
                ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
            } {
                None => {
                    read += length;
                    written += length;
                    break 'outer;
                }
                Some((non_ascii, consumed)) => {
                    read += consumed;
                    written += consumed;
                    non_ascii
                }
            }
        };
        // Check for the longest sequence to avoid checking twice for the
        // multi-byte sequences. This can't overflow with 64-bit address space,
        // because full 64 bits aren't in use. In the 32-bit PAE case, for this
        // to overflow would mean that the source slice would be so large that
        // the address space of the process would not have space for any code.
        // Therefore, the slice cannot be so long that this would overflow.
        if likely(read + 4 <= src.len()) {
            'inner: loop {
                // At this point, `byte` is not included in `read`, because we
                // don't yet know that a) the UTF-8 sequence is valid and b) that there
                // is output space if it is an astral sequence.
                // We know, thanks to `ascii_to_basic_latin` that there is output
                // space for at least one UTF-16 code unit, so no need to check
                // for output space in the BMP cases.
                // Inspecting the lead byte directly is faster than what the
                // std lib does!
                if likely(in_inclusive_range8(byte, 0xC2, 0xDF)) {
                    // Two-byte
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
                    if !in_inclusive_range8(second, 0x80, 0xBF) {
                        break 'outer;
                    }
                    unsafe {
                        *(dst.get_unchecked_mut(written)) =
                            ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)
                    };
                    read += 2;
                    written += 1;

                    // Next lead (manually inlined)
                    if written == dst.len() {
                        break 'outer;
                    }
                    if likely(read + 4 <= src.len()) {
                        byte = unsafe { *(src.get_unchecked(read)) };
                        if byte < 0x80 {
                            unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
                            read += 1;
                            written += 1;
                            continue 'outer;
                        }
                        continue 'inner;
                    }
                    break 'inner;
                }
                if likely(byte < 0xF0) {
                    'three: loop {
                        // Three-byte
                        let second = unsafe { *(src.get_unchecked(read + 1)) };
                        let third = unsafe { *(src.get_unchecked(read + 2)) };
                        if ((UTF8_DATA.table[usize::from(second)]
                            & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                            | (third >> 6))
                            != 2
                        {
                            break 'outer;
                        }
                        let point = ((u16::from(byte) & 0xF) << 12)
                            | ((u16::from(second) & 0x3F) << 6)
                            | (u16::from(third) & 0x3F);
                        unsafe { *(dst.get_unchecked_mut(written)) = point };
                        read += 3;
                        written += 1;

                        // Next lead (manually inlined)
                        if written == dst.len() {
                            break 'outer;
                        }
                        if likely(read + 4 <= src.len()) {
                            byte = unsafe { *(src.get_unchecked(read)) };
                            if in_inclusive_range8(byte, 0xE0, 0xEF) {
                                continue 'three;
                            }
                            if likely(byte < 0x80) {
                                unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
                                read += 1;
                                written += 1;
                                continue 'outer;
                            }
                            continue 'inner;
                        }
                        break 'inner;
                    }
                }
                // Four-byte
                if written + 1 == dst.len() {
                    break 'outer;
                }
                let second = unsafe { *(src.get_unchecked(read + 1)) };
                let third = unsafe { *(src.get_unchecked(read + 2)) };
                let fourth = unsafe { *(src.get_unchecked(read + 3)) };
                if (u16::from(
                    UTF8_DATA.table[usize::from(second)]
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
                ) | u16::from(third >> 6)
                    | (u16::from(fourth & 0xC0) << 2))
                    != 0x202
                {
                    break 'outer;
                }
                let point = ((u32::from(byte) & 0x7) << 18)
                    | ((u32::from(second) & 0x3F) << 12)
                    | ((u32::from(third) & 0x3F) << 6)
                    | (u32::from(fourth) & 0x3F);
                unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
                unsafe {
                    *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
                };
                read += 4;
                written += 2;

                // Next lead
                if written == dst.len() {
                    break 'outer;
                }
                if likely(read + 4 <= src.len()) {
                    byte = unsafe { *(src.get_unchecked(read)) };
                    if byte < 0x80 {
                        unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
                        read += 1;
                        written += 1;
                        continue 'outer;
                    }
                    continue 'inner;
                }
                break 'inner;
            }
        }
        // We can't have a complete 4-byte sequence, but we could still have
        // one to three shorter sequences.
        'tail: loop {
            // >= is better for bound check elision than ==
            if read >= src.len() || written >= dst.len() {
                break 'outer;
            }
            byte = src[read];
            // At this point, `byte` is not included in `read`, because we
            // don't yet know that a) the UTF-8 sequence is valid and b) that there
            // is output space if it is an astral sequence.
            // Inspecting the lead byte directly is faster than what the
            // std lib does!
            if byte < 0x80 {
                dst[written] = u16::from(byte);
                read += 1;
                written += 1;
                continue 'tail;
            }
            if in_inclusive_range8(byte, 0xC2, 0xDF) {
                // Two-byte
                let new_read = read + 2;
                if new_read > src.len() {
                    break 'outer;
                }
                let second = src[read + 1];
                if !in_inclusive_range8(second, 0x80, 0xBF) {
                    break 'outer;
                }
                dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
                read += 2;
                written += 1;
                continue 'tail;
            }
            // We need to exclude valid four byte lead bytes, because
            // `UTF8_DATA.second_mask` covers
            if byte < 0xF0 {
                // Three-byte
                let new_read = read + 3;
                if new_read > src.len() {
                    break 'outer;
                }
                let second = src[read + 1];
                let third = src[read + 2];
                if ((UTF8_DATA.table[usize::from(second)]
                    & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
                    | (third >> 6))
                    != 2
                {
                    break 'outer;
                }
                let point = ((u16::from(byte) & 0xF) << 12)
                    | ((u16::from(second) & 0x3F) << 6)
                    | (u16::from(third) & 0x3F);
                dst[written] = point;
                read += 3;
                written += 1;
                // `'tail` handles sequences shorter than 4, so
                // there can't be another sequence after this one.
                break 'outer;
            }
            break 'outer;
        }
    }
    (read, written)
}

pub struct Utf8Decoder {
    code_point: u32,
    bytes_seen: usize,   // 1, 2 or 3: counts continuations only
    bytes_needed: usize, // 1, 2 or 3: counts continuations only
    lower_boundary: u8,
    upper_boundary: u8,
}

impl Utf8Decoder {
    pub fn new_inner() -> Utf8Decoder {
        Utf8Decoder {
            code_point: 0,
            bytes_seen: 0,
            bytes_needed: 0,
            lower_boundary: 0x80u8,
            upper_boundary: 0xBFu8,
        }
    }

    pub fn new() -> VariantDecoder {
        VariantDecoder::Utf8(Utf8Decoder::new_inner())
    }

    pub fn in_neutral_state(&self) -> bool {
        self.bytes_needed == 0
    }

    fn extra_from_state(&self) -> usize {
        if self.bytes_needed == 0 {
            0
        } else {
            self.bytes_seen + 1
        }
    }

    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
        byte_length.checked_add(1 + self.extra_from_state())
    }

    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
        byte_length.checked_add(3 + self.extra_from_state())
    }

    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
        checked_add(
            3,
            checked_mul(3, byte_length.checked_add(self.extra_from_state())),
        )
    }

    decoder_functions!(
        {},
        {
            // This is the fast path. The rest runs only at the
            // start and end for partial sequences.
            if self.bytes_needed == 0 {
                dest.copy_utf8_up_to_invalid_from(&mut source);
            }
        },
        {
            if self.bytes_needed != 0 {
                let bad_bytes = (self.bytes_seen + 1) as u8;
                self.code_point = 0;
                self.bytes_needed = 0;
                self.bytes_seen = 0;
                return (
                    DecoderResult::Malformed(bad_bytes, 0),
                    src_consumed,
                    dest.written(),
                );
            }
        },
        {
            if self.bytes_needed == 0 {
                if b < 0x80u8 {
                    destination_handle.write_ascii(b);
                    continue;
                }
                if b < 0xC2u8 {
                    return (
                        DecoderResult::Malformed(1, 0),
                        unread_handle.consumed(),
                        destination_handle.written(),
                    );
                }
                if b < 0xE0u8 {
                    self.bytes_needed = 1;
                    self.code_point = u32::from(b) & 0x1F;
                    continue;
                }
                if b < 0xF0u8 {
                    if b == 0xE0u8 {
                        self.lower_boundary = 0xA0u8;
                    } else if b == 0xEDu8 {
                        self.upper_boundary = 0x9Fu8;
                    }
                    self.bytes_needed = 2;
                    self.code_point = u32::from(b) & 0xF;
                    continue;
                }
                if b < 0xF5u8 {
                    if b == 0xF0u8 {
                        self.lower_boundary = 0x90u8;
                    } else if b == 0xF4u8 {
                        self.upper_boundary = 0x8Fu8;
                    }
                    self.bytes_needed = 3;
                    self.code_point = u32::from(b) & 0x7;
                    continue;
                }
                return (
                    DecoderResult::Malformed(1, 0),
                    unread_handle.consumed(),
                    destination_handle.written(),
                );
            }
            // self.bytes_needed != 0
            if !(b >= self.lower_boundary && b <= self.upper_boundary) {
                let bad_bytes = (self.bytes_seen + 1) as u8;
                self.code_point = 0;
                self.bytes_needed = 0;
                self.bytes_seen = 0;
                self.lower_boundary = 0x80u8;
                self.upper_boundary = 0xBFu8;
                return (
                    DecoderResult::Malformed(bad_bytes, 0),
                    unread_handle.unread(),
                    destination_handle.written(),
                );
            }
            self.lower_boundary = 0x80u8;
            self.upper_boundary = 0xBFu8;
            self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F);
            self.bytes_seen += 1;
            if self.bytes_seen != self.bytes_needed {
                continue;
            }
            if self.bytes_needed == 3 {
                destination_handle.write_astral(self.code_point);
            } else {
                destination_handle.write_bmp_excl_ascii(self.code_point as u16);
            }
            self.code_point = 0;
            self.bytes_needed = 0;
            self.bytes_seen = 0;
            continue;
        },
        self,
        src_consumed,
        dest,
        source,
        b,
        destination_handle,
        unread_handle,
        check_space_astral
    );
}

#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
#[inline(never)]
pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
    let mut read = 0;
    let mut written = 0;
    'outer: loop {
        let mut unit = {
            let src_remaining = &src[read..];
            let dst_remaining = &mut dst[written..];
            let length = if dst_remaining.len() < src_remaining.len() {
                dst_remaining.len()
            } else {
                src_remaining.len()
            };
            match unsafe {
                basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
            } {
                None => {
                    read += length;
                    written += length;
                    return (read, written);
                }
                Some((non_ascii, consumed)) => {
                    read += consumed;
                    written += consumed;
                    non_ascii
                }
            }
        };
        'inner: loop {
            // The following loop is only broken out of as a goto forward.
            loop {
                // Unfortunately, this check isn't enough for the compiler to elide
                // the bound checks on writes to dst, which is why they are manually
                // elided, which makes a measurable difference.
                if written.checked_add(4).unwrap() > dst.len() {
                    return (read, written);
                }
                read += 1;
                if unit < 0x800 {
                    unsafe {
                        *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8;
                        written += 1;
                        *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
                        written += 1;
                    }
                    break;
                }
                let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
                if likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) {
                    unsafe {
                        *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
                        written += 1;
                        *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
                        written += 1;
                        *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
                        written += 1;
                    }
                    break;
                }
                if likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) {
                    // high surrogate
                    // read > src.len() is impossible, but using
                    // >= instead of == allows the compiler to elide a bound check.
                    if read >= src.len() {
                        debug_assert_eq!(read, src.len());
                        // Unpaired surrogate at the end of the buffer.
                        unsafe {
                            *(dst.get_unchecked_mut(written)) = 0xEFu8;
                            written += 1;
                            *(dst.get_unchecked_mut(written)) = 0xBFu8;
                            written += 1;
                            *(dst.get_unchecked_mut(written)) = 0xBDu8;
                            written += 1;
                        }
                        return (read, written);
                    }
                    let second = src[read];
                    let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
                    if likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) {
                        // The next code unit is a low surrogate. Advance position.
                        read += 1;
                        let astral = (u32::from(unit) << 10) + u32::from(second)
                            - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
                        unsafe {
                            *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
                            written += 1;
                            *(dst.get_unchecked_mut(written)) =
                                ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
                            written += 1;
                            *(dst.get_unchecked_mut(written)) =
                                ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
                            written += 1;
                            *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
                            written += 1;
                        }
                        break;
                    }
                    // The next code unit is not a low surrogate. Don't advance
                    // position and treat the high surrogate as unpaired.
                    // Fall through
                }
                // Unpaired low surrogate
                unsafe {
                    *(dst.get_unchecked_mut(written)) = 0xEFu8;
                    written += 1;
                    *(dst.get_unchecked_mut(written)) = 0xBFu8;
                    written += 1;
                    *(dst.get_unchecked_mut(written)) = 0xBDu8;
                    written += 1;
                }
                break;
            }
            // Now see if the next unit is Basic Latin
            // read > src.len() is impossible, but using
            // >= instead of == allows the compiler to elide a bound check.
            if read >= src.len() {
                debug_assert_eq!(read, src.len());
                return (read, written);
            }
            unit = src[read];
            if unlikely(unit < 0x80) {
                // written > dst.len() is impossible, but using
                // >= instead of == allows the compiler to elide a bound check.
                if written >= dst.len() {
                    debug_assert_eq!(written, dst.len());
                    return (read, written);
                }
                dst[written] = unit as u8;
                read += 1;
                written += 1;
                // Mysteriously, adding a punctuation check here makes
                // the expected benificiary cases *slower*!
                continue 'outer;
            }
            continue 'inner;
        }
    }
}

#[inline(never)]
pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
    // Everything below is cold code!
    let mut read = 0;
    let mut written = 0;
    let mut unit = src[read];
    // We now have up to 3 output slots, so an astral character
    // will not fit.
    if unit < 0x800 {
        loop {
            if unit < 0x80 {
                if written >= dst.len() {
                    return (read, written);
                }
                read += 1;
                dst[written] = unit as u8;
                written += 1;
            } else if unit < 0x800 {
                if written + 2 > dst.len() {
                    return (read, written);
                }
                read += 1;
                dst[written] = (unit >> 6) as u8 | 0xC0u8;
                written += 1;
                dst[written] = (unit & 0x3F) as u8 | 0x80u8;
                written += 1;
            } else {
                return (read, written);
            }
            // read > src.len() is impossible, but using
            // >= instead of == allows the compiler to elide a bound check.
            if read >= src.len() {
                debug_assert_eq!(read, src.len());
                return (read, written);
            }
            unit = src[read];
        }
    }
    // Could be an unpaired surrogate, but we'll need 3 output
    // slots in any case.
    if written + 3 > dst.len() {
        return (read, written);
    }
    read += 1;
    let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
    if unit_minus_surrogate_start <= (0xDFFF - 0xD800) {
        // Got surrogate
        if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
            // Got high surrogate
            if read >= src.len() {
                // Unpaired high surrogate
                unit = 0xFFFD;
            } else {
                let second = src[read];
                if in_inclusive_range16(second, 0xDC00, 0xDFFF) {
                    // Valid surrogate pair, but we know it won't fit.
                    read -= 1;
                    return (read, written);
                }
                // Unpaired high
                unit = 0xFFFD;
            }
        } else {
            // Unpaired low
            unit = 0xFFFD;
        }
    }
    dst[written] = (unit >> 12) as u8 | 0xE0u8;
    written += 1;
    dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
    written += 1;
    dst[written] = (unit & 0x3F) as u8 | 0x80u8;
    written += 1;
    debug_assert_eq!(written, dst.len());
    (read, written)
}

pub struct Utf8Encoder;

impl Utf8Encoder {
    pub fn new(encoding: &'static Encoding) -> Encoder {
        Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder))
    }

    pub fn max_buffer_length_from_utf16_without_replacement(
        &self,
        u16_length: usize,
    ) -> Option<usize> {
        u16_length.checked_mul(3)
    }

    pub fn max_buffer_length_from_utf8_without_replacement(
        &self,
        byte_length: usize,
    ) -> Option<usize> {
        Some(byte_length)
    }

    pub fn encode_from_utf16_raw(
        &mut self,
        src: &[u16],
        dst: &mut [u8],
        _last: bool,
    ) -> (EncoderResult, usize, usize) {
        let (read, written) = convert_utf16_to_utf8_partial(src, dst);
        (
            if read == src.len() {
                EncoderResult::InputEmpty
            } else {
                EncoderResult::OutputFull
            },
            read,
            written,
        )
    }

    pub fn encode_from_utf8_raw(
        &mut self,
        src: &str,
        dst: &mut [u8],
        _last: bool,
    ) -> (EncoderResult, usize, usize) {
        let bytes = src.as_bytes();
        let mut to_write = bytes.len();
        if to_write <= dst.len() {
            (&mut dst[..to_write]).copy_from_slice(bytes);
            return (EncoderResult::InputEmpty, to_write, to_write);
        }
        to_write = dst.len();
        // Move back until we find a UTF-8 sequence boundary.
        while (bytes[to_write] & 0xC0) == 0x80 {
            to_write -= 1;
        }
        (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
        (EncoderResult::OutputFull, to_write, to_write)
    }
}

// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/

#[cfg(all(test, feature = "alloc"))]
mod tests {
    use super::super::testing::*;
    use super::super::*;

    //    fn decode_utf8_to_utf16(bytes: &[u8], expect: &[u16]) {
    //        decode_to_utf16_without_replacement(UTF_8, bytes, expect);
    //    }

    fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) {
        decode_to_utf8(UTF_8, bytes, expect);
    }

    fn decode_valid_utf8(string: &str) {
        decode_utf8_to_utf8(string.as_bytes(), string);
    }

    fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) {
        encode_from_utf16(UTF_8, string, expect);
    }

    fn encode_utf8_from_utf8(string: &str, expect: &[u8]) {
        encode_from_utf8(UTF_8, string, expect);
    }

    fn encode_utf8_from_utf16_with_output_limit(
        string: &[u16],
        expect: &str,
        limit: usize,
        expect_result: EncoderResult,
    ) {
        let mut dst = Vec::new();
        {
            dst.resize(limit, 0u8);
            let mut encoder = UTF_8.new_encoder();
            let (result, read, written) =
                encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
            assert_eq!(result, expect_result);
            if expect_result == EncoderResult::InputEmpty {
                assert_eq!(read, string.len());
            }
            assert_eq!(&dst[..written], expect.as_bytes());
        }
        {
            dst.resize(64, 0u8);
            for (i, elem) in dst.iter_mut().enumerate() {
                *elem = i as u8;
            }
            let mut encoder = UTF_8.new_encoder();
            let (_, _, mut j) =
                encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
            while j < dst.len() {
                assert_eq!(usize::from(dst[j]), j);
                j += 1;
            }
        }
    }

    #[test]
    fn test_utf8_decode() {
        // Empty
        decode_valid_utf8("");
        // ASCII
        decode_valid_utf8("ab");
        // Low BMP
        decode_valid_utf8("a\u{E4}Z");
        // High BMP
        decode_valid_utf8("a\u{2603}Z");
        // Astral
        decode_valid_utf8("a\u{1F4A9}Z");
        // Low BMP with last byte missing
        decode_utf8_to_utf8(b"a\xC3Z", "a\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xC3", "a\u{FFFD}");
        // High BMP with last byte missing
        decode_utf8_to_utf8(b"a\xE2\x98Z", "a\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xE2\x98", "a\u{FFFD}");
        // Astral with last byte missing
        decode_utf8_to_utf8(b"a\xF0\x9F\x92Z", "a\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xF0\x9F\x92", "a\u{FFFD}");
        // Lone highest continuation
        decode_utf8_to_utf8(b"a\xBFZ", "a\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xBF", "a\u{FFFD}");
        // Two lone highest continuations
        decode_utf8_to_utf8(b"a\xBF\xBFZ", "a\u{FFFD}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xBF\xBF", "a\u{FFFD}\u{FFFD}");
        // Low BMP followed by lowest lone continuation
        decode_utf8_to_utf8(b"a\xC3\xA4\x80Z", "a\u{E4}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xC3\xA4\x80", "a\u{E4}\u{FFFD}");
        // Low BMP followed by highest lone continuation
        decode_utf8_to_utf8(b"a\xC3\xA4\xBFZ", "a\u{E4}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xC3\xA4\xBF", "a\u{E4}\u{FFFD}");
        // High BMP followed by lowest lone continuation
        decode_utf8_to_utf8(b"a\xE2\x98\x83\x80Z", "a\u{2603}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xE2\x98\x83\x80", "a\u{2603}\u{FFFD}");
        // High BMP followed by highest lone continuation
        decode_utf8_to_utf8(b"a\xE2\x98\x83\xBFZ", "a\u{2603}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xE2\x98\x83\xBF", "a\u{2603}\u{FFFD}");
        // Astral followed by lowest lone continuation
        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80Z", "a\u{1F4A9}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80", "a\u{1F4A9}\u{FFFD}");
        // Astral followed by highest lone continuation
        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBFZ", "a\u{1F4A9}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBF", "a\u{1F4A9}\u{FFFD}");

        // Boundary conditions
        // Lowest single-byte
        decode_valid_utf8("Z\x00");
        decode_valid_utf8("Z\x00Z");
        // Lowest single-byte as two-byte overlong sequence
        decode_utf8_to_utf8(b"a\xC0\x80", "a\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xC0\x80Z", "a\u{FFFD}\u{FFFD}Z");
        // Lowest single-byte as three-byte overlong sequence
        decode_utf8_to_utf8(b"a\xE0\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xE0\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Lowest single-byte as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // One below lowest single-byte
        decode_utf8_to_utf8(b"a\xFF", "a\u{FFFD}");
        decode_utf8_to_utf8(b"a\xFFZ", "a\u{FFFD}Z");
        // Highest single-byte
        decode_valid_utf8("a\x7F");
        decode_valid_utf8("a\x7FZ");
        // Highest single-byte as two-byte overlong sequence
        decode_utf8_to_utf8(b"a\xC1\xBF", "a\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xC1\xBFZ", "a\u{FFFD}\u{FFFD}Z");
        // Highest single-byte as three-byte overlong sequence
        decode_utf8_to_utf8(b"a\xE0\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xE0\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Highest single-byte as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x80\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x80\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // One past highest single byte (also lone continuation)
        decode_utf8_to_utf8(b"a\x80Z", "a\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\x80", "a\u{FFFD}");
        // Two lone continuations
        decode_utf8_to_utf8(b"a\x80\x80Z", "a\u{FFFD}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\x80\x80", "a\u{FFFD}\u{FFFD}");
        // Three lone continuations
        decode_utf8_to_utf8(b"a\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
        // Four lone continuations
        decode_utf8_to_utf8(b"a\x80\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        decode_utf8_to_utf8(b"a\x80\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        // Lowest two-byte
        decode_utf8_to_utf8(b"a\xC2\x80", "a\u{0080}");
        decode_utf8_to_utf8(b"a\xC2\x80Z", "a\u{0080}Z");
        // Lowest two-byte as three-byte overlong sequence
        decode_utf8_to_utf8(b"a\xE0\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xE0\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Lowest two-byte as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x80\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x80\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Lead one below lowest two-byte
        decode_utf8_to_utf8(b"a\xC1\x80", "a\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xC1\x80Z", "a\u{FFFD}\u{FFFD}Z");
        // Trail one below lowest two-byte
        decode_utf8_to_utf8(b"a\xC2\x7F", "a\u{FFFD}\u{007F}");
        decode_utf8_to_utf8(b"a\xC2\x7FZ", "a\u{FFFD}\u{007F}Z");
        // Highest two-byte
        decode_utf8_to_utf8(b"a\xDF\xBF", "a\u{07FF}");
        decode_utf8_to_utf8(b"a\xDF\xBFZ", "a\u{07FF}Z");
        // Highest two-byte as three-byte overlong sequence
        decode_utf8_to_utf8(b"a\xE0\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xE0\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Highest two-byte as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Lowest three-byte
        decode_utf8_to_utf8(b"a\xE0\xA0\x80", "a\u{0800}");
        decode_utf8_to_utf8(b"a\xE0\xA0\x80Z", "a\u{0800}Z");
        // Lowest three-byte as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Highest below surrogates
        decode_utf8_to_utf8(b"a\xED\x9F\xBF", "a\u{D7FF}");
        decode_utf8_to_utf8(b"a\xED\x9F\xBFZ", "a\u{D7FF}Z");
        // Highest below surrogates as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // First surrogate
        decode_utf8_to_utf8(b"a\xED\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xED\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // First surrogate as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Last surrogate
        decode_utf8_to_utf8(b"a\xED\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xED\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Last surrogate as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Lowest above surrogates
        decode_utf8_to_utf8(b"a\xEE\x80\x80", "a\u{E000}");
        decode_utf8_to_utf8(b"a\xEE\x80\x80Z", "a\u{E000}Z");
        // Lowest above surrogates as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Highest three-byte
        decode_utf8_to_utf8(b"a\xEF\xBF\xBF", "a\u{FFFF}");
        decode_utf8_to_utf8(b"a\xEF\xBF\xBFZ", "a\u{FFFF}Z");
        // Highest three-byte as four-byte overlong sequence
        decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
        // Lowest four-byte
        decode_utf8_to_utf8(b"a\xF0\x90\x80\x80", "a\u{10000}");
        decode_utf8_to_utf8(b"a\xF0\x90\x80\x80Z", "a\u{10000}Z");
        // Highest four-byte
        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBF", "a\u{10FFFF}");
        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBFZ", "a\u{10FFFF}Z");
        // One past highest four-byte
        decode_utf8_to_utf8(b"a\xF4\x90\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF4\x90\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");

        // Highest four-byte with last byte replaced with 0xFF
        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFF", "a\u{FFFD}\u{FFFD}");
        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFFZ", "a\u{FFFD}\u{FFFD}Z");
    }

    #[test]
    fn test_utf8_encode() {
        // Empty
        encode_utf8_from_utf16(&[], b"");
        encode_utf8_from_utf8("", b"");

        encode_utf8_from_utf16(&[0x0000], "\u{0000}".as_bytes());
        encode_utf8_from_utf16(&[0x007F], "\u{007F}".as_bytes());
        encode_utf8_from_utf16(&[0x0080], "\u{0080}".as_bytes());
        encode_utf8_from_utf16(&[0x07FF], "\u{07FF}".as_bytes());
        encode_utf8_from_utf16(&[0x0800], "\u{0800}".as_bytes());
        encode_utf8_from_utf16(&[0xD7FF], "\u{D7FF}".as_bytes());
        encode_utf8_from_utf16(&[0xD800], "\u{FFFD}".as_bytes());
        encode_utf8_from_utf16(&[0xD800, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
        encode_utf8_from_utf16(&[0xDFFF], "\u{FFFD}".as_bytes());
        encode_utf8_from_utf16(&[0xDFFF, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
        encode_utf8_from_utf16(&[0xE000], "\u{E000}".as_bytes());
        encode_utf8_from_utf16(&[0xFFFF], "\u{FFFF}".as_bytes());
        encode_utf8_from_utf16(&[0xD800, 0xDC00], "\u{10000}".as_bytes());
        encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], "\u{10FFFF}".as_bytes());
        encode_utf8_from_utf16(&[0xDC00, 0xDEDE], "\u{FFFD}\u{FFFD}".as_bytes());
    }

    #[test]
    fn test_encode_utf8_from_utf16_with_output_limit() {
        encode_utf8_from_utf16_with_output_limit(&[0x0062], "\u{62}", 1, EncoderResult::InputEmpty);
        encode_utf8_from_utf16_with_output_limit(&[0x00A7], "\u{A7}", 2, EncoderResult::InputEmpty);
        encode_utf8_from_utf16_with_output_limit(
            &[0x2603],
            "\u{2603}",
            3,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDCA9],
            "\u{1F4A9}",
            4,
            EncoderResult::InputEmpty,
        );

        encode_utf8_from_utf16_with_output_limit(&[0x00A7], "", 1, EncoderResult::OutputFull);
        encode_utf8_from_utf16_with_output_limit(&[0x2603], "", 2, EncoderResult::OutputFull);
        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDCA9],
            "",
            3,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x0062],
            "\u{63}\u{62}",
            2,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00A7],
            "\u{63}\u{A7}",
            3,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x2603],
            "\u{63}\u{2603}",
            4,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0xD83D, 0xDCA9],
            "\u{63}\u{1F4A9}",
            5,
            EncoderResult::InputEmpty,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00A7],
            "\u{63}",
            2,
            EncoderResult::OutputFull,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x2603],
            "\u{63}",
            3,
            EncoderResult::OutputFull,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0xD83D, 0xDCA9],
            "\u{63}",
            4,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x00B6, 0x0062],
            "\u{B6}\u{62}",
            3,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x00B6, 0x00A7],
            "\u{B6}\u{A7}",
            4,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x00B6, 0x2603],
            "\u{B6}\u{2603}",
            5,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x00B6, 0xD83D, 0xDCA9],
            "\u{B6}\u{1F4A9}",
            6,
            EncoderResult::InputEmpty,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x00B6, 0x00A7],
            "\u{B6}",
            3,
            EncoderResult::OutputFull,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x00B6, 0x2603],
            "\u{B6}",
            4,
            EncoderResult::OutputFull,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x00B6, 0xD83D, 0xDCA9],
            "\u{B6}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x0062],
            "\u{263A}\u{62}",
            4,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x00A7],
            "\u{263A}\u{A7}",
            5,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x2603],
            "\u{263A}\u{2603}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0xD83D, 0xDCA9],
            "\u{263A}\u{1F4A9}",
            7,
            EncoderResult::InputEmpty,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x00A7],
            "\u{263A}",
            4,
            EncoderResult::OutputFull,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x2603],
            "\u{263A}",
            5,
            EncoderResult::OutputFull,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0xD83D, 0xDCA9],
            "\u{263A}",
            6,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDE0E, 0x0062],
            "\u{1F60E}\u{62}",
            5,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDE0E, 0x00A7],
            "\u{1F60E}\u{A7}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDE0E, 0x2603],
            "\u{1F60E}\u{2603}",
            7,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
            "\u{1F60E}\u{1F4A9}",
            8,
            EncoderResult::InputEmpty,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDE0E, 0x00A7],
            "\u{1F60E}",
            5,
            EncoderResult::OutputFull,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDE0E, 0x2603],
            "\u{1F60E}",
            6,
            EncoderResult::OutputFull,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
            "\u{1F60E}",
            7,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x0062, 0x0062],
            "\u{63}\u{B6}\u{62}\u{62}",
            5,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x0062, 0x0062],
            "\u{63}\u{B6}\u{62}",
            4,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
            "\u{63}\u{B6}\u{62}\u{62}\u{62}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
            "\u{63}\u{B6}\u{62}\u{62}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x0062, 0x0062],
            "\u{263A}\u{62}\u{62}",
            5,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x0062, 0x0062],
            "\u{263A}\u{62}",
            4,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x0062, 0x0062, 0x0062],
            "\u{263A}\u{62}\u{62}\u{62}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x0062, 0x0062, 0x0062],
            "\u{263A}\u{62}\u{62}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x00A7],
            "\u{63}\u{B6}\u{A7}",
            5,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x00A7],
            "\u{63}\u{B6}",
            4,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x00A7, 0x0062],
            "\u{63}\u{B6}\u{A7}\u{62}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x00A7, 0x0062],
            "\u{63}\u{B6}\u{A7}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x00A7, 0x0062],
            "\u{263A}\u{A7}\u{62}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x00A7, 0x0062],
            "\u{263A}\u{A7}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x0062, 0x00A7],
            "\u{63}\u{B6}\u{62}\u{A7}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x0062, 0x00A7],
            "\u{63}\u{B6}\u{62}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x0062, 0x00A7],
            "\u{263A}\u{62}\u{A7}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x0062, 0x00A7],
            "\u{263A}\u{62}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x2603],
            "\u{63}\u{B6}\u{2603}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0x2603],
            "\u{63}\u{B6}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x2603],
            "\u{263A}\u{2603}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0x2603],
            "\u{263A}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0xD83D],
            "\u{63}\u{B6}\u{FFFD}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0xD83D],
            "\u{63}\u{B6}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0xD83D],
            "\u{263A}\u{FFFD}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0xD83D],
            "\u{263A}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0xDCA9],
            "\u{63}\u{B6}\u{FFFD}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x0063, 0x00B6, 0xDCA9],
            "\u{63}\u{B6}",
            5,
            EncoderResult::OutputFull,
        );

        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0xDCA9],
            "\u{263A}\u{FFFD}",
            6,
            EncoderResult::InputEmpty,
        );
        encode_utf8_from_utf16_with_output_limit(
            &[0x263A, 0xDCA9],
            "\u{263A}",
            5,
            EncoderResult::OutputFull,
        );
    }

    #[test]
    fn test_utf8_max_length_from_utf16() {
        let mut encoder = UTF_8.new_encoder();
        let mut output = [0u8; 13];
        let input = &[0x2C9Fu16, 0x2CA9u16, 0x2CA3u16, 0x2C9Fu16];
        let needed = encoder
            .max_buffer_length_from_utf16_without_replacement(input.len())
            .unwrap();
        let (result, _, _) =
            encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], true);
        assert_eq!(result, EncoderResult::InputEmpty);
    }

    #[test]
    fn test_decode_bom_prefixed_split_byte_triple() {
        let mut output = [0u16; 20];
        let mut decoder = UTF_8.new_decoder();
        {
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
            let (result, read, written, had_errors) =
                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
            assert_eq!(result, CoderResult::InputEmpty);
            assert_eq!(read, 1);
            assert_eq!(written, 0);
            assert!(!had_errors);
        }
        {
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
            let (result, read, written, had_errors) =
                decoder.decode_to_utf16(b"\xBF", &mut output[..needed], false);
            assert_eq!(result, CoderResult::InputEmpty);
            assert_eq!(read, 1);
            assert_eq!(written, 0);
            assert!(!had_errors);
        }
        {
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
            let (result, read, written, had_errors) =
                decoder.decode_to_utf16(b"\xBE", &mut output[..needed], true);
            assert_eq!(result, CoderResult::InputEmpty);
            assert_eq!(read, 1);
            assert_eq!(written, 1);
            assert!(!had_errors);
            assert_eq!(output[0], 0xFFFE);
        }
    }

    #[test]
    fn test_decode_bom_prefixed_split_byte_pair() {
        let mut output = [0u16; 20];
        let mut decoder = UTF_8.new_decoder();
        {
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
            let (result, read, written, had_errors) =
                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
            assert_eq!(result, CoderResult::InputEmpty);
            assert_eq!(read, 1);
            assert_eq!(written, 0);
            assert!(!had_errors);
        }
        {
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
            let (result, read, written, had_errors) =
                decoder.decode_to_utf16(b"\xBC", &mut output[..needed], true);
            assert_eq!(result, CoderResult::InputEmpty);
            assert_eq!(read, 1);
            assert_eq!(written, 1);
            assert!(had_errors);
            assert_eq!(output[0], 0xFFFD);
        }
    }

    #[test]
    fn test_decode_bom_prefix() {
        let mut output = [0u16; 20];
        let mut decoder = UTF_8.new_decoder();
        {
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
            let (result, read, written, had_errors) =
                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], true);
            assert_eq!(result, CoderResult::InputEmpty);
            assert_eq!(read, 1);
            assert_eq!(written, 1);
            assert!(had_errors);
            assert_eq!(output[0], 0xFFFD);
        }
    }

    #[test]
    fn test_tail() {
        let mut output = [0u16; 1];
        let mut decoder = UTF_8.new_decoder_without_bom_handling();
        {
            let (result, read, written, had_errors) =
                decoder.decode_to_utf16("\u{E4}a".as_bytes(), &mut output[..], false);
            assert_eq!(result, CoderResult::OutputFull);
            assert_eq!(read, 2);
            assert_eq!(written, 1);
            assert!(!had_errors);
            assert_eq!(output[0], 0x00E4);
        }
    }
}

[ Seitenstruktur0.93Drucken  etwas mehr zur Ethik  ]

                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....
    

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge