Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quellcode-Bibliothek single_byte.rs   Sprache: unbekannt

 
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use super::*;
use crate::ascii::*;
use crate::data::position;
use crate::handles::*;
use crate::variant::*;

pub struct SingleByteDecoder {
    table: &'static [u16; 128],
}

impl SingleByteDecoder {
    pub fn new(data: &'static [u16; 128]) -> VariantDecoder {
        VariantDecoder::SingleByte(SingleByteDecoder { table: data })
    }

    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
        Some(byte_length)
    }

    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
        byte_length.checked_mul(3)
    }

    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
        byte_length.checked_mul(3)
    }

    pub fn decode_to_utf8_raw(
        &mut self,
        src: &[u8],
        dst: &mut [u8],
        _last: bool,
    ) -> (DecoderResult, usize, usize) {
        let mut source = ByteSource::new(src);
        let mut dest = Utf8Destination::new(dst);
        'outermost: loop {
            match dest.copy_ascii_from_check_space_bmp(&mut source) {
                CopyAsciiResult::Stop(ret) => return ret,
                CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
                    // Start non-boilerplate
                    //
                    // Since the non-ASCIIness of `non_ascii` is hidden from
                    // the optimizer, it can't figure out that it's OK to
                    // statically omit the bound check when accessing
                    // `[u16; 128]` with an index
                    // `non_ascii as usize - 0x80usize`.
                    //
                    // Safety: `non_ascii` is a u8 byte >=0x80, from the invariants
                    // on Utf8Destination::copy_ascii_from_check_space_bmp()
                    let mapped =
                        unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
                    // let mapped = self.table[non_ascii as usize - 0x80usize];
                    if mapped == 0u16 {
                        return (
                            DecoderResult::Malformed(1, 0),
                            source.consumed(),
                            handle.written(),
                        );
                    }
                    let dest_again = handle.write_bmp_excl_ascii(mapped);
                    // End non-boilerplate
                    match source.check_available() {
                        Space::Full(src_consumed) => {
                            return (
                                DecoderResult::InputEmpty,
                                src_consumed,
                                dest_again.written(),
                            );
                        }
                        Space::Available(source_handle) => {
                            match dest_again.check_space_bmp() {
                                Space::Full(dst_written) => {
                                    return (
                                        DecoderResult::OutputFull,
                                        source_handle.consumed(),
                                        dst_written,
                                    );
                                }
                                Space::Available(mut destination_handle) => {
                                    let (mut b, unread_handle) = source_handle.read();
                                    let source_again = unread_handle.commit();
                                    'innermost: loop {
                                        if b > 127 {
                                            non_ascii = b;
                                            handle = destination_handle;
                                            continue 'middle;
                                        }
                                        // Testing on Haswell says that we should write the
                                        // byte unconditionally instead of trying to unread it
                                        // to make it part of the next SIMD stride.
                                        let dest_again_again = destination_handle.write_ascii(b);
                                        if b < 60 {
                                            // We've got punctuation
                                            match source_again.check_available() {
                                                Space::Full(src_consumed_again) => {
                                                    return (
                                                        DecoderResult::InputEmpty,
                                                        src_consumed_again,
                                                        dest_again_again.written(),
                                                    );
                                                }
                                                Space::Available(source_handle_again) => {
                                                    match dest_again_again.check_space_bmp() {
                                                        Space::Full(dst_written_again) => {
                                                            return (
                                                                DecoderResult::OutputFull,
                                                                source_handle_again.consumed(),
                                                                dst_written_again,
                                                            );
                                                        }
                                                        Space::Available(
                                                            destination_handle_again,
                                                        ) => {
                                                            let (b_again, _unread_handle_again) =
                                                                source_handle_again.read();
                                                            b = b_again;
                                                            destination_handle =
                                                                destination_handle_again;
                                                            continue 'innermost;
                                                        }
                                                    }
                                                }
                                            }
                                        }
                                        // We've got markup or ASCII text
                                        continue 'outermost;
                                    }
                                }
                            }
                        }
                    }
                },
            }
        }
    }

    pub fn decode_to_utf16_raw(
        &mut self,
        src: &[u8],
        dst: &mut [u16],
        _last: bool,
    ) -> (DecoderResult, usize, usize) {
        let (pending, length) = if dst.len() < src.len() {
            (DecoderResult::OutputFull, dst.len())
        } else {
            (DecoderResult::InputEmpty, src.len())
        };
        // Safety invariant: converted <= length. Quite often we have `converted < length`
        // which will be separately marked.
        let mut converted = 0usize;
        'outermost: loop {
            match unsafe {
                // Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
                ascii_to_basic_latin(
                    src.as_ptr().add(converted),
                    dst.as_mut_ptr().add(converted),
                    length - converted,
                )
            } {
                None => {
                    return (pending, length, length);
                }
                Some((mut non_ascii, consumed)) => {
                    // Safety invariant: `converted <= length` upheld, since this can only consume
                    // up to `length - converted` bytes.
                    //
                    // Furthermore, in this context,
                    // we can assume `converted < length` since this branch is only ever hit when
                    // ascii_to_basic_latin fails to consume the entire slice
                    converted += consumed;
                    'middle: loop {
                        // `converted` doesn't count the reading of `non_ascii` yet.
                        // Since the non-ASCIIness of `non_ascii` is hidden from
                        // the optimizer, it can't figure out that it's OK to
                        // statically omit the bound check when accessing
                        // `[u16; 128]` with an index
                        // `non_ascii as usize - 0x80usize`.
                        //
                        // Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
                        // the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
                        let mapped =
                            unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
                        // let mapped = self.table[non_ascii as usize - 0x80usize];
                        if mapped == 0u16 {
                            return (
                                DecoderResult::Malformed(1, 0),
                                converted + 1, // +1 `for non_ascii`
                                converted,
                            );
                        }
                        unsafe {
                            // Safety: As mentioned above, `converted < length`
                            *(dst.get_unchecked_mut(converted)) = mapped;
                        }
                        // Safety: `converted <= length` upheld, since `converted < length` before this
                        converted += 1;
                        // Next, handle ASCII punctuation and non-ASCII without
                        // going back to ASCII acceleration. Non-ASCII scripts
                        // use ASCII punctuation, so this avoid going to
                        // acceleration just for punctuation/space and then
                        // failing. This is a significant boost to non-ASCII
                        // scripts.
                        // TODO: Split out Latin converters without this part
                        // this stuff makes Latin script-conversion slower.
                        if converted == length {
                            return (pending, length, length);
                        }
                        // Safety: We are back to `converted < length` because of the == above
                        // and can perform this check.
                        let mut b = unsafe { *(src.get_unchecked(converted)) };
                        // Safety: `converted < length` is upheld for this loop
                        'innermost: loop {
                            if b > 127 {
                                non_ascii = b;
                                continue 'middle;
                            }
                            // Testing on Haswell says that we should write the
                            // byte unconditionally instead of trying to unread it
                            // to make it part of the next SIMD stride.
                            unsafe {
                                // Safety: `converted < length` is true for this loop
                                *(dst.get_unchecked_mut(converted)) = u16::from(b);
                            }
                            // Safety: We are now at `converted <= length`. We should *not* `continue`
                            // the loop without reverifying
                            converted += 1;
                            if b < 60 {
                                // We've got punctuation
                                if converted == length {
                                    return (pending, length, length);
                                }
                                // Safety: we're back to `converted <= length` because of the == above
                                b = unsafe { *(src.get_unchecked(converted)) };
                                // Safety: The loop continues as `converted < length`
                                continue 'innermost;
                            }
                            // We've got markup or ASCII text
                            continue 'outermost;
                        }
                    }
                }
            }
        }
    }

    pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
        let mut bytes = buffer;
        let mut total = 0;
        loop {
            if let Some((non_ascii, offset)) = validate_ascii(bytes) {
                total += offset;
                // Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
                // the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
                let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
                if mapped != u16::from(non_ascii) {
                    return total;
                }
                total += 1;
                bytes = &bytes[offset + 1..];
            } else {
                return total;
            }
        }
    }
}

pub struct SingleByteEncoder {
    table: &'static [u16; 128],
    run_bmp_offset: usize,
    run_byte_offset: usize,
    run_length: usize,
}

impl SingleByteEncoder {
    pub fn new(
        encoding: &'static Encoding,
        data: &'static [u16; 128],
        run_bmp_offset: u16,
        run_byte_offset: u8,
        run_length: u8,
    ) -> Encoder {
        Encoder::new(
            encoding,
            VariantEncoder::SingleByte(SingleByteEncoder {
                table: data,
                run_bmp_offset: run_bmp_offset as usize,
                run_byte_offset: run_byte_offset as usize,
                run_length: run_length as usize,
            }),
        )
    }

    pub fn max_buffer_length_from_utf16_without_replacement(
        &self,
        u16_length: usize,
    ) -> Option<usize> {
        Some(u16_length)
    }

    pub fn max_buffer_length_from_utf8_without_replacement(
        &self,
        byte_length: usize,
    ) -> Option<usize> {
        Some(byte_length)
    }

    #[inline(always)]
    fn encode_u16(&self, code_unit: u16) -> Option<u8> {
        // First, we see if the code unit falls into a run of consecutive
        // code units that can be mapped by offset. This is very efficient
        // for most non-Latin encodings as well as Latin1-ish encodings.
        //
        // For encodings that don't fit this pattern, the run (which may
        // have the length of just one) just establishes the starting point
        // for the next rule.
        //
        // Next, we do a forward linear search in the part of the index
        // after the run. Even in non-Latin1-ish Latin encodings (except
        // macintosh), the lower case letters are here.
        //
        // Next, we search the third quadrant up to the start of the run
        // (upper case letters in Latin encodings except macintosh, in
        // Greek and in KOI encodings) and then the second quadrant,
        // except if the run stared before the third quadrant, we search
        // the second quadrant up to the run.
        //
        // Last, we search the first quadrant, which has unused controls
        // or punctuation in most encodings. This is bad for macintosh
        // and IBM866, but those are rare.

        // Run of consecutive units
        let unit_as_usize = code_unit as usize;
        let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
        if offset < self.run_length {
            return Some((128 + self.run_byte_offset + offset) as u8);
        }

        // Search after the run
        let tail_start = self.run_byte_offset + self.run_length;
        if let Some(pos) = position(&self.table[tail_start..], code_unit) {
            return Some((128 + tail_start + pos) as u8);
        }

        if self.run_byte_offset >= 64 {
            // Search third quadrant before the run
            if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) {
                return Some(((128 + 64) + pos) as u8);
            }

            // Search second quadrant
            if let Some(pos) = position(&self.table[32..64], code_unit) {
                return Some(((128 + 32) + pos) as u8);
            }
        } else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) {
            // windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
            // Search second quadrant before the run
            return Some(((128 + 32) + pos) as u8);
        }

        // Search first quadrant
        if let Some(pos) = position(&self.table[..32], code_unit) {
            return Some((128 + pos) as u8);
        }

        None
    }

    ascii_compatible_bmp_encoder_function!(
        {
            match self.encode_u16(bmp) {
                Some(byte) => handle.write_one(byte),
                None => {
                    return (
                        EncoderResult::unmappable_from_bmp(bmp),
                        source.consumed(),
                        handle.written(),
                    );
                }
            }
        },
        bmp,
        self,
        source,
        handle,
        copy_ascii_to_check_space_one,
        check_space_one,
        encode_from_utf8_raw,
        str,
        Utf8Source,
        true
    );

    pub fn encode_from_utf16_raw(
        &mut self,
        src: &[u16],
        dst: &mut [u8],
        _last: bool,
    ) -> (EncoderResult, usize, usize) {
        let (pending, length) = if dst.len() < src.len() {
            (EncoderResult::OutputFull, dst.len())
        } else {
            (EncoderResult::InputEmpty, src.len())
        };
        // Safety invariant: converted <= length. Quite often we have `converted < length`
        // which will be separately marked.
        let mut converted = 0usize;
        'outermost: loop {
            match unsafe {
                // Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
                basic_latin_to_ascii(
                    src.as_ptr().add(converted),
                    dst.as_mut_ptr().add(converted),
                    length - converted,
                )
            } {
                None => {
                    return (pending, length, length);
                }
                Some((mut non_ascii, consumed)) => {
                    // Safety invariant: `converted <= length` upheld, since this can only consume
                    // up to `length - converted` bytes.
                    //
                    // Furthermore, in this context,
                    // we can assume `converted < length` since this branch is only ever hit when
                    // ascii_to_basic_latin fails to consume the entire slice
                    converted += consumed;
                    'middle: loop {
                        // `converted` doesn't count the reading of `non_ascii` yet.
                        match self.encode_u16(non_ascii) {
                            Some(byte) => {
                                unsafe {
                                    // Safety: we're allowed this access since `converted < length`
                                    *(dst.get_unchecked_mut(converted)) = byte;
                                }
                                converted += 1;
                                // `converted <= length` now
                            }
                            None => {
                                // At this point, we need to know if we
                                // have a surrogate.
                                let high_bits = non_ascii & 0xFC00u16;
                                if high_bits == 0xD800u16 {
                                    // high surrogate
                                    if converted + 1 == length {
                                        // End of buffer. This surrogate is unpaired.
                                        return (
                                            EncoderResult::Unmappable('\u{FFFD}'),
                                            converted + 1, // +1 `for non_ascii`
                                            converted,
                                        );
                                    }
                                    // Safety: convered < length from outside the match, and `converted + 1 != length`,
                                    // So `converted + 1 < length` as well. We're in bounds
                                    let second =
                                        u32::from(unsafe { *src.get_unchecked(converted + 1) });
                                    if second & 0xFC00u32 != 0xDC00u32 {
                                        return (
                                            EncoderResult::Unmappable('\u{FFFD}'),
                                            converted + 1, // +1 `for non_ascii`
                                            converted,
                                        );
                                    }
                                    // The next code unit is a low surrogate.
                                    let astral: char = unsafe {
                                        // Safety: We can rely on non_ascii being 0xD800-0xDBFF since the high bits are 0xD800
                                        // Then, (non_ascii << 10 - 0xD800 << 10) becomes between (0 to 0x3FF) << 10, which is between
                                        // 0x400 to 0xffc00. Adding the 0x10000 gives a range of 0x10400 to 0x10fc00. Subtracting the 0xDC00
                                        // gives 0x2800 to 0x102000
                                        // The second term is between 0xDC00 and 0xDFFF from the check above. This gives a maximum
                                        // possible range of (0x10400 + 0xDC00) to (0x102000 + 0xDFFF) which is 0x1E000 to 0x10ffff.
                                        // This is in range.
                                        //
                                        // From a Unicode principles perspective this can also be verified as we have checked that `non_ascii` is a high surrogate
                                        // (0xD800..=0xDBFF), and that `second` is a low surrogate (`0xDC00..=0xDFFF`), and we are applying reverse of the UTC16 transformation
                                        // algorithm <https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF>, by applying the high surrogate - 0xD800 to the
                                        // high ten bits, and the low surrogate - 0xDc00 to the low ten bits, and then adding 0x10000
                                        ::core::char::from_u32_unchecked(
                                            (u32::from(non_ascii) << 10) + second
                                                - (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
                                        )
                                    };
                                    return (
                                        EncoderResult::Unmappable(astral),
                                        converted + 2, // +2 `for non_ascii` and `second`
                                        converted,
                                    );
                                }
                                if high_bits == 0xDC00u16 {
                                    // Unpaired low surrogate
                                    return (
                                        EncoderResult::Unmappable('\u{FFFD}'),
                                        converted + 1, // +1 `for non_ascii`
                                        converted,
                                    );
                                }
                                return (
                                    EncoderResult::unmappable_from_bmp(non_ascii),
                                    converted + 1, // +1 `for non_ascii`
                                    converted,
                                );
                                // Safety: This branch diverges, so no need to uphold invariants on `converted`
                            }
                        }
                        // Next, handle ASCII punctuation and non-ASCII without
                        // going back to ASCII acceleration. Non-ASCII scripts
                        // use ASCII punctuation, so this avoid going to
                        // acceleration just for punctuation/space and then
                        // failing. This is a significant boost to non-ASCII
                        // scripts.
                        // TODO: Split out Latin converters without this part
                        // this stuff makes Latin script-conversion slower.
                        if converted == length {
                            return (pending, length, length);
                        }
                        // Safety: we're back to `converted < length` due to the == above and can perform
                        // the unchecked read
                        let mut unit = unsafe { *(src.get_unchecked(converted)) };
                        'innermost: loop {
                            // Safety: This loop always begins with `converted < length`, see
                            // the invariant outside and the comment on the continue below
                            if unit > 127 {
                                non_ascii = unit;
                                continue 'middle;
                            }
                            // Testing on Haswell says that we should write the
                            // byte unconditionally instead of trying to unread it
                            // to make it part of the next SIMD stride.
                            unsafe {
                                // Safety: Can rely on converted < length
                                *(dst.get_unchecked_mut(converted)) = unit as u8;
                            }
                            converted += 1;
                            // `converted <= length` here
                            if unit < 60 {
                                // We've got punctuation
                                if converted == length {
                                    return (pending, length, length);
                                }
                                // Safety: `converted < length` due to the == above. The read is safe.
                                unit = unsafe { *(src.get_unchecked(converted)) };
                                // Safety: This only happens if `converted < length`, maintaining it
                                continue 'innermost;
                            }
                            // We've got markup or ASCII text
                            continue 'outermost;
                            // Safety: All other routes to here diverge so the continue is the only
                            // way to run the innermost loop.
                        }
                    }
                }
            }
        }
    }
}

// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/

#[cfg(all(test, feature = "alloc"))]
mod tests {
    use super::super::testing::*;
    use super::super::*;

    #[test]
    fn test_windows_1255_ca() {
        decode(WINDOWS_1255, b"\xCA", "\u{05BA}");
        encode(WINDOWS_1255, "\u{05BA}", b"\xCA");
    }

    #[test]
    fn test_ascii_punctuation() {
        let bytes = b"\xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4.";
        let characters = "\u{0391}\u{03C5}\u{03C4}\u{03CC} \
                          \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
                          \u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \
                          \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \
                          \u{03C4}\u{03B5}\u{03C3}\u{03C4}.";
        decode(WINDOWS_1253, bytes, characters);
        encode(WINDOWS_1253, characters, bytes);
    }

    #[test]
    fn test_decode_malformed() {
        decode(
            WINDOWS_1253,
            b"\xC1\xF5\xD2\xF4\xFC",
            "\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}",
        );
    }

    #[test]
    fn test_encode_unmappables() {
        encode(
            WINDOWS_1253,
            "\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}",
            b"\xC1\xF5☃\xF4\xFC",
        );
        encode(
            WINDOWS_1253,
            "\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}",
            b"\xC1\xF5💩\xF4\xFC",
        );
    }

    #[test]
    fn test_encode_unpaired_surrogates() {
        encode_from_utf16(
            WINDOWS_1253,
            &[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16],
            b"\xC1\xF5�\xF4\xFC",
        );
        encode_from_utf16(
            WINDOWS_1253,
            &[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16],
            b"\xC1\xF5�\xF4\xFC",
        );
        encode_from_utf16(
            WINDOWS_1253,
            &[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16],
            b"\xC1\xF5\xF4\xFC�",
        );
    }

    pub const HIGH_BYTES: &'static [u8; 128] = &[
        0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E,
        0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D,
        0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC,
        0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB,
        0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA,
        0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9,
        0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8,
        0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
        0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
    ];

    fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
        let mut with_replacement = [0u16; 128];
        let mut it = data.iter().enumerate();
        loop {
            match it.next() {
                Some((i, code_point)) => {
                    if *code_point == 0 {
                        with_replacement[i] = 0xFFFD;
                    } else {
                        with_replacement[i] = *code_point;
                    }
                }
                None => {
                    break;
                }
            }
        }

        decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
    }

    fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) {
        let mut with_zeros = [0u8; 128];
        let mut it = data.iter().enumerate();
        loop {
            match it.next() {
                Some((i, code_point)) => {
                    if *code_point == 0 {
                        with_zeros[i] = 0;
                    } else {
                        with_zeros[i] = HIGH_BYTES[i];
                    }
                }
                None => {
                    break;
                }
            }
        }

        encode_from_utf16(encoding, data, &with_zeros[..]);
    }

    #[test]
    fn test_single_byte_from_two_low_surrogates() {
        let expectation = b"��";
        let mut output = [0u8; 40];
        let mut encoder = WINDOWS_1253.new_encoder();
        let (result, read, written, had_errors) =
            encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
        assert_eq!(result, CoderResult::InputEmpty);
        assert_eq!(read, 2);
        assert_eq!(written, expectation.len());
        assert!(had_errors);
        assert_eq!(&output[..written], expectation);
    }

    // These tests are so self-referential that they are pretty useless.

    // BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
    // Instead, please regenerate using generate-encoding-data.py

    #[test]
    fn test_single_byte_decode() {
        decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
        decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
        if cfg!(miri) {
            // Miri is too slow
            return;
        }
        decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
        decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
        decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
        decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
        decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
        decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
        decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
        decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
        decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
        decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
        decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
        decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
        decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
        decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
        decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
        decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
        decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
        decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
        decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
        decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
        decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
        decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
        decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
        decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
        decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
    }

    #[test]
    fn test_single_byte_encode() {
        encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
        encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
        if cfg!(miri) {
            // Miri is too slow
            return;
        }
        encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
        encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
        encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
        encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
        encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
        encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
        encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
        encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
        encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
        encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
        encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
        encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
        encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
        encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
        encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
        encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
        encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
        encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
        encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
        encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
        encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
        encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
        encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
        encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
        encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
    }
    // END GENERATED CODE
}

[ 0.66Quellennavigators  Projekt   ]

                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....
    

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge