Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quelle  gb18030.rs   Sprache: unbekannt

 
// Copyright Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use super::*;
use crate::data::*;
use crate::gb18030_2022::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
use super::in_inclusive_range16;
use super::in_range16;

enum Gb18030Pending {
    None,
    One(u8),
    Two(u8, u8),
    Three(u8, u8, u8),
}

impl Gb18030Pending {
    fn is_none(&self) -> bool {
        match *self {
            Gb18030Pending::None => true,
            _ => false,
        }
    }

    fn count(&self) -> usize {
        match *self {
            Gb18030Pending::None => 0,
            Gb18030Pending::One(_) => 1,
            Gb18030Pending::Two(_, _) => 2,
            Gb18030Pending::Three(_, _, _) => 3,
        }
    }
}

pub struct Gb18030Decoder {
    first: Option<u8>,
    second: Option<u8>,
    third: Option<u8>,
    pending: Gb18030Pending,
    pending_ascii: Option<u8>,
}

impl Gb18030Decoder {
    pub fn new() -> VariantDecoder {
        VariantDecoder::Gb18030(Gb18030Decoder {
            first: None,
            second: None,
            third: None,
            pending: Gb18030Pending::None,
            pending_ascii: None,
        })
    }

    pub fn in_neutral_state(&self) -> bool {
        self.first.is_none()
            && self.second.is_none()
            && self.third.is_none()
            && self.pending.is_none()
            && self.pending_ascii.is_none()
    }

    fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
        byte_length.checked_add(
            self.pending.count()
                + match self.first {
                    None => 0,
                    Some(_) => 1,
                }
                + match self.second {
                    None => 0,
                    Some(_) => 1,
                }
                + match self.third {
                    None => 0,
                    Some(_) => 1,
                }
                + match self.pending_ascii {
                    None => 0,
                    Some(_) => 1,
                },
        )
    }

    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
        // ASCII: 1 to 1 (worst case)
        // gbk: 2 to 1
        // ranges: 4 to 1 or 4 to 2
        checked_add(1, self.extra_from_state(byte_length))
    }

    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
        // ASCII: 1 to 1
        // gbk: 2 to 2 or 2 to 3
        // ranges: 4 to 2, 4 to 3 or 4 to 4
        // 0x80: 1 to 3 (worst case)
        self.max_utf8_buffer_length(byte_length)
    }

    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
        checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
    }

    gb18030_decoder_functions!(
        {
            // If first is between 0x81 and 0xFE, inclusive,
            // subtract offset 0x81.
            let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
            if non_ascii_minus_offset > (0xFE - 0x81) {
                if non_ascii == 0x80 {
                    handle.write_upper_bmp(0x20ACu16);
                    continue 'outermost;
                }
                return (DecoderResult::Malformed(1, 0),
                        source.consumed(),
                        handle.written());
            }
            non_ascii_minus_offset
        },
        {
            // Two-byte (or error)
            if first_minus_offset >= 0x20 {
                // Not the gbk ideograph range above GB2312
                let trail_minus_offset = second.wrapping_sub(0xA1);
                if trail_minus_offset <= (0xFE - 0xA1) {
                    // GB2312
                    let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
                    if hanzi_lead < (0x77 - 0x2F) {
                        // Level 1 Hanzi, Level 2 Hanzi
                        // or one of the 5 PUA code
                        // points in between.
                        let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
                        let upper_bmp = GB2312_HANZI[hanzi_pointer];
                        handle.write_upper_bmp(upper_bmp)
                    } else if first_minus_offset == 0x20 {
                        // Symbols (starting with ideographic space)
                        let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
                        handle.write_bmp_excl_ascii(bmp)
                    } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
                        handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
                    } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
                        handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
                    } else if first_minus_offset > 0x76 {
                        // Bottom PUA
                        let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
                        handle.write_upper_bmp(pua)
                    } else {
                        let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
                        handle.write_bmp_excl_ascii(bmp)
                    }
                } else {
                    // gbk range on the left
                    let mut trail_minus_offset = second.wrapping_sub(0x40);
                    if trail_minus_offset > (0x7E - 0x40) {
                        let trail_minus_range_start = second.wrapping_sub(0x80);
                        if trail_minus_range_start > (0xA0 - 0x80) {
                            if second < 0x80 {
                                return (DecoderResult::Malformed(1, 0),
                                        unread_handle_second.unread(),
                                        handle.written());
                            }
                            return (DecoderResult::Malformed(2, 0),
                                    unread_handle_second.consumed(),
                                    handle.written());
                        }
                        trail_minus_offset = second - 0x41;
                    }
                    // Zero-base lead
                    let left_lead = first_minus_offset - 0x20;
                    let left_pointer = left_lead as usize * (190 - 94) +
                                       trail_minus_offset as usize;
                    let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
                    if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
                        let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
                        handle.write_upper_bmp(upper_bmp)
                    } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
                        let bmp = gbk_other_decode(left_pointer as u16);
                        handle.write_bmp_excl_ascii(bmp)
                    } else {
                        let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
                        let upper_bmp = GBK_BOTTOM[bottom_pointer];
                        handle.write_upper_bmp(upper_bmp)
                    }
                }
            } else {
                // gbk ideograph range above GB2312
                let mut trail_minus_offset = second.wrapping_sub(0x40);
                if trail_minus_offset > (0x7E - 0x40) {
                    let trail_minus_range_start = second.wrapping_sub(0x80);
                    if trail_minus_range_start > (0xFE - 0x80) {
                        if second < 0x80 {
                            return (DecoderResult::Malformed(1, 0),
                                    unread_handle_second.unread(),
                                    handle.written());
                        }
                        return (DecoderResult::Malformed(2, 0),
                                unread_handle_second.consumed(),
                                handle.written());
                    }
                    trail_minus_offset = second - 0x41;
                }
                let pointer = first_minus_offset as usize * 190usize +
                              trail_minus_offset as usize;
                let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
                handle.write_upper_bmp(upper_bmp)
            }
        },
        {
            // If third is between 0x81 and 0xFE, inclusive,
            // subtract offset 0x81.
            let third_minus_offset = third.wrapping_sub(0x81);
            if third_minus_offset > (0xFE - 0x81) {
                // We have an error. Let's inline what's going
                // to happen when `second` is
                // reprocessed. (`third` gets unread.)
                // `second` is guaranteed ASCII, so let's
                // put it in `pending_ascii`. Recompute
                // `second` from `second_minus_offset`.
                self.pending_ascii = Some(second_minus_offset + 0x30);
                // Now unread `third` and designate the previous
                // `first` as being in error.
                return (DecoderResult::Malformed(1, 1),
                        unread_handle_third.unread(),
                        handle.written());
            }
            third_minus_offset
        },
        {
            // If fourth is between 0x30 and 0x39, inclusive,
            // subtract offset 0x30.
            //
            // If we have an error, we'll inline what's going
            // to happen when `second` and `third` are
            // reprocessed. (`fourth` gets unread.)
            // `second` is guaranteed ASCII, so let's
            // put it in `pending_ascii`. Recompute
            // `second` from `second_minus_offset` to
            // make this block reusable when `second`
            // is not in scope.
            //
            // `third` is guaranteed to be in the range
            // that makes it become the new `self.first`.
            //
            // `fourth` gets unread and the previous
            // `first` gets designates as being in error.
            let fourth_minus_offset = fourth.wrapping_sub(0x30);
            if fourth_minus_offset > (0x39 - 0x30) {
                self.pending_ascii = Some(second_minus_offset + 0x30);
                self.pending = Gb18030Pending::One(third_minus_offset);
                return (DecoderResult::Malformed(1, 2),
                        unread_handle_fourth.unread(),
                        handle.written());
            }
            let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
                          (second_minus_offset as usize * (10 * 126)) +
                          (third_minus_offset as usize * 10) +
                          fourth_minus_offset as usize;
            if pointer <= 39419 {
                // BMP
                if pointer == 7457 {
                    handle.write_upper_bmp(0xE7C7)
                } else {
                    handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
                }
            } else if pointer >= 189_000 && pointer <= 1_237_575 {
                // Astral
                handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
            } else {
                return (DecoderResult::Malformed(4, 0),
                        unread_handle_fourth.consumed(),
                        handle.written());
            }
        },
        self,
        non_ascii,
        first_minus_offset,
        second,
        second_minus_offset,
        unread_handle_second,
        third,
        third_minus_offset,
        unread_handle_third,
        fourth,
        fourth_minus_offset,
        unread_handle_fourth,
        source,
        handle,
        'outermost);
}

// XXX Experiment with inline directives
fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
    // Try ideographic punctuation first as it's the most likely case.
    // Throwing in the check for full-width currencies and tilde is probably
    // more size-efficient here than elsewhere.
    if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
        if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
            return Some((0xA1, pos + 0xA1));
        }
    }
    // Ext A
    if in_range16(bmp, 0x3400, 0x4E00) {
        return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
            (
                0xFE,
                pos + if pos < (0x3F - 16) {
                    0x40 + 16
                } else {
                    0x41 + 16
                },
            )
        });
    }
    // Compatibility ideographs
    if in_range16(bmp, 0xF900, 0xFB00) {
        return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
            if pos < 5 {
                // end of second to last row
                (0xFD, pos + (190 - 94 - 5 + 0x41))
            } else {
                // last row
                (0xFE, pos + (0x40 - 5))
            }
        });
    }
    // Handle everything below U+02CA, which is in GBK_OTHER.
    if bmp < 0x02CA {
        if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
            // Pinyin except U+1E3F
            if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
                return Some((0xA8, pos + 0xA1));
            }
        } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
            || in_inclusive_range16(bmp, 0x02C7, 0x02C9)
        {
            // Diacritics and Latin 1 symbols
            if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
                return Some((0xA1, pos + 0xA1 + 3));
            }
        }
        return None;
    }

    if in_inclusive_range16(bmp, 0xE78D, 0xE864) {
        // The array is sorted but short, so let's do linear search.
        if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) {
            let pair = &GB18030_2022_OVERRIDE_BYTES[pos];
            return Some((pair[0].into(), pair[1].into()));
        }
    } else if bmp >= 0xFE17 {
        // Various brackets, all in full-width regions
        if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
            return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
        }
    } else if bmp == 0x1E3F {
        // The one Pinyin placed elsewhere on the BMP
        return Some((0xA8, 0x7B - 0x60 + 0xA1));
    } else if in_range16(bmp, 0xA000, 0xD800) {
        // Since Korean has usage in China, let's spend a branch to fast-track
        // Hangul.
        return None;
    }
    // GB2312 other (except bottom PUA and PUA between Hanzi levels).
    if let Some(other_pointer) = gb2312_other_encode(bmp) {
        let other_lead = other_pointer as usize / 94;
        let other_trail = other_pointer as usize % 94;
        return Some((0xA2 + other_lead, 0xA1 + other_trail));
    }
    // At this point, we've handled all mappable characters above U+02D9 but
    // below U+2010. Let's check for that range in order to let lower BMP
    // characters used for minority languages in China avoid the subsequent
    // search that deals mainly with various symbols.
    if in_range16(bmp, 0x02DA, 0x2010) {
        return None;
    }
    // GBK other (except radicals and PUA in GBK_BOTTOM).
    if let Some(other_pointer) = gbk_other_encode(bmp) {
        let other_lead = other_pointer as usize / (190 - 94);
        let other_trail = other_pointer as usize % (190 - 94);
        let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
        return Some((other_lead + (0x81 + 0x20), other_trail + offset));
    }
    // CJK Radicals Supplement, PUA, and U+9FBx ideographs in GBK_BOTTOM
    if in_inclusive_range16(bmp, 0x2E81, 0x2ECA)
        || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB)
        || in_inclusive_range16(bmp, 0xE816, 0xE855)
    {
        if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
            let trail = pos + 16;
            let offset = if trail < 0x3F { 0x40 } else { 0x41 };
            return Some((0xFE, trail + offset));
        }
    }
    // GB2312 bottom PUA
    let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
    if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
        let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
        let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
        return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
    }
    // PUA between Hanzi Levels
    let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
    if bmp_minus_pua_between_hanzi < 5 {
        return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
    }
    None
}

#[cfg(not(feature = "fast-gb-hanzi-encode"))]
#[inline(always)]
fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
    if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
        (lead, trail)
    } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
        let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
        let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
        (hanzi_lead as u8, hanzi_trail as u8)
    } else {
        let (lead, gbk_trail) = if bmp < 0x72DC {
            // Above GB2312
            let pointer = gbk_top_ideograph_encode(bmp) as usize;
            let lead = (pointer / 190) + 0x81;
            let gbk_trail = pointer % 190;
            (lead, gbk_trail)
        } else {
            // To the left of GB2312
            let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
            let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
            let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
            (lead, gbk_trail)
        };
        let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
        (lead as u8, (gbk_trail + offset) as u8)
    }
}

#[cfg(feature = "fast-gb-hanzi-encode")]
#[inline(always)]
fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
    gbk_hanzi_encode(bmp_minus_unified_start)
}

pub struct Gb18030Encoder {
    extended: bool,
}

impl Gb18030Encoder {
    pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
        Encoder::new(
            encoding,
            VariantEncoder::Gb18030(Gb18030Encoder {
                extended: extended_range,
            }),
        )
    }

    pub fn max_buffer_length_from_utf16_without_replacement(
        &self,
        u16_length: usize,
    ) -> Option<usize> {
        if self.extended {
            u16_length.checked_mul(4)
        } else {
            // Need to add, because space check is done with the four-byte
            // assumption.
            checked_add(2, u16_length.checked_mul(2))
        }
    }

    pub fn max_buffer_length_from_utf8_without_replacement(
        &self,
        byte_length: usize,
    ) -> Option<usize> {
        if self.extended {
            // 1 to 1
            // 2 to 2
            // 3 to 2
            // 2 to 4 (worst)
            // 3 to 4
            // 4 to 4
            checked_add(2, byte_length.checked_mul(2))
        } else {
            // 1 to 1
            // 2 to 2
            // 3 to 2
            // Need to add, because space check is done with the four-byte
            // assumption.
            byte_length.checked_add(3)
        }
    }

    ascii_compatible_encoder_functions!(
        {
            let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
            if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
                // CJK Unified Ideographs
                // Can't fail now, since all are
                // mapped.
                let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
                handle.write_two(lead, trail)
            } else if bmp == 0xE5E5 {
                // It's not optimal to check for the unmappable
                // and for euro at this stage, but getting
                // the out of the way makes the rest of the
                // code less messy.
                return (
                    EncoderResult::unmappable_from_bmp(bmp),
                    source.consumed(),
                    handle.written(),
                );
            } else if bmp == 0x20AC && !self.extended {
                handle.write_one(0x80u8)
            } else {
                match gbk_encode_non_unified(bmp) {
                    Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
                    None => {
                        if !self.extended {
                            return (
                                EncoderResult::unmappable_from_bmp(bmp),
                                source.consumed(),
                                handle.written(),
                            );
                        }
                        let range_pointer = gb18030_range_encode(bmp);
                        let first = range_pointer / (10 * 126 * 10);
                        let rem_first = range_pointer % (10 * 126 * 10);
                        let second = rem_first / (10 * 126);
                        let rem_second = rem_first % (10 * 126);
                        let third = rem_second / 10;
                        let fourth = rem_second % 10;
                        handle.write_four(
                            (first + 0x81) as u8,
                            (second + 0x30) as u8,
                            (third + 0x81) as u8,
                            (fourth + 0x30) as u8,
                        )
                    }
                }
            }
        },
        {
            if !self.extended {
                return (
                    EncoderResult::Unmappable(astral),
                    source.consumed(),
                    handle.written(),
                );
            }
            let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
            let first = range_pointer / (10 * 126 * 10);
            let rem_first = range_pointer % (10 * 126 * 10);
            let second = rem_first / (10 * 126);
            let rem_second = rem_first % (10 * 126);
            let third = rem_second / 10;
            let fourth = rem_second % 10;
            handle.write_four(
                (first + 0x81) as u8,
                (second + 0x30) as u8,
                (third + 0x81) as u8,
                (fourth + 0x30) as u8,
            )
        },
        bmp,
        astral,
        self,
        source,
        handle,
        copy_ascii_to_check_space_four,
        check_space_four,
        false
    );
}

// Any copyright to the test code below this comment is dedicated to the
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/

#[cfg(all(test, feature = "alloc"))]
mod tests {
    use super::super::testing::*;
    use super::super::*;

    fn decode_gb18030(bytes: &[u8], expect: &str) {
        decode(GB18030, bytes, expect);
    }

    fn encode_gb18030(string: &str, expect: &[u8]) {
        encode(GB18030, string, expect);
    }

    fn encode_gbk(string: &str, expect: &[u8]) {
        encode(GBK, string, expect);
    }

    #[test]
    fn test_gb18030_decode() {
        // Empty
        decode_gb18030(b"", &"");

        // ASCII
        decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");

        // euro
        decode_gb18030(b"\x80", "\u{20AC}");
        decode_gb18030(b"\xA2\xE3", "\u{20AC}");

        // two bytes
        decode_gb18030(b"\x81\x40", "\u{4E02}");
        decode_gb18030(b"\x81\x7E", "\u{4E8A}");
        decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
        decode_gb18030(b"\x81\x80", "\u{4E90}");
        decode_gb18030(b"\x81\xFE", "\u{4FA2}");
        decode_gb18030(b"\xFE\x40", "\u{FA0C}");
        decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
        decode_gb18030(b"\xFE\x80", "\u{4723}");
        decode_gb18030(b"\xFE\xFE", "\u{E4C5}");

        // Changes between GB18030-2005 and GB18030-2022
        decode_gb18030(b"\xFE\x7E", "\u{9FB9}");
        decode_gb18030(b"\xA6\xDD", "\u{FE14}");

        // These mappings remain in place the GB18030-2005 way despite GB18030-2022
        decode_gb18030(b"\x82\x35\x91\x32", "\u{9FB9}");
        decode_gb18030(b"\x84\x31\x83\x30", "\u{FE14}");

        // The difference from the original GB18030
        decode_gb18030(b"\xA3\xA0", "\u{3000}");
        decode_gb18030(b"\xA1\xA1", "\u{3000}");

        // 0xFF
        decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
        decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
        decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
        decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
        decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
        decode_gb18030(
            b"\xFF\x32\x9A\x33\x00",
            "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
        );

        // Four bytes
        decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
        decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
        decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
        decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
        decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
        decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
        decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
        decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
        decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
    }

    #[test]
    fn test_gb18030_encode() {
        // Empty
        encode_gb18030("", b"");

        // ASCII
        encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");

        // euro
        encode_gb18030("\u{20AC}", b"\xA2\xE3");

        // two bytes
        encode_gb18030("\u{4E02}", b"\x81\x40");
        encode_gb18030("\u{4E8A}", b"\x81\x7E");
        if !cfg!(miri) {
            // Miri is too slow
            encode_gb18030("\u{4E90}", b"\x81\x80");
            encode_gb18030("\u{4FA2}", b"\x81\xFE");
            encode_gb18030("\u{FA0C}", b"\xFE\x40");
            encode_gb18030("\u{E843}", b"\xFE\x7E");
            encode_gb18030("\u{4723}", b"\xFE\x80");
            encode_gb18030("\u{E4C5}", b"\xFE\xFE");
        }

        // The difference from the original GB18030
        encode_gb18030("\u{E5E5}", b"");
        encode_gb18030("\u{3000}", b"\xA1\xA1");

        // Four bytes
        encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
        encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
        if !cfg!(miri) {
            // Miri is too slow
            encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
            encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
            encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
        }

        // Edge cases
        encode_gb18030("\u{00F7}", b"\xA1\xC2");

        // GB18030-2022
        encode_gb18030("\u{9FB9}", b"\xFE\x7E");
        encode_gb18030("\u{FE14}", b"\xA6\xDD");
        encode_gb18030("\u{E843}", b"\xFE\x7E");
        encode_gb18030("\u{E791}", b"\xA6\xDD");

        // Non-change in GB18030-2022
        encode_gb18030("\u{E817}", b"\xFE\x52");
    }

    #[test]
    fn test_gbk_encode() {
        // Empty
        encode_gbk("", b"");

        // ASCII
        encode_gbk("\u{0061}\u{0062}", b"\x61\x62");

        // euro
        encode_gbk("\u{20AC}", b"\x80");

        // two bytes
        encode_gbk("\u{4E02}", b"\x81\x40");
        encode_gbk("\u{4E8A}", b"\x81\x7E");
        if !cfg!(miri) {
            // Miri is too slow
            encode_gbk("\u{4E90}", b"\x81\x80");
            encode_gbk("\u{4FA2}", b"\x81\xFE");
            encode_gbk("\u{FA0C}", b"\xFE\x40");
            encode_gbk("\u{E843}", b"\xFE\x7E");
            encode_gbk("\u{4723}", b"\xFE\x80");
            encode_gbk("\u{E4C5}", b"\xFE\xFE");
        }

        // The difference from the original gb18030
        encode_gbk("\u{E5E5}", b"");
        encode_gbk("\u{3000}", b"\xA1\xA1");

        // Four bytes
        encode_gbk("\u{0080}", b"€");
        encode_gbk("\u{E7C7}", b"");
        if !cfg!(miri) {
            // Miri is too slow
            encode_gbk("\u{2603}", b"☃");
            encode_gbk("\u{1F4A9}", b"💩");
            encode_gbk("\u{10FFFF}", b"􏿿");
        }

        // Edge cases
        encode_gbk("\u{00F7}", b"\xA1\xC2");

        // GB18030-2022
        encode_gb18030("\u{9FB9}", b"\xFE\x7E");
        encode_gb18030("\u{FE14}", b"\xA6\xDD");
        encode_gb18030("\u{E843}", b"\xFE\x7E");
        encode_gb18030("\u{E791}", b"\xA6\xDD");

        // Non-change in GB18030-2022
        encode_gb18030("\u{E817}", b"\xFE\x52");
    }

    #[test]
    #[cfg_attr(miri, ignore)] // Miri is too slow
    fn test_gb18030_decode_all() {
        let input = include_bytes!("test_data/gb18030_in.txt");
        let expectation = include_str!("test_data/gb18030_in_ref.txt");
        let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
        assert!(!had_errors, "Should not have had errors.");
        assert_eq!(&cow[..], expectation);
    }

    #[test]
    #[cfg_attr(miri, ignore)] // Miri is too slow
    fn test_gb18030_encode_all() {
        let input = include_str!("test_data/gb18030_out.txt");
        let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
        let (cow, encoding, had_errors) = GB18030.encode(input);
        assert!(!had_errors, "Should not have had errors.");
        assert_eq!(encoding, GB18030);
        assert_eq!(&cow[..], &expectation[..]);
    }

    #[test]
    fn test_gb18030_encode_from_utf16_max_length() {
        let mut output = [0u8; 20];
        let mut encoder = GB18030.new_encoder();
        {
            let needed = encoder
                .max_buffer_length_from_utf16_without_replacement(1)
                .unwrap();
            let (result, read, written) = encoder.encode_from_utf16_without_replacement(
                &[0x3000],
                &mut output[..needed],
                true,
            );
            assert_eq!(result, EncoderResult::InputEmpty);
            assert_eq!(read, 1);
            assert_eq!(written, 2);
            assert_eq!(output[0], 0xA1);
            assert_eq!(output[1], 0xA1);
        }
    }
}

[ zur Elbe Produktseite wechseln0.59Quellennavigators  Analyse erneut starten  ]

                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....
    

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge