Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quelle  lexer.rs   Sprache: unbekannt

 
//! Contains simple lexer for XML documents.
//!
//! This module is for internal use. Use `xml::pull` module to do parsing.

use std::fmt;
use std::collections::VecDeque;
use std::io::Read;
use std::result;
use std::borrow::Cow;

use common::{Position, TextPosition, is_whitespace_char, is_name_char};
use reader::Error;
use util;

/// `Token` represents a single lexeme of an XML document. These lexemes
/// are used to perform actual parsing.
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
pub enum Token {
    /// `<?`
    ProcessingInstructionStart,
    /// `?>`
    ProcessingInstructionEnd,
    /// `<!DOCTYPE
    DoctypeStart,
    /// `<`
    OpeningTagStart,
    /// `</`
    ClosingTagStart,
    /// `>`
    TagEnd,
    /// `/>`
    EmptyTagEnd,
    /// `<!--`
    CommentStart,
    /// `-->`
    CommentEnd,
    /// A chunk of characters, used for errors recovery.
    Chunk(&'static str),
    /// Any non-special character except whitespace.
    Character(char),
    /// Whitespace character.
    Whitespace(char),
    /// `=`
    EqualsSign,
    /// `'`
    SingleQuote,
    /// `"`
    DoubleQuote,
    /// `<![CDATA[`
    CDataStart,
    /// `]]>`
    CDataEnd,
    /// `&`
    ReferenceStart,
    /// `;`
    ReferenceEnd,
}

impl fmt::Display for Token {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match *self {
            Token::Chunk(s)                            => write!(f, "{}", s),
            Token::Character(c) | Token::Whitespace(c) => write!(f, "{}", c),
            other => write!(f, "{}", match other {
                Token::OpeningTagStart            => "<",
                Token::ProcessingInstructionStart => "<?",
                Token::DoctypeStart               => "<!DOCTYPE",
                Token::ClosingTagStart            => "</",
                Token::CommentStart               => "<!--",
                Token::CDataStart                 => "<![CDATA[",
                Token::TagEnd                     => ">",
                Token::EmptyTagEnd                => "/>",
                Token::ProcessingInstructionEnd   => "?>",
                Token::CommentEnd                 => "-->",
                Token::CDataEnd                   => "]]>",
                Token::ReferenceStart             => "&",
                Token::ReferenceEnd               => ";",
                Token::EqualsSign                 => "=",
                Token::SingleQuote                => "'",
                Token::DoubleQuote                => "\"",
                _                          => unreachable!()
            })
        }
    }
}

impl Token {
    pub fn as_static_str(&self) -> Option<&'static str> {
        match *self {
            Token::OpeningTagStart            => Some("<"),
            Token::ProcessingInstructionStart => Some("<?"),
            Token::DoctypeStart               => Some("<!DOCTYPE"),
            Token::ClosingTagStart            => Some("</"),
            Token::CommentStart               => Some("<!--"),
            Token::CDataStart                 => Some("<![CDATA["),
            Token::TagEnd                     => Some(">"),
            Token::EmptyTagEnd                => Some("/>"),
            Token::ProcessingInstructionEnd   => Some("?>"),
            Token::CommentEnd                 => Some("-->"),
            Token::CDataEnd                   => Some("]]>"),
            Token::ReferenceStart             => Some("&"),
            Token::ReferenceEnd               => Some(";"),
            Token::EqualsSign                 => Some("="),
            Token::SingleQuote                => Some("'"),
            Token::DoubleQuote                => Some("\""),
            Token::Chunk(s)                   => Some(s),
            _                                 => None
        }
    }

    // using String.push_str(token.to_string()) is simply way too slow
    pub fn push_to_string(&self, target: &mut String) {
        match self.as_static_str() {
            Some(s) => { target.push_str(s); }
            None => {
                match *self {
                    Token::Character(c) | Token::Whitespace(c) => target.push(c),
                    _ => unreachable!()
                }
            }
        }
    }

    /// Returns `true` if this token contains data that can be interpreted
    /// as a part of the text. Surprisingly, this also means '>' and '=' and '"' and "'" and '-->'.
    #[inline]
    pub fn contains_char_data(&self) -> bool {
        match *self {
            Token::Whitespace(_) | Token::Chunk(_) | Token::Character(_) | Token::CommentEnd |
            Token::TagEnd | Token::EqualsSign | Token::DoubleQuote | Token::SingleQuote | Token::CDataEnd | 
            Token::ProcessingInstructionEnd | Token::EmptyTagEnd => true,
            _ => false
        }
    }

    /// Returns `true` if this token corresponds to a white space character.
    #[inline]
    pub fn is_whitespace(&self) -> bool {
        match *self {
            Token::Whitespace(_) => true,
            _ => false
        }
    }
}

enum State {
    /// Triggered on '<'
    TagStarted,
    /// Triggered on '<!'
    CommentOrCDataOrDoctypeStarted,
    /// Triggered on '<!-'
    CommentStarted,
    /// Triggered on '<!D' up to '<!DOCTYPE'
    DoctypeStarted(DoctypeStartedSubstate),
    /// Triggered after DoctypeStarted to handle sub elements
    DoctypeFinishing(u8),
    /// Triggered on '<![' up to '<![CDATA'
    CDataStarted(CDataStartedSubstate),
    /// Triggered on '?'
    ProcessingInstructionClosing,
    /// Triggered on '/'
    EmptyTagClosing,
    /// Triggered on '-' up to '--'
    CommentClosing(ClosingSubstate),
    /// Triggered on ']' up to ']]'
    CDataClosing(ClosingSubstate),
    /// Default state
    Normal
}

#[derive(Copy, Clone)]
enum ClosingSubstate {
    First, Second
}

#[derive(Copy, Clone)]
enum DoctypeStartedSubstate {
    D, DO, DOC, DOCT, DOCTY, DOCTYP
}

#[derive(Copy, Clone)]
enum CDataStartedSubstate {
    E, C, CD, CDA, CDAT, CDATA
}

/// `Result` represents lexing result. It is either a token or an error message.
pub type Result = result::Result<Option<Token>, Error>;

/// Helps to set up a dispatch table for lexing large unambigous tokens like
/// `<![CDATA[` or `<!DOCTYPE `.
macro_rules! dispatch_on_enum_state(
    ($_self:ident, $s:expr, $c:expr, $is:expr,
     $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
     $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
        match $s {
            $(
            $st => match $c {
                $stc => $_self.move_to($is($next_st)),
                _  => $_self.handle_error($chunk, $c)
            },
            )+
            $end_st => match $c {
                $end_c => $e,
                _      => $_self.handle_error($end_chunk, $c)
            }
        }
    )
);

/// `Lexer` is a lexer for XML documents, which implements pull API.
///
/// Main method is `next_token` which accepts an `std::io::Read` instance and
/// tries to read the next lexeme from it.
///
/// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
/// When it is not set, errors will be reported as `Err` objects with a string message.
/// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
/// to toggle the behavior.
pub struct Lexer {
    pos: TextPosition,
    head_pos: TextPosition,
    char_queue: VecDeque<char>,
    st: State,
    skip_errors: bool,
    inside_comment: bool,
    inside_token: bool,
    eof_handled: bool
}

impl Position for Lexer {
    #[inline]
    /// Returns the position of the last token produced by the lexer
    fn position(&self) -> TextPosition { self.pos }
}

impl Lexer {
    /// Returns a new lexer with default state.
    pub fn new() -> Lexer {
        Lexer {
            pos: TextPosition::new(),
            head_pos: TextPosition::new(),
            char_queue: VecDeque::with_capacity(4),  // TODO: check size
            st: State::Normal,
            skip_errors: false,
            inside_comment: false,
            inside_token: false,
            eof_handled: false
        }
    }

    /// Enables error handling so `next_token` will return `Some(Err(..))`
    /// upon invalid lexeme.
    #[inline]
    pub fn enable_errors(&mut self) { self.skip_errors = false; }

    /// Disables error handling so `next_token` will return `Some(Chunk(..))`
    /// upon invalid lexeme with this lexeme content.
    #[inline]
    pub fn disable_errors(&mut self) { self.skip_errors = true; }

    /// Enables special handling of some lexemes which should be done when we're parsing comment
    /// internals.
    #[inline]
    pub fn inside_comment(&mut self) { self.inside_comment = true; }

    /// Disables the effect of `inside_comment()` method.
    #[inline]
    pub fn outside_comment(&mut self) { self.inside_comment = false; }

    /// Reset the eof handled flag of the lexer.
    #[inline]
    pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }

    /// Tries to read the next token from the buffer.
    ///
    /// It is possible to pass different instaces of `BufReader` each time
    /// this method is called, but the resulting behavior is undefined in this case.
    ///
    /// Return value:
    /// * `Err(reason) where reason: reader::Error` - when an error occurs;
    /// * `Ok(None)` - upon end of stream is reached;
    /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
    pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
        // Already reached end of buffer
        if self.eof_handled {
            return Ok(None);
        }

        if !self.inside_token {
            self.pos = self.head_pos;
            self.inside_token = true;
        }

        // Check if we have saved a char or two for ourselves
        while let Some(c) = self.char_queue.pop_front() {
            match try!(self.read_next_token(c)) {
                Some(t) => {
                    self.inside_token = false;
                    return Ok(Some(t));
                }
                None => {}  // continue
            }
        }

        loop {
            // TODO: this should handle multiple encodings
            let c = match try!(util::next_char_from(b)) {
                Some(c) => c,   // got next char
                None => break,  // nothing to read left
            };

            match try!(self.read_next_token(c)) {
                Some(t) => {
                    self.inside_token = false;
                    return Ok(Some(t));
                }
                None => {
                    // continue
                }
            }
        }

        // Handle end of stream
        self.eof_handled = true;
        self.pos = self.head_pos;
        match self.st {
            State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
            State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
            State::CommentClosing(ClosingSubstate::Second) |
            State::DoctypeFinishing(_) =>
                Err(self.error("Unexpected end of stream")),
            State::ProcessingInstructionClosing =>
                Ok(Some(Token::Character('?'))),
            State::EmptyTagClosing =>
                Ok(Some(Token::Character('/'))),
            State::CommentClosing(ClosingSubstate::First) =>
                Ok(Some(Token::Character('-'))),
            State::CDataClosing(ClosingSubstate::First) =>
                Ok(Some(Token::Character(']'))),
            State::CDataClosing(ClosingSubstate::Second) =>
                Ok(Some(Token::Chunk("]]"))),
            State::Normal =>
                Ok(None)
        }
    }

    #[inline]
    fn error<M: Into<Cow<'static, str>>>(&self, msg: M) -> Error {
        (self, msg).into()
    }

    #[inline]
    fn read_next_token(&mut self, c: char) -> Result {
        let res = self.dispatch_char(c);
        if self.char_queue.is_empty() {
            if c == '\n' {
                self.head_pos.new_line();
            } else {
                self.head_pos.advance(1);
            }
        }
        res
    }

    fn dispatch_char(&mut self, c: char) -> Result {
        match self.st {
            State::Normal                         => self.normal(c),
            State::TagStarted                     => self.tag_opened(c),
            State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
            State::CommentStarted                 => self.comment_started(c),
            State::CDataStarted(s)                => self.cdata_started(c, s),
            State::DoctypeStarted(s)              => self.doctype_started(c, s),
            State::DoctypeFinishing(d)            => self.doctype_finishing(c, d),
            State::ProcessingInstructionClosing   => self.processing_instruction_closing(c),
            State::EmptyTagClosing                => self.empty_element_closing(c),
            State::CommentClosing(s)              => self.comment_closing(c, s),
            State::CDataClosing(s)                => self.cdata_closing(c, s)
        }
    }

    #[inline]
    fn move_to(&mut self, st: State) -> Result {
        self.st = st;
        Ok(None)
    }

    #[inline]
    fn move_to_with(&mut self, st: State, token: Token) -> Result {
        self.st = st;
        Ok(Some(token))
    }

    #[inline]
    fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
        self.char_queue.extend(cs.iter().cloned());
        self.move_to_with(st, token)
    }

    fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
        self.char_queue.push_back(c);
        if self.skip_errors || (self.inside_comment && chunk != "--") {  // FIXME: looks hacky
            self.move_to_with(State::Normal, Token::Chunk(chunk))
        } else {
            Err(self.error(format!("Unexpected token '{}' before '{}'", chunk, c)))
        }
    }

    /// Encountered a char
    fn normal(&mut self, c: char) -> Result {
        match c {
            '<'                        => self.move_to(State::TagStarted),
            '>'                        => Ok(Some(Token::TagEnd)),
            '/'                        => self.move_to(State::EmptyTagClosing),
            '='                        => Ok(Some(Token::EqualsSign)),
            '"'                        => Ok(Some(Token::DoubleQuote)),
            '\''                       => Ok(Some(Token::SingleQuote)),
            '?'                        => self.move_to(State::ProcessingInstructionClosing),
            '-'                        => self.move_to(State::CommentClosing(ClosingSubstate::First)),
            ']'                        => self.move_to(State::CDataClosing(ClosingSubstate::First)),
            '&'                        => Ok(Some(Token::ReferenceStart)),
            ';'                        => Ok(Some(Token::ReferenceEnd)),
            _ if is_whitespace_char(c) => Ok(Some(Token::Whitespace(c))),
            _                          => Ok(Some(Token::Character(c)))
        }
    }

    /// Encountered '<'
    fn tag_opened(&mut self, c: char) -> Result {
        match c {
            '?'                        => self.move_to_with(State::Normal, Token::ProcessingInstructionStart),
            '/'                        => self.move_to_with(State::Normal, Token::ClosingTagStart),
            '!'                        => self.move_to(State::CommentOrCDataOrDoctypeStarted),
            _ if is_whitespace_char(c) => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
            _ if is_name_char(c)       => self.move_to_with_unread(State::Normal, &[c], Token::OpeningTagStart),
            _                          => self.handle_error("<", c)
        }
    }

    /// Encountered '<!'
    fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
        match c {
            '-' => self.move_to(State::CommentStarted),
            '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
            'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
            _   => self.handle_error("<!", c)
        }
    }

    /// Encountered '<!-'
    fn comment_started(&mut self, c: char) -> Result {
        match c {
            '-' => self.move_to_with(State::Normal, Token::CommentStart),
            _   => self.handle_error("<!-", c)
        }
    }

    /// Encountered '<!['
    fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
        use self::CDataStartedSubstate::{E, C, CD, CDA, CDAT, CDATA};
        dispatch_on_enum_state!(self, s, c, State::CDataStarted,
            E     ; 'C' ; C     ; "<![",
            C     ; 'D' ; CD    ; "<![C",
            CD    ; 'A' ; CDA   ; "<![CD",
            CDA   ; 'T' ; CDAT  ; "<![CDA",
            CDAT  ; 'A' ; CDATA ; "<![CDAT";
            CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::Normal, Token::CDataStart)
        )
    }

    /// Encountered '<!D'
    fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
        use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
        dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
            D      ; 'O' ; DO     ; "<!D",
            DO     ; 'C' ; DOC    ; "<!DO",
            DOC    ; 'T' ; DOCT   ; "<!DOC",
            DOCT   ; 'Y' ; DOCTY  ; "<!DOCT",
            DOCTY  ; 'P' ; DOCTYP ; "<!DOCTY";
            DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_with(State::DoctypeFinishing(1), Token::DoctypeStart)
        )
    }

    /// State used while awaiting the closing bracket for the <!DOCTYPE tag
    fn doctype_finishing(&mut self, c: char, d: u8) -> Result {
        match c {
            '<' => self.move_to(State::DoctypeFinishing(d + 1)),
            '>' if d == 1 => self.move_to_with(State::Normal, Token::TagEnd),
            '>' => self.move_to(State::DoctypeFinishing(d - 1)),
            _ => Ok(None),
        }
    }

    /// Encountered '?'
    fn processing_instruction_closing(&mut self, c: char) -> Result {
        match c {
            '>' => self.move_to_with(State::Normal, Token::ProcessingInstructionEnd),
            _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('?')),
        }
    }

    /// Encountered '/'
    fn empty_element_closing(&mut self, c: char) -> Result {
        match c {
            '>' => self.move_to_with(State::Normal, Token::EmptyTagEnd),
            _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('/')),
        }
    }

    /// Encountered '-'
    fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
        match s {
            ClosingSubstate::First => match c {
                '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
                _   => self.move_to_with_unread(State::Normal, &[c], Token::Character('-'))
            },
            ClosingSubstate::Second => match c {
                '>'                      => self.move_to_with(State::Normal, Token::CommentEnd),
                // double dash not followed by a greater-than is a hard error inside comment
                _ if self.inside_comment => self.handle_error("--", c),
                // nothing else except comment closing starts with a double dash, and comment
                // closing can never be after another dash, and also we're outside of a comment,
                // therefore it is safe to push only the last read character to the list of unread
                // characters and pass the double dash directly to the output
                _                        => self.move_to_with_unread(State::Normal, &[c], Token::Chunk("--"))
            }
        }
    }

    /// Encountered ']'
    fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
        match s {
            ClosingSubstate::First => match c {
                ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
                _   => self.move_to_with_unread(State::Normal, &[c], Token::Character(']'))
            },
            ClosingSubstate::Second => match c {
                '>' => self.move_to_with(State::Normal, Token::CDataEnd),
                _   => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']'))
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use common::{Position};
    use std::io::{BufReader, Cursor};

    use super::{Lexer, Token};

    macro_rules! assert_oks(
        (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
            $(
                assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
             )+
        })
    );

    macro_rules! assert_err(
        (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
            let err = $lex.next_token(&mut $buf);
            assert!(err.is_err());
            let err = err.unwrap_err();
            assert_eq!($r as u64, err.position().row);
            assert_eq!($c as u64, err.position().column);
            assert_eq!($s, err.msg());
        })
    );

    macro_rules! assert_none(
        (for $lex:ident and $buf:ident) => (
            assert_eq!(Ok(None), $lex.next_token(&mut $buf));
        )
    );

    fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
        (Lexer::new(), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
    }

    #[test]
    fn simple_lexer_test() {
        let (mut lex, mut buf) = make_lex_and_buf(
            r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c -->  "#
        );

        assert_oks!(for lex and buf ;
            Token::OpeningTagStart
            Token::Character('a')
            Token::Whitespace(' ')
            Token::Character('p')
            Token::EqualsSign
            Token::SingleQuote
            Token::Character('q')
            Token::SingleQuote
            Token::TagEnd
            Token::Whitespace(' ')
            Token::Character('x')
            Token::OpeningTagStart
            Token::Character('b')
            Token::Whitespace(' ')
            Token::Character('z')
            Token::EqualsSign
            Token::DoubleQuote
            Token::Character('y')
            Token::DoubleQuote
            Token::TagEnd
            Token::Character('d')
            Token::Whitespace('\t')
            Token::ClosingTagStart
            Token::Character('b')
            Token::TagEnd
            Token::ClosingTagStart
            Token::Character('a')
            Token::TagEnd
            Token::OpeningTagStart
            Token::Character('p')
            Token::EmptyTagEnd
            Token::Whitespace(' ')
            Token::ProcessingInstructionStart
            Token::Character('n')
            Token::Character('m')
            Token::Whitespace(' ')
            Token::ProcessingInstructionEnd
            Token::Whitespace(' ')
            Token::CommentStart
            Token::Whitespace(' ')
            Token::Character('a')
            Token::Whitespace(' ')
            Token::Character('c')
            Token::Whitespace(' ')
            Token::CommentEnd
            Token::Whitespace(' ')
            Token::ReferenceStart
            Token::Character('n')
            Token::Character('b')
            Token::Character('s')
            Token::Character('p')
            Token::ReferenceEnd
        );
        assert_none!(for lex and buf);
    }

    #[test]
    fn special_chars_test() {
        let (mut lex, mut buf) = make_lex_and_buf(
            r#"?x!+ // -| ]z]]"#
        );

        assert_oks!(for lex and buf ;
            Token::Character('?')
            Token::Character('x')
            Token::Character('!')
            Token::Character('+')
            Token::Whitespace(' ')
            Token::Character('/')
            Token::Character('/')
            Token::Whitespace(' ')
            Token::Character('-')
            Token::Character('|')
            Token::Whitespace(' ')
            Token::Character(']')
            Token::Character('z')
            Token::Chunk("]]")
        );
        assert_none!(for lex and buf);
    }

    #[test]
    fn cdata_test() {
        let (mut lex, mut buf) = make_lex_and_buf(
            r#"<a><![CDATA[x y ?]]> </a>"#
        );

        assert_oks!(for lex and buf ;
            Token::OpeningTagStart
            Token::Character('a')
            Token::TagEnd
            Token::CDataStart
            Token::Character('x')
            Token::Whitespace(' ')
            Token::Character('y')
            Token::Whitespace(' ')
            Token::Character('?')
            Token::CDataEnd
            Token::Whitespace(' ')
            Token::ClosingTagStart
            Token::Character('a')
            Token::TagEnd
        );
        assert_none!(for lex and buf);
    }

    #[test]
    fn doctype_test() {
        let (mut lex, mut buf) = make_lex_and_buf(
            r#"<a><!DOCTYPE ab xx z> "#
        );
        assert_oks!(for lex and buf ;
            Token::OpeningTagStart
            Token::Character('a')
            Token::TagEnd
            Token::DoctypeStart
            Token::TagEnd
            Token::Whitespace(' ')
        );
        assert_none!(for lex and buf)
    }

    #[test]
    fn doctype_with_internal_subset_test() {
        let (mut lex, mut buf) = make_lex_and_buf(
            r#"<a><!DOCTYPE ab[<!ELEMENT ba> ]> "#
        );
        assert_oks!(for lex and buf ;
            Token::OpeningTagStart
            Token::Character('a')
            Token::TagEnd
            Token::DoctypeStart
            Token::TagEnd
            Token::Whitespace(' ')
        );
        assert_none!(for lex and buf)
    }

    #[test]
    fn end_of_stream_handling_ok() {
        macro_rules! eof_check(
            ($data:expr ; $token:expr) => ({
                let (mut lex, mut buf) = make_lex_and_buf($data);
                assert_oks!(for lex and buf ; $token);
                assert_none!(for lex and buf);
            })
        );
        eof_check!("?"  ; Token::Character('?'));
        eof_check!("/"  ; Token::Character('/'));
        eof_check!("-"  ; Token::Character('-'));
        eof_check!("]"  ; Token::Character(']'));
        eof_check!("]]" ; Token::Chunk("]]"));
    }

    #[test]
    fn end_of_stream_handling_error() {
        macro_rules! eof_check(
            ($data:expr; $r:expr, $c:expr) => ({
                let (mut lex, mut buf) = make_lex_and_buf($data);
                assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
                assert_none!(for lex and buf);
            })
        );
        eof_check!("<"        ; 0, 1);
        eof_check!("<!"       ; 0, 2);
        eof_check!("<!-"      ; 0, 3);
        eof_check!("<!["      ; 0, 3);
        eof_check!("<![C"     ; 0, 4);
        eof_check!("<![CD"    ; 0, 5);
        eof_check!("<![CDA"   ; 0, 6);
        eof_check!("<![CDAT"  ; 0, 7);
        eof_check!("<![CDATA" ; 0, 8);
        eof_check!("--"       ; 0, 2);
    }

    #[test]
    fn error_in_comment_or_cdata_prefix() {
        let (mut lex, mut buf) = make_lex_and_buf("<!x");
        assert_err!(for lex and buf expect row 0 ; 0,
            "Unexpected token '<!' before 'x'"
        );

        let (mut lex, mut buf) = make_lex_and_buf("<!x");
        lex.disable_errors();
        assert_oks!(for lex and buf ;
            Token::Chunk("<!")
            Token::Character('x')
        );
        assert_none!(for lex and buf);
    }

    #[test]
    fn error_in_comment_started() {
        let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
        assert_err!(for lex and buf expect row 0 ; 0,
            "Unexpected token '<!-' before '\t'"
        );

        let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
        lex.disable_errors();
        assert_oks!(for lex and buf ;
            Token::Chunk("<!-")
            Token::Whitespace('\t')
        );
        assert_none!(for lex and buf);
    }

    #[test]
    fn error_in_comment_two_dashes_not_at_end() {
        let (mut lex, mut buf) = make_lex_and_buf("--x");
        lex.inside_comment();
        assert_err!(for lex and buf expect row 0; 0,
            "Unexpected token '--' before 'x'"
        );

        let (mut lex, mut buf) = make_lex_and_buf("--x");
        assert_oks!(for lex and buf ;
            Token::Chunk("--")
            Token::Character('x')
        );
    }

    macro_rules! check_case(
        ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
            let (mut lex, mut buf) = make_lex_and_buf($data);
            assert_err!(for lex and buf expect row $r ; $c, $s);

            let (mut lex, mut buf) = make_lex_and_buf($data);
            lex.disable_errors();
            assert_oks!(for lex and buf ;
                Token::Chunk($chunk)
                Token::Character($app)
            );
            assert_none!(for lex and buf);
        })
    );

    #[test]
    fn error_in_cdata_started() {
        check_case!("<![",      '['; "<![["      ; 0, 0, "Unexpected token '<![' before '['");
        check_case!("<![C",     '['; "<![C["     ; 0, 0, "Unexpected token '<![C' before '['");
        check_case!("<![CD",    '['; "<![CD["    ; 0, 0, "Unexpected token '<![CD' before '['");
        check_case!("<![CDA",   '['; "<![CDA["   ; 0, 0, "Unexpected token '<![CDA' before '['");
        check_case!("<![CDAT",  '['; "<![CDAT["  ; 0, 0, "Unexpected token '<![CDAT' before '['");
        check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
    }

    #[test]
    fn error_in_doctype_started() {
        check_case!("<!D",      'a'; "<!Da"      ; 0, 0, "Unexpected token '<!D' before 'a'");
        check_case!("<!DO",     'b'; "<!DOb"     ; 0, 0, "Unexpected token '<!DO' before 'b'");
        check_case!("<!DOC",    'c'; "<!DOCc"    ; 0, 0, "Unexpected token '<!DOC' before 'c'");
        check_case!("<!DOCT",   'd'; "<!DOCTd"   ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
        check_case!("<!DOCTY",  'e'; "<!DOCTYe"  ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
        check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
    }



    #[test]
    fn issue_98_cdata_ending_with_right_bracket() {
        let (mut lex, mut buf) = make_lex_and_buf(
            r#"<![CDATA[Foo [Bar]]]>"#
        );

        assert_oks!(for lex and buf ;
            Token::CDataStart
            Token::Character('F')
            Token::Character('o')
            Token::Character('o')
            Token::Whitespace(' ')
            Token::Character('[')
            Token::Character('B')
            Token::Character('a')
            Token::Character('r')
            Token::Character(']')
            Token::CDataEnd
        );
        assert_none!(for lex and buf);
    }
}

[ Dauer der Verarbeitung: 0.33 Sekunden  (vorverarbeitet)  ]

                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge