Quelle svparser.cxx

Sprache: C

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* This file incorporates work covered by the following license notice:
*
*   Licensed to the Apache Software Foundation (ASF) under one or more
*   contributor license agreements. See the NOTICE file distributed
*   with this work for additional information regarding copyright
*   ownership. The ASF licenses this file to you under the Apache
*   License, Version 2.0 (the "License"); you may not use this file
*   except in compliance with the License. You may obtain a copy of
*   the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/

#include <svtools/svparser.hxx>
#include <svtools/htmltokn.h>
#include <tools/stream.hxx>
#include <tools/debug.hxx>
#include <rtl/textcvt.h>
#include <rtl/tencinfo.h>
#include <rtl/character.hxx>
#include <sal/log.hxx>
#include <unicode/ucsdet.h>
#include <comphelper/configuration.hxx>

#include <vector>

// structure to store the actual data
template<typename T>
struct SvParser_Impl
{
    OUString        aToken;             // parsed token
    sal_uInt64      nFilePos;           // actual position in stream
    sal_uInt32      nlLineNr;           // actual line number
    sal_uInt32      nlLinePos;          // actual column number
    tools::Long            nTokenValue;        // extra value (RTF)
    bool            bTokenHasValue;     // indicates whether nTokenValue is valid
    T               nToken;             // actual Token
    sal_uInt32      nNextCh;            // actual character
    T               nSaveToken;         // the token from Continue

    rtl_TextToUnicodeConverter hConv;
    rtl_TextToUnicodeContext   hContext;

    SvParser_Impl()
        : nFilePos(0)
        , nlLineNr(0)
        , nlLinePos(0)
        , nTokenValue(0)
        , bTokenHasValue(false)
        , nToken(static_cast<T>(0))
        , nNextCh(0)
        , nSaveToken(static_cast<T>(0))
        , hConv( nullptr )
        , hContext( reinterpret_cast<rtl_TextToUnicodeContext>(1) )
    {
    }

};

template<typename T>
SvParser<T>::TokenStackType::TokenStackType()
  : nTokenValue(0)
    , bTokenHasValue(false)
    , nTokenId(static_cast<T>(0))
{
}

// Constructor
template<typename T>
SvParser<T>::SvParser( SvStream& rIn, sal_uInt8 nStackSize )
    : rInput( rIn )
    , nlLineNr( 1 )
    , nlLinePos( 1 )
    , nConversionErrors( 0 )
    , pImplData( nullptr )
    , m_nTokenIndex(0)
    , nTokenValue( 0 )
    , bTokenHasValue( false )
    , bFuzzing(comphelper::IsFuzzing())
    , eState( SvParserState::NotStarted )
    , eSrcEnc( RTL_TEXTENCODING_DONTKNOW )
    , nNextChPos(0)
    , nNextCh(0)
    , bSwitchToUCS2(false)
    , bRTF_InTextRead(false)
    , nTokenStackSize( nStackSize )
    , nTokenStackPos( 0 )
{
    eState = SvParserState::NotStarted;
    if( nTokenStackSize < 3 )
        nTokenStackSize = 3;
    pTokenStack.reset(new TokenStackType[ nTokenStackSize ]);
    pTokenStackPos = pTokenStack.get();
}

template<typename T>
SvParser<T>::~SvParser()
{
    if( pImplData && pImplData->hConv )
    {
        rtl_destroyTextToUnicodeContext( pImplData->hConv,
                                         pImplData->hContext );
        rtl_destroyTextToUnicodeConverter( pImplData->hConv );
    }

    pTokenStack.reset();
}

template<typename T> SvParserState SvParser<T>::GetStatus() const { return eState; }
template<typename T> sal_uInt32 SvParser<T>::GetLineNr() const       { return nlLineNr; }
template<typename T> sal_uInt32 SvParser<T>::GetLinePos() const      { return nlLinePos; }
template<typename T> void       SvParser<T>::IncLineNr()             { ++nlLineNr; }
template<typename T> sal_uInt32 SvParser<T>::IncLinePos()            { return ++nlLinePos; }
template<typename T> void       SvParser<T>::SetLineNr( sal_uInt32 nlNum ) { nlLineNr = nlNum; }
template<typename T> void       SvParser<T>::SetLinePos( sal_uInt32 nlPos ) {   nlLinePos = nlPos; }
template<typename T> bool       SvParser<T>::IsParserWorking() const { return SvParserState::Working == eState; }
template<typename T> rtl_TextEncoding SvParser<T>::GetSrcEncoding() const { return eSrcEnc; }
template<typename T> void       SvParser<T>::SetSwitchToUCS2( bool bSet ) { bSwitchToUCS2 = bSet; }
template<typename T> bool       SvParser<T>::IsSwitchToUCS2() const { return bSwitchToUCS2; }
template<typename T> sal_uInt16 SvParser<T>::GetCharSize() const { return (RTL_TEXTENCODING_UCS2 == eSrcEnc) ? 2 : 1; }
template<typename T> Link<LinkParamNone*,void> SvParser<T>::GetAsynchCallLink() const
{
    return LINK( const_cast<SvParser*>(this), SvParser, NewDataRead );
}

template<typename T>
void SvParser<T>::ClearTxtConvContext()
{
    if( pImplData && pImplData->hConv )
        rtl_resetTextToUnicodeContext( pImplData->hConv, pImplData->hContext );
}

template<typename T>
void SvParser<T>::SetSrcEncoding( rtl_TextEncoding eEnc )
{
    if( eEnc == eSrcEnc )
        return;

    if( pImplData && pImplData->hConv )
    {
        rtl_destroyTextToUnicodeContext( pImplData->hConv,
                                         pImplData->hContext );
        rtl_destroyTextToUnicodeConverter( pImplData->hConv );
        pImplData->hConv = nullptr;
        pImplData->hContext = reinterpret_cast<rtl_TextToUnicodeContext>(1);
    }

    if( rtl_isOctetTextEncoding(eEnc) ||
        RTL_TEXTENCODING_UCS2 == eEnc  )
    {
        eSrcEnc = eEnc;
        if( !pImplData )
            pImplData.reset(new SvParser_Impl<T>);
        pImplData->hConv = rtl_createTextToUnicodeConverter( eSrcEnc );
        DBG_ASSERT( pImplData->hConv,
                    "SvParser::SetSrcEncoding: no converter for source encoding" );
        if( !pImplData->hConv )
            eSrcEnc = RTL_TEXTENCODING_DONTKNOW;
        else
            pImplData->hContext =
                rtl_createTextToUnicodeContext( pImplData->hConv );
    }
    else
    {
        SAL_WARN( "svtools",
                    "SvParser::SetSrcEncoding: invalid source encoding" );
        eSrcEnc = RTL_TEXTENCODING_DONTKNOW;
    }
}

template<typename T>
void SvParser<T>::RereadLookahead()
{
    rInput.Seek(nNextChPos);
    nNextCh = GetNextChar();
}

template<typename T>
sal_uInt32 SvParser<T>::GetNextChar()
{
    sal_uInt32 c = 0U;

    // When reading multiple bytes, we don't have to care about the file
    // position when we run into the pending state. The file position is
    // maintained by SaveState/RestoreState.
    if( bSwitchToUCS2 && 0 == rInput.Tell() )
    {
        rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW);
        if (rInput.good())
        {
            sal_uInt64 nPos = rInput.Tell();
            if (nPos == 2)
                eSrcEnc = RTL_TEXTENCODING_UCS2;
            else if (nPos == 3)
                SetSrcEncoding(RTL_TEXTENCODING_UTF8);
            else // Try to detect encoding without BOM
            {
                std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer
                const size_t nSize = rInput.ReadBytes(buf.data(), buf.size());
                rInput.Seek(0);
                if (nSize > 0)
                {
                    UErrorCode uerr = U_ZERO_ERROR;
                    UCharsetDetector* ucd = ucsdet_open(&uerr);
                    ucsdet_setText(ucd, buf.data(), nSize, &uerr);
                    if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr))
                    {
                        const char* pEncodingName = ucsdet_getName(match, &uerr);

                        if (U_SUCCESS(uerr))
                        {
                            if (strcmp("UTF-8", pEncodingName) == 0)
                            {
                                SetSrcEncoding(RTL_TEXTENCODING_UTF8);
                            }
                            else if (strcmp("UTF-16LE", pEncodingName) == 0)
                            {
                                eSrcEnc = RTL_TEXTENCODING_UCS2;
                                rInput.SetEndian(SvStreamEndian::LITTLE);
                            }
                            else if (strcmp("UTF-16BE", pEncodingName) == 0)
                            {
                                eSrcEnc = RTL_TEXTENCODING_UCS2;
                                rInput.SetEndian(SvStreamEndian::BIG);
                            }
                        }
                    }

                    ucsdet_close(ucd);
                }
            }
        }
        bSwitchToUCS2 = false;
    }

    bool bErr;
    nNextChPos = rInput.Tell();

    if( RTL_TEXTENCODING_UCS2 == eSrcEnc )
    {
        sal_Unicode cUC;
        rInput.ReadUtf16(cUC);
        bErr = !rInput.good();
        if( !bErr )
        {
            c = cUC;
            if (rtl::isHighSurrogate(cUC))
            {
                const sal_uInt64 nPos = rInput.Tell();
                rInput.ReadUtf16(cUC);
                if (rtl::isLowSurrogate(cUC)) // can only be true when ReadUtf16 succeeded
                    c = rtl::combineSurrogates(c, cUC);
                else
                    rInput.Seek(nPos); // process lone high surrogate
            }
        }
    }
    else
    {
        sal_Size nChars = 0;
        do
        {
            char c1;    // signed, that's the text converter expects
            rInput.ReadChar( c1 );
            bErr = !rInput.good();
            if( !bErr )
            {
                if (
                     RTL_TEXTENCODING_DONTKNOW == eSrcEnc ||
                     RTL_TEXTENCODING_SYMBOL == eSrcEnc
                   )
                {
                    // no conversion shall take place
                    c = reinterpret_cast<unsigned char&>( c1 );
                    nChars = 1;
                }
                else
                {
                    assert(pImplData && pImplData->hConv && "no text converter!");

                    sal_Unicode cUC;
                    sal_uInt32 nInfo = 0;
                    sal_Size nCvtBytes;
                    nChars = rtl_convertTextToUnicode(
                                pImplData->hConv, pImplData->hContext,
                                &c1, 1, &cUC, 1,
                                RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
                                RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
                                RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
                                &nInfo, &nCvtBytes);
                    if( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 )
                    {
                        // The conversion wasn't successful because we haven't
                        // read enough characters.
                        if( pImplData->hContext != reinterpret_cast<rtl_TextToUnicodeContext>(1) )
                        {
                            sal_Unicode sCh[2];
                            while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 )
                            {
                                rInput.ReadChar( c1 );
                                bErr = !rInput.good();
                                if( bErr )
                                    break;

                                nChars = rtl_convertTextToUnicode(
                                            pImplData->hConv, pImplData->hContext,
                                            &c1, 1, sCh , 2,
                                            RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
                                            RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
                                            RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
                                            &nInfo, &nCvtBytes);
                            }
                            if( !bErr )
                            {
                                if( 1 == nChars && 0 == nInfo )
                                {
                                    c = sal_uInt32( sCh[0] );
                                }
                                else if( 2 == nChars && 0 == nInfo )
                                {
                                    c = rtl::combineSurrogates( sCh[0], sCh[1] );
                                }
                                else if( 0 != nChars || 0 != nInfo )
                                {
                                    DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0,
                                        "source buffer is too small" );
                                    DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0,
                                         "there is a conversion error" );
                                    DBG_ASSERT( 0 == nChars,
                                       "there is a converted character, but an error" );
                                    // There are still errors, but nothing we can
                                    // do
                                    c = '?';
                                    nChars = 1;
                                    ++nConversionErrors;
                                }
                            }
                        }
                        else
                        {
                            char sBuffer[10];
                            sBuffer[0] = c1;
                            sal_uInt16 nLen = 1;
                            while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 &&
                                    nLen < 10 )
                            {
                                rInput.ReadChar( c1 );
                                bErr = !rInput.good();
                                if( bErr )
                                    break;

                                sBuffer[nLen++] = c1;
                                nChars = rtl_convertTextToUnicode(
                                            pImplData->hConv, nullptr, sBuffer, nLen, &cUC, 1,
                                            RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
                                            RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
                                            RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
                                            &nInfo, &nCvtBytes);
                            }
                            if( !bErr )
                            {
                                if( 1 == nChars && 0 == nInfo )
                                {
                                    DBG_ASSERT( nCvtBytes == nLen,
                                                "no all bytes have been converted!" );
                                    c = cUC;
                                }
                                else
                                {
                                    DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0,
                                        "source buffer is too small" );
                                    DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0,
                                         "there is a conversion error" );
                                    DBG_ASSERT( 0 == nChars,
                                       "there is a converted character, but an error" );

                                    // There are still errors, so we use the first
                                    // character and restart after that.
                                    c = reinterpret_cast<unsigned char&>( sBuffer[0] );
                                    rInput.SeekRel( -(nLen-1) );
                                    nChars = 1;
                                    ++nConversionErrors;
                                }
                            }
                        }
                    }
                    else if( 1 == nChars && 0 == nInfo )
                    {
                        // The conversion was successful
                        DBG_ASSERT( nCvtBytes == 1,
                                    "no all bytes have been converted!" );
                        c = cUC;
                    }
                    else if( 0 != nChars || 0 != nInfo )
                    {
                        DBG_ASSERT( 0 == nChars,
                                "there is a converted character, but an error" );
                        DBG_ASSERT( 0 != nInfo,
                                "there is no converted character and no error" );
                        // #73398#: If the character could not be converted,
                        // because a conversion is not available, do no conversion at all.
                        c = reinterpret_cast<unsigned char&>( c1 );
                        nChars = 1;
                        ++nConversionErrors;
                    }
                }
            }
        }
        while( 0 == nChars  && !bErr );
    }

    if ( ! rtl::isUnicodeScalarValue( c ) )
        c = '?' ;

    if (bFuzzing && nConversionErrors > 128)
    {
        SAL_WARN("svtools", "SvParser::GetNextChar too many conversion errors while fuzzing, abandoning for performance");
        bErr = true;
    }

    if( bErr )
    {
        if( ERRCODE_IO_PENDING == rInput.GetError() )
        {
            eState = SvParserState::Pending;
            return c;
        }
        else
            return sal_Unicode(EOF);
    }

    if( c == '\n' )
    {
        IncLineNr();
        SetLinePos( 1 );
    }
    else
        IncLinePos();

    return c;
}

template<typename T>
T SvParser<T>::GetNextToken()
{
    T nRet = static_cast<T>(0);

    if( !nTokenStackPos )
    {
        aToken.setLength( 0 );     // empty token buffer
        nTokenValue = -1;   // marker for no value read
        bTokenHasValue = false;

        nRet = GetNextToken_();
        if( SvParserState::Pending == eState )
            return nRet;
    }

    ++pTokenStackPos;
    if( pTokenStackPos == pTokenStack.get() + nTokenStackSize )
        pTokenStackPos = pTokenStack.get();

    // pop from stack ??
    if( nTokenStackPos )
    {
        --nTokenStackPos;
        nTokenValue = pTokenStackPos->nTokenValue;
        bTokenHasValue = pTokenStackPos->bTokenHasValue;
        aToken = pTokenStackPos->sToken;
        nRet = pTokenStackPos->nTokenId;
        ++m_nTokenIndex;
    }
    // no, now push actual value on stack
    else if( SvParserState::Working == eState )
    {
        pTokenStackPos->sToken = aToken;
        pTokenStackPos->nTokenValue = nTokenValue;
        pTokenStackPos->bTokenHasValue = bTokenHasValue;
        pTokenStackPos->nTokenId = nRet;
        ++m_nTokenIndex;
    }
    else if( SvParserState::Accepted != eState && SvParserState::Pending != eState )
        eState = SvParserState::Error;       // an error occurred

    return nRet;
}

template<typename T>
T SvParser<T>::SkipToken( short nCnt )       // "skip" n Tokens backward
{
    pTokenStackPos = GetStackPtr( nCnt );
    short nTmp = nTokenStackPos - nCnt;
    if( nTmp < 0 )
        nTmp = 0;
    else if( nTmp > nTokenStackSize )
        nTmp = nTokenStackSize;
    nTokenStackPos = sal_uInt8(nTmp);

    m_nTokenIndex -= nTmp;

    // restore values
    aToken = pTokenStackPos->sToken;
    nTokenValue = pTokenStackPos->nTokenValue;
    bTokenHasValue = pTokenStackPos->bTokenHasValue;

    return pTokenStackPos->nTokenId;
}

template<typename T>
typename SvParser<T>::TokenStackType* SvParser<T>::GetStackPtr( short nCnt )
{
    sal_uInt8 nCurrentPos = sal_uInt8(pTokenStackPos - pTokenStack.get());
    if( nCnt > 0 )
    {
        if( nCnt >= nTokenStackSize )
            nCnt = (nTokenStackSize-1);
        if( nCurrentPos + nCnt < nTokenStackSize )
            nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt);
        else
            nCurrentPos = sal::static_int_cast< sal_uInt8 >(
                nCurrentPos + (nCnt - nTokenStackSize));
    }
    else if( nCnt < 0 )
    {
        if( -nCnt >= nTokenStackSize )
            nCnt = -nTokenStackSize+1;
        if( -nCnt <= nCurrentPos )
            nCurrentPos = sal::static_int_cast< sal_uInt8 >(nCurrentPos + nCnt);
        else
            nCurrentPos = sal::static_int_cast< sal_uInt8 >(
                nCurrentPos + (nCnt + nTokenStackSize));
    }
    return pTokenStack.get() + nCurrentPos;
}

// to read asynchronous from SvStream

template<typename T>
T SvParser<T>::GetSaveToken() const
{
    return pImplData ? pImplData->nSaveToken : static_cast<T>(0);
}

template<typename T>
void SvParser<T>::SaveState( T nToken )
{
    // save actual status
    if( !pImplData )
    {
        pImplData.reset(new SvParser_Impl<T>);
        pImplData->nSaveToken = static_cast<T>(0);
    }

    pImplData->nFilePos = rInput.Tell();
    pImplData->nToken = nToken;

    pImplData->aToken = aToken;
    pImplData->nlLineNr = nlLineNr;
    pImplData->nlLinePos = nlLinePos;
    pImplData->nTokenValue= nTokenValue;
    pImplData->bTokenHasValue = bTokenHasValue;
    pImplData->nNextCh = nNextCh;
}

template<typename T>
void SvParser<T>::RestoreState()
{
    // restore old status
    if( !pImplData )
        return;

    if( ERRCODE_IO_PENDING == rInput.GetError() )
        rInput.ResetError();
    aToken = pImplData->aToken;
    nlLineNr = pImplData->nlLineNr;
    nlLinePos = pImplData->nlLinePos;
    nTokenValue= pImplData->nTokenValue;
    bTokenHasValue=pImplData->bTokenHasValue;
    nNextCh = pImplData->nNextCh;

    pImplData->nSaveToken = pImplData->nToken;

    rInput.Seek( pImplData->nFilePos );
}

template<typename T>
void SvParser<T>::Continue( T )
{
}

// expanded out version of
//   IMPL_LINK_NOARG( SvParser, NewDataRead, LinkParamNone*, void )
// since it can't cope with template methods
template<typename T>
void SvParser<T>::LinkStubNewDataRead(void * instance, LinkParamNone* data) {
    return static_cast<SvParser<T> *>(instance)->NewDataRead(data);
}
template<typename T>
void SvParser<T>::NewDataRead(SAL_UNUSED_PARAMETER LinkParamNone*)
{
    switch( eState )
    {
    case SvParserState::Pending:
        eState = SvParserState::Working;
        RestoreState();

        Continue( pImplData->nToken );

        if( ERRCODE_IO_PENDING == rInput.GetError() )
            rInput.ResetError();

        if( SvParserState::Pending != eState )
            ReleaseRef();                    // ready otherwise!
        break;

    case SvParserState::NotStarted:
    case SvParserState::Working:
        break;

    default:
        ReleaseRef();                    // ready otherwise!
        break;
    }
}

template class SVT_DLLPUBLIC SvParser<int>;
template class SVT_DLLPUBLIC SvParser<HtmlTokenId>;

/*========================================================================
*
* SvKeyValueIterator.
*
*======================================================================*/

typedef std::vector<SvKeyValue> SvKeyValueList_Impl;

struct SvKeyValueIterator::Impl
{
    SvKeyValueList_Impl maList;
    sal_uInt16 mnPos;

    Impl() : mnPos(0) {}
};

SvKeyValueIterator::SvKeyValueIterator() : mpImpl(new Impl) {}

SvKeyValueIterator::~SvKeyValueIterator() = default;

bool SvKeyValueIterator::GetFirst (SvKeyValue &rKeyVal)
{
    mpImpl->mnPos = mpImpl->maList.size();
    return GetNext (rKeyVal);
}

bool SvKeyValueIterator::GetNext (SvKeyValue &rKeyVal)
{
    if (mpImpl->mnPos > 0)
    {
        rKeyVal = mpImpl->maList[--mpImpl->mnPos];
        return true;
    }
    else
    {
        // Nothing to do.
        return false;
    }
}

void SvKeyValueIterator::Append (const SvKeyValue &rKeyVal)
{
    mpImpl->maList.push_back(rKeyVal);
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.15 Sekunden (vorverarbeitet am 2026-05-08) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.