/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
// structure to store the actual data template<typename T> struct SvParser_Impl
{
OUString aToken; // parsed token
sal_uInt64 nFilePos; // actual position in stream
sal_uInt32 nlLineNr; // actual line number
sal_uInt32 nlLinePos; // actual column number
tools::Long nTokenValue; // extra value (RTF) bool bTokenHasValue; // indicates whether nTokenValue is valid
T nToken; // actual Token
sal_uInt32 nNextCh; // actual character
T nSaveToken; // the token from Continue
template<typename T>
sal_uInt32 SvParser<T>::GetNextChar()
{
sal_uInt32 c = 0U;
// When reading multiple bytes, we don't have to care about the file // position when we run into the pending state. The file position is // maintained by SaveState/RestoreState. if( bSwitchToUCS2 && 0 == rInput.Tell() )
{
rInput.StartReadingUnicodeText(RTL_TEXTENCODING_DONTKNOW); if (rInput.good())
{
sal_uInt64 nPos = rInput.Tell(); if (nPos == 2)
eSrcEnc = RTL_TEXTENCODING_UCS2; elseif (nPos == 3)
SetSrcEncoding(RTL_TEXTENCODING_UTF8); else// Try to detect encoding without BOM
{
std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer const size_t nSize = rInput.ReadBytes(buf.data(), buf.size());
rInput.Seek(0); if (nSize > 0)
{
UErrorCode uerr = U_ZERO_ERROR;
UCharsetDetector* ucd = ucsdet_open(&uerr);
ucsdet_setText(ucd, buf.data(), nSize, &uerr); if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr))
{ constchar* pEncodingName = ucsdet_getName(match, &uerr);
nChars = rtl_convertTextToUnicode(
pImplData->hConv, pImplData->hContext,
&c1, 1, sCh , 2,
RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
&nInfo, &nCvtBytes);
} if( !bErr )
{ if( 1 == nChars && 0 == nInfo )
{
c = sal_uInt32( sCh[0] );
} elseif( 2 == nChars && 0 == nInfo )
{
c = rtl::combineSurrogates( sCh[0], sCh[1] );
} elseif( 0 != nChars || 0 != nInfo )
{
DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0, "source buffer is too small" );
DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0, "there is a conversion error" );
DBG_ASSERT( 0 == nChars, "there is a converted character, but an error" ); // There are still errors, but nothing we can // do
c = '?';
nChars = 1;
++nConversionErrors;
}
}
} else
{ char sBuffer[10];
sBuffer[0] = c1;
sal_uInt16 nLen = 1; while( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) != 0 &&
nLen < 10 )
{
rInput.ReadChar( c1 );
bErr = !rInput.good(); if( bErr ) break;
sBuffer[nLen++] = c1;
nChars = rtl_convertTextToUnicode(
pImplData->hConv, nullptr, sBuffer, nLen, &cUC, 1,
RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR|
RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR|
RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR,
&nInfo, &nCvtBytes);
} if( !bErr )
{ if( 1 == nChars && 0 == nInfo )
{
DBG_ASSERT( nCvtBytes == nLen, "no all bytes have been converted!" );
c = cUC;
} else
{
DBG_ASSERT( (nInfo&RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL) == 0, "source buffer is too small" );
DBG_ASSERT( (nInfo&~(RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOOSMALL)) == 0, "there is a conversion error" );
DBG_ASSERT( 0 == nChars, "there is a converted character, but an error" );
// There are still errors, so we use the first // character and restart after that.
c = reinterpret_cast<unsignedchar&>( sBuffer[0] );
rInput.SeekRel( -(nLen-1) );
nChars = 1;
++nConversionErrors;
}
}
}
} elseif( 1 == nChars && 0 == nInfo )
{ // The conversion was successful
DBG_ASSERT( nCvtBytes == 1, "no all bytes have been converted!" );
c = cUC;
} elseif( 0 != nChars || 0 != nInfo )
{
DBG_ASSERT( 0 == nChars, "there is a converted character, but an error" );
DBG_ASSERT( 0 != nInfo, "there is no converted character and no error" ); // #73398#: If the character could not be converted, // because a conversion is not available, do no conversion at all.
c = reinterpret_cast<unsignedchar&>( c1 );
nChars = 1;
++nConversionErrors;
}
}
}
} while( 0 == nChars && !bErr );
}
if ( ! rtl::isUnicodeScalarValue( c ) )
c = '?' ;
if (bFuzzing && nConversionErrors > 128)
{
SAL_WARN("svtools", "SvParser::GetNextChar too many conversion errors while fuzzing, abandoning for performance");
bErr = true;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.