/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
// We don't want to end up with 2GB read in one line just because of malformed // multiline fields, so chop it _somewhere_, which is twice supported columns // times arbitrary maximum cell content length, 2*1024*64K=128M, and because // it's sal_Unicode that's 256MB. If it's 2GB of data without LF we're out of // luck anyway.
constexpr sal_Int32 nArbitraryCellLengthLimit = SAL_MAX_UINT16;
constexpr sal_Int32 nArbitraryLineLengthLimit = 2 * MAXCOLCOUNT * nArbitraryCellLengthLimit;
namespace
{ constchar SYLK_LF[] = "\x1b :";
}
namespace {
enumclass SylkVersion
{
SCALC3, // Wrote wrongly quoted strings and unescaped semicolons.
OOO32, // Correct strings, plus multiline content.
OWN, // Place our new versions, if any, before this value.
OTHER // Assume that aliens wrote correct strings.
};
/** Determine if *p is a quote that ends a quoted field.
Precondition: we are parsing a quoted field already and *p is a quote.
@return FIELDEND_QUOTE if end of field quote DONTKNOW_QUOTE anything else
*/ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* pSeps, sal_Unicode& rcDetectSep )
{ // Due to broken CSV generators that don't double embedded quotes check if // a field separator immediately or with trailing spaces follows the quote, // only then end the field, or at end of string.
constexpr sal_Unicode cBlank = ' '; if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank)) return FIELDEND_QUOTE; // Detect a possible blank separator if it's not already in the list (which // was checked right above for p[1]==cBlank). constbool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank); while (p[1] == cBlank)
++p; if (lcl_isFieldEnd( p[1], pSeps)) return FIELDEND_QUOTE; // Extended separator detection after a closing quote (with or without // blanks). Note that nQuotes is incremented *after* the call so is not yet // even here, and that with separator detection we reach here only if // lcl_isEscapedOrFieldEndQuote() did not already detect FIRST_QUOTE or // SECOND_QUOTE for an escaped embedded quote, thus nQuotes does not have // to be checked. if (!rcDetectSep)
{ static constexpr sal_Unicode vSep[] = { ',', '\t', ';' }; for (const sal_Unicode c : vSep)
{ if (p[1] == c)
{
rcDetectSep = c; return FIELDEND_QUOTE;
}
}
} // Blank separator is least significant, after others. if (bBlankSep)
{
rcDetectSep = cBlank; return FIELDEND_QUOTE;
} return DONTKNOW_QUOTE;
}
/** Determine if *p is a quote that is escaped by being doubled or ends a quoted field.
Precondition: *p is a quote.
@param nQuotes Quote characters encountered so far. Odd (after opening quote) means either no embedded quotes or only quote pairs so far. Even means either not in a quoted field or already one quote encountered, the first of a pair.
@return FIELDSTART_QUOTE if first quote in a field, either starting content or embedded so caller should check beforehand. FIRST_QUOTE if first of a doubled quote SECOND_QUOTE if second of a doubled quote FIELDEND_QUOTE if end of field quote DONTKNOW_QUOTE if an unescaped quote we don't consider as end of field, do not increment nQuotes in caller then!
*/ static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p, const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep )
{ if ((nQuotes & 1) == 0)
{ if (p[-1] == cStr) return SECOND_QUOTE; else
{
SAL_WARN( "sc", "lcl_isEscapedOrFieldEndQuote: really want a FIELDSTART_QUOTE?"); return FIELDSTART_QUOTE;
}
} if (p[1] == cStr) return FIRST_QUOTE; return lcl_isFieldEndQuote( p, pSeps, rcDetectSep);
}
enumclass DoubledQuoteMode
{
KEEP_ALL, // both are taken, additionally start and end quote are included in string
ESCAPE, // escaped quote, one is taken, one ignored
};
}
/** Scan for a quoted string.
Precondition: initial current position *p is a cStr quote.
For DoubledQuoteMode::ESCAPE, if after the closing quote there is a field end (with or without trailing blanks and as determined by lcl_isFieldEndQuote()), then the content is appended to rField with quotes processed and removed. Else if no field end after the quoted string was detected, nothing is appended and processing continues and is repeated until the next quote. If no closing quote at a field end was found at all, nothing is appended and the initial position is returned and caller has to decide, usually just taking all as literal data.
For DoubledQuoteMode::KEEP_ALL, the string up to and including the closing quote is appended to rField and the next position returned, regardless whether there is a field separator following or not.
*/ staticconst sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rField, const sal_Unicode* pSeps, sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
{
OUString aString; bool bClosingQuote = (eMode == DoubledQuoteMode::KEEP_ALL); const sal_Unicode* const pStart = p; if (eMode != DoubledQuoteMode::KEEP_ALL)
p++; //! jump over opening quote bool bCont; do
{
bCont = false; const sal_Unicode* p0 = p; for( ;; )
{ if (!*p)
{ // Encountering end of data after an opening quote is not a // quoted string, ReadCsvLine() concatenated lines with '\n' // for a properly quoted embedded linefeed. if (eMode == DoubledQuoteMode::KEEP_ALL) // Caller would append that data anyway, so we can do it // already here. break;
return pStart;
}
if( *p == cStr )
{ if ( *++p != cStr )
{ // break or continue for loop if (eMode == DoubledQuoteMode::ESCAPE)
{
sal_Unicode cDetectSep = 0xffff; // No separator detection here. if (lcl_isFieldEndQuote( p-1, pSeps, cDetectSep) == FIELDEND_QUOTE)
{
bClosingQuote = true; break;
} else continue;
} else break;
} // doubled quote char switch ( eMode )
{ case DoubledQuoteMode::KEEP_ALL :
p++; // both for us (not breaking for-loop) break; case DoubledQuoteMode::ESCAPE :
p++; // one for us (breaking for-loop)
bCont = true; // and more break;
} if ( eMode == DoubledQuoteMode::ESCAPE ) break;
} else
p++;
} if ( p0 < p )
{ if (!lcl_appendLineData( aString, p0, ((eMode != DoubledQuoteMode::KEEP_ALL && (*p || *(p-1) == cStr)) ? p-1 : p)))
rbOverflowCell = true;
}
} while ( bCont );
if (!bClosingQuote) return pStart;
if (!aString.isEmpty())
rField += aString;
return p;
}
staticvoid lcl_UnescapeSylk( OUString & rString, SylkVersion eVersion )
{ // Older versions didn't escape the semicolon. // Older versions quoted the string and doubled embedded quotes, but not // the semicolons, which was plain wrong. if (eVersion >= SylkVersion::OOO32)
rString = rString.replaceAll(";;", ";"); else
rString = rString.replaceAll("\"\"", "\"");
rString = rString.replaceAll(SYLK_LF, "\n");
}
staticconst sal_Unicode* lcl_ScanSylkString( const sal_Unicode* p,
OUString& rString, SylkVersion eVersion )
{ const sal_Unicode* pStartQuote = p; const sal_Unicode* pEndQuote = nullptr; while( *(++p) )
{ if( *p == '"' )
{
pEndQuote = p; if (eVersion >= SylkVersion::OOO32)
{ if (*(p+1) == ';')
{ if (*(p+2) == ';')
{
p += 2; // escaped ';'
pEndQuote = nullptr;
} else break; // end field
}
} else
{ if (*(p+1) == '"')
{
++p; // escaped '"'
pEndQuote = nullptr;
} elseif (*(p+1) == ';') break; // end field
}
}
} if (!pEndQuote)
pEndQuote = p; // Take all data as string.
rString += std::u16string_view(pStartQuote + 1, pEndQuote - pStartQuote - 1 );
lcl_UnescapeSylk( rString, eVersion); return p;
}
staticconst sal_Unicode* lcl_ScanSylkFormula( const sal_Unicode* p,
OUString& rString, SylkVersion eVersion )
{ const sal_Unicode* pStart = p; if (eVersion >= SylkVersion::OOO32)
{ while (*p)
{ if (*p == ';')
{ if (*(p+1) == ';')
++p; // escaped ';' else break; // end field
}
++p;
}
rString += std::u16string_view( pStart, p - pStart);
lcl_UnescapeSylk( rString, eVersion);
} else
{ // Nasty. If in old versions the formula contained a semicolon, it was // quoted and embedded quotes were doubled, but semicolons were not. If // there was no semicolon, it could still contain quotes and doubled // embedded quotes if it was something like ="a""b", which was saved as // E"a""b" as is and has to be preserved, even if older versions // couldn't even load it correctly. However, theoretically another // field might follow and thus the line contain a semicolon again, such // as ...;E"a""b";... bool bQuoted = false; if (*p == '"')
{ // May be a quoted expression or just a string constant expression // with quotes. while (*(++p))
{ if (*p == '"')
{ if (*(p+1) == '"')
++p; // escaped '"' else break; // closing '"', had no ';' yet
} elseif (*p == ';')
{
bQuoted = true; // ';' within quoted expression break;
}
}
p = pStart;
} if (bQuoted)
p = lcl_ScanSylkString( p, rString, eVersion); else
{ while (*p && *p != ';')
++p;
rString += std::u16string_view( pStart, p - pStart);
}
} return p;
}
staticvoid lcl_WriteString( SvStream& rStrm, OUString& rString, sal_Unicode cQuote, sal_Unicode cEsc )
{ if (cEsc)
{ // the goal is to replace cStr by cStr+cStr
OUString strFrom(cEsc);
OUString strTo = strFrom + strFrom;
rString = rString.replaceAll(strFrom, strTo);
}
sal_uInt16 nFound = 0; bool bInNum = false; for (sal_Int32 nPos = 0; nPos < nLen && (bInNum || nFound < nMaxNumberParts); ++nPos)
{ bool bLetter = false; if (rtl::isAsciiDigit(rStr[nPos]) ||
(((!bInNum && nFound==nMP) || (bInNum && nFound==nMP+1))
&& (bLetter = ScGlobal::getCharClass().isLetterNumeric( rStr, nPos))))
{ if (!bInNum)
{
bInNum = true;
nStart[nFound] = nPos;
++nFound;
}
nEnd[nFound-1] = nPos; if (bIso && (bLetter || (2 <= nFound && nFound <= 6 && nPos > nStart[nFound-1] + 1))) // Each M,D,h,m,s at most 2 digits.
bIso = false;
} else
{
bInNum = false; if (bIso)
{ // ([+-])YYYY-MM-DD([T ]hh:mm(:ss(.fff)))(([+-])TZ) // XXX NOTE: timezone is accepted here, but number // formatter parser will not, so the end result will be // type Text to preserve timezone information. switch (rStr[nPos])
{ case'+': if (nFound >= 5 && nPos == nEnd[nFound-1] + 1) // Accept timezone offset.
; elseif (nPos > 0) // Accept one leading sign.
bIso = false; break; case'-': if (nFound >= 5 && nPos == nEnd[nFound-1] + 1) // Accept timezone offset.
; elseif (nFound == 0 && nPos > 0) // Accept one leading sign.
bIso = false; elseif (nFound < 1 || 2 < nFound || nPos != nEnd[nFound-1] + 1) // Not immediately after 1 or 1-2
bIso = false; break; case'T': case' ': if (nFound != 3 || nPos != nEnd[nFound-1] + 1) // Not immediately after 1-2-3
bIso = false; break; case':': if (nFound < 4 || 5 < nFound || nPos != nEnd[nFound-1] + 1) // Not at 1-2-3T4:5:
bIso = false; break; case'.': case',': if (nFound != 6 || nPos != nEnd[nFound-1] + 1) // Not at 1-2-3T4:5:6.
bIso = false; break; case'Z': if (nFound >= 5 && nPos == nEnd[nFound-1] + 1) // Accept Zero timezone.
; else
bIso = false; break; default:
bIso = false;
}
}
}
}
if (nFound < 3)
bIso = false;
if (bIso)
{ // Leave conversion and detection of various possible number // formats to the number formatter. ISO is recognized in any locale // so we can directly use the document's formatter.
sal_uInt32 nFormat = 0; double fVal = 0.0;
SvNumberFormatter* pDocFormatter = rDoc.GetFormatTable(); if (pDocFormatter->IsNumberFormat( rStr, nFormat, fVal))
{ if (pDocFormatter->GetType(nFormat) & SvNumFormatType::DATE)
{
ScAddress aPos(nCol,nRow,nTab); if (bUseDocImport)
rDocImport.setNumericCell(aPos, fVal); else
rDoc.SetValue(aPos, fVal);
rDoc.SetNumberFormat(aPos, nFormat);
return bMultiLine; // success
}
} // If we reach here it is type Text (e.g. timezone or trailing // characters). Handled below.
}
if ( nFound == 1 )
{ // try to break one number (without separators) into date fields
if ( nDateLen >= 5 && nDateLen <= 8 &&
ScGlobal::getCharClass().isNumeric( rStr.copy( nDateStart, nDateLen ) ) )
{ // 6 digits: 2 each for day, month, year // 8 digits: 4 for year, 2 each for day and month // 5 or 7 digits: first field is shortened by 1
CalendarWrapper* pCalendar = (bSecondCal ? pSecondCalendar : &rCalendar);
sal_Int16 nNumMonths = pCalendar->getNumberOfMonthsInYear(); if ( nDay && nMonth && nDay<=31 && nMonth<=nNumMonths )
{
--nMonth;
pCalendar->setValue( i18n::CalendarFieldIndex::DAY_OF_MONTH, nDay );
pCalendar->setValue( i18n::CalendarFieldIndex::MONTH, nMonth );
pCalendar->setValue( i18n::CalendarFieldIndex::YEAR, nYear );
sal_Int16 nHour, nMinute, nSecond; // #i14974# The imported value should have no fractional value, so set the // time fields to zero (ICU calendar instance defaults to current date/time)
nHour = nMinute = nSecond = 0; if (nFound > 3)
nHour = static_cast<sal_Int16>(o3tl::toInt32(rStr.subView( nStart[3], nEnd[3]+1-nStart[3]))); if (nFound > 4)
nMinute = static_cast<sal_Int16>(o3tl::toInt32(rStr.subView( nStart[4], nEnd[4]+1-nStart[4]))); if (nFound > 5)
nSecond = static_cast<sal_Int16>(o3tl::toInt32(rStr.subView( nStart[5], nEnd[5]+1-nStart[5]))); // do not use calendar's milliseconds, to avoid fractional part truncation double fFrac = 0.0; if (nFound > 6)
{
sal_Unicode cDec = '.';
OUString aT = OUStringChar(cDec) + rStr.subView( nStart[6], nEnd[6]+1-nStart[6]);
rtl_math_ConversionStatus eStatus; double fV = rtl::math::stringToDouble( aT, cDec, 0, &eStatus ); if (eStatus == rtl_math_ConversionStatus_Ok)
fFrac = fV / 86400.0;
}
sal_Int32 nPos; if (nFound > 3 && 1 <= nHour && nHour <= 12 // nHour 0 and >=13 can't be AM/PM
&& (nPos = nEnd[nFound-1] + 1) < nLen)
{ // Dreaded AM/PM may be following. while (nPos < nLen && rStr[nPos] == ' ')
++nPos; if (nPos < nLen)
{
sal_Int32 nStop = nPos; while (nStop < nLen && rStr[nStop] != ' ')
++nStop;
OUString aAmPm = rStr.copy( nPos, nStop - nPos); // For AM only 12 needs to be treated, whereas for PM // it must not. Check both, locale and second/English // strings. if (nHour == 12 &&
(rTransliteration.isEqual( aAmPm, pFormatter->GetLocaleData()->getTimeAM()) ||
(pSecondTransliteration && pSecondTransliteration->isEqual( aAmPm, u"AM"_ustr))))
{
nHour = 0;
} elseif (nHour < 12 &&
(rTransliteration.isEqual( aAmPm, pFormatter->GetLocaleData()->getTimePM()) ||
(pSecondTransliteration && pSecondTransliteration->isEqual( aAmPm, u"PM"_ustr))))
{
nHour += 12;
}
}
}
pCalendar->setValue( i18n::CalendarFieldIndex::HOUR, nHour );
pCalendar->setValue( i18n::CalendarFieldIndex::MINUTE, nMinute );
pCalendar->setValue( i18n::CalendarFieldIndex::SECOND, nSecond );
pCalendar->setValue( i18n::CalendarFieldIndex::MILLISECOND, 0 ); if ( pCalendar->isValid() )
{ // Whole days diff. double fDiff = DateTime::Sub( DateTime(pDocFormatter->GetNullDate()),
pCalendar->getEpochStart()); // #i14974# must use getLocalDateTime to get the same // date values as set above double fDays = pCalendar->getLocalDateTime() + fFrac;
fDays -= fDiff;
LanguageType eLatin, eCjk, eCtl;
rDoc.GetLanguage( eLatin, eCjk, eCtl );
LanguageType eDocLang = eLatin; //! which language for date formats?
SvNumFormatType nType = (nFound > 3 ? SvNumFormatType::DATETIME : SvNumFormatType::DATE);
sal_uLong nFormat = pDocFormatter->GetStandardFormat( nType, eDocLang ); // maybe there is a special format including seconds or milliseconds if (nFound > 5)
nFormat = pDocFormatter->GetStandardFormat( fDays, nFormat, nType, eDocLang);
while(--nSkipLines>0)
{
aLine = ReadCsvLine(rStrm, !bFixed, aSeps, cStr, cDetectSep); // content is ignored if ( rStrm.eof() ) break;
}
// Determine range for Undo. // We don't need this during import of a file to a new sheet or document... bool bDetermineRange = bUndo; bool bColumnsAreDetermined = false;
// Row heights don't need to be adjusted on the fly if EndPaste() is called // afterwards, which happens only if bDetermineRange. This variable also // survives the toggle of bDetermineRange down at the end of the do{} loop. bool bRangeIsDetermined = bDetermineRange;
ScDocumentImport aDocImport(rDoc); do
{ const SCCOL nLastCol = nEndCol; // tdf#129701 preserve value of nEndCol for( ;; )
{
aLine = ReadCsvLine(rStrm, !bFixed, aSeps, cStr, cDetectSep); if ( rStrm.eof() && aLine.isEmpty() ) break;
assert(pSeps == aSeps.getStr());
if ( nRow > rDoc.MaxRow() )
{
bOverflowRow = true; // display warning on import break; // for
}
if (!bDetermineRange)
EmbeddedNullTreatment( aLine);
sal_Int32 nLineLen = aLine.getLength();
SCCOL nCol = nStartCol; bool bMultiLine = false; if ( bFixed ) // Fixed line length
{ if (bDetermineRange)
{ if (!bColumnsAreDetermined)
{ // Yes, the check is nCol<=rDoc.MaxCol()+1, +1 because it // is only an overflow if there is really data following to // be put behind the last column, which doesn't happen if // info is SC_COL_SKIP. for (i=0; i < nInfoCount && nCol <= rDoc.MaxCol()+1; ++i)
{ const sal_uInt8 nFmt = pColFormat[i]; if (nFmt != SC_COL_SKIP) // otherwise don't increment nCol either
{ if (nCol > rDoc.MaxCol())
bOverflowCol = true; // display warning on import
++nCol;
}
}
bColumnsAreDetermined = true;
}
} else
{
sal_Int32 nStartIdx = 0; // Same maxcol+1 check reason as above. for (i=0; i < nInfoCount && nCol <= rDoc.MaxCol()+1; ++i)
{
sal_Int32 nNextIdx = nStartIdx; if (i + 1 < nInfoCount)
CountVisualWidth( aLine, nNextIdx, pColStart[i+1] - pColStart[i] ); else
nNextIdx = nLineLen;
sal_uInt8 nFmt = pColFormat[i]; if (nFmt != SC_COL_SKIP) // otherwise don't increment nCol either
{ if (nCol > rDoc.MaxCol())
bOverflowCol = true; // display warning on import else
{ bool bIsQuoted = false;
aCell = lcl_GetFixed( aLine, nStartIdx, nNextIdx, bIsQuoted, bOverflowCell ); if (bIsQuoted && bQuotedAsText)
nFmt = SC_COL_TEXT;
bMultiLine |= lcl_PutString(
aDocImport, !mbOverwriting, nCol, nRow, nTab, aCell, nFmt,
&aNumFormatter, bDetectNumFormat, bDetectSciNumFormat, bEvaluateFormulas, bSkipEmptyCells,
aTransliteration, aCalendar,
pEnglishTransliteration.get(), pEnglishCalendar.get());
}
++nCol;
}
nStartIdx = nNextIdx;
}
}
} else// Search for the separator
{
SCCOL nSourceCol = 0;
sal_uInt16 nInfoStart = 0; const sal_Unicode* p = aLine.getStr(); // tdf#129701 if there is only one column, and user wants to treat empty cells, // we need to detect *p = null bool bIsLastColEmpty = !(*p) && !bSkipEmptyCells && !bDetermineRange; // Yes, the check is nCol<=rDoc.MaxCol()+1, +1 because it is only an // overflow if there is really data following to be put behind // the last column, which doesn't happen if info is // SC_COL_SKIP. while ( (*p || bIsLastColEmpty) && nCol <= rDoc.MaxCol()+1)
{ bool bIsQuoted = false;
p = ScImportExport::ScanNextFieldFromString( p, aCell,
cStr, pSeps, bMerge, bIsQuoted, bOverflowCell, bRemoveSpace );
sal_uInt8 nFmt = SC_COL_STANDARD; for ( i=nInfoStart; i<nInfoCount; i++ )
{ if ( pColStart[i] == nSourceCol + 1 ) // pColStart is 1-based
{
nFmt = pColFormat[i];
nInfoStart = i + 1; // ColInfos are in succession break; // for
}
} if ( nFmt != SC_COL_SKIP )
{ if (nCol > rDoc.MaxCol())
bOverflowCol = true; // display warning on import elseif (!bDetermineRange)
{ if (bIsQuoted && bQuotedAsText)
nFmt = SC_COL_TEXT;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.