/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/ #include <string.h>
if( nRead + seqStart.getLength())
{ // if nRead is 0, the file is already eof. if( ! m_bStarted && nRead )
{ // ensure that enough data is available to parse encoding if( seqStart.hasElements() )
{ // prefix with what we had so far.
sal_Int32 nLength = seq.getLength();
seq.realloc( seqStart.getLength() + nLength );
if( ! m_bStarted )
{ // it must now be ensured, that no encoding attribute exist anymore // ( otherwise the expat-Parser will crash ) // This must be done after decoding ! // ( e.g. Files decoded in ucs-4 cannot be read properly )
m_bStarted = true;
removeEncoding( seq );
}
nRead = seq.getLength();
}
// Checks, if enough data has been accumulated to recognize the encoding bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
{ const sal_Int8 *pSource = seq.getConstArray(); bool bCheckIfFirstClosingBracketExists = false;
if( seq.getLength() < 8 ) { // no recognition possible, when less than 8 bytes are available returnfalse;
}
if( seq.getLength() < 4 ) { // no recognition possible, when less than 4 bytes are available returnfalse;
}
// first level : detect possible file formats if (seq.getLength() >= 5 && !strncmp(reinterpret_cast<constchar *>(pSource), ", 5)) { // scan for encoding
OString str( reinterpret_cast<constchar *>(pSource), seq.getLength() );
// cut sequence to first line break //find first line break; int nMax = str.indexOf( 10 ); if( nMax >= 0 )
{
str = str.copy( 0 , nMax );
}
int nFound = str.indexOf( " encoding" ); if( nFound >= 0 ) { int nStop; int nStart = str.indexOf( "\"" , nFound ); if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
{
nStart = str.indexOf( "'" , nFound );
nStop = str.indexOf( "'" , nStart +1 );
} else
{
nStop = str.indexOf( "\"" , nStart +1);
} if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
{ // encoding found finally
m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
}
}
} elseif( 0xFE == pSource[0] &&
0xFF == pSource[1] ) { // UTF-16 big endian // conversion is done so that encoding information can be easily extracted
m_sEncoding = "utf-16"_ostr;
} elseif( 0xFF == pSource[0] &&
0xFE == pSource[1] ) { // UTF-16 little endian // conversion is done so that encoding information can be easily extracted
m_sEncoding = "utf-16"_ostr;
} elseif( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) { // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.) // The byte order mark is simply added
// simply add the byte order mark !
seq.realloc( seq.getLength() + 2 );
memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); reinterpret_cast<sal_uInt8*>(seq.getArray())[0] = 0xFE; reinterpret_cast<sal_uInt8*>(seq.getArray())[1] = 0xFF;
m_sEncoding = "utf-16"_ostr;
} elseif( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) { // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.) // The byte order mark is simply added
if( m_seqSource.hasElements() ) { // For surrogates ! // put old rest and new byte sequence into one array // In general when surrogates are used, they should be rarely // cut off between two convert()-calls. So this code is used // rarely and the extra copy is acceptable.
puTempMem.reset(new sal_Unicode[ nSourceSize + m_seqSource.getLength()]);
memcpy( puTempMem.get() ,
m_seqSource.getConstArray() ,
m_seqSource.getLength() * sizeof( sal_Unicode ) );
memcpy(
&(puTempMem[ m_seqSource.getLength() ]) ,
puSource ,
nSourceSize*sizeof( sal_Unicode ) );
puSource = puTempMem.get();
nSourceSize += m_seqSource.getLength();
// take nSourceSize * 3 as preference // this is an upper boundary for converting to utf8, // which most often used as the target.
sal_Int32 nSeqSize = nSourceSize * 3;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.