/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
// Entity binds all information needed for a single file | single call of parseStream struct Entity : public ParserData
{ // Amount of work producer sends to consumer in one iteration: staticconst size_t mnEventListSize = 1000;
// unique for each Entity instance:
// Number of valid events in mxProducedEvents:
size_t mnProducedEventsSize;
std::optional<EventList> mxProducedEvents;
std::queue<EventList> maPendingEvents;
std::queue<EventList> maUsedEvents;
std::mutex maEventProtector;
staticconst size_t mnEventLowWater = 4; staticconst size_t mnEventHighWater = 8;
osl::Condition maConsumeResume;
osl::Condition maProduceResume; // Event we use to store data if threading is disabled:
Event maSharedEvent;
// copied in copy constructor:
// Allow to disable threading for small documents: bool mbEnableThreads;
css::xml::sax::InputSource maStructSource;
xmlParserCtxtPtr mpParser;
::sax_expatwrap::XMLFile2UTFConverter maConverter;
// Exceptions cannot be thrown through the C-XmlParser (possible // resource leaks), therefore any exception thrown by a UNO callback // must be saved somewhere until the C-XmlParser is stopped.
css::uno::Any maSavedException;
std::mutex maSavedExceptionMutex; void saveException( const Any & e ); // Thread-safe check if maSavedException has value bool hasException(); void throwException( const ::rtl::Reference< FastLocatorImpl > &xDocumentLocator, bool mbDuringParse );
std::stack< NameWithToken, std::vector<NameWithToken> > maNamespaceStack; /* Context for main thread consuming events. * startElement() stores the data, which characters() and endElement() uses
*/
std::stack< SaxContext, std::vector<SaxContext> > maContextStack; // Determines which elements of maNamespaceDefines are valid in current context
std::stack< sal_uInt32, std::vector<sal_uInt32> > maNamespaceCount;
std::vector< NamespaceDefine > maNamespaceDefines;
ParserData maData; /// Cached parser configuration for next call of parseStream().
Entity *mpTop; /// std::stack::top() is amazingly slow => cache this.
std::stack< Entity > maEntities; /// Entity stack for each call of parseStream().
std::vector<char> pendingCharacters; /// Data from characters() callback that needs to be sent.
};
// throw an exception, but avoid callback if // during a threaded produce void Entity::throwException( const ::rtl::Reference< FastLocatorImpl > &xDocumentLocator, bool mbDuringParse )
{ // Error during parsing !
Any savedException;
{
std::scoped_lock g(maSavedExceptionMutex); if (maSavedException.hasValue())
{
savedException.setValue(&maSavedException, cppu::UnoType<decltype(maSavedException)>::get());
}
}
SAXParseException aExcept(
lclGetErrorMessage( mpParser,
xDocumentLocator->getSystemId(),
xDocumentLocator->getLineNumber() ),
Reference< XInterface >(),
savedException,
xDocumentLocator->getPublicId(),
xDocumentLocator->getSystemId(),
xDocumentLocator->getLineNumber(),
xDocumentLocator->getColumnNumber()
);
// error handler is set, it may throw the exception if( !mbDuringParse || !mbEnableThreads )
{ if (mxErrorHandler.is() )
mxErrorHandler->fatalError( Any( aExcept ) );
}
// error handler has not thrown, but parsing must stop => throw ourselves throw aExcept;
}
// In the single threaded case we emit events via our C // callbacks, so any exception caught must be queued up until // we can safely re-throw it from our C++ parent of parse()
// If multi-threaded, we need to push an EXCEPTION event, at // which point we transfer ownership of maSavedException to // the consuming thread. void Entity::saveException( const Any & e )
{ // fdo#81214 - allow the parser to run on after an exception, // unexpectedly some 'startElements' produce a UNO_QUERY_THROW // for XComponent; and yet expect to continue parsing.
SAL_WARN("sax", "Unexpected exception from XML parser " << exceptionToString(e));
std::scoped_lock g(maSavedExceptionMutex); if (maSavedException.hasValue())
{
SAL_INFO("sax.fastparser", "discarding exception, already have one");
} else
{
maSavedException = e;
}
}
#ifdef EMSCRIPTEN
rEntity.mbEnableThreads = false; #else if (!getenv("SAX_DISABLE_THREADS") && !m_bDisableThreadedParser)
{
Reference<css::io::XSeekable> xSeekable(rEntity.maStructSource.aInputStream, UNO_QUERY); // available() is not __really__ relevant here, but leave it in as a heuristic for non-seekable streams
rEntity.mbEnableThreads = (xSeekable.is() && xSeekable->getLength() > 10000)
|| (rEntity.maStructSource.aInputStream->available() > 10000);
} #endif
if (rEntity.mbEnableThreads)
{
rtl::Reference<ParserThread> xParser = new ParserThread(this);
xParser->launch();
aEnsureFree.setThread(xParser); bool done = false; do {
rEntity.maConsumeResume.wait();
rEntity.maConsumeResume.reset();
std::unique_lock aGuard(rEntity.maEventProtector); while (!rEntity.maPendingEvents.empty())
{ if (rEntity.maPendingEvents.size() <= Entity::mnEventLowWater)
rEntity.maProduceResume.set(); // start producer again
if ( rEntity.maPendingEvents.size() <= Entity::mnEventLowWater )
{
aGuard.unlock(); for (auto& rEvent : aEventList.maEvents)
{ if (rEvent.mxAttributes.is())
{
rEvent.mxAttributes->clear(); if( rEntity.mxNamespaceHandler.is() )
rEvent.mxDeclAttributes->clear();
}
aEventList.mbIsAttributesEmpty = true;
}
aGuard.lock();
}
rEntity.maUsedEvents.push(std::move(aEventList));
}
} while (!done);
aEnsureFree.joinThread();
deleteUsedEvents();
// callbacks used inside XML_Parse may have caught an exception // No need to lock maSavedExceptionMutex here because parser // thread is joined. if( rEntity.maSavedException.hasValue() )
rEntity.throwException( mxDocumentLocator, true );
} else
{
parse();
}
while (!rEntity.maUsedEvents.empty())
{
{ // the block makes sure that aEventList is destructed outside the lock
EventList aEventList = std::move(rEntity.maUsedEvents.front());
rEntity.maUsedEvents.pop();
while (rEntity.maPendingEvents.size() >= Entity::mnEventHighWater)
{ // pause parsing for a bit
aGuard.unlock(); // unlock
rEntity.maProduceResume.wait();
rEntity.maProduceResume.reset();
aGuard.lock(); // lock
}
// Tell libxml2 parser to decode entities in attribute values. // Also allow XML attribute values which are larger than 10MB, because this used to work // with expat. // coverity[unsafe_xml_parse_config] - entity support is required
xmlCtxtUseOptions(rEntity.mpParser, XML_PARSE_NOENT | XML_PARSE_HUGE);
} else
{
bContinue = xmlParseChunk( rEntity.mpParser, reinterpret_cast<constchar*>(seqOut.getConstArray()), nRead, 0 )
== XML_ERR_OK;
}
// callbacks used inside XML_Parse may have caught an exception if (!bContinue)
{
rEntity.throwException( mxDocumentLocator, true );
} if (rEntity.hasException())
{
rEntity.throwException( mxDocumentLocator, true );
}
} while( nRead > 0 );
rEntity.getEvent( CallbackType::DONE ); if( rEntity.mbEnableThreads )
produce( true );
}
try
{ /* #158414# Each element may define new namespaces, also for attributes. First, process all namespaces, second, process the attributes after namespaces
have been initialized. */
std::string_view sPrefix; // convert to string_view so we only do strlen() once. if (prefix != nullptr)
sPrefix = XML_CAST(prefix); // #158414# first: get namespaces for (int i = 0; i < numNamespaces * 2; i += 2)
{ // namespaces[] is (prefix/URI) if( namespaces[ i ] != nullptr )
{
OString aPrefix( XML_CAST( namespaces[ i ] ));
OUString namespaceURL( XML_CAST( namespaces[ i + 1 ] ), strlen( XML_CAST( namespaces[ i + 1 ] )), RTL_TEXTENCODING_UTF8 );
NormalizeURI( namespaceURL );
DefineNamespace(aPrefix, namespaceURL); if( rEntity.mxNamespaceHandler.is() )
rEvent.mxDeclAttributes->addUnknown( OString( XML_CAST( namespaces[ i ] ) ), OString( XML_CAST( namespaces[ i + 1 ] ) ) );
} else
{ // default namespace
sNamespace = OUString( XML_CAST( namespaces[ i + 1 ] ), strlen( XML_CAST( namespaces[ i + 1 ] )), RTL_TEXTENCODING_UTF8 );
NormalizeURI( sNamespace );
nNamespaceToken = GetNamespaceToken( sNamespace ); if( rEntity.mxNamespaceHandler.is() )
rEvent.mxDeclAttributes->addUnknown( ""_ostr, OString( XML_CAST( namespaces[ i + 1 ] ) ) );
}
}
if ( rEntity.mxTokenHandler.is() )
{ // #158414# second: fill attribute list with other attributes
rEvent.mxAttributes->reserve( numAttributes ); for (int i = 0; i < numAttributes * 5; i += 5)
{ // attributes[] is ( localname / prefix / nsURI / valueBegin / valueEnd ) if( attributes[ i + 1 ] != nullptr )
{
sal_Int32 nAttributeToken = GetTokenWithPrefix(XML_CAST(attributes[ i + 1 ]), attributes[ i ]); if( nAttributeToken != FastToken::DONTKNOW )
rEvent.mxAttributes->add( nAttributeToken, std::string_view(XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ]) ); else
addUnknownElementWithPrefix(attributes, i, rEvent.mxAttributes);
} else
{
sal_Int32 nAttributeToken = GetToken(attributes[ i ]); if( nAttributeToken != FastToken::DONTKNOW )
rEvent.mxAttributes->add( nAttributeToken, std::string_view(XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ]) ); else
{
SAL_WARN("xmloff", "unknown attribute " << XML_CAST( attributes[ i ] ) << "=" <<
OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
rEvent.mxAttributes->addUnknown( XML_CAST( attributes[ i ] ),
OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
}
}
}
if( !sPrefix.empty() )
rEvent.mnElementToken = GetTokenWithPrefix(sPrefix, localName); elseif( !sNamespace.isEmpty() )
rEvent.mnElementToken = GetTokenWithContextNamespace(nNamespaceToken, localName); else
rEvent.mnElementToken = GetToken(localName);
} else
{ for (int i = 0; i < numAttributes * 5; i += 5)
{ if( attributes[ i + 1 ] != nullptr )
addUnknownElementWithPrefix(attributes, i, rEvent.mxAttributes); else
rEvent.mxAttributes->addUnknown( XML_CAST( attributes[ i ] ),
OString( XML_CAST( attributes[ i + 3 ] ), attributes[ i + 4 ] - attributes[ i + 3 ] ));
}
rEntity.getEvent( CallbackType::END_ELEMENT ); if (rEntity.mbEnableThreads)
produce(); else
rEntity.endElement();
}
void FastSaxParserImpl::callbackCharacters( const xmlChar* s, int nLen )
{ // SAX interface allows that the characters callback splits content of one XML node // (e.g. because there's an entity that needs decoding), however for consumers it's // simpler FastSaxParser's character callback provides the whole string at once, // so merge data from possible multiple calls and send them at once (before the element // ends or another one starts). // // We use a std::vector<char> to avoid calling into the OUString constructor more than once when // we have multiple callbackCharacters() calls that we have to merge, which happens surprisingly // often in writer documents. int nOriginalLen = pendingCharacters.size();
pendingCharacters.resize(nOriginalLen + nLen);
memcpy(pendingCharacters.data() + nOriginalLen, s, nLen);
}
// This event is very rare, so no need to waste extra space for this // Using namespace and element strings to be target and data in that order.
rEvent.msNamespace = OUString( XML_CAST( target ), strlen( XML_CAST( target ) ), RTL_TEXTENCODING_UTF8 ); if ( data != nullptr )
rEvent.msElementName = OUString( XML_CAST( data ), strlen( XML_CAST( data ) ), RTL_TEXTENCODING_UTF8 ); else
rEvent.msElementName.clear();
if (rEntity.mbEnableThreads)
produce(); else
rEntity.processingInstruction( rEvent.msNamespace, rEvent.msElementName );
}
// ---------------------------------------------------------- // copy of the code in xmloff/source/core/namespace.cxx, which adds namespace aliases // for various dodgy namespace decls in the wild.
staticbool NormalizeW3URI( OUString& rName )
{ // check if URI matches: // http://www.w3.org/[0-9]*/[:letter:]* // (year)/(WG name) // For the following WG/standards names: // - xforms
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.