/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
XmlReader::XmlReader(OUString fileUrl)
: fileUrl_(std::move(fileUrl))
, fileHandle_(nullptr)
{
oslFileError e = osl_openFile(
fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read); switch (e)
{ case osl_File_E_None: break; case osl_File_E_NOENT: throw css::container::NoSuchElementException( fileUrl_ ); default: throw css::uno::RuntimeException( "cannot open " + fileUrl_ + ": " + OUString::number(e));
}
e = osl_getFileSize(fileHandle_, &fileSize_); if (e == osl_File_E_None) {
e = osl_mapFile(
fileHandle_, &fileAddress_, fileSize_, 0,
osl_File_MapFlag_WillNeed);
} if (e != osl_File_E_None) {
oslFileError e2 = osl_closeFile(fileHandle_); if (e2 != osl_File_E_None) {
SAL_WARN( "xmlreader", "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
} throw css::uno::RuntimeException( "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
}
namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
pos_ = static_cast< char * >(fileAddress_);
end_ = pos_ + fileSize_;
state_ = State::Content;
firstAttribute_ = true;
}
XmlReader::~XmlReader() { if (!fileHandle_) return;
oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_); if (e != osl_File_E_None) {
SAL_WARN( "xmlreader", "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
}
e = osl_closeFile(fileHandle_); if (e != osl_File_E_None) {
SAL_WARN( "xmlreader", "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
}
}
int XmlReader::registerNamespaceIri(Span const & iri) { int id = toNamespaceId(namespaceIris_.size());
namespaceIris_.push_back(iri); if (iri == "http://www.w3.org/2001/XMLSchema-instance") { // Old user layer .xcu files used the xsi namespace prefix without // declaring a corresponding namespace binding, see issue 77174; reading // those files during migration would fail without this hack that can be // removed once migration is no longer relevant (see // configmgr::Components::parseModificationLayer):
namespaces_.emplace_back(Span("xsi"), id);
} return id;
}
XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
{ switch (state_) { case State::Content: switch (reportText) { case Text::NONE: return handleSkippedText(data, nsId); case Text::Raw: return handleRawText(data); default: // Text::Normalized return handleNormalizedText(data);
} case State::StartTag: return handleStartTag(nsId, data); case State::EndTag: return handleEndTag(); case State::EmptyElementTag:
handleElementEnd(); return Result::End; default: // State::Done return Result::Done;
}
}
int XmlReader::getNamespaceId(Span const & prefix) const { auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
[&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
if (i != namespaces_.rend()) return i->nsId;
return NAMESPACE_UNKNOWN;
}
void XmlReader::normalizeLineEnds(Span const & text) { charconst * p = text.begin;
sal_Int32 n = text.length; for (;;) {
sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); if (i < 0) { break;
}
pad_.add(p, i);
p += i + 1;
n -= i + 1; if (n == 0 || *p != '\x0A') {
pad_.add("\x0A");
}
}
pad_.add(p, n);
}
void XmlReader::skipSpace() { while (isSpace(peek())) {
++pos_;
}
}
bool XmlReader::skipComment() { if (rtl_str_shortenedCompare_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
RTL_CONSTASCII_LENGTH("--")) !=
0)
{ returnfalse;
}
pos_ += RTL_CONSTASCII_LENGTH("--");
sal_Int32 i = rtl_str_indexOfStr_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); if (i < 0) { throw css::uno::RuntimeException( "premature end (within comment) of " + fileUrl_ );
}
pos_ += i + RTL_CONSTASCII_LENGTH("--"); if (read() != '>') { throw css::uno::RuntimeException( "illegal \"--\" within comment in " + fileUrl_ );
} returntrue;
}
void XmlReader::skipProcessingInstruction() {
sal_Int32 i = rtl_str_indexOfStr_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); if (i < 0) { throw css::uno::RuntimeException( "bad '' in " + fileUrl_ );
}
pos_ += i + RTL_CONSTASCII_LENGTH("?>");
}
void XmlReader::skipDocumentTypeDeclaration() { // Neither is it checked that the doctypedecl is at the correct position in // the document, nor that it is well-formed: for (;;) { char c = read(); switch (c) { case'\0': // i.e., EOF throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ ); case'"': case'\'':
{
sal_Int32 i = rtl_str_indexOfChar_WithLength(
pos_, end_ - pos_, c); if (i < 0) { throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ );
}
pos_ += i + 1;
} break; case'>': return; case'[': for (;;) {
c = read(); switch (c) { case'\0': // i.e., EOF throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ ); case'"': case'\'':
{
sal_Int32 i = rtl_str_indexOfChar_WithLength(
pos_, end_ - pos_, c); if (i < 0) { throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ );
}
pos_ += i + 1;
} break; case'<': switch (read()) { case'\0': // i.e., EOF throw css::uno::RuntimeException( "premature end (within DTD) of " + fileUrl_ ); case'!':
skipComment(); break; case'?':
skipProcessingInstruction(); break; default: break;
} break; case']':
skipSpace(); if (read() != '>') { throw css::uno::RuntimeException( "missing \">\" of DTD in " + fileUrl_ );
} return; default: break;
}
} default: break;
}
}
}
Span XmlReader::scanCdataSection() { if (rtl_str_shortenedCompare_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
RTL_CONSTASCII_LENGTH("[CDATA[")) !=
0)
{ return Span();
}
pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); charconst * begin = pos_;
sal_Int32 i = rtl_str_indexOfStr_WithLength(
pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); if (i < 0) { throw css::uno::RuntimeException( "premature end (within CDATA section) of " + fileUrl_ );
}
pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); return Span(begin, i);
}
int XmlReader::scanNamespaceIri(charconst * begin, charconst * end) {
assert(begin != nullptr && begin <= end);
Span iri(handleAttributeValue(begin, end, false)); for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { if (namespaceIris_[i] == iri) { return toNamespaceId(i);
}
} return XmlReader::NAMESPACE_UNKNOWN;
}
charconst * XmlReader::handleReference(charconst * position, charconst * end)
{
assert(position != nullptr && *position == '&' && position < end);
++position; if (*position == '#') {
++position;
sal_uInt32 val = 0; charconst * p; if (*position == 'x') {
++position;
p = position; for (;; ++position)
{
val = o3tl::convertToHex<sal_uInt32>(*position); if (val >= 16) break;
if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow throw css::uno::RuntimeException( "'...' too large in " + fileUrl_ );
}
}
} else {
p = position; for (;; ++position) { char c = *position; if (c >= '0' && c <= '9') {
val = 10 * val + (c - '0');
} else { break;
} if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow throw css::uno::RuntimeException( "'...' too large in " + fileUrl_ );
}
}
} if (position == p || *position++ != ';') { throw css::uno::RuntimeException( "'...' missing ';' in " + fileUrl_ );
}
assert(rtl::isUnicodeCodePoint(val)); if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
(val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
{ throw css::uno::RuntimeException( "character reference denoting invalid character in " + fileUrl_ );
} char buf[4];
sal_Int32 len; if (val < 0x80) {
buf[0] = static_cast< char >(val);
len = 1;
} elseif (val < 0x800) {
buf[0] = static_cast< char >((val >> 6) | 0xC0);
buf[1] = static_cast< char >((val & 0x3F) | 0x80);
len = 2;
} elseif (val < 0x10000) {
buf[0] = static_cast< char >((val >> 12) | 0xE0);
buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
buf[2] = static_cast< char >((val & 0x3F) | 0x80);
len = 3;
} else {
buf[0] = static_cast< char >((val >> 18) | 0xF0);
buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
buf[3] = static_cast< char >((val & 0x3F) | 0x80);
len = 4;
}
pad_.addEphemeral(buf, len); return position;
} else { struct EntityRef { charconst * inBegin;
sal_Int32 const inLength; charconst * outBegin;
sal_Int32 const outLength;
}; static EntityRef const refs[] = {
{ RTL_CONSTASCII_STRINGPARAM("amp;"),
RTL_CONSTASCII_STRINGPARAM("&") },
{ RTL_CONSTASCII_STRINGPARAM("lt;"),
RTL_CONSTASCII_STRINGPARAM("<") },
{ RTL_CONSTASCII_STRINGPARAM("gt;"),
RTL_CONSTASCII_STRINGPARAM(">") },
{ RTL_CONSTASCII_STRINGPARAM("apos;"),
RTL_CONSTASCII_STRINGPARAM("'") },
{ RTL_CONSTASCII_STRINGPARAM("quot;"),
RTL_CONSTASCII_STRINGPARAM("\"") } }; for (constauto & ref : refs) { if (rtl_str_shortenedCompare_WithLength(
position, end - position, ref.inBegin, ref.inLength,
ref.inLength) ==
0)
{
position += ref.inLength;
pad_.add(ref.outBegin, ref.outLength); return position;
}
} throw css::uno::RuntimeException( "unknown entity reference in " + fileUrl_ );
}
}
Span XmlReader::handleAttributeValue( charconst * begin, charconst * end, bool fullyNormalize)
{
pad_.clear(); if (fullyNormalize) { while (begin != end && isSpace(*begin)) {
++begin;
} while (end != begin && isSpace(end[-1])) {
--end;
} charconst * p = begin; enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; // a single true space character can go into the current span, // everything else breaks the span
Space space = SPACE_NONE; while (p != end) { switch (*p) { case'\x09': case'\x0A': case'\x0D': switch (space) { case SPACE_NONE:
pad_.add(begin, p - begin);
pad_.add(" ");
space = SPACE_BREAK; break; case SPACE_SPAN:
pad_.add(begin, p - begin);
space = SPACE_BREAK; break; case SPACE_BREAK: break;
}
begin = ++p; break; case' ': switch (space) { case SPACE_NONE:
++p;
space = SPACE_SPAN; break; case SPACE_SPAN:
pad_.add(begin, p - begin);
begin = ++p;
space = SPACE_BREAK; break; case SPACE_BREAK:
begin = ++p; break;
} break; case'&':
pad_.add(begin, p - begin);
p = handleReference(p, end);
begin = p;
space = SPACE_NONE; break; default:
++p;
space = SPACE_NONE; break;
}
}
pad_.add(begin, p - begin);
} else { charconst * p = begin; while (p != end) { switch (*p) { case'\x09': case'\x0A':
pad_.add(begin, p - begin);
begin = ++p;
pad_.add(" "); break; case'\x0D':
pad_.add(begin, p - begin);
++p; if (peek() == '\x0A') {
++p;
}
begin = p;
pad_.add(" "); break; case'&':
pad_.add(begin, p - begin);
p = handleReference(p, end);
begin = p; break; default:
++p; break;
}
}
pad_.add(begin, p - begin);
} return pad_.get();
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.