/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* * This file is part of the LibreOffice project. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * This file incorporates work covered by the following license notice: * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.apache.org/licenses/LICENSE-2.0 .
*/
// To improve performance, assume that if for any prefix URL of a given // hierarchical URL either a UCB content cannot be created, or the UCB content // does not support the getCasePreservingURL command, then this will hold for // any other prefix URL of the given URL, too: enum Result { Success, GeneralFailure, SpecificFailure };
OUString normalize(
css::uno::Reference< css::ucb::XUniversalContentBroker > const & broker,
css::uno::Reference< css::uri::XUriReferenceFactory > const & uriFactory,
OUString const & uriReference)
{ // normalizePrefix can potentially fail (a typically example being a file // URL that denotes a non-existing resource); in such a case, try to // normalize as long a prefix of the given URL as possible (i.e., normalize // all the existing directories within the path):
OUString normalized;
sal_Int32 n = uriReference.indexOf('#');
normalized = n == -1 ? uriReference : uriReference.copy(0, n); switch (normalizePrefix(broker, normalized, &normalized)) { case Success: return n == -1 ? normalized : normalized + uriReference.subView(n); case GeneralFailure: return uriReference; case SpecificFailure: default: break;
}
css::uno::Reference< css::uri::XUriReference > ref(
uriFactory->parse(uriReference)); if (!isAbsoluteHierarchicalUriReference(ref)) { return uriReference;
}
sal_Int32 count = ref->getPathSegmentCount(); if (count < 2) { return uriReference;
}
OUStringBuffer head(ref->getScheme());
head.append(':'); if (ref->hasAuthority()) {
head.append("//" + ref->getAuthority());
} for (sal_Int32 i = count - 1; i > 0; --i) {
OUStringBuffer buf(head); for (sal_Int32 j = 0; j < i; ++j) {
buf.append('/');
buf.append(ref->getPathSegment(j));
}
normalized = buf.makeStringAndClear(); if (normalizePrefix(broker, normalized, &normalized) != SpecificFailure)
{
buf.append(normalized);
css::uno::Reference< css::uri::XUriReference > preRef(
uriFactory->parse(normalized)); if (!isAbsoluteHierarchicalUriReference(preRef)) { // This could only happen if something is inconsistent: break;
}
sal_Int32 preCount = preRef->getPathSegmentCount(); // normalizePrefix may have added or removed a final slash: if (preCount != i) { if (preCount == i - 1) {
buf.append('/');
} elseif (preCount - 1 == i && !buf.isEmpty()
&& buf[buf.getLength() - 1] == '/')
{
buf.setLength(buf.getLength() - 1);
} else { // This could only happen if something is inconsistent: break;
}
} for (sal_Int32 j = i; j < count; ++j) {
buf.append('/');
buf.append(ref->getPathSegment(j));
} if (ref->hasQuery()) {
buf.append('?');
buf.append(ref->getQuery());
} if (ref->hasFragment()) {
buf.append('#');
buf.append(ref->getFragment());
} return buf.makeStringAndClear();
}
} return uriReference;
}
case 6: // closing bracket
++(*pPos); if(nullptr != pMatchingBracketDepth && *pMatchingBracketDepth > 0)
{
--(*pMatchingBracketDepth); // tdf#145381 When there was an opening bracket, detect this closing bracket // as part of the uri
*pEnd = *pPos;
} returntrue;
OUString URIHelper::FindFirstURLInText(OUString const & rText,
sal_Int32 & rBegin,
sal_Int32 & rEnd,
CharClass const & rCharClass,
INetURLObject::EncodeMechanism eMechanism,
rtl_TextEncoding eCharset)
{ if (rBegin > rEnd || rEnd > rText.getLength()) return OUString();
// Search for the first substring of [rBegin..rEnd[ that matches any of the // following productions (for which the appropriate style bit is set in // eStyle, if applicable).
// 1st Production (known scheme): // \B1 <one of the known schemes, except file> ":" 1*wchar ["#" 1*wchar] // \B1
// 8th Production (Unix-like DOS file): // \B1 ALPHA ":/" *(wchar / "\") \B1
// The productions use the following auxiliary rules.
// local-part = atom *("." atom) // atom = 1*(alphanum / "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" // / "-" / "/" / "=" / "?" / "^" / "_" / "`" / "{" / "|" / "}" // / "~") // domain = label *("." label) // label = alphanum [*(alphanum / "-") alphanum] // alphanum = ALPHA / DIGIT // wchar = <any uric character (ignoring the escaped rule), or "%", or // a letter or digit (according to rCharClass)>
// "\B1" (boundary 1) stands for the beginning or end of the block of text, // or a character that is neither (a) a letter or digit (according to // rCharClass), nor (b) any of "$", "%", "&", "-", "/", "@", or "\". // (FIXME: What was the rationale for this set of punctuation characters?)
// "\B2" (boundary 2) stands for the beginning or end of the block of text, // or a character that is neither (a) a letter or digit (according to // rCharClass), nor (b) any of "!", "#", "$", "%", "&", "'", "*", "+", "-", // "/", "=", "?", "@", "^", "_", "`", "{", "|", "}", or "~" (i.e., an RFC // 822 <atom> character, or "@" from \B1's set above).
// Productions 1--4, and 6--8 try to find a maximum-length match, but they // stop at the first <wchar> character that is a "\B1" character which is // only followed by "\B1" characters (taking "\" and "|" characters into // account appropriately). Production 5 simply tries to find a maximum- // length match.
// Productions 1--4 use the given eMechanism and eCharset. Productions 5--9 // use EncodeMechanism::All.
// Productions 6--9 are only applicable if the FSysStyle::Dos bit is set in // eStyle.
// tdf#145381: In addition to the productions I added a mechanism to detect // matching brackets. The task presents the case of an url that ends on a // closing bracket. This needs to be detected as part of the uri in the case // that a matching opening bracket exists.
bool bBoundary1 = true; bool bBoundary2 = true; for (sal_Int32 nPos = rBegin; nPos != rEnd; nPos = nextChar(rText, nPos))
{
sal_Unicode c = rText[nPos]; if (bBoundary1)
{ if (rtl::isAsciiAlpha(c))
{
sal_Int32 i = nPos;
INetProtocol eScheme = INetURLObject::CompareProtocolScheme(rText.subView(i, rEnd - i)); if (eScheme == INetProtocol::File) // 2nd
{ while (rText[i++] != ':') ;
sal_Int32 nPrefixEnd = i;
sal_Int32 nUriEnd = i; while (i != rEnd
&& checkWChar(rCharClass, rText, &i, &nUriEnd, nullptr, true, true)) ; if (i != nPrefixEnd && i != rEnd && rText[i] == '#')
{
++i; while (i != rEnd
&& checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
} if (nUriEnd != nPrefixEnd
&& isBoundary1(rCharClass, rText, nUriEnd, rEnd))
{
INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
INetProtocol::File, eMechanism, eCharset,
FSysStyle::Detect); if (!aUri.HasError())
{
rBegin = nPos;
rEnd = nUriEnd; return
aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
}
}
} elseif (eScheme != INetProtocol::NotValid) // 1st
{ while (rText[i++] != ':') ;
sal_Int32 nPrefixEnd = i;
sal_Int32 nUriEnd = i;
sal_Int32 nMatchingBracketDepth = 0; while (i != rEnd
&& checkWChar(rCharClass, rText, &i, &nUriEnd,
&nMatchingBracketDepth)) ; if (i != nPrefixEnd && i != rEnd && rText[i] == '#')
{
++i; while (i != rEnd
&& checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
} if (nUriEnd != nPrefixEnd
&& (isBoundary1(rCharClass, rText, nUriEnd, rEnd)
|| rText[nUriEnd] == '\\'))
{
INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
INetProtocol::Http, eMechanism,
eCharset); if (!aUri.HasError())
{
rBegin = nPos;
rEnd = nUriEnd; return
aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
}
}
}
// 3rd, 4th:
i = nPos;
sal_uInt32 nLabels = scanDomain(rText, &i, rEnd); if (nLabels >= 3
&& rText[nPos + 3] == '.'
&& (((rText[nPos] == 'w'
|| rText[nPos] == 'W')
&& (rText[nPos + 1] == 'w'
|| rText[nPos + 1] == 'W')
&& (rText[nPos + 2] == 'w'
|| rText[nPos + 2] == 'W'))
|| ((rText[nPos] == 'f'
|| rText[nPos] == 'F')
&& (rText[nPos + 1] == 't'
|| rText[nPos + 1] == 'T')
&& (rText[nPos + 2] == 'p'
|| rText[nPos + 2] == 'P')))) // (note that rText.GetChar(nPos + 3) is guaranteed to be // valid)
{
sal_Int32 nUriEnd = i; if (i != rEnd && rText[i] == '/')
{
nUriEnd = ++i; while (i != rEnd
&& checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
} if (i != rEnd && rText[i] == '#')
{
++i; while (i != rEnd
&& checkWChar(rCharClass, rText, &i, &nUriEnd)) ;
} if (isBoundary1(rCharClass, rText, nUriEnd, rEnd)
|| rText[nUriEnd] == '\\')
{
INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
INetProtocol::Http, eMechanism,
eCharset); if (!aUri.HasError())
{
rBegin = nPos;
rEnd = nUriEnd; return
aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
}
}
}
if (rEnd - nPos >= 3
&& rText[nPos + 1] == ':'
&& (rText[nPos + 2] == '/'
|| rText[nPos + 2] == '\\')) // 7th, 8th
{
i = nPos + 3;
sal_Int32 nUriEnd = i; while (i != rEnd
&& checkWChar(rCharClass, rText, &i, &nUriEnd)) ; if (isBoundary1(rCharClass, rText, nUriEnd, rEnd))
{
INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
INetProtocol::File,
INetURLObject::EncodeMechanism::All,
RTL_TEXTENCODING_UTF8,
FSysStyle::Dos); if (!aUri.HasError())
{
rBegin = nPos;
rEnd = nUriEnd; return
aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
}
}
}
} elseif (rEnd - nPos >= 2
&& rText[nPos] == '\\'
&& rText[nPos + 1] == '\\') // 6th
{
sal_Int32 i = nPos + 2;
sal_uInt32 nLabels = scanDomain(rText, &i, rEnd); if (nLabels >= 1 && i != rEnd && rText[i] == '\\')
{
sal_Int32 nUriEnd = ++i; while (i != rEnd
&& checkWChar(rCharClass, rText, &i, &nUriEnd,
nullptr, true)) ; if (isBoundary1(rCharClass, rText, nUriEnd, rEnd))
{
INetURLObject aUri(rText.subView(nPos, nUriEnd - nPos),
INetProtocol::File,
INetURLObject::EncodeMechanism::All,
RTL_TEXTENCODING_UTF8,
FSysStyle::Dos); if (!aUri.HasError())
{
rBegin = nPos;
rEnd = nUriEnd; return
aUri.GetMainURL(INetURLObject::DecodeMechanism::ToIUri);
}
}
}
}
} if (bBoundary2 && INetMIME::isAtomChar(c)) // 5th
{ bool bDot = false; for (sal_Int32 i = nPos + 1; i != rEnd; ++i)
{
sal_Unicode c2 = rText[i]; if (INetMIME::isAtomChar(c2))
bDot = false; elseif (bDot) break; elseif (c2 == '.')
bDot = true; else
{ if (c2 == '@')
{
++i;
sal_uInt32 nLabels = scanDomain(rText, &i, rEnd); if (nLabels >= 1
&& isBoundary1(rCharClass, rText, i, rEnd))
{
INetURLObject aUri(rText.subView(nPos, i - nPos),
INetProtocol::Mailto,
INetURLObject::EncodeMechanism::All); if (!aUri.HasError())
{
rBegin = nPos;
rEnd = i; return aUri.GetMainURL(
INetURLObject::DecodeMechanism::ToIUri);
}
}
} break;
}
}
}
bBoundary1 = isBoundary1(rCharClass, rText, nPos, rEnd);
bBoundary2 = isBoundary2(rCharClass, rText, nPos, rEnd);
}
rBegin = rEnd; return OUString();
}
OUString URIHelper::FindFirstDOIInText(std::u16string_view rText,
sal_Int32 & rBegin, const sal_Int32 & rEnd,
CharClass const & rCharClass)
{ if (rBegin > rEnd || rEnd > static_cast<sal_Int32>(rText.size())) return OUString();
sal_Int32 start = 7;
sal_Int32 count = rEnd-rBegin;
OUString candidate(rText.substr(rBegin, count)); // Match with regex "doi:10\.\d{4,9}\/[-._;()\/:a-zA-Z0-9]+" if (candidate.startsWithIgnoreAsciiCase("doi:10."))
{ bool flag = true;
sal_Int32 digit = 0; for (sal_Int32 i=start; i<count; i++)
{
sal_Unicode c = candidate[i]; // Match 4 to 9 digits before slash if (digit >= 0)
{ if (digit>9)
{
flag = false; break;
}
if ( rCharClass.isDigit(candidate,i) )
{
digit++;
} elseif (c=='/' && digit>=4 && i<count-1)
{
digit=-1;
} else
{
flag = false; break;
}
} // Match [-._;()\/:a-zA-Z0-9] after slash elseif (!( rCharClass.isAlphaNumeric(candidate, i) || c == '.' || c == '-' || c=='_' ||
c==';' || c=='(' || c==')' || c=='\\' || (c=='/' && i<count-1) || c==':'))
{
flag = false; break;
}
} if (flag && digit==-1)
{ return OUString::Concat("https://doi.org/")+candidate.subView(4);
}
}
rBegin = rEnd; return OUString();
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.