Quelle Utf8.h Sprache: C

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
* UTF-8-related functionality, including a type-safe structure representing a
* UTF-8 code unit.
*/

#ifndef mozilla_Utf8_h
#define mozilla_Utf8_h

#include "mozilla/Casting.h"    // for mozilla::AssertedCast
#include "mozilla/Likely.h"     // for MOZ_UNLIKELY
#include "mozilla/Maybe.h"      // for mozilla::Maybe
#include "mozilla/Span.h"       // for mozilla::Span
#include "mozilla/TextUtils.h"  // for mozilla::IsAscii and via Latin1.h for
                                // encoding_rs_mem.h and MOZ_HAS_JSRUST.
#include "mozilla/Types.h"      // for MFBT_API

#include <limits>    // for std::numeric_limits
#include <limits.h>  // for CHAR_BIT
#include <stddef.h>  // for size_t
#include <stdint.h>  // for uint8_t

#if MOZ_HAS_JSRUST()
// Can't include mozilla/Encoding.h here.
extern "C" {
// Declared as uint8_t instead of char to match declaration in another header.
size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
}
#else
namespace mozilla {
namespace detail {
extern MFBT_API bool IsValidUtf8(const void* aCodeUnits, size_t aCount);
};  // namespace detail
};  // namespace mozilla
#endif  // MOZ_HAS_JSRUST

namespace mozilla {

union Utf8Unit;

static_assert(CHAR_BIT == 8,
              "Utf8Unit won't work so well with non-octet chars");

/**
* A code unit within a UTF-8 encoded string.  (A code unit is the smallest
* unit within the Unicode encoding of a string.  For UTF-8 this is an 8-bit
* number; for UTF-16 it would be a 16-bit number.)
*
* This is *not* the same as a single code point: in UTF-8, non-ASCII code
* points are constituted by multiple code units.
*/
union Utf8Unit {
private:
  // Utf8Unit is a union wrapping a raw |char|.  The C++ object model and C++
  // requirements as to how objects may be accessed with respect to their actual
  // types (almost?) uniquely compel this choice.
  //
  // Our requirements for a UTF-8 code unit representation are:
  //
  //   1. It must be "compatible" with C++ character/string literals that use
  //      the UTF-8 encoding.  Given a properly encoded C++ literal, you should
  //      be able to use |Utf8Unit| and friends to access it; given |Utf8Unit|
  //      and friends (particularly UnicodeData), you should be able to access
  //      C++ character types for their contents.
  //   2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by
  //      explicit operation.
  //   3. |Utf8Unit| must participate in overload resolution and template type
  //      equivalence (that is, given |template<class> class X|, when |X<T>| and
  //      |X<U>| are the same type) distinctly from the C++ character types.
  //
  // And a few nice-to-haves (at least for the moment):
  //
  //   4. The representation should use unsigned numbers, to avoid undefined
  //      behavior that can arise with signed types, and because Unicode code
  //      points and code units are unsigned.
  //   5. |Utf8Unit| and friends should be convertible to/from |unsigned char|
  //      and |unsigned char*|, for APIs that (because of #4 above) use those
  //      types as the "natural" choice for UTF-8 data.
  //
  // #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of
  // |{,{un,}signed} char|.[0]  |uint8_t| won't work because it might not be a
  // C++ character type.
  //
  // #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one:
  // typedefs don't generate *new* types, just type aliases).  This requires a
  // compound type.
  //
  // The ultimate representation (and character type in it) is constrained by
  // C++14 [basic.lval]p10 that defines how objects may be accessed, with
  // respect to the dynamic type in memory and the actual type used to access
  // them.  It reads:
  //
  //     If a program attempts to access the stored value of an object
  //     through a glvalue of other than one of the following types the
  //     behavior is undefined:
  //
  //       1. the dynamic type of the object,
  //       2. a cv-qualified version of the dynamic type of the object,
  //       ...other types irrelevant here...
  //       3. an aggregate or union type that includes one of the
  //          aforementioned types among its elements or non-static data
  //          members (including, recursively, an element or non-static
  //          data member of a subaggregate or contained union),
  //       ...more irrelevant types...
  //       4. a char or unsigned char type.
  //
  // Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no
  // matter the representation by #4.  (Briefly set aside what values are seen.)
  // (And #2 allows |const| on either the dynamic type or the accessing type.)
  // (|signed char| is really only useful for small signed numbers, not
  // characters, so we ignore it.)
  //
  // If we interpret contents as |char|/|unsigned char| contrary to the actual
  // type stored there, what happens?  C++14 [basic.fundamental]p1 requires
  // character types be identically aligned/sized; C++14 [basic.fundamental]p3
  // requires |signed char| and |unsigned char| have the same value
  // representation.  C++ doesn't require identical bitwise representation, tho.
  // Practically we could assume it, but this verges on C++ spec bits best not
  // *relied* on for correctness, if possible.
  //
  // So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char|
  // and |char*|.  Instead we safely expose |unsigned char| by fully-defined
  // *integral conversion* (C++14 [conv.integral]p2).  Integral conversion from
  // |unsigned char| → |char| has only implementation-defined behavior.  It'd be
  // better not to depend on that, but given twos-complement won, it should be
  // okay.  (Also |unsigned char*| is awkward enough to work with for strings
  // that it probably doesn't appear in string manipulation much anyway, only in
  // places that should really use |Utf8Unit| directly.)
  //
  // The opposite direction -- interpreting |char| or |char*| data through
  // |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as
  // decided above, using #3.  An "aggregate or union" will work that contains a
  // |char|.  Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says
  // aggregates must have "no private or protected non-static data members", and
  // we want to keep the inner |char| hidden.  So a |struct| is out, and only
  // |union| remains.
  //
  // (Enums are not "an aggregate or union type", so [maybe surprisingly] we
  // can't make |Utf8Unit| an enum class with |char| underlying type, because we
  // are given no license to treat |char| memory as such an |enum|'s memory.)
  //
  // Therefore |Utf8Unit| is a union type with a |char| non-static data member.
  // This satisfies all our requirements.  It also supports the nice-to-haves of
  // creating a |Utf8Unit| from an |unsigned char|, and being convertible to
  // |unsigned char|.  It doesn't satisfy the nice-to-haves of using an
  // |unsigned char| internally, nor of letting us wrap an existing
  // |unsigned char| or pointer to one.  We probably *could* do these, if we
  // were willing to rely harder on implementation-defined behaviors, but for
  // now we privilege C++'s main character type over some conceptual purity.
  //
  // 0. There's a proposal for a UTF-8 character type distinct from the existing
  //    C++ narrow character types:
  //
  //      http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html
  //
  //    but it hasn't been standardized (and might never be), and none of the
  //    compilers we really care about have implemented it.  Maybe someday we
  //    can change our implementation to it without too much trouble, if we're
  //    lucky...
  char mValue = '\0';

public:
  Utf8Unit() = default;

  explicit constexpr Utf8Unit(char aUnit) : mValue(aUnit) {}

  explicit constexpr Utf8Unit(unsigned char aUnit)
      : mValue(static_cast<char>(aUnit)) {
    // Per the above comment, the prior cast is integral conversion with
    // implementation-defined semantics, and we regretfully but unavoidably
    // assume the conversion does what we want it to.
  }

#ifdef __cpp_char8_t
  explicit constexpr Utf8Unit(char8_t aUnit)
      : mValue(static_cast<char>(aUnit)) {}
#endif

  constexpr bool operator==(const Utf8Unit& aOther) const {
    return mValue == aOther.mValue;
  }

  constexpr bool operator!=(const Utf8Unit& aOther) const {
    return !(*this == aOther);
  }

  /** Convert a UTF-8 code unit to a raw char. */
  constexpr char toChar() const {
    // Only a |char| is ever permitted to be written into this location, so this
    // is both permissible and returns the desired value.
    return mValue;
  }

  /** Convert a UTF-8 code unit to a raw unsigned char. */
  constexpr unsigned char toUnsignedChar() const {
    // Per the above comment, this is well-defined integral conversion.
    return static_cast<unsigned char>(mValue);
  }

  /** Convert a UTF-8 code unit to a uint8_t. */
  constexpr uint8_t toUint8() const {
    // Per the above comment, this is well-defined integral conversion.
    return static_cast<uint8_t>(mValue);
  }

  // We currently don't expose |&mValue|.  |UnicodeData| sort of does, but
  // that's a somewhat separate concern, justified in different comments in
  // that other code.
};

/**
* Reinterpret the address of a UTF-8 code unit as |const unsigned char*|.
*
* Assuming proper backing has been set up, the resulting |const unsigned char*|
* may validly be dereferenced.
*
* No access is provided to mutate this underlying memory as |unsigned char|.
* Presently memory inside |Utf8Unit| is *only* stored as |char|, and we are
* loath to offer a way to write non-|char| data until absolutely necessary.
*/
inline const unsigned char* Utf8AsUnsignedChars(const Utf8Unit* aUnits) {
  static_assert(sizeof(Utf8Unit) == sizeof(unsigned char),
                "sizes must match to permissibly reinterpret_cast<>");
  static_assert(alignof(Utf8Unit) == alignof(unsigned char),
                "alignment must match to permissibly reinterpret_cast<>");

  // The static_asserts above only enable the reinterpret_cast<> to occur.
  //
  // Dereferencing the resulting pointer is a separate question.  Any object's
  // memory may be interpreted as |unsigned char| per C++11 [basic.lval]p10, but
  // this doesn't guarantee what values will be observed.  If |char| is
  // implemented to act like |unsigned char|, we're good to go: memory for the
  // |char| in |Utf8Unit| acts as we need.  But if |char| is implemented to act
  // like |signed char|, dereferencing produces the right value only if the
  // |char| types all use two's-complement representation.  Every modern
  // compiler does this, and there's a C++ proposal to standardize it.
  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0907r0.html   So
  // *technically* this is implementation-defined -- but everyone does it and
  // this behavior is being standardized.
  return reinterpret_cast<const unsigned char*>(aUnits);
}

/** Returns true iff |aUnit| is an ASCII value. */
constexpr bool IsAscii(Utf8Unit aUnit) {
  return IsAscii(aUnit.toUnsignedChar());
}

/**
* Return true if the given span of memory consists of a valid UTF-8
* string and false otherwise.
*
* The string *may* contain U+0000 NULL code points.
*/
inline bool IsUtf8(mozilla::Span<const char> aString) {
#if MOZ_HAS_JSRUST()
  size_t length = aString.Length();
  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.Elements());
  // For short strings, the function call is a pessimization, and the SIMD
  // code won't have a chance to kick in anyway.
  if (length < 16) {
    for (size_t i = 0; i < length; i++) {
      if (ptr[i] >= 0x80U) {
        ptr += i;
        length -= i;
        goto end;
      }
    }
    return true;
  }
end:
  return length == encoding_utf8_valid_up_to(ptr, length);
#else
  return detail::IsValidUtf8(aString.Elements(), aString.Length());
#endif
}

#if MOZ_HAS_JSRUST()

// See Latin1.h for conversions between Latin1 and UTF-8.

/**
* Returns the index of the start of the first malformed byte
* sequence or the length of the string if there are none.
*/
inline size_t Utf8ValidUpTo(mozilla::Span<const char> aString) {
  return encoding_utf8_valid_up_to(
      reinterpret_cast<const uint8_t*>(aString.Elements()), aString.Length());
}

/**
* Converts potentially-invalid UTF-16 to UTF-8 replacing lone surrogates
* with the REPLACEMENT CHARACTER.
*
* The length of aDest must be at least the length of aSource times three.
*
* Returns the number of code units written.
*/
inline size_t ConvertUtf16toUtf8(mozilla::Span<const char16_t> aSource,
                                 mozilla::Span<char> aDest) {
  return encoding_mem_convert_utf16_to_utf8(
      aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}

/**
* Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte
* sequences with the REPLACEMENT CHARACTER with potentially insufficient
* output space.
*
* Returns the number of code units read and the number of bytes written.
*
* If the output isn't large enough, not all input is consumed.
*
* The conversion is guaranteed to be complete if the length of aDest is
* at least the length of aSource times three.
*
* The output is always valid UTF-8 ending on scalar value boundary
* even in the case of partial conversion.
*
* The semantics of this function match the semantics of
* TextEncoder.encodeInto.
* https://encoding.spec.whatwg.org/#dom-textencoder-encodeinto
*/
inline std::tuple<size_t, size_t> ConvertUtf16toUtf8Partial(
    mozilla::Span<const char16_t> aSource, mozilla::Span<char> aDest) {
  size_t srcLen = aSource.Length();
  size_t dstLen = aDest.Length();
  encoding_mem_convert_utf16_to_utf8_partial(aSource.Elements(), &srcLen,
                                             aDest.Elements(), &dstLen);
  return std::make_tuple(srcLen, dstLen);
}

/**
* Converts potentially-invalid UTF-8 to UTF-16 replacing malformed byte
* sequences with the REPLACEMENT CHARACTER.
*
* Returns the number of code units written.
*
* The length of aDest must be at least one greater than the length of aSource
* even though the last slot isn't written to.
*
* If you know that the input is valid for sure, use
* UnsafeConvertValidUtf8toUtf16() instead.
*/
inline size_t ConvertUtf8toUtf16(mozilla::Span<const char> aSource,
                                 mozilla::Span<char16_t> aDest) {
  return encoding_mem_convert_utf8_to_utf16(
      aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
}

/**
* Converts known-valid UTF-8 to UTF-16. If the input might be invalid,
* use ConvertUtf8toUtf16() or ConvertUtf8toUtf16WithoutReplacement() instead.
*
* Returns the number of code units written.
*
* The length of aDest must be at least the length of aSource.
*/
inline size_t UnsafeConvertValidUtf8toUtf16(mozilla::Span<const char> aSource,
                                            mozilla::Span<char16_t> aDest) {
  return encoding_mem_convert_str_to_utf16(aSource.Elements(), aSource.Length(),
                                           aDest.Elements(), aDest.Length());
}

/**
* Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
*
* Returns the number of code units written or `mozilla::Nothing` if the
* input was invalid.
*
* The length of the destination buffer must be at least the length of the
* source buffer.
*
* When the input was invalid, some output may have been written.
*
* If you know that the input is valid for sure, use
* UnsafeConvertValidUtf8toUtf16() instead.
*/
inline mozilla::Maybe<size_t> ConvertUtf8toUtf16WithoutReplacement(
    mozilla::Span<const char> aSource, mozilla::Span<char16_t> aDest) {
  size_t written = encoding_mem_convert_utf8_to_utf16_without_replacement(
      aSource.Elements(), aSource.Length(), aDest.Elements(), aDest.Length());
  if (MOZ_UNLIKELY(written == std::numeric_limits<size_t>::max())) {
    return mozilla::Nothing();
  }
  return mozilla::Some(written);
}

#endif  // MOZ_HAS_JSRUST

/**
* Returns true iff |aUnit| is a UTF-8 trailing code unit matching the pattern
* 0b10xx'xxxx.
*/
inline bool IsTrailingUnit(Utf8Unit aUnit) {
  return (aUnit.toUint8() & 0b1100'0000) == 0b1000'0000;
}

/**
* Given |aLeadUnit| that is a non-ASCII code unit, a pointer to an |Iter aIter|
* that (initially) itself points one unit past |aLeadUnit|, and
* |const EndIter& aEnd| that denotes the end of the UTF-8 data when compared
* against |*aIter| using |aEnd - *aIter|:
*
* If |aLeadUnit| and subsequent code units computed using |*aIter| (up to
* |aEnd|) encode a valid code point -- not exceeding Unicode's range, not a
* surrogate, in shortest form -- then return Some(that code point) and advance
* |*aIter| past those code units.
*
* Otherwise decrement |*aIter| (so that it points at |aLeadUnit|) and return
* Nothing().
*
* |Iter| and |EndIter| are generalized concepts most easily understood as if
* they were |const char*|, |const unsigned char*|, or |const Utf8Unit*|:
* iterators that when dereferenced can be used to construct a |Utf8Unit| and
* that can be compared and modified in certain limited ways.  (Carefully note
* that this function mutates |*aIter|.)  |Iter| and |EndIter| are template
* parameters to support more-complicated adaptor iterators.
*
* The template parameters after |Iter| allow users to implement custom handling
* for various forms of invalid UTF-8.  A version of this function that defaults
* all such handling to no-ops is defined below this function.  To learn how to
* define your own custom handling, consult the implementation of that function,
* which documents exactly how custom handler functors are invoked.
*
* This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
* of this function without the "Inline" suffix on the name.
*/
template <typename Iter, typename EndIter, class OnBadLeadUnit,
          class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint,
          class OnNotShortestForm>
MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline(
    const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd,
    OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits,
    OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint,
    OnNotShortestForm aOnNotShortestForm) {
  MOZ_ASSERT(Utf8Unit((*aIter)[-1]) == aLeadUnit);

  char32_t n = aLeadUnit.toUint8();
  MOZ_ASSERT(!IsAscii(n));

  // |aLeadUnit| determines the number of trailing code units in the code point
  // and the bits of |aLeadUnit| that contribute to the code point's value.
  uint8_t remaining;
  uint32_t min;
  if ((n & 0b1110'0000) == 0b1100'0000) {
    remaining = 1;
    min = 0x80;
    n &= 0b0001'1111;
  } else if ((n & 0b1111'0000) == 0b1110'0000) {
    remaining = 2;
    min = 0x800;
    n &= 0b0000'1111;
  } else if ((n & 0b1111'1000) == 0b1111'0000) {
    remaining = 3;
    min = 0x10000;
    n &= 0b0000'0111;
  } else {
    *aIter -= 1;
    aOnBadLeadUnit();
    return Nothing();
  }

  // If the code point would require more code units than remain, the encoding
  // is invalid.
  auto actual = aEnd - *aIter;
  if (MOZ_UNLIKELY(actual < remaining)) {
    *aIter -= 1;
    aOnNotEnoughUnits(AssertedCast<uint8_t>(actual + 1), remaining + 1);
    return Nothing();
  }

  for (uint8_t i = 0; i < remaining; i++) {
    const Utf8Unit unit(*(*aIter)++);

    // Every non-leading code unit in properly encoded UTF-8 has its high
    // bit set and the next-highest bit unset.
    if (MOZ_UNLIKELY(!IsTrailingUnit(unit))) {
      uint8_t unitsObserved = i + 1 + 1;
      *aIter -= unitsObserved;
      aOnBadTrailingUnit(unitsObserved);
      return Nothing();
    }

    // The code point being encoded is the concatenation of all the
    // unconstrained bits.
    n = (n << 6) | (unit.toUint8() & 0b0011'1111);
  }

  // UTF-16 surrogates and values outside the Unicode range are invalid.
  if (MOZ_UNLIKELY(n > 0x10FFFF || (0xD800 <= n && n <= 0xDFFF))) {
    uint8_t unitsObserved = remaining + 1;
    *aIter -= unitsObserved;
    aOnBadCodePoint(n, unitsObserved);
    return Nothing();
  }

  // Overlong code points are also invalid.
  if (MOZ_UNLIKELY(n < min)) {
    uint8_t unitsObserved = remaining + 1;
    *aIter -= unitsObserved;
    aOnNotShortestForm(n, unitsObserved);
    return Nothing();
  }

  return Some(n);
}

/**
* Identical to the above function, but not forced to be instantiated inline --
* the compiler is permitted to common up separate invocations if it chooses.
*/
template <typename Iter, typename EndIter, class OnBadLeadUnit,
          class OnNotEnoughUnits, class OnBadTrailingUnit, class OnBadCodePoint,
          class OnNotShortestForm>
inline Maybe<char32_t> DecodeOneUtf8CodePoint(
    const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd,
    OnBadLeadUnit aOnBadLeadUnit, OnNotEnoughUnits aOnNotEnoughUnits,
    OnBadTrailingUnit aOnBadTrailingUnit, OnBadCodePoint aOnBadCodePoint,
    OnNotShortestForm aOnNotShortestForm) {
  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, aOnBadLeadUnit,
                                      aOnNotEnoughUnits, aOnBadTrailingUnit,
                                      aOnBadCodePoint, aOnNotShortestForm);
}

/**
* Like the always-inlined function above, but with no-op behavior from all
* trailing if-invalid notifier functors.
*
* This function is MOZ_ALWAYS_INLINE: if you don't need that, use the version
* of this function without the "Inline" suffix on the name.
*/
template <typename Iter, typename EndIter>
MOZ_ALWAYS_INLINE Maybe<char32_t> DecodeOneUtf8CodePointInline(
    const Utf8Unit aLeadUnit, Iter* aIter, const EndIter& aEnd) {
  // aOnBadLeadUnit is called when |aLeadUnit| itself is an invalid lead unit in
  // a multi-unit code point.  It is passed no arguments: the caller already has
  // |aLeadUnit| on hand, so no need to provide it again.
  auto onBadLeadUnit = []() {};

  // aOnNotEnoughUnits is called when |aLeadUnit| properly indicates a code
  // point length, but there aren't enough units from |*aIter| to |aEnd| to
  // satisfy that length.  It is passed the number of code units actually
  // available (according to |aEnd - *aIter|) and the number of code units that
  // |aLeadUnit| indicates are needed.  Both numbers include the contribution
  // of |aLeadUnit| itself: so |aUnitsAvailable <= 3|, |aUnitsNeeded <= 4|, and
  // |aUnitsAvailable < aUnitsNeeded|.  As above, it also is not passed the lead
  // code unit.
  auto onNotEnoughUnits = [](uint8_t aUnitsAvailable, uint8_t aUnitsNeeded) {};

  // aOnBadTrailingUnit is called when one of the trailing code units implied by
  // |aLeadUnit| doesn't match the 0b10xx'xxxx bit pattern that all UTF-8
  // trailing code units must satisfy.  It is passed the total count of units
  // observed (including |aLeadUnit|).  The bad trailing code unit will
  // conceptually be at |(*aIter)[aUnitsObserved - 1]| if this functor is
  // called, and so |aUnitsObserved <= 4|.
  auto onBadTrailingUnit = [](uint8_t aUnitsObserved) {};

  // aOnBadCodePoint is called when a structurally-correct code point encoding
  // is found, but the *value* that is encoded is not a valid code point: either
  // because it exceeded the U+10FFFF Unicode maximum code point, or because it
  // was a UTF-16 surrogate.  It is passed the non-code point value and the
  // number of code units used to encode it.
  auto onBadCodePoint = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {};

  // aOnNotShortestForm is called when structurally-correct encoding is found,
  // but the encoded value should have been encoded in fewer code units (e.g.
  // mis-encoding U+0000 as 0b1100'0000 0b1000'0000 in two code units instead of
  // as 0b0000'0000).  It is passed the mis-encoded code point (which will be
  // valid and not a surrogate) and the count of code units that mis-encoded it.
  auto onNotShortestForm = [](char32_t aBadCodePoint, uint8_t aUnitsObserved) {
  };

  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd, onBadLeadUnit,
                                      onNotEnoughUnits, onBadTrailingUnit,
                                      onBadCodePoint, onNotShortestForm);
}

/**
* Identical to the above function, but not forced to be instantiated inline --
* the compiler/linker are allowed to common up separate invocations.
*/
template <typename Iter, typename EndIter>
inline Maybe<char32_t> DecodeOneUtf8CodePoint(const Utf8Unit aLeadUnit,
                                              Iter* aIter,
                                              const EndIter& aEnd) {
  return DecodeOneUtf8CodePointInline(aLeadUnit, aIter, aEnd);
}

}  // namespace mozilla

#endif /* mozilla_Utf8_h */

quality97%

¤ Dauer der Verarbeitung: 0.5 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung ist noch experimentell.