Quelle ICU4XWordSegmenter.hpp Sprache: C

#ifndef ICU4XWordSegmenter_HPP
#define ICU4XWordSegmenter_HPP
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <algorithm>
#include <memory>
#include <variant>
#include <optional>
#include "diplomat_runtime.hpp"

#include "ICU4XWordSegmenter.h"

class ICU4XDataProvider;
class ICU4XWordSegmenter;
#include "ICU4XError.hpp"
class ICU4XWordBreakIteratorUtf8;
class ICU4XWordBreakIteratorUtf16;
class ICU4XWordBreakIteratorLatin1;

/**
* A destruction policy for using ICU4XWordSegmenter with std::unique_ptr.
*/
struct ICU4XWordSegmenterDeleter {
  void operator()(capi::ICU4XWordSegmenter* l) const noexcept {
    capi::ICU4XWordSegmenter_destroy(l);
  }
};

/**
* An ICU4X word-break segmenter, capable of finding word breakpoints in strings.
*
* See the [Rust documentation for `WordSegmenter`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html) for more information.
*/
class ICU4XWordSegmenter {
public:

  /**
   * Construct an [`ICU4XWordSegmenter`] with automatically selecting the best available LSTM
   * or dictionary payload data.
   *
   * Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
   * Khmer, Lao, and Thai.
   *
   * See the [Rust documentation for `new_auto`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_auto) for more information.
   */
  static diplomat::result<ICU4XWordSegmenter, ICU4XError> create_auto(const ICU4XDataProvider& provider);

  /**
   * Construct an [`ICU4XWordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
   * Thai.
   *
   * Warning: [`ICU4XWordSegmenter`] created by this function doesn't handle Chinese or
   * Japanese.
   *
   * See the [Rust documentation for `new_lstm`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_lstm) for more information.
   */
  static diplomat::result<ICU4XWordSegmenter, ICU4XError> create_lstm(const ICU4XDataProvider& provider);

  /**
   * Construct an [`ICU4XWordSegmenter`] with dictionary payload data for Chinese, Japanese,
   * Burmese, Khmer, Lao, and Thai.
   *
   * See the [Rust documentation for `new_dictionary`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.new_dictionary) for more information.
   */
  static diplomat::result<ICU4XWordSegmenter, ICU4XError> create_dictionary(const ICU4XDataProvider& provider);

  /**
   * Segments a string.
   *
   * Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
   * to the WHATWG Encoding Standard.
   *
   * See the [Rust documentation for `segment_utf8`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.segment_utf8) for more information.
   *
   * Lifetimes: `this`, `input` must live at least as long as the output.
   */
  ICU4XWordBreakIteratorUtf8 segment_utf8(const std::string_view input) const;

  /**
   * Segments a string.
   *
   * Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
   * to the WHATWG Encoding Standard.
   *
   * See the [Rust documentation for `segment_utf16`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.segment_utf16) for more information.
   *
   * Lifetimes: `this`, `input` must live at least as long as the output.
   */
  ICU4XWordBreakIteratorUtf16 segment_utf16(const std::u16string_view input) const;

  /**
   * Segments a Latin-1 string.
   *
   * See the [Rust documentation for `segment_latin1`](https://docs.rs/icu/latest/icu/segmenter/struct.WordSegmenter.html#method.segment_latin1) for more information.
   *
   * Lifetimes: `this`, `input` must live at least as long as the output.
   */
  ICU4XWordBreakIteratorLatin1 segment_latin1(const diplomat::span<const uint8_t> input) const;
  inline const capi::ICU4XWordSegmenter* AsFFI() const { return this->inner.get(); }
  inline capi::ICU4XWordSegmenter* AsFFIMut() { return this->inner.get(); }
  inline explicit ICU4XWordSegmenter(capi::ICU4XWordSegmenter* i) : inner(i) {}
  ICU4XWordSegmenter() = default;
  ICU4XWordSegmenter(ICU4XWordSegmenter&&) noexcept = default;
  ICU4XWordSegmenter& operator=(ICU4XWordSegmenter&& other) noexcept = default;
private:
  std::unique_ptr<capi::ICU4XWordSegmenter, ICU4XWordSegmenterDeleter> inner;
};

#include "ICU4XDataProvider.hpp"
#include "ICU4XWordBreakIteratorUtf8.hpp"
#include "ICU4XWordBreakIteratorUtf16.hpp"
#include "ICU4XWordBreakIteratorLatin1.hpp"

inline diplomat::result<ICU4XWordSegmenter, ICU4XError> ICU4XWordSegmenter::create_auto(const ICU4XDataProvider& provider) {
  auto diplomat_result_raw_out_value = capi::ICU4XWordSegmenter_create_auto(provider.AsFFI());
  diplomat::result<ICU4XWordSegmenter, ICU4XError> diplomat_result_out_value;
  if (diplomat_result_raw_out_value.is_ok) {
    diplomat_result_out_value = diplomat::Ok<ICU4XWordSegmenter>(ICU4XWordSegmenter(diplomat_result_raw_out_value.ok));
  } else {
    diplomat_result_out_value = diplomat::Err<ICU4XError>(static_cast<ICU4XError>(diplomat_result_raw_out_value.err));
  }
  return diplomat_result_out_value;
}
inline diplomat::result<ICU4XWordSegmenter, ICU4XError> ICU4XWordSegmenter::create_lstm(const ICU4XDataProvider& provider) {
  auto diplomat_result_raw_out_value = capi::ICU4XWordSegmenter_create_lstm(provider.AsFFI());
  diplomat::result<ICU4XWordSegmenter, ICU4XError> diplomat_result_out_value;
  if (diplomat_result_raw_out_value.is_ok) {
    diplomat_result_out_value = diplomat::Ok<ICU4XWordSegmenter>(ICU4XWordSegmenter(diplomat_result_raw_out_value.ok));
  } else {
    diplomat_result_out_value = diplomat::Err<ICU4XError>(static_cast<ICU4XError>(diplomat_result_raw_out_value.err));
  }
  return diplomat_result_out_value;
}
inline diplomat::result<ICU4XWordSegmenter, ICU4XError> ICU4XWordSegmenter::create_dictionary(const ICU4XDataProvider& provider) {
  auto diplomat_result_raw_out_value = capi::ICU4XWordSegmenter_create_dictionary(provider.AsFFI());
  diplomat::result<ICU4XWordSegmenter, ICU4XError> diplomat_result_out_value;
  if (diplomat_result_raw_out_value.is_ok) {
    diplomat_result_out_value = diplomat::Ok<ICU4XWordSegmenter>(ICU4XWordSegmenter(diplomat_result_raw_out_value.ok));
  } else {
    diplomat_result_out_value = diplomat::Err<ICU4XError>(static_cast<ICU4XError>(diplomat_result_raw_out_value.err));
  }
  return diplomat_result_out_value;
}
inline ICU4XWordBreakIteratorUtf8 ICU4XWordSegmenter::segment_utf8(const std::string_view input) const {
  return ICU4XWordBreakIteratorUtf8(capi::ICU4XWordSegmenter_segment_utf8(this->inner.get(), input.data(), input.size()));
}
inline ICU4XWordBreakIteratorUtf16 ICU4XWordSegmenter::segment_utf16(const std::u16string_view input) const {
  return ICU4XWordBreakIteratorUtf16(capi::ICU4XWordSegmenter_segment_utf16(this->inner.get(), input.data(), input.size()));
}
inline ICU4XWordBreakIteratorLatin1 ICU4XWordSegmenter::segment_latin1(const diplomat::span<const uint8_t> input) const {
  return ICU4XWordBreakIteratorLatin1(capi::ICU4XWordSegmenter_segment_latin1(this->inner.get(), input.data(), input.size()));
}
#endif

Messung V0.5

¤ Dauer der Verarbeitung: 0.14 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.