// // libsemigroups - C++ library for semigroups and monoids // Copyright (C) 2021-2023 James D. Mitchell + Maria Tsalakou // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program. If not, see <http://www.gnu.org/licenses/>. //
// This file contains the implementation of the Ukkonen class.
#include"libsemigroups/ukkonen.hpp"
#include <algorithm> // for lower_bound, sort, max #include <array> // for array #include <cstddef> // for size_t #include <cstdint> // for uint64_t #include <numeric> // for accumulate #include <string> // for operator+, char_traits, to_st...
#include"libsemigroups/constants.hpp"// for UNDEFINED #include"libsemigroups/debug.hpp"// for LIBSEMIGROUPS_ASSERT #include"libsemigroups/exception.hpp"// for LIBSEMIGROUPS_EXCEPTION #include"libsemigroups/string.hpp"// for to_string #include"libsemigroups/types.hpp"// for word_type
namespace libsemigroups {
using State = Ukkonen::State; using Node = Ukkonen::Node;
// Return type is node_index_type
size_t& Node::child(letter_type c) { if (children.count(c) == 0) {
children[c] = UNDEFINED;
} return children[c];
}
// Return type is node_index_type
size_t Node::child(letter_type c) const { if (children.count(c) == 0) { return UNDEFINED;
} return children[c];
}
//////////////////////////////////////////////////////////////////////// // Ukkonen - constructors - public ////////////////////////////////////////////////////////////////////////
// TODO init all data members
Ukkonen::Ukkonen()
: _max_word_length(0),
_multiplicity(),
_next_unique_letter(static_cast<unique_letter_type>(-1)),
_nodes(1),
_ptr(0, 0),
_word_begin({0}),
_word() {}
//////////////////////////////////////////////////////////////////////// // Ukkonen - destructors - public ////////////////////////////////////////////////////////////////////////
Ukkonen::~Ukkonen() = default;
//////////////////////////////////////////////////////////////////////// // Ukkonen - initialisation - public ////////////////////////////////////////////////////////////////////////
void Ukkonen::add_word_no_checks(const_iterator first, const_iterator last) { if (first >= last) { return;
}
size_t const ndx = index(first, last); if (ndx != UNDEFINED) { // Duplicate word, do nothing
_multiplicity[ndx]++; return;
}
_multiplicity.push_back(1);
size_t const n = std::distance(first, last);
// TODO(later) could do some more caching here, i.e. only check the children // if we have added more words since the value was last computed. bool Ukkonen::is_real_suffix(Node const& n) const { if (n.is_real_suffix) { returntrue;
} for (autoconst& child : n.children) { if (is_unique_letter(child.first)) {
n.is_real_suffix = true; break;
}
} return n.is_real_suffix;
}
size_t Ukkonen::distance_from_root(Node const& n) const {
size_t result = 0;
Node const* m = &n; while (m->parent != UNDEFINED) {
result += m->length();
m = &_nodes[m->parent];
} return result;
}
//////////////////////////////////////////////////////////////////////// // The following functions go, split, get_link, and tree_extend are // minimally adapted from: // // https://cp-algorithms.com/string/suffix-tree-ukkonen.html ////////////////////////////////////////////////////////////////////////
// Follow the path in the tree starting at the position described by // State st, and corresponding to the range [l, r) in _word. void Ukkonen::go(State& st, index_type l, index_type r) const {
LIBSEMIGROUPS_ASSERT(l < _word.size());
LIBSEMIGROUPS_ASSERT(r <= _word.size());
LIBSEMIGROUPS_ASSERT(l <= r); while (l < r) { if (st.pos == _nodes[st.v].length()) {
st = State(_nodes[st.v].child(_word[l]), 0); if (st.v == UNDEFINED) { return;
}
} else { if (_word[_nodes[st.v].l + st.pos] != _word[l]) {
st.v = UNDEFINED;
st.pos = UNDEFINED; return;
} if (r - l < _nodes[st.v].length() - st.pos) {
st.pos += r - l; return;
}
l += _nodes[st.v].length() - st.pos;
st.pos = _nodes[st.v].length();
}
}
}
// Split the node _nodes[st.v] into two nodes, the new node // with edge corresponding to // // [_nodes[st.v].l, _nodes[st.v].l + st.pos) // // and the old node with edge corresponding to // // [_nodes[st.v].l + st.pos, _nodes[st.v].r)
Ukkonen::node_index_type Ukkonen::split(State const& st) {
LIBSEMIGROUPS_ASSERT(st.v != UNDEFINED);
LIBSEMIGROUPS_ASSERT(st.v < _nodes.size());
LIBSEMIGROUPS_ASSERT(st.pos <= _nodes[st.v].length());
// Get the suffix link of a node by index
Ukkonen::node_index_type Ukkonen::get_link(node_index_type v) {
LIBSEMIGROUPS_ASSERT(v < _nodes.size()); if (_nodes[v].link != UNDEFINED) { return _nodes[v].link;
} elseif (_nodes[v].parent == UNDEFINED) { return 0;
} auto to = get_link(_nodes[v].parent);
LIBSEMIGROUPS_ASSERT(to < _nodes.size());
State st(to, _nodes[to].length());
go(st, _nodes[v].l + (_nodes[v].parent == 0), _nodes[v].r);
LIBSEMIGROUPS_ASSERT(st.v != UNDEFINED);
LIBSEMIGROUPS_ASSERT(st.pos <= _nodes[st.v].length()); // WARNING: the assignment of xxx to split(st) in the next line looks // redundant, but isn't! Without this the test fail when compiling // with gcc (9, 10, 11, at least)! auto xxx = split(st);
_nodes[v].link = xxx;
LIBSEMIGROUPS_ASSERT(_nodes[v].link < _nodes.size()); return _nodes[v].link;
}
// Perform the phase starting with the pos letter of the word. void Ukkonen::tree_extend(index_type pos) {
LIBSEMIGROUPS_ASSERT(pos < _word.size()); for (;;) {
State nptr(_ptr);
go(nptr, pos, pos + 1); if (nptr.v != UNDEFINED) {
_ptr = nptr; return;
}
auto mid = split(_ptr); auto leaf = _nodes.size();
_nodes.emplace_back(pos, _word.size(), mid);
_nodes[mid].child(_word[pos]) = leaf;
void GreedyReduceHelper::pre_order(Ukkonen const& st, size_t v) { autoconst& nodes = st.nodes(); // This is a tree so we've never seen v before! if (!nodes[v].is_root()) {
_distance_from_root[v]
= _distance_from_root[nodes[v].parent] + nodes[v].length();
} if (nodes[v].is_leaf()) {
_num_leafs[v]++; // Starting index of the suffix that the leaf corresponds to
_suffix_index.push_back(nodes[v].r - _distance_from_root[v]);
}
}
for (autoconst& child : nodes[v].children) {
_num_leafs[v] += _num_leafs[child.second];
}
_scratch.assign(_suffix_index.cend() - _num_leafs[v],
_suffix_index.cend());
std::sort(_scratch.begin(), _scratch.end()); // number of non-overlapping subwords corresponding to the node v
size_t num_non_overlap = st.multiplicity(st.word_index(_scratch[0]));
// Only try greedily matching non-overlapping subwords from left to // right auto subword_begin = _scratch[0]; auto it = _scratch.cbegin(); do { auto subword_end = subword_begin + _distance_from_root[v];
it = std::lower_bound(it, _scratch.cend(), subword_end); if (it != _scratch.cend()) {
subword_begin = *it;
num_non_overlap += st.multiplicity(st.word_index(subword_begin));
}
} while (it != _scratch.cend()); int goodness = (_distance_from_root[v] * num_non_overlap)
- num_non_overlap - (_distance_from_root[v] + 1); if (goodness > _best_goodness) {
_best = v;
_best_goodness = goodness;
}
}
using const_iterator = typename Ukkonen::const_iterator;
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.14Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.