|
|
|
|
Quelle dfa.rs
Sprache: unbekannt
|
|
/*!
Types and routines specific to lazy DFAs.
This module is the home of [`hybrid::dfa::DFA`](DFA).
This module also contains a [`hybrid::dfa::Builder`](Builder) and a
[`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA.
*/
use core::{iter, mem::size_of};
use alloc::vec::Vec;
use crate::{
hybrid::{
error::{BuildError, CacheError},
id::{LazyStateID, LazyStateIDError},
search,
},
nfa::thompson,
util::{
alphabet::{self, ByteClasses, ByteSet},
determinize::{self, State, StateBuilderEmpty, StateBuilderNFA},
empty,
prefilter::Prefilter,
primitives::{PatternID, StateID as NFAStateID},
search::{
Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet,
},
sparse_set::SparseSets,
start::{Start, StartByteMap},
},
};
/// The minimum number of states that a lazy DFA's cache size must support.
///
/// This is checked at time of construction to ensure that at least some small
/// number of states can fit in the given capacity allotment. If we can't fit
/// at least this number of states, then the thinking is that it's pretty
/// senseless to use the lazy DFA. More to the point, parts of the code do
/// assume that the cache can fit at least some small number of states.
const MIN_STATES: usize = SENTINEL_STATES + 2;
/// The number of "sentinel" states that get added to every lazy DFA.
///
/// These are special states indicating status conditions of a search: unknown,
/// dead and quit. These states in particular also use zero NFA states, so
/// their memory usage is quite small. This is relevant for computing the
/// minimum memory needed for a lazy DFA cache.
const SENTINEL_STATES: usize = 3;
/// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching.
///
/// A lazy DFA is a DFA that builds itself at search time. It otherwise has
/// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA).
/// Indeed, both support precisely the same regex features with precisely the
/// same semantics.
///
/// Where as a `dense::DFA` must be completely built to handle any input before
/// it may be used for search, a lazy DFA starts off effectively empty. During
/// a search, a lazy DFA will build itself depending on whether it has already
/// computed the next transition or not. If it has, then it looks a lot like
/// a `dense::DFA` internally: it does a very fast table based access to find
/// the next transition. Otherwise, if the state hasn't been computed, then it
/// does determinization _for that specific transition_ to compute the next DFA
/// state.
///
/// The main selling point of a lazy DFA is that, in practice, it has
/// the performance profile of a `dense::DFA` without the weakness of it
/// taking worst case exponential time to build. Indeed, for each byte of
/// input, the lazy DFA will construct as most one new DFA state. Thus, a
/// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~
/// pattern.len()` and `n ~ haystack.len()`).
///
/// The main downsides of a lazy DFA are:
///
/// 1. It requires mutable "cache" space during search. This is where the
/// transition table, among other things, is stored.
/// 2. In pathological cases (e.g., if the cache is too small), it will run
/// out of room and either require a bigger cache capacity or will repeatedly
/// clear the cache and thus repeatedly regenerate DFA states. Overall, this
/// will tend to be slower than a typical NFA simulation.
///
/// # Capabilities
///
/// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following
/// operations:
///
/// 1. Detection of a match.
/// 2. Location of the end of a match.
/// 3. In the case of a lazy DFA with multiple patterns, which pattern matched
/// is reported as well.
///
/// A notable absence from the above list of capabilities is the location of
/// the *start* of a match. In order to provide both the start and end of
/// a match, *two* lazy DFAs are required. This functionality is provided by a
/// [`Regex`](crate::hybrid::regex::Regex).
///
/// # Example
///
/// This shows how to build a lazy DFA with the default configuration and
/// execute a search. Notice how, in contrast to a `dense::DFA`, we must create
/// a cache and pass it to our search routine.
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+")?;
/// let mut cache = dfa.create_cache();
///
/// let expected = Some(HalfMatch::must(0, 8));
/// assert_eq!(expected, dfa.try_search_fwd(
/// &mut cache, &Input::new("foo12345"))?,
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct DFA {
config: Config,
nfa: thompson::NFA,
stride2: usize,
start_map: StartByteMap,
classes: ByteClasses,
quitset: ByteSet,
cache_capacity: usize,
}
impl DFA {
/// Parse the given regular expression using a default configuration and
/// return the corresponding lazy DFA.
///
/// If you want a non-default configuration, then use the [`Builder`] to
/// set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+bar")?;
/// let mut cache = dfa.create_cache();
///
/// let expected = HalfMatch::must(0, 11);
/// assert_eq!(
/// Some(expected),
/// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "syntax")]
pub fn new(pattern: &str) -> Result<DFA, BuildError> {
DFA::builder().build(pattern)
}
/// Parse the given regular expressions using a default configuration and
/// return the corresponding lazy multi-DFA.
///
/// If you want a non-default configuration, then use the [`Builder`] to
/// set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?;
/// let mut cache = dfa.create_cache();
///
/// let expected = HalfMatch::must(1, 3);
/// assert_eq!(
/// Some(expected),
/// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?,
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "syntax")]
pub fn new_many<P: AsRef<str>>(patterns: &[P]) -> Result<DFA, BuildError> {
DFA::builder().build_many(patterns)
}
/// Create a new lazy DFA that matches every input.
///
/// # Example
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::always_match()?;
/// let mut cache = dfa.create_cache();
///
/// let expected = HalfMatch::must(0, 0);
/// assert_eq!(Some(expected), dfa.try_search_fwd(
/// &mut cache, &Input::new(""))?,
/// );
/// assert_eq!(Some(expected), dfa.try_search_fwd(
/// &mut cache, &Input::new("foo"))?,
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn always_match() -> Result<DFA, BuildError> {
let nfa = thompson::NFA::always_match();
Builder::new().build_from_nfa(nfa)
}
/// Create a new lazy DFA that never matches any input.
///
/// # Example
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, Input};
///
/// let dfa = DFA::never_match()?;
/// let mut cache = dfa.create_cache();
///
/// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?);
/// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn never_match() -> Result<DFA, BuildError> {
let nfa = thompson::NFA::never_match();
Builder::new().build_from_nfa(nfa)
}
/// Return a default configuration for a `DFA`.
///
/// This is a convenience routine to avoid needing to import the [`Config`]
/// type when customizing the construction of a lazy DFA.
///
/// # Example
///
/// This example shows how to build a lazy DFA that heuristically supports
/// Unicode word boundaries.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input};
///
/// let re = DFA::builder()
/// .configure(DFA::config().unicode_word_boundary(true))
/// .build(r"\b\w+\b")?;
/// let mut cache = re.create_cache();
///
/// // Since our haystack is all ASCII, the DFA search sees then and knows
/// // it is legal to interpret Unicode word boundaries as ASCII word
/// // boundaries.
/// let input = Input::new("!!foo!!");
/// let expected = HalfMatch::must(0, 5);
/// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?);
///
/// // But if our haystack contains non-ASCII, then the search will fail
/// // with an error.
/// let input = Input::new("!!βββ!!");
/// let expected = MatchError::quit(b'\xCE', 2);
/// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn config() -> Config {
Config::new()
}
/// Return a builder for configuring the construction of a `Regex`.
///
/// This is a convenience routine to avoid needing to import the
/// [`Builder`] type in common cases.
///
/// # Example
///
/// This example shows how to use the builder to disable UTF-8 mode
/// everywhere for lazy DFAs.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input};
///
/// let re = DFA::builder()
/// .syntax(syntax::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let mut cache = re.create_cache();
///
/// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n");
/// let expected = Some(HalfMatch::must(0, 9));
/// let got = re.try_search_fwd(&mut cache, &input)?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn builder() -> Builder {
Builder::new()
}
/// Create a new cache for this lazy DFA.
///
/// The cache returned should only be used for searches for this
/// lazy DFA. If you want to reuse the cache for another DFA, then
/// you must call [`Cache::reset`] with that DFA (or, equivalently,
/// [`DFA::reset_cache`]).
pub fn create_cache(&self) -> Cache {
Cache::new(self)
}
/// Reset the given cache such that it can be used for searching with the
/// this lazy DFA (and only this DFA).
///
/// A cache reset permits reusing memory already allocated in this cache
/// with a different lazy DFA.
///
/// Resetting a cache sets its "clear count" to 0. This is relevant if the
/// lazy DFA has been configured to "give up" after it has cleared the
/// cache a certain number of times.
///
/// Any lazy state ID generated by the cache prior to resetting it is
/// invalid after the reset.
///
/// # Example
///
/// This shows how to re-purpose a cache for use with a different DFA.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa1 = DFA::new(r"\w")?;
/// let dfa2 = DFA::new(r"\W")?;
///
/// let mut cache = dfa1.create_cache();
/// assert_eq!(
/// Some(HalfMatch::must(0, 2)),
/// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?,
/// );
///
/// // Using 'cache' with dfa2 is not allowed. It may result in panics or
/// // incorrect results. In order to re-purpose the cache, we must reset
/// // it with the DFA we'd like to use it with.
/// //
/// // Similarly, after this reset, using the cache with 'dfa1' is also not
/// // allowed.
/// dfa2.reset_cache(&mut cache);
/// assert_eq!(
/// Some(HalfMatch::must(0, 3)),
/// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?,
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn reset_cache(&self, cache: &mut Cache) {
Lazy::new(self, cache).reset_cache()
}
/// Returns the total number of patterns compiled into this lazy DFA.
///
/// In the case of a DFA that contains no patterns, this returns `0`.
///
/// # Example
///
/// This example shows the pattern length for a DFA that never matches:
///
/// ```
/// use regex_automata::hybrid::dfa::DFA;
///
/// let dfa = DFA::never_match()?;
/// assert_eq!(dfa.pattern_len(), 0);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// And another example for a DFA that matches at every position:
///
/// ```
/// use regex_automata::hybrid::dfa::DFA;
///
/// let dfa = DFA::always_match()?;
/// assert_eq!(dfa.pattern_len(), 1);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// And finally, a DFA that was constructed from multiple patterns:
///
/// ```
/// use regex_automata::hybrid::dfa::DFA;
///
/// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
/// assert_eq!(dfa.pattern_len(), 3);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn pattern_len(&self) -> usize {
self.nfa.pattern_len()
}
/// Returns the equivalence classes that make up the alphabet for this DFA.
///
/// Unless [`Config::byte_classes`] was disabled, it is possible that
/// multiple distinct bytes are grouped into the same equivalence class
/// if it is impossible for them to discriminate between a match and a
/// non-match. This has the effect of reducing the overall alphabet size
/// and in turn potentially substantially reducing the size of the DFA's
/// transition table.
///
/// The downside of using equivalence classes like this is that every state
/// transition will automatically use this map to convert an arbitrary
/// byte to its corresponding equivalence class. In practice this has a
/// negligible impact on performance.
pub fn byte_classes(&self) -> &ByteClasses {
&self.classes
}
/// Returns this lazy DFA's configuration.
pub fn get_config(&self) -> &Config {
&self.config
}
/// Returns a reference to the underlying NFA.
pub fn get_nfa(&self) -> &thompson::NFA {
&self.nfa
}
/// Returns the stride, as a base-2 exponent, required for these
/// equivalence classes.
///
/// The stride is always the smallest power of 2 that is greater than or
/// equal to the alphabet length. This is done so that converting between
/// state IDs and indices can be done with shifts alone, which is much
/// faster than integer division.
fn stride2(&self) -> usize {
self.stride2
}
/// Returns the total stride for every state in this lazy DFA. This
/// corresponds to the total number of transitions used by each state in
/// this DFA's transition table.
fn stride(&self) -> usize {
1 << self.stride2()
}
/// Returns the memory usage, in bytes, of this lazy DFA.
///
/// This does **not** include the stack size used up by this lazy DFA. To
/// compute that, use `std::mem::size_of::<DFA>()`. This also does not
/// include the size of the `Cache` used.
///
/// This also does not include any heap memory used by the NFA inside of
/// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and
/// thus not owned by this hybrid NFA/DFA. More practically, several regex
/// engines in this crate embed an NFA, and reporting the NFA's memory
/// usage in all of them would likely result in reporting higher heap
/// memory than is actually used.
pub fn memory_usage(&self) -> usize {
// The only thing that uses heap memory in a DFA is the NFA. But the
// NFA has shared ownership, so reporting its memory as part of the
// hybrid DFA is likely to lead to double-counting the NFA memory
// somehow. In particular, this DFA does not really own an NFA, so
// including it in the DFA's memory usage doesn't seem semantically
// correct.
0
}
}
impl DFA {
/// Executes a forward search and returns the end position of the leftmost
/// match that is found. If no match exists, then `None` is returned.
///
/// In particular, this method continues searching even after it enters
/// a match state. The search only terminates once it has reached the
/// end of the input or when it has entered a dead or quit state. Upon
/// termination, the position of the last byte seen while still in a match
/// state is returned.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example
///
/// This example shows how to run a basic search.
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+")?;
/// let mut cache = dfa.create_cache();
/// let expected = HalfMatch::must(0, 8);
/// assert_eq!(Some(expected), dfa.try_search_fwd(
/// &mut cache, &Input::new("foo12345"))?,
/// );
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the leftmost first match semantics demand that we find the earliest
/// // match that prefers earlier parts of the pattern over later parts.
/// let dfa = DFA::new("abc|a")?;
/// let mut cache = dfa.create_cache();
/// let expected = HalfMatch::must(0, 3);
/// assert_eq!(Some(expected), dfa.try_search_fwd(
/// &mut cache, &Input::new("abc"))?,
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// # Example: specific pattern search
///
/// This example shows how to build a lazy multi-DFA that permits searching
/// for specific patterns.
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::DFA,
/// Anchored, HalfMatch, PatternID, Input,
/// };
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().starts_for_each_pattern(true))
/// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
/// let mut cache = dfa.create_cache();
/// let haystack = "foo123";
///
/// // Since we are using the default leftmost-first match and both
/// // patterns match at the same starting position, only the first pattern
/// // will be returned in this case when doing a search for any of the
/// // patterns.
/// let expected = Some(HalfMatch::must(0, 6));
/// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// // But if we want to check whether some other pattern matches, then we
/// // can provide its pattern ID.
/// let expected = Some(HalfMatch::must(1, 6));
/// let input = Input::new(haystack)
/// .anchored(Anchored::Pattern(PatternID::must(1)));
/// let got = dfa.try_search_fwd(&mut cache, &input)?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// # Example: specifying the bounds of a search
///
/// This example shows how providing the bounds of a search can produce
/// different results than simply sub-slicing the haystack.
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input};
///
/// // N.B. We disable Unicode here so that we use a simple ASCII word
/// // boundary. Alternatively, we could enable heuristic support for
/// // Unicode word boundaries since our haystack is pure ASCII.
/// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "foo123bar";
///
/// // Since we sub-slice the haystack, the search doesn't know about the
/// // larger context and assumes that `123` is surrounded by word
/// // boundaries. And of course, the match position is reported relative
/// // to the sub-slice as well, which means we get `3` instead of `6`.
/// let expected = Some(HalfMatch::must(0, 3));
/// let got = dfa.try_search_fwd(
/// &mut cache,
/// &Input::new(&haystack[3..6]),
/// )?;
/// assert_eq!(expected, got);
///
/// // But if we provide the bounds of the search within the context of the
/// // entire haystack, then the search can take the surrounding context
/// // into account. (And if we did find a match, it would be reported
/// // as a valid offset into `haystack` instead of its sub-slice.)
/// let expected = None;
/// let got = dfa.try_search_fwd(
/// &mut cache,
/// &Input::new(haystack).range(3..6),
/// )?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn try_search_fwd(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
let hm = match search::find_fwd(self, cache, input)? {
None => return Ok(None),
Some(hm) if !utf8empty => return Ok(Some(hm)),
Some(hm) => hm,
};
// We get to this point when we know our DFA can match the empty string
// AND when UTF-8 mode is enabled. In this case, we skip any matches
// whose offset splits a codepoint. Such a match is necessarily a
// zero-width match, because UTF-8 mode requires the underlying NFA
// to be built such that all non-empty matches span valid UTF-8.
// Therefore, any match that ends in the middle of a codepoint cannot
// be part of a span of valid UTF-8 and thus must be an empty match.
// In such cases, we skip it, so as not to report matches that split a
// codepoint.
//
// Note that this is not a checked assumption. Callers *can* provide an
// NFA with UTF-8 mode enabled but produces non-empty matches that span
// invalid UTF-8. But doing so is documented to result in unspecified
// behavior.
empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
let got = search::find_fwd(self, cache, input)?;
Ok(got.map(|hm| (hm, hm.offset())))
})
}
/// Executes a reverse search and returns the start of the position of the
/// leftmost match that is found. If no match exists, then `None` is
/// returned.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example
///
/// This routine is principally useful when used in
/// conjunction with the
/// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse)
/// configuration. In general, it's unlikely to be correct to use both
/// `try_search_fwd` and `try_search_rev` with the same DFA since any
/// particular DFA will only support searching in one direction with
/// respect to the pattern.
///
/// ```
/// use regex_automata::{
/// nfa::thompson,
/// hybrid::dfa::DFA,
/// HalfMatch, Input,
/// };
///
/// let dfa = DFA::builder()
/// .thompson(thompson::Config::new().reverse(true))
/// .build("foo[0-9]+")?;
/// let mut cache = dfa.create_cache();
/// let expected = HalfMatch::must(0, 0);
/// assert_eq!(
/// Some(expected),
/// dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?,
/// );
///
/// // Even though a match is found after reading the last byte (`c`),
/// // the leftmost first match semantics demand that we find the earliest
/// // match that prefers earlier parts of the pattern over latter parts.
/// let dfa = DFA::builder()
/// .thompson(thompson::Config::new().reverse(true))
/// .build("abc|c")?;
/// let mut cache = dfa.create_cache();
/// let expected = HalfMatch::must(0, 0);
/// assert_eq!(Some(expected), dfa.try_search_rev(
/// &mut cache, &Input::new("abc"))?,
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// # Example: UTF-8 mode
///
/// This examples demonstrates that UTF-8 mode applies to reverse
/// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
/// matches reported must correspond to valid UTF-8 spans. This includes
/// prohibiting zero-width matches that split a codepoint.
///
/// UTF-8 mode is enabled by default. Notice below how the only zero-width
/// matches reported are those at UTF-8 boundaries:
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::DFA,
/// nfa::thompson,
/// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = DFA::builder()
/// .thompson(thompson::Config::new().reverse(true))
/// .build(r"")?;
/// let mut cache = dfa.create_cache();
///
/// // Run the reverse DFA to collect all matches.
/// let mut input = Input::new("☃");
/// let mut matches = vec![];
/// loop {
/// match dfa.try_search_rev(&mut cache, &input)? {
/// None => break,
/// Some(hm) => {
/// matches.push(hm);
/// if hm.offset() == 0 || input.end() == 0 {
/// break;
/// } else if hm.offset() < input.end() {
/// input.set_end(hm.offset());
/// } else {
/// // This is only necessary to handle zero-width
/// // matches, which of course occur in this example.
/// // Without this, the search would never advance
/// // backwards beyond the initial match.
/// input.set_end(input.end() - 1);
/// }
/// }
/// }
/// }
///
/// // No matches split a codepoint.
/// let expected = vec![
/// HalfMatch::must(0, 3),
/// HalfMatch::must(0, 0),
/// ];
/// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// Now let's look at the same example, but with UTF-8 mode on the
/// underlying NFA disabled:
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::DFA,
/// nfa::thompson,
/// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = DFA::builder()
/// .thompson(thompson::Config::new().reverse(true).utf8(false))
/// .build(r"")?;
/// let mut cache = dfa.create_cache();
///
/// // Run the reverse DFA to collect all matches.
/// let mut input = Input::new("☃");
/// let mut matches = vec![];
/// loop {
/// match dfa.try_search_rev(&mut cache, &input)? {
/// None => break,
/// Some(hm) => {
/// matches.push(hm);
/// if hm.offset() == 0 || input.end() == 0 {
/// break;
/// } else if hm.offset() < input.end() {
/// input.set_end(hm.offset());
/// } else {
/// // This is only necessary to handle zero-width
/// // matches, which of course occur in this example.
/// // Without this, the search would never advance
/// // backwards beyond the initial match.
/// input.set_end(input.end() - 1);
/// }
/// }
/// }
/// }
///
/// // No matches split a codepoint.
/// let expected = vec![
/// HalfMatch::must(0, 3),
/// HalfMatch::must(0, 2),
/// HalfMatch::must(0, 1),
/// HalfMatch::must(0, 0),
/// ];
/// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn try_search_rev(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
let hm = match search::find_rev(self, cache, input)? {
None => return Ok(None),
Some(hm) if !utf8empty => return Ok(Some(hm)),
Some(hm) => hm,
};
empty::skip_splits_rev(input, hm, hm.offset(), |input| {
let got = search::find_rev(self, cache, input)?;
Ok(got.map(|hm| (hm, hm.offset())))
})
}
/// Executes an overlapping forward search and returns the end position of
/// matches as they are found. If no match exists, then `None` is returned.
///
/// This routine is principally only useful when searching for multiple
/// patterns on inputs where multiple patterns may match the same regions
/// of text. In particular, callers must preserve the automaton's search
/// state from prior calls so that the implementation knows where the last
/// match occurred.
///
/// When using this routine to implement an iterator of overlapping
/// matches, the `start` of the search should remain invariant throughout
/// iteration. The `OverlappingState` given to the search will keep track
/// of the current position of the search. (This is because multiple
/// matches may be reported at the same position, so only the search
/// implementation itself knows when to advance the position.)
///
/// If for some reason you want the search to forget about its previous
/// state and restart the search at a particular position, then setting the
/// state to [`OverlappingState::start`] will accomplish that.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example
///
/// This example shows how to run a basic overlapping search. Notice
/// that we build the automaton with a `MatchKind::All` configuration.
/// Overlapping searches are unlikely to work as one would expect when
/// using the default `MatchKind::LeftmostFirst` match semantics, since
/// leftmost-first matching is fundamentally incompatible with overlapping
/// searches. Namely, overlapping searches need to report matches as they
/// are seen, where as leftmost-first searches will continue searching even
/// after a match has been observed in order to find the conventional end
/// position of the match. More concretely, leftmost-first searches use
/// dead states to terminate a search after a specific match can no longer
/// be extended. Overlapping searches instead do the opposite by continuing
/// the search to find totally new matches (potentially of other patterns).
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// hybrid::dfa::{DFA, OverlappingState},
/// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().match_kind(MatchKind::All))
/// .build_many(&[r"\w+$", r"\S+$"])?;
/// let mut cache = dfa.create_cache();
///
/// let haystack = "@foo";
/// let mut state = OverlappingState::start();
///
/// let expected = Some(HalfMatch::must(1, 4));
/// dfa.try_search_overlapping_fwd(
/// &mut cache, &Input::new(haystack), &mut state,
/// )?;
/// assert_eq!(expected, state.get_match());
///
/// // The first pattern also matches at the same position, so re-running
/// // the search will yield another match. Notice also that the first
/// // pattern is returned after the second. This is because the second
/// // pattern begins its match before the first, is therefore an earlier
/// // match and is thus reported first.
/// let expected = Some(HalfMatch::must(0, 4));
/// dfa.try_search_overlapping_fwd(
/// &mut cache, &Input::new(haystack), &mut state,
/// )?;
/// assert_eq!(expected, state.get_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn try_search_overlapping_fwd(
&self,
cache: &mut Cache,
input: &Input<'_>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
search::find_overlapping_fwd(self, cache, input, state)?;
match state.get_match() {
None => Ok(()),
Some(_) if !utf8empty => Ok(()),
Some(_) => skip_empty_utf8_splits_overlapping(
input,
state,
|input, state| {
search::find_overlapping_fwd(self, cache, input, state)
},
),
}
}
/// Executes a reverse overlapping search and returns the start of the
/// position of the leftmost match that is found. If no match exists, then
/// `None` is returned.
///
/// When using this routine to implement an iterator of overlapping
/// matches, the `start` of the search should remain invariant throughout
/// iteration. The `OverlappingState` given to the search will keep track
/// of the current position of the search. (This is because multiple
/// matches may be reported at the same position, so only the search
/// implementation itself knows when to advance the position.)
///
/// If for some reason you want the search to forget about its previous
/// state and restart the search at a particular position, then setting the
/// state to [`OverlappingState::start`] will accomplish that.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example: UTF-8 mode
///
/// This examples demonstrates that UTF-8 mode applies to reverse
/// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all
/// matches reported must correspond to valid UTF-8 spans. This includes
/// prohibiting zero-width matches that split a codepoint.
///
/// UTF-8 mode is enabled by default. Notice below how the only zero-width
/// matches reported are those at UTF-8 boundaries:
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::{DFA, OverlappingState},
/// nfa::thompson,
/// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().match_kind(MatchKind::All))
/// .thompson(thompson::Config::new().reverse(true))
/// .build_many(&[r"", r"☃"])?;
/// let mut cache = dfa.create_cache();
///
/// // Run the reverse DFA to collect all matches.
/// let input = Input::new("☃");
/// let mut state = OverlappingState::start();
/// let mut matches = vec![];
/// loop {
/// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
/// match state.get_match() {
/// None => break,
/// Some(hm) => matches.push(hm),
/// }
/// }
///
/// // No matches split a codepoint.
/// let expected = vec![
/// HalfMatch::must(0, 3),
/// HalfMatch::must(1, 0),
/// HalfMatch::must(0, 0),
/// ];
/// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// Now let's look at the same example, but with UTF-8 mode on the
/// underlying NFA disabled:
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::{DFA, OverlappingState},
/// nfa::thompson,
/// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().match_kind(MatchKind::All))
/// .thompson(thompson::Config::new().reverse(true).utf8(false))
/// .build_many(&[r"", r"☃"])?;
/// let mut cache = dfa.create_cache();
///
/// // Run the reverse DFA to collect all matches.
/// let input = Input::new("☃");
/// let mut state = OverlappingState::start();
/// let mut matches = vec![];
/// loop {
/// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?;
/// match state.get_match() {
/// None => break,
/// Some(hm) => matches.push(hm),
/// }
/// }
///
/// // Now *all* positions match, even within a codepoint,
/// // because we lifted the requirement that matches
/// // correspond to valid UTF-8 spans.
/// let expected = vec![
/// HalfMatch::must(0, 3),
/// HalfMatch::must(0, 2),
/// HalfMatch::must(0, 1),
/// HalfMatch::must(1, 0),
/// HalfMatch::must(0, 0),
/// ];
/// assert_eq!(expected, matches);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn try_search_overlapping_rev(
&self,
cache: &mut Cache,
input: &Input<'_>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
search::find_overlapping_rev(self, cache, input, state)?;
match state.get_match() {
None => Ok(()),
Some(_) if !utf8empty => Ok(()),
Some(_) => skip_empty_utf8_splits_overlapping(
input,
state,
|input, state| {
search::find_overlapping_rev(self, cache, input, state)
},
),
}
}
/// Writes the set of patterns that match anywhere in the given search
/// configuration to `patset`. If multiple patterns match at the same
/// position and the underlying DFA supports overlapping matches, then all
/// matching patterns are written to the given set.
///
/// Unless all of the patterns in this DFA are anchored, then generally
/// speaking, this will visit every byte in the haystack.
///
/// This search routine *does not* clear the pattern set. This gives some
/// flexibility to the caller (e.g., running multiple searches with the
/// same pattern set), but does make the API bug-prone if you're reusing
/// the same pattern set for multiple searches but intended them to be
/// independent.
///
/// If a pattern ID matched but the given `PatternSet` does not have
/// sufficient capacity to store it, then it is not inserted and silently
/// dropped.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the lazy DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the lazy DFA quitting.
/// * The configuration of the lazy DFA may also permit it to "give up"
/// on a search if it makes ineffective use of its transition table
/// cache. The default configuration does not enable this by default,
/// although it is typically a good idea to.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
///
/// # Example
///
/// This example shows how to find all matching patterns in a haystack,
/// even when some patterns match at the same position as other patterns.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// hybrid::dfa::DFA,
/// Input, MatchKind, PatternSet,
/// };
///
/// let patterns = &[
/// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
/// ];
/// let dfa = DFA::builder()
/// .configure(DFA::config().match_kind(MatchKind::All))
/// .build_many(patterns)?;
/// let mut cache = dfa.create_cache();
///
/// let input = Input::new("foobar");
/// let mut patset = PatternSet::new(dfa.pattern_len());
/// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?;
/// let expected = vec![0, 2, 3, 4, 6];
/// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn try_which_overlapping_matches(
&self,
cache: &mut Cache,
input: &Input<'_>,
patset: &mut PatternSet,
) -> Result<(), MatchError> {
let mut state = OverlappingState::start();
while let Some(m) = {
self.try_search_overlapping_fwd(cache, input, &mut state)?;
state.get_match()
} {
let _ = patset.try_insert(m.pattern());
// There's nothing left to find, so we can stop. Or the caller
// asked us to.
if patset.is_full() || input.get_earliest() {
break;
}
}
Ok(())
}
}
impl DFA {
/// Transitions from the current state to the next state, given the next
/// byte of input.
///
/// The given cache is used to either reuse pre-computed state
/// transitions, or to store this newly computed transition for future
/// reuse. Thus, this routine guarantees that it will never return a state
/// ID that has an "unknown" tag.
///
/// # State identifier validity
///
/// The only valid value for `current` is the lazy state ID returned
/// by the most recent call to `next_state`, `next_state_untagged`,
/// `next_state_untagged_unchecked`, `start_state_forward` or
/// `state_state_reverse` for the given `cache`. Any state ID returned from
/// prior calls to these routines (with the same `cache`) is considered
/// invalid (even if it gives an appearance of working). State IDs returned
/// from _any_ prior call for different `cache` values are also always
/// invalid.
///
/// The returned ID is always a valid ID when `current` refers to a valid
/// ID. Moreover, this routine is defined for all possible values of
/// `input`.
///
/// These validity rules are not checked, even in debug mode. Callers are
/// required to uphold these rules themselves.
///
/// Violating these state ID validity rules will not sacrifice memory
/// safety, but _may_ produce an incorrect result or a panic.
///
/// # Panics
///
/// If the given ID does not refer to a valid state, then this routine
/// may panic but it also may not panic and instead return an invalid or
/// incorrect ID.
///
/// # Example
///
/// This shows a simplistic example for walking a lazy DFA for a given
/// haystack by using the `next_state` method.
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, Input};
///
/// let dfa = DFA::new(r"[a-z]+r")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "bar".as_bytes();
///
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
/// let mut sid = dfa.start_state_forward(
/// &mut cache, &Input::new(haystack),
/// )?;
/// // Walk all the bytes in the haystack.
/// for &b in haystack {
/// sid = dfa.next_state(&mut cache, sid, b)?;
/// }
/// // Matches are always delayed by 1 byte, so we must explicitly walk the
/// // special "EOI" transition at the end of the search.
/// sid = dfa.next_eoi_state(&mut cache, sid)?;
/// assert!(sid.is_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn next_state(
&self,
cache: &mut Cache,
current: LazyStateID,
input: u8,
) -> Result<LazyStateID, CacheError> {
let class = usize::from(self.classes.get(input));
let offset = current.as_usize_untagged() + class;
let sid = cache.trans[offset];
if !sid.is_unknown() {
return Ok(sid);
}
let unit = alphabet::Unit::u8(input);
Lazy::new(self, cache).cache_next_state(current, unit)
}
/// Transitions from the current state to the next state, given the next
/// byte of input and a state ID that is not tagged.
///
/// The only reason to use this routine is performance. In particular, the
/// `next_state` method needs to do some additional checks, among them is
/// to account for identifiers to states that are not yet computed. In
/// such a case, the transition is computed on the fly. However, if it is
/// known that the `current` state ID is untagged, then these checks can be
/// omitted.
///
/// Since this routine does not compute states on the fly, it does not
/// modify the cache and thus cannot return an error. Consequently, `cache`
/// does not need to be mutable and it is possible for this routine to
/// return a state ID corresponding to the special "unknown" state. In
/// this case, it is the caller's responsibility to use the prior state
/// ID and `input` with `next_state` in order to force the computation of
/// the unknown transition. Otherwise, trying to use the "unknown" state
/// ID will just result in transitioning back to itself, and thus never
/// terminating. (This is technically a special exemption to the state ID
/// validity rules, but is permissible since this routine is guarateed to
/// never mutate the given `cache`, and thus the identifier is guaranteed
/// to remain valid.)
///
/// See [`LazyStateID`] for more details on what it means for a state ID
/// to be tagged. Also, see
/// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked)
/// for this same idea, but with bounds checks forcefully elided.
///
/// # State identifier validity
///
/// The only valid value for `current` is an **untagged** lazy
/// state ID returned by the most recent call to `next_state`,
/// `next_state_untagged`, `next_state_untagged_unchecked`,
/// `start_state_forward` or `state_state_reverse` for the given `cache`.
/// Any state ID returned from prior calls to these routines (with the
/// same `cache`) is considered invalid (even if it gives an appearance
/// of working). State IDs returned from _any_ prior call for different
/// `cache` values are also always invalid.
///
/// The returned ID is always a valid ID when `current` refers to a valid
/// ID, although it may be tagged. Moreover, this routine is defined for
/// all possible values of `input`.
///
/// Not all validity rules are checked, even in debug mode. Callers are
/// required to uphold these rules themselves.
///
/// Violating these state ID validity rules will not sacrifice memory
/// safety, but _may_ produce an incorrect result or a panic.
///
/// # Panics
///
/// If the given ID does not refer to a valid state, then this routine
/// may panic but it also may not panic and instead return an invalid or
/// incorrect ID.
///
/// # Example
///
/// This shows a simplistic example for walking a lazy DFA for a given
/// haystack by using the `next_state_untagged` method where possible.
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, Input};
///
/// let dfa = DFA::new(r"[a-z]+r")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "bar".as_bytes();
///
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
/// let mut sid = dfa.start_state_forward(
/// &mut cache, &Input::new(haystack),
/// )?;
/// // Walk all the bytes in the haystack.
/// let mut at = 0;
/// while at < haystack.len() {
/// if sid.is_tagged() {
/// sid = dfa.next_state(&mut cache, sid, haystack[at])?;
/// } else {
/// let mut prev_sid = sid;
/// // We attempt to chew through as much as we can while moving
/// // through untagged state IDs. Thus, the transition function
/// // does less work on average per byte. (Unrolling this loop
/// // may help even more.)
/// while at < haystack.len() {
/// prev_sid = sid;
/// sid = dfa.next_state_untagged(
/// &mut cache, sid, haystack[at],
/// );
/// at += 1;
/// if sid.is_tagged() {
/// break;
/// }
/// }
/// // We must ensure that we never proceed to the next iteration
/// // with an unknown state ID. If we don't account for this
/// // case, then search isn't guaranteed to terminate since all
/// // transitions on unknown states loop back to itself.
/// if sid.is_unknown() {
/// sid = dfa.next_state(
/// &mut cache, prev_sid, haystack[at - 1],
/// )?;
/// }
/// }
/// }
/// // Matches are always delayed by 1 byte, so we must explicitly walk the
/// // special "EOI" transition at the end of the search.
/// sid = dfa.next_eoi_state(&mut cache, sid)?;
/// assert!(sid.is_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn next_state_untagged(
&self,
cache: &Cache,
current: LazyStateID,
input: u8,
) -> LazyStateID {
debug_assert!(!current.is_tagged());
let class = usize::from(self.classes.get(input));
let offset = current.as_usize_unchecked() + class;
cache.trans[offset]
}
/// Transitions from the current state to the next state, eliding bounds
/// checks, given the next byte of input and a state ID that is not tagged.
///
/// The only reason to use this routine is performance. In particular, the
/// `next_state` method needs to do some additional checks, among them is
/// to account for identifiers to states that are not yet computed. In
/// such a case, the transition is computed on the fly. However, if it is
/// known that the `current` state ID is untagged, then these checks can be
/// omitted.
///
/// Since this routine does not compute states on the fly, it does not
/// modify the cache and thus cannot return an error. Consequently, `cache`
/// does not need to be mutable and it is possible for this routine to
/// return a state ID corresponding to the special "unknown" state. In
/// this case, it is the caller's responsibility to use the prior state
/// ID and `input` with `next_state` in order to force the computation of
/// the unknown transition. Otherwise, trying to use the "unknown" state
/// ID will just result in transitioning back to itself, and thus never
/// terminating. (This is technically a special exemption to the state ID
/// validity rules, but is permissible since this routine is guarateed to
/// never mutate the given `cache`, and thus the identifier is guaranteed
/// to remain valid.)
///
/// See [`LazyStateID`] for more details on what it means for a state ID
/// to be tagged. Also, see
/// [`next_state_untagged`](DFA::next_state_untagged)
/// for this same idea, but with memory safety guaranteed by retaining
/// bounds checks.
///
/// # State identifier validity
///
/// The only valid value for `current` is an **untagged** lazy
/// state ID returned by the most recent call to `next_state`,
/// `next_state_untagged`, `next_state_untagged_unchecked`,
/// `start_state_forward` or `state_state_reverse` for the given `cache`.
/// Any state ID returned from prior calls to these routines (with the
/// same `cache`) is considered invalid (even if it gives an appearance
/// of working). State IDs returned from _any_ prior call for different
/// `cache` values are also always invalid.
///
/// The returned ID is always a valid ID when `current` refers to a valid
/// ID, although it may be tagged. Moreover, this routine is defined for
/// all possible values of `input`.
///
/// Not all validity rules are checked, even in debug mode. Callers are
/// required to uphold these rules themselves.
///
/// Violating these state ID validity rules will not sacrifice memory
/// safety, but _may_ produce an incorrect result or a panic.
///
/// # Safety
///
/// Callers of this method must guarantee that `current` refers to a valid
/// state ID according to the rules described above. If `current` is not a
/// valid state ID for this automaton, then calling this routine may result
/// in undefined behavior.
///
/// If `current` is valid, then the ID returned is valid for all possible
/// values of `input`.
#[inline]
pub unsafe fn next_state_untagged_unchecked(
&self,
cache: &Cache,
current: LazyStateID,
input: u8,
) -> LazyStateID {
debug_assert!(!current.is_tagged());
let class = usize::from(self.classes.get(input));
let offset = current.as_usize_unchecked() + class;
*cache.trans.get_unchecked(offset)
}
/// Transitions from the current state to the next state for the special
/// EOI symbol.
///
/// The given cache is used to either reuse pre-computed state
/// transitions, or to store this newly computed transition for future
/// reuse. Thus, this routine guarantees that it will never return a state
/// ID that has an "unknown" tag.
///
/// This routine must be called at the end of every search in a correct
/// implementation of search. Namely, lazy DFAs in this crate delay matches
/// by one byte in order to support look-around operators. Thus, after
/// reaching the end of a haystack, a search implementation must follow one
/// last EOI transition.
///
/// It is best to think of EOI as an additional symbol in the alphabet of a
/// DFA that is distinct from every other symbol. That is, the alphabet of
/// lazy DFAs in this crate has a logical size of 257 instead of 256, where
/// 256 corresponds to every possible inhabitant of `u8`. (In practice, the
/// physical alphabet size may be smaller because of alphabet compression
/// via equivalence classes, but EOI is always represented somehow in the
/// alphabet.)
///
/// # State identifier validity
///
/// The only valid value for `current` is the lazy state ID returned
/// by the most recent call to `next_state`, `next_state_untagged`,
/// `next_state_untagged_unchecked`, `start_state_forward` or
/// `state_state_reverse` for the given `cache`. Any state ID returned from
/// prior calls to these routines (with the same `cache`) is considered
/// invalid (even if it gives an appearance of working). State IDs returned
/// from _any_ prior call for different `cache` values are also always
/// invalid.
///
/// The returned ID is always a valid ID when `current` refers to a valid
/// ID.
///
/// These validity rules are not checked, even in debug mode. Callers are
/// required to uphold these rules themselves.
///
/// Violating these state ID validity rules will not sacrifice memory
/// safety, but _may_ produce an incorrect result or a panic.
///
/// # Panics
///
/// If the given ID does not refer to a valid state, then this routine
/// may panic but it also may not panic and instead return an invalid or
/// incorrect ID.
///
/// # Example
///
/// This shows a simplistic example for walking a DFA for a given haystack,
/// and then finishing the search with the final EOI transition.
///
/// ```
/// use regex_automata::{hybrid::dfa::DFA, Input};
///
/// let dfa = DFA::new(r"[a-z]+r")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "bar".as_bytes();
///
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack.
/// let mut sid = dfa.start_state_forward(
/// &mut cache, &Input::new(haystack),
/// )?;
/// // Walk all the bytes in the haystack.
/// for &b in haystack {
/// sid = dfa.next_state(&mut cache, sid, b)?;
/// }
/// // Matches are always delayed by 1 byte, so we must explicitly walk
/// // the special "EOI" transition at the end of the search. Without this
/// // final transition, the assert below will fail since the DFA will not
/// // have entered a match state yet!
/// sid = dfa.next_eoi_state(&mut cache, sid)?;
/// assert!(sid.is_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn next_eoi_state(
&self,
cache: &mut Cache,
current: LazyStateID,
) -> Result<LazyStateID, CacheError> {
let eoi = self.classes.eoi().as_usize();
let offset = current.as_usize_untagged() + eoi;
let sid = cache.trans[offset];
if !sid.is_unknown() {
return Ok(sid);
}
let unit = self.classes.eoi();
Lazy::new(self, cache).cache_next_state(current, unit)
}
/// Return the ID of the start state for this lazy DFA when executing a
/// forward search.
///
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
///
/// * The [`Anchored`] mode of the search. Unanchored, anchored and
/// anchored searches for a specific [`PatternID`] all use different start
/// states.
/// * The position at which the search begins, via [`Input::start`]. This
/// and the byte immediately preceding the start of the search (if one
/// exists) influence which look-behind assertions are true at the start
/// of the search. This in turn influences which start state is selected.
/// * Whether the search is a forward or reverse search. This routine can
/// only be used for forward searches.
///
/// # Errors
///
/// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
/// needs to give up when determining the start state (for example, if
/// it sees a "quit" byte or if the cache has been cleared too many
/// times). This can also return an error if the given `Input` contains an
/// unsupported [`Anchored`] configuration.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn start_state_forward(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
if !self.quitset.is_empty() && input.start() > 0 {
let offset = input.start() - 1;
let byte = input.haystack()[offset];
if self.quitset.contains(byte) {
return Err(MatchError::quit(byte, offset));
}
}
let start_type = self.start_map.fwd(input);
let start = LazyRef::new(self, cache)
.get_cached_start_id(input, start_type)?;
if !start.is_unknown() {
return Ok(start);
}
Lazy::new(self, cache).cache_start_group(input, start_type)
}
/// Return the ID of the start state for this lazy DFA when executing a
/// reverse search.
///
/// Unlike typical DFA implementations, the start state for DFAs in this
/// crate is dependent on a few different factors:
///
/// * The [`Anchored`] mode of the search. Unanchored, anchored and
/// anchored searches for a specific [`PatternID`] all use different start
/// states.
/// * The position at which the search begins, via [`Input::start`]. This
/// and the byte immediately preceding the start of the search (if one
/// exists) influence which look-behind assertions are true at the start
/// of the search. This in turn influences which start state is selected.
/// * Whether the search is a forward or reverse search. This routine can
/// only be used for reverse searches.
///
/// # Errors
///
/// This may return a [`MatchError`] (not a [`CacheError`]!) if the search
/// needs to give up when determining the start state (for example, if
/// it sees a "quit" byte or if the cache has been cleared too many
/// times). This can also return an error if the given `Input` contains an
/// unsupported [`Anchored`] configuration.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn start_state_reverse(
&self,
cache: &mut Cache,
input: &Input<'_>,
) -> Result<LazyStateID, MatchError> {
--> --------------------
--> maximum size reached
--> --------------------
[ Dauer der Verarbeitung: 0.34 Sekunden
(vorverarbeitet)
]
|
2026-04-04
|
|
|
|
|