|
|
|
|
Quelle dense.rs
Sprache: unbekannt
|
|
/*!
Types and routines specific to dense DFAs.
This module is the home of [`dense::DFA`](DFA).
This module also contains a [`dense::Builder`](Builder) and a
[`dense::Config`](Config) for building and configuring a dense DFA.
*/
#[cfg(feature = "dfa-build")]
use core::cmp;
use core::{convert::TryFrom, fmt, iter, mem::size_of, slice};
#[cfg(feature = "dfa-build")]
use alloc::{
collections::{BTreeMap, BTreeSet},
vec,
vec::Vec,
};
#[cfg(feature = "dfa-build")]
use crate::{
dfa::{
accel::Accel, determinize, minimize::Minimizer, remapper::Remapper,
sparse,
},
nfa::thompson,
util::{look::LookMatcher, search::MatchKind},
};
use crate::{
dfa::{
accel::Accels,
automaton::{fmt_state_indicator, Automaton},
special::Special,
start::StartKind,
DEAD,
},
util::{
alphabet::{self, ByteClasses, ByteSet},
int::{Pointer, Usize},
prefilter::Prefilter,
primitives::{PatternID, StateID},
search::{Anchored, Input, MatchError},
start::{Start, StartByteMap},
wire::{self, DeserializeError, Endian, SerializeError},
},
};
/// The label that is pre-pended to a serialized DFA.
const LABEL: &str = "rust-regex-automata-dfa-dense";
/// The format version of dense regexes. This version gets incremented when a
/// change occurs. A change may not necessarily be a breaking change, but the
/// version does permit good error messages in the case where a breaking change
/// is made.
const VERSION: u32 = 2;
/// The configuration used for compiling a dense DFA.
///
/// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The
/// advantage of the former is that it often lets you avoid importing the
/// `Config` type directly.
///
/// A dense DFA configuration is a simple data object that is typically used
/// with [`dense::Builder::configure`](self::Builder::configure).
///
/// The default configuration guarantees that a search will never return
/// a "quit" error, although it is possible for a search to fail if
/// [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by
/// default) and an [`Anchored::Pattern`] mode is requested via [`Input`].
#[cfg(feature = "dfa-build")]
#[derive(Clone, Debug, Default)]
pub struct Config {
// As with other configuration types in this crate, we put all our knobs
// in options so that we can distinguish between "default" and "not set."
// This makes it possible to easily combine multiple configurations
// without default values overwriting explicitly specified values. See the
// 'overwrite' method.
//
// For docs on the fields below, see the corresponding method setters.
accelerate: Option<bool>,
pre: Option<Option<Prefilter>>,
minimize: Option<bool>,
match_kind: Option<MatchKind>,
start_kind: Option<StartKind>,
starts_for_each_pattern: Option<bool>,
byte_classes: Option<bool>,
unicode_word_boundary: Option<bool>,
quitset: Option<ByteSet>,
specialize_start_states: Option<bool>,
dfa_size_limit: Option<Option<usize>>,
determinize_size_limit: Option<Option<usize>>,
}
#[cfg(feature = "dfa-build")]
impl Config {
/// Return a new default dense DFA compiler configuration.
pub fn new() -> Config {
Config::default()
}
/// Enable state acceleration.
///
/// When enabled, DFA construction will analyze each state to determine
/// whether it is eligible for simple acceleration. Acceleration typically
/// occurs when most of a state's transitions loop back to itself, leaving
/// only a select few bytes that will exit the state. When this occurs,
/// other routines like `memchr` can be used to look for those bytes which
/// may be much faster than traversing the DFA.
///
/// Callers may elect to disable this if consistent performance is more
/// desirable than variable performance. Namely, acceleration can sometimes
/// make searching slower than it otherwise would be if the transitions
/// that leave accelerated states are traversed frequently.
///
/// See [`Automaton::accelerator`](crate::dfa::Automaton::accelerator) for
/// an example.
///
/// This is enabled by default.
pub fn accelerate(mut self, yes: bool) -> Config {
self.accelerate = Some(yes);
self
}
/// Set a prefilter to be used whenever a start state is entered.
///
/// A [`Prefilter`] in this context is meant to accelerate searches by
/// looking for literal prefixes that every match for the corresponding
/// pattern (or patterns) must start with. Once a prefilter produces a
/// match, the underlying search routine continues on to try and confirm
/// the match.
///
/// Be warned that setting a prefilter does not guarantee that the search
/// will be faster. While it's usually a good bet, if the prefilter
/// produces a lot of false positive candidates (i.e., positions matched
/// by the prefilter but not by the regex), then the overall result can
/// be slower than if you had just executed the regex engine without any
/// prefilters.
///
/// Note that unless [`Config::specialize_start_states`] has been
/// explicitly set, then setting this will also enable (when `pre` is
/// `Some`) or disable (when `pre` is `None`) start state specialization.
/// This occurs because without start state specialization, a prefilter
/// is likely to be less effective. And without a prefilter, start state
/// specialization is usually pointless.
///
/// **WARNING:** Note that prefilters are not preserved as part of
/// serialization. Serializing a DFA will drop its prefilter.
///
/// By default no prefilter is set.
///
/// # Example
///
/// ```
/// use regex_automata::{
/// dfa::{dense::DFA, Automaton},
/// util::prefilter::Prefilter,
/// Input, HalfMatch, MatchKind,
/// };
///
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
/// let re = DFA::builder()
/// .configure(DFA::config().prefilter(pre))
/// .build(r"(foo|bar)[a-z]+")?;
/// let input = Input::new("foo1 barfox bar");
/// assert_eq!(
/// Some(HalfMatch::must(0, 11)),
/// re.try_search_fwd(&input)?,
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// Be warned though that an incorrect prefilter can lead to incorrect
/// results!
///
/// ```
/// use regex_automata::{
/// dfa::{dense::DFA, Automaton},
/// util::prefilter::Prefilter,
/// Input, HalfMatch, MatchKind,
/// };
///
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
/// let re = DFA::builder()
/// .configure(DFA::config().prefilter(pre))
/// .build(r"(foo|bar)[a-z]+")?;
/// let input = Input::new("foo1 barfox bar");
/// assert_eq!(
/// // No match reported even though there clearly is one!
/// None,
/// re.try_search_fwd(&input)?,
/// );
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
self.pre = Some(pre);
if self.specialize_start_states.is_none() {
self.specialize_start_states =
Some(self.get_prefilter().is_some());
}
self
}
/// Minimize the DFA.
///
/// When enabled, the DFA built will be minimized such that it is as small
/// as possible.
///
/// Whether one enables minimization or not depends on the types of costs
/// you're willing to pay and how much you care about its benefits. In
/// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
/// space, where `n` is the number of DFA states and `k` is the alphabet
/// size. In practice, minimization can be quite costly in terms of both
/// space and time, so it should only be done if you're willing to wait
/// longer to produce a DFA. In general, you might want a minimal DFA in
/// the following circumstances:
///
/// 1. You would like to optimize for the size of the automaton. This can
/// manifest in one of two ways. Firstly, if you're converting the
/// DFA into Rust code (or a table embedded in the code), then a minimal
/// DFA will translate into a corresponding reduction in code size, and
/// thus, also the final compiled binary size. Secondly, if you are
/// building many DFAs and putting them on the heap, you'll be able to
/// fit more if they are smaller. Note though that building a minimal
/// DFA itself requires additional space; you only realize the space
/// savings once the minimal DFA is constructed (at which point, the
/// space used for minimization is freed).
/// 2. You've observed that a smaller DFA results in faster match
/// performance. Naively, this isn't guaranteed since there is no
/// inherent difference between matching with a bigger-than-minimal
/// DFA and a minimal DFA. However, a smaller DFA may make use of your
/// CPU's cache more efficiently.
/// 3. You are trying to establish an equivalence between regular
/// languages. The standard method for this is to build a minimal DFA
/// for each language and then compare them. If the DFAs are equivalent
/// (up to state renaming), then the languages are equivalent.
///
/// Typically, minimization only makes sense as an offline process. That
/// is, one might minimize a DFA before serializing it to persistent
/// storage. In practical terms, minimization can take around an order of
/// magnitude more time than compiling the initial DFA via determinization.
///
/// This option is disabled by default.
pub fn minimize(mut self, yes: bool) -> Config {
self.minimize = Some(yes);
self
}
/// Set the desired match semantics.
///
/// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
/// match semantics of Perl-like regex engines. That is, when multiple
/// patterns would match at the same leftmost position, the pattern that
/// appears first in the concrete syntax is chosen.
///
/// Currently, the only other kind of match semantics supported is
/// [`MatchKind::All`]. This corresponds to classical DFA construction
/// where all possible matches are added to the DFA.
///
/// Typically, `All` is used when one wants to execute an overlapping
/// search and `LeftmostFirst` otherwise. In particular, it rarely makes
/// sense to use `All` with the various "leftmost" find routines, since the
/// leftmost routines depend on the `LeftmostFirst` automata construction
/// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA
/// as a way to terminate the search and report a match. `LeftmostFirst`
/// also supports non-greedy matches using this strategy where as `All`
/// does not.
///
/// # Example: overlapping search
///
/// This example shows the typical use of `MatchKind::All`, which is to
/// report overlapping matches.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// dfa::{Automaton, OverlappingState, dense},
/// HalfMatch, Input, MatchKind,
/// };
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().match_kind(MatchKind::All))
/// .build_many(&[r"\w+$", r"\S+$"])?;
/// let input = Input::new("@foo");
/// let mut state = OverlappingState::start();
///
/// let expected = Some(HalfMatch::must(1, 4));
/// dfa.try_search_overlapping_fwd(&input, &mut state)?;
/// assert_eq!(expected, state.get_match());
///
/// // The first pattern also matches at the same position, so re-running
/// // the search will yield another match. Notice also that the first
/// // pattern is returned after the second. This is because the second
/// // pattern begins its match before the first, is therefore an earlier
/// // match and is thus reported first.
/// let expected = Some(HalfMatch::must(0, 4));
/// dfa.try_search_overlapping_fwd(&input, &mut state)?;
/// assert_eq!(expected, state.get_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// # Example: reverse automaton to find start of match
///
/// Another example for using `MatchKind::All` is for constructing a
/// reverse automaton to find the start of a match. `All` semantics are
/// used for this in order to find the longest possible match, which
/// corresponds to the leftmost starting position.
///
/// Note that if you need the starting position then
/// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for
/// you, so it's usually not necessary to do this yourself.
///
/// ```
/// use regex_automata::{
/// dfa::{dense, Automaton, StartKind},
/// nfa::thompson::NFA,
/// Anchored, HalfMatch, Input, MatchKind,
/// };
///
/// let haystack = "123foobar456".as_bytes();
/// let pattern = r"[a-z]+r";
///
/// let dfa_fwd = dense::DFA::new(pattern)?;
/// let dfa_rev = dense::Builder::new()
/// .thompson(NFA::config().reverse(true))
/// .configure(dense::Config::new()
/// // This isn't strictly necessary since both anchored and
/// // unanchored searches are supported by default. But since
/// // finding the start-of-match only requires anchored searches,
/// // we can get rid of the unanchored configuration and possibly
/// // slim down our DFA considerably.
/// .start_kind(StartKind::Anchored)
/// .match_kind(MatchKind::All)
/// )
/// .build(pattern)?;
/// let expected_fwd = HalfMatch::must(0, 9);
/// let expected_rev = HalfMatch::must(0, 3);
/// let got_fwd = dfa_fwd.try_search_fwd(&Input::new(haystack))?.unwrap();
/// // Here we don't specify the pattern to search for since there's only
/// // one pattern and we're doing a leftmost search. But if this were an
/// // overlapping search, you'd need to specify the pattern that matched
/// // in the forward direction. (Otherwise, you might wind up finding the
/// // starting position of a match of some other pattern.) That in turn
/// // requires building the reverse automaton with starts_for_each_pattern
/// // enabled. Indeed, this is what Regex does internally.
/// let input = Input::new(haystack)
/// .range(..got_fwd.offset())
/// .anchored(Anchored::Yes);
/// let got_rev = dfa_rev.try_search_rev(&input)?.unwrap();
/// assert_eq!(expected_fwd, got_fwd);
/// assert_eq!(expected_rev, got_rev);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn match_kind(mut self, kind: MatchKind) -> Config {
self.match_kind = Some(kind);
self
}
/// The type of starting state configuration to use for a DFA.
///
/// By default, the starting state configuration is [`StartKind::Both`].
///
/// # Example
///
/// ```
/// use regex_automata::{
/// dfa::{dense::DFA, Automaton, StartKind},
/// Anchored, HalfMatch, Input,
/// };
///
/// let haystack = "quux foo123";
/// let expected = HalfMatch::must(0, 11);
///
/// // By default, DFAs support both anchored and unanchored searches.
/// let dfa = DFA::new(r"[0-9]+")?;
/// let input = Input::new(haystack);
/// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
///
/// // But if we only need anchored searches, then we can build a DFA
/// // that only supports anchored searches. This leads to a smaller DFA
/// // (potentially significantly smaller in some cases), but a DFA that
/// // will panic if you try to use it with an unanchored search.
/// let dfa = DFA::builder()
/// .configure(DFA::config().start_kind(StartKind::Anchored))
/// .build(r"[0-9]+")?;
/// let input = Input::new(haystack)
/// .range(8..)
/// .anchored(Anchored::Yes);
/// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn start_kind(mut self, kind: StartKind) -> Config {
self.start_kind = Some(kind);
self
}
/// Whether to compile a separate start state for each pattern in the
/// automaton.
///
/// When enabled, a separate **anchored** start state is added for each
/// pattern in the DFA. When this start state is used, then the DFA will
/// only search for matches for the pattern specified, even if there are
/// other patterns in the DFA.
///
/// The main downside of this option is that it can potentially increase
/// the size of the DFA and/or increase the time it takes to build the DFA.
///
/// There are a few reasons one might want to enable this (it's disabled
/// by default):
///
/// 1. When looking for the start of an overlapping match (using a
/// reverse DFA), doing it correctly requires starting the reverse search
/// using the starting state of the pattern that matched in the forward
/// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex),
/// it will automatically enable this option when building the reverse DFA
/// internally.
/// 2. When you want to use a DFA with multiple patterns to both search
/// for matches of any pattern or to search for anchored matches of one
/// particular pattern while using the same DFA. (Otherwise, you would need
/// to compile a new DFA for each pattern.)
/// 3. Since the start states added for each pattern are anchored, if you
/// compile an unanchored DFA with one pattern while also enabling this
/// option, then you can use the same DFA to perform anchored or unanchored
/// searches. The latter you get with the standard search APIs. The former
/// you get from the various `_at` search methods that allow you specify a
/// pattern ID to search for.
///
/// By default this is disabled.
///
/// # Example
///
/// This example shows how to use this option to permit the same DFA to
/// run both anchored and unanchored searches for a single pattern.
///
/// ```
/// use regex_automata::{
/// dfa::{dense, Automaton},
/// Anchored, HalfMatch, PatternID, Input,
/// };
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().starts_for_each_pattern(true))
/// .build(r"foo[0-9]+")?;
/// let haystack = "quux foo123";
///
/// // Here's a normal unanchored search. Notice that we use 'None' for the
/// // pattern ID. Since the DFA was built as an unanchored machine, it
/// // use its default unanchored starting state.
/// let expected = HalfMatch::must(0, 11);
/// let input = Input::new(haystack);
/// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
/// // But now if we explicitly specify the pattern to search ('0' being
/// // the only pattern in the DFA), then it will use the starting state
/// // for that specific pattern which is always anchored. Since the
/// // pattern doesn't have a match at the beginning of the haystack, we
/// // find nothing.
/// let input = Input::new(haystack)
/// .anchored(Anchored::Pattern(PatternID::must(0)));
/// assert_eq!(None, dfa.try_search_fwd(&input)?);
/// // And finally, an anchored search is not the same as putting a '^' at
/// // beginning of the pattern. An anchored search can only match at the
/// // beginning of the *search*, which we can change:
/// let input = Input::new(haystack)
/// .anchored(Anchored::Pattern(PatternID::must(0)))
/// .range(5..);
/// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn starts_for_each_pattern(mut self, yes: bool) -> Config {
self.starts_for_each_pattern = Some(yes);
self
}
/// Whether to attempt to shrink the size of the DFA's alphabet or not.
///
/// This option is enabled by default and should never be disabled unless
/// one is debugging a generated DFA.
///
/// When enabled, the DFA will use a map from all possible bytes to their
/// corresponding equivalence class. Each equivalence class represents a
/// set of bytes that does not discriminate between a match and a non-match
/// in the DFA. For example, the pattern `[ab]+` has at least two
/// equivalence classes: a set containing `a` and `b` and a set containing
/// every byte except for `a` and `b`. `a` and `b` are in the same
/// equivalence class because they never discriminate between a match and a
/// non-match.
///
/// The advantage of this map is that the size of the transition table
/// can be reduced drastically from `#states * 256 * sizeof(StateID)` to
/// `#states * k * sizeof(StateID)` where `k` is the number of equivalence
/// classes (rounded up to the nearest power of 2). As a result, total
/// space usage can decrease substantially. Moreover, since a smaller
/// alphabet is used, DFA compilation becomes faster as well.
///
/// **WARNING:** This is only useful for debugging DFAs. Disabling this
/// does not yield any speed advantages. Namely, even when this is
/// disabled, a byte class map is still used while searching. The only
/// difference is that every byte will be forced into its own distinct
/// equivalence class. This is useful for debugging the actual generated
/// transitions because it lets one see the transitions defined on actual
/// bytes instead of the equivalence classes.
pub fn byte_classes(mut self, yes: bool) -> Config {
self.byte_classes = Some(yes);
self
}
/// Heuristically enable Unicode word boundaries.
///
/// When set, this will attempt to implement Unicode word boundaries as if
/// they were ASCII word boundaries. This only works when the search input
/// is ASCII only. If a non-ASCII byte is observed while searching, then a
/// [`MatchError::quit`](crate::MatchError::quit) error is returned.
///
/// A possible alternative to enabling this option is to simply use an
/// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this
/// option is if you absolutely need Unicode support. This option lets one
/// use a fast search implementation (a DFA) for some potentially very
/// common cases, while providing the option to fall back to some other
/// regex engine to handle the general case when an error is returned.
///
/// If the pattern provided has no Unicode word boundary in it, then this
/// option has no effect. (That is, quitting on a non-ASCII byte only
/// occurs when this option is enabled _and_ a Unicode word boundary is
/// present in the pattern.)
///
/// This is almost equivalent to setting all non-ASCII bytes to be quit
/// bytes. The only difference is that this will cause non-ASCII bytes to
/// be quit bytes _only_ when a Unicode word boundary is present in the
/// pattern.
///
/// When enabling this option, callers _must_ be prepared to handle
/// a [`MatchError`](crate::MatchError) error during search.
/// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds
/// to using the `try_` suite of methods. Alternatively, if
/// callers can guarantee that their input is ASCII only, then a
/// [`MatchError::quit`](crate::MatchError::quit) error will never be
/// returned while searching.
///
/// This is disabled by default.
///
/// # Example
///
/// This example shows how to heuristically enable Unicode word boundaries
/// in a pattern. It also shows what happens when a search comes across a
/// non-ASCII byte.
///
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
/// HalfMatch, Input, MatchError,
/// };
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().unicode_word_boundary(true))
/// .build(r"\b[0-9]+\b")?;
///
/// // The match occurs before the search ever observes the snowman
/// // character, so no error occurs.
/// let haystack = "foo 123 ☃".as_bytes();
/// let expected = Some(HalfMatch::must(0, 7));
/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// // Notice that this search fails, even though the snowman character
/// // occurs after the ending match offset. This is because search
/// // routines read one byte past the end of the search to account for
/// // look-around, and indeed, this is required here to determine whether
/// // the trailing \b matches.
/// let haystack = "foo 123 ☃".as_bytes();
/// let expected = MatchError::quit(0xE2, 8);
/// let got = dfa.try_search_fwd(&Input::new(haystack));
/// assert_eq!(Err(expected), got);
///
/// // Another example is executing a search where the span of the haystack
/// // we specify is all ASCII, but there is non-ASCII just before it. This
/// // correctly also reports an error.
/// let input = Input::new("β123").range(2..);
/// let expected = MatchError::quit(0xB2, 1);
/// let got = dfa.try_search_fwd(&input);
/// assert_eq!(Err(expected), got);
///
/// // And similarly for the trailing word boundary.
/// let input = Input::new("123β").range(..3);
/// let expected = MatchError::quit(0xCE, 3);
/// let got = dfa.try_search_fwd(&input);
/// assert_eq!(Err(expected), got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn unicode_word_boundary(mut self, yes: bool) -> Config {
// We have a separate option for this instead of just setting the
// appropriate quit bytes here because we don't want to set quit bytes
// for every regex. We only want to set them when the regex contains a
// Unicode word boundary.
self.unicode_word_boundary = Some(yes);
self
}
/// Add a "quit" byte to the DFA.
///
/// When a quit byte is seen during search time, then search will return
/// a [`MatchError::quit`](crate::MatchError::quit) error indicating the
/// offset at which the search stopped.
///
/// A quit byte will always overrule any other aspects of a regex. For
/// example, if the `x` byte is added as a quit byte and the regex `\w` is
/// used, then observing `x` will cause the search to quit immediately
/// despite the fact that `x` is in the `\w` class.
///
/// This mechanism is primarily useful for heuristically enabling certain
/// features like Unicode word boundaries in a DFA. Namely, if the input
/// to search is ASCII, then a Unicode word boundary can be implemented
/// via an ASCII word boundary with no change in semantics. Thus, a DFA
/// can attempt to match a Unicode word boundary but give up as soon as it
/// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes
/// to be quit bytes, then Unicode word boundaries will be permitted when
/// building DFAs. Of course, callers should enable
/// [`Config::unicode_word_boundary`] if they want this behavior instead.
/// (The advantage being that non-ASCII quit bytes will only be added if a
/// Unicode word boundary is in the pattern.)
///
/// When enabling this option, callers _must_ be prepared to handle a
/// [`MatchError`](crate::MatchError) error during search. When using a
/// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the
/// `try_` suite of methods.
///
/// By default, there are no quit bytes set.
///
/// # Panics
///
/// This panics if heuristic Unicode word boundaries are enabled and any
/// non-ASCII byte is removed from the set of quit bytes. Namely, enabling
/// Unicode word boundaries requires setting every non-ASCII byte to a quit
/// byte. So if the caller attempts to undo any of that, then this will
/// panic.
///
/// # Example
///
/// This example shows how to cause a search to terminate if it sees a
/// `\n` byte. This could be useful if, for example, you wanted to prevent
/// a user supplied pattern from matching across a line boundary.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{dfa::{Automaton, dense}, Input, MatchError};
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().quit(b'\n', true))
/// .build(r"foo\p{any}+bar")?;
///
/// let haystack = "foo\nbar".as_bytes();
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
/// let expected = MatchError::quit(b'\n', 3);
/// let got = dfa.try_search_fwd(&Input::new(haystack)).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn quit(mut self, byte: u8, yes: bool) -> Config {
if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes {
panic!(
"cannot set non-ASCII byte to be non-quit when \
Unicode word boundaries are enabled"
);
}
if self.quitset.is_none() {
self.quitset = Some(ByteSet::empty());
}
if yes {
self.quitset.as_mut().unwrap().add(byte);
} else {
self.quitset.as_mut().unwrap().remove(byte);
}
self
}
/// Enable specializing start states in the DFA.
///
/// When start states are specialized, an implementor of a search routine
/// using a lazy DFA can tell when the search has entered a starting state.
/// When start states aren't specialized, then it is impossible to know
/// whether the search has entered a start state.
///
/// Ideally, this option wouldn't need to exist and we could always
/// specialize start states. The problem is that start states can be quite
/// active. This in turn means that an efficient search routine is likely
/// to ping-pong between a heavily optimized hot loop that handles most
/// states and to a less optimized specialized handling of start states.
/// This causes branches to get heavily mispredicted and overall can
/// materially decrease throughput. Therefore, specializing start states
/// should only be enabled when it is needed.
///
/// Knowing whether a search is in a start state is typically useful when a
/// prefilter is active for the search. A prefilter is typically only run
/// when in a start state and a prefilter can greatly accelerate a search.
/// Therefore, the possible cost of specializing start states is worth it
/// in this case. Otherwise, if you have no prefilter, there is likely no
/// reason to specialize start states.
///
/// This is disabled by default, but note that it is automatically
/// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless
/// `specialize_start_states` has already been set, [`Config::prefilter`]
/// will automatically enable or disable it based on whether a prefilter
/// is present or not, respectively. This is done because a prefilter's
/// effectiveness is rooted in being executed whenever the DFA is in a
/// start state, and that's only possible to do when they are specialized.
///
/// Note that it is plausibly reasonable to _disable_ this option
/// explicitly while _enabling_ a prefilter. In that case, a prefilter
/// will still be run at the beginning of a search, but never again. This
/// in theory could strike a good balance if you're in a situation where a
/// prefilter is likely to produce many false positive candidates.
///
/// # Example
///
/// This example shows how to enable start state specialization and then
/// shows how to check whether a state is a start state or not.
///
/// ```
/// use regex_automata::{dfa::{Automaton, dense::DFA}, Input};
///
/// let dfa = DFA::builder()
/// .configure(DFA::config().specialize_start_states(true))
/// .build(r"[a-z]+")?;
///
/// let haystack = "123 foobar 4567".as_bytes();
/// let sid = dfa.start_state_forward(&Input::new(haystack))?;
/// // The ID returned by 'start_state_forward' will always be tagged as
/// // a start state when start state specialization is enabled.
/// assert!(dfa.is_special_state(sid));
/// assert!(dfa.is_start_state(sid));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// Compare the above with the default DFA configuration where start states
/// are _not_ specialized. In this case, the start state is not tagged at
/// all:
///
/// ```
/// use regex_automata::{dfa::{Automaton, dense::DFA}, Input};
///
/// let dfa = DFA::new(r"[a-z]+")?;
///
/// let haystack = "123 foobar 4567";
/// let sid = dfa.start_state_forward(&Input::new(haystack))?;
/// // Start states are not special in the default configuration!
/// assert!(!dfa.is_special_state(sid));
/// assert!(!dfa.is_start_state(sid));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn specialize_start_states(mut self, yes: bool) -> Config {
self.specialize_start_states = Some(yes);
self
}
/// Set a size limit on the total heap used by a DFA.
///
/// This size limit is expressed in bytes and is applied during
/// determinization of an NFA into a DFA. If the DFA's heap usage, and only
/// the DFA, exceeds this configured limit, then determinization is stopped
/// and an error is returned.
///
/// This limit does not apply to auxiliary storage used during
/// determinization that isn't part of the generated DFA.
///
/// This limit is only applied during determinization. Currently, there is
/// no way to post-pone this check to after minimization if minimization
/// was enabled.
///
/// The total limit on heap used during determinization is the sum of the
/// DFA and determinization size limits.
///
/// The default is no limit.
///
/// # Example
///
/// This example shows a DFA that fails to build because of a configured
/// size limit. This particular example also serves as a cautionary tale
/// demonstrating just how big DFAs with large Unicode character classes
/// can get.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{dfa::{dense, Automaton}, Input};
///
/// // 6MB isn't enough!
/// dense::Builder::new()
/// .configure(dense::Config::new().dfa_size_limit(Some(6_000_000)))
/// .build(r"\w{20}")
/// .unwrap_err();
///
/// // ... but 7MB probably is!
/// // (Note that DFA sizes aren't necessarily stable between releases.)
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().dfa_size_limit(Some(7_000_000)))
/// .build(r"\w{20}")?;
/// let haystack = "A".repeat(20).into_bytes();
/// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// While one needs a little more than 6MB to represent `\w{20}`, it
/// turns out that you only need a little more than 6KB to represent
/// `(?-u:\w{20})`. So only use Unicode if you need it!
///
/// As with [`Config::determinize_size_limit`], the size of a DFA is
/// influenced by other factors, such as what start state configurations
/// to support. For example, if you only need unanchored searches and not
/// anchored searches, then configuring the DFA to only support unanchored
/// searches can reduce its size. By default, DFAs support both unanchored
/// and anchored searches.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{dfa::{dense, Automaton, StartKind}, Input};
///
/// // 3MB isn't enough!
/// dense::Builder::new()
/// .configure(dense::Config::new()
/// .dfa_size_limit(Some(3_000_000))
/// .start_kind(StartKind::Unanchored)
/// )
/// .build(r"\w{20}")
/// .unwrap_err();
///
/// // ... but 4MB probably is!
/// // (Note that DFA sizes aren't necessarily stable between releases.)
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new()
/// .dfa_size_limit(Some(4_000_000))
/// .start_kind(StartKind::Unanchored)
/// )
/// .build(r"\w{20}")?;
/// let haystack = "A".repeat(20).into_bytes();
/// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn dfa_size_limit(mut self, bytes: Option<usize>) -> Config {
self.dfa_size_limit = Some(bytes);
self
}
/// Set a size limit on the total heap used by determinization.
///
/// This size limit is expressed in bytes and is applied during
/// determinization of an NFA into a DFA. If the heap used for auxiliary
/// storage during determinization (memory that is not in the DFA but
/// necessary for building the DFA) exceeds this configured limit, then
/// determinization is stopped and an error is returned.
///
/// This limit does not apply to heap used by the DFA itself.
///
/// The total limit on heap used during determinization is the sum of the
/// DFA and determinization size limits.
///
/// The default is no limit.
///
/// # Example
///
/// This example shows a DFA that fails to build because of a
/// configured size limit on the amount of heap space used by
/// determinization. This particular example complements the example for
/// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode
/// potentially make DFAs themselves big, but it also results in more
/// auxiliary storage during determinization. (Although, auxiliary storage
/// is still not as much as the DFA itself.)
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
/// use regex_automata::{dfa::{dense, Automaton}, Input};
///
/// // 600KB isn't enough!
/// dense::Builder::new()
/// .configure(dense::Config::new()
/// .determinize_size_limit(Some(600_000))
/// )
/// .build(r"\w{20}")
/// .unwrap_err();
///
/// // ... but 700KB probably is!
/// // (Note that auxiliary storage sizes aren't necessarily stable between
/// // releases.)
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new()
/// .determinize_size_limit(Some(700_000))
/// )
/// .build(r"\w{20}")?;
/// let haystack = "A".repeat(20).into_bytes();
/// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// Note that some parts of the configuration on a DFA can have a
/// big impact on how big the DFA is, and thus, how much memory is
/// used. For example, the default setting for [`Config::start_kind`] is
/// [`StartKind::Both`]. But if you only need an anchored search, for
/// example, then it can be much cheaper to build a DFA that only supports
/// anchored searches. (Running an unanchored search with it would panic.)
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039
/// use regex_automata::{
/// dfa::{dense, Automaton, StartKind},
/// Anchored, Input,
/// };
///
/// // 200KB isn't enough!
/// dense::Builder::new()
/// .configure(dense::Config::new()
/// .determinize_size_limit(Some(200_000))
/// .start_kind(StartKind::Anchored)
/// )
/// .build(r"\w{20}")
/// .unwrap_err();
///
/// // ... but 300KB probably is!
/// // (Note that auxiliary storage sizes aren't necessarily stable between
/// // releases.)
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new()
/// .determinize_size_limit(Some(300_000))
/// .start_kind(StartKind::Anchored)
/// )
/// .build(r"\w{20}")?;
/// let haystack = "A".repeat(20).into_bytes();
/// let input = Input::new(&haystack).anchored(Anchored::Yes);
/// assert!(dfa.try_search_fwd(&input)?.is_some());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn determinize_size_limit(mut self, bytes: Option<usize>) -> Config {
self.determinize_size_limit = Some(bytes);
self
}
/// Returns whether this configuration has enabled simple state
/// acceleration.
pub fn get_accelerate(&self) -> bool {
self.accelerate.unwrap_or(true)
}
/// Returns the prefilter attached to this configuration, if any.
pub fn get_prefilter(&self) -> Option<&Prefilter> {
self.pre.as_ref().unwrap_or(&None).as_ref()
}
/// Returns whether this configuration has enabled the expensive process
/// of minimizing a DFA.
pub fn get_minimize(&self) -> bool {
self.minimize.unwrap_or(false)
}
/// Returns the match semantics set in this configuration.
pub fn get_match_kind(&self) -> MatchKind {
self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
}
/// Returns the starting state configuration for a DFA.
pub fn get_starts(&self) -> StartKind {
self.start_kind.unwrap_or(StartKind::Both)
}
/// Returns whether this configuration has enabled anchored starting states
/// for every pattern in the DFA.
pub fn get_starts_for_each_pattern(&self) -> bool {
self.starts_for_each_pattern.unwrap_or(false)
}
/// Returns whether this configuration has enabled byte classes or not.
/// This is typically a debugging oriented option, as disabling it confers
/// no speed benefit.
pub fn get_byte_classes(&self) -> bool {
self.byte_classes.unwrap_or(true)
}
/// Returns whether this configuration has enabled heuristic Unicode word
/// boundary support. When enabled, it is possible for a search to return
/// an error.
pub fn get_unicode_word_boundary(&self) -> bool {
self.unicode_word_boundary.unwrap_or(false)
}
/// Returns whether this configuration will instruct the DFA to enter a
/// quit state whenever the given byte is seen during a search. When at
/// least one byte has this enabled, it is possible for a search to return
/// an error.
pub fn get_quit(&self, byte: u8) -> bool {
self.quitset.map_or(false, |q| q.contains(byte))
}
/// Returns whether this configuration will instruct the DFA to
/// "specialize" start states. When enabled, the DFA will mark start states
/// as "special" so that search routines using the DFA can detect when
/// it's in a start state and do some kind of optimization (like run a
/// prefilter).
pub fn get_specialize_start_states(&self) -> bool {
self.specialize_start_states.unwrap_or(false)
}
/// Returns the DFA size limit of this configuration if one was set.
/// The size limit is total number of bytes on the heap that a DFA is
/// permitted to use. If the DFA exceeds this limit during construction,
/// then construction is stopped and an error is returned.
pub fn get_dfa_size_limit(&self) -> Option<usize> {
self.dfa_size_limit.unwrap_or(None)
}
/// Returns the determinization size limit of this configuration if one
/// was set. The size limit is total number of bytes on the heap that
/// determinization is permitted to use. If determinization exceeds this
/// limit during construction, then construction is stopped and an error is
/// returned.
///
/// This is different from the DFA size limit in that this only applies to
/// the auxiliary storage used during determinization. Once determinization
/// is complete, this memory is freed.
///
/// The limit on the total heap memory used is the sum of the DFA and
/// determinization size limits.
pub fn get_determinize_size_limit(&self) -> Option<usize> {
self.determinize_size_limit.unwrap_or(None)
}
/// Overwrite the default configuration such that the options in `o` are
/// always used. If an option in `o` is not set, then the corresponding
/// option in `self` is used. If it's not set in `self` either, then it
/// remains not set.
pub(crate) fn overwrite(&self, o: Config) -> Config {
Config {
accelerate: o.accelerate.or(self.accelerate),
pre: o.pre.or_else(|| self.pre.clone()),
minimize: o.minimize.or(self.minimize),
match_kind: o.match_kind.or(self.match_kind),
start_kind: o.start_kind.or(self.start_kind),
starts_for_each_pattern: o
.starts_for_each_pattern
.or(self.starts_for_each_pattern),
byte_classes: o.byte_classes.or(self.byte_classes),
unicode_word_boundary: o
.unicode_word_boundary
.or(self.unicode_word_boundary),
quitset: o.quitset.or(self.quitset),
specialize_start_states: o
.specialize_start_states
.or(self.specialize_start_states),
dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit),
determinize_size_limit: o
.determinize_size_limit
.or(self.determinize_size_limit),
}
}
}
/// A builder for constructing a deterministic finite automaton from regular
/// expressions.
///
/// This builder provides two main things:
///
/// 1. It provides a few different `build` routines for actually constructing
/// a DFA from different kinds of inputs. The most convenient is
/// [`Builder::build`], which builds a DFA directly from a pattern string. The
/// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight
/// from an NFA.
/// 2. The builder permits configuring a number of things.
/// [`Builder::configure`] is used with [`Config`] to configure aspects of
/// the DFA and the construction process itself. [`Builder::syntax`] and
/// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA
/// construction, respectively. The syntax and thompson configurations only
/// apply when building from a pattern string.
///
/// This builder always constructs a *single* DFA. As such, this builder
/// can only be used to construct regexes that either detect the presence
/// of a match or find the end location of a match. A single DFA cannot
/// produce both the start and end of a match. For that information, use a
/// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured
/// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to
/// use a DFA directly is if the end location of a match is enough for your use
/// case. Namely, a `Regex` will construct two DFAs instead of one, since a
/// second reverse DFA is needed to find the start of a match.
///
/// Note that if one wants to build a sparse DFA, you must first build a dense
/// DFA and convert that to a sparse DFA. There is no way to build a sparse
/// DFA without first building a dense DFA.
///
/// # Example
///
/// This example shows how to build a minimized DFA that completely disables
/// Unicode. That is:
///
/// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w`
/// and `\b` are ASCII-only while `.` matches any byte except for `\n`
/// (instead of any UTF-8 encoding of a Unicode scalar value except for
/// `\n`). Things that are Unicode only, such as `\pL`, are not allowed.
/// * The pattern itself is permitted to match invalid UTF-8. For example,
/// things like `[^a]` that match any byte except for `a` are permitted.
///
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
/// util::syntax,
/// HalfMatch, Input,
/// };
///
/// let dfa = dense::Builder::new()
/// .configure(dense::Config::new().minimize(false))
/// .syntax(syntax::Config::new().unicode(false).utf8(false))
/// .build(r"foo[^b]ar.*")?;
///
/// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n";
/// let expected = Some(HalfMatch::must(0, 10));
/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "dfa-build")]
#[derive(Clone, Debug)]
pub struct Builder {
config: Config,
#[cfg(feature = "syntax")]
thompson: thompson::Compiler,
}
#[cfg(feature = "dfa-build")]
impl Builder {
/// Create a new dense DFA builder with the default configuration.
pub fn new() -> Builder {
Builder {
config: Config::default(),
#[cfg(feature = "syntax")]
thompson: thompson::Compiler::new(),
}
}
/// Build a DFA from the given pattern.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
#[cfg(feature = "syntax")]
pub fn build(&self, pattern: &str) -> Result<OwnedDFA, BuildError> {
self.build_many(&[pattern])
}
/// Build a DFA from the given patterns.
///
/// When matches are returned, the pattern ID corresponds to the index of
/// the pattern in the slice given.
#[cfg(feature = "syntax")]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<OwnedDFA, BuildError> {
let nfa = self
.thompson
.clone()
// We can always forcefully disable captures because DFAs do not
// support them.
.configure(
thompson::Config::new()
.which_captures(thompson::WhichCaptures::None),
)
.build_many(patterns)
.map_err(BuildError::nfa)?;
self.build_from_nfa(&nfa)
}
/// Build a DFA from the given NFA.
///
/// # Example
///
/// This example shows how to build a DFA if you already have an NFA in
/// hand.
///
/// ```
/// use regex_automata::{
/// dfa::{Automaton, dense},
/// nfa::thompson::NFA,
/// HalfMatch, Input,
/// };
///
/// let haystack = "foo123bar".as_bytes();
///
/// // This shows how to set non-default options for building an NFA.
/// let nfa = NFA::compiler()
/// .configure(NFA::config().shrink(true))
/// .build(r"[0-9]+")?;
/// let dfa = dense::Builder::new().build_from_nfa(&nfa)?;
/// let expected = Some(HalfMatch::must(0, 6));
/// let got = dfa.try_search_fwd(&Input::new(haystack))?;
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn build_from_nfa(
&self,
nfa: &thompson::NFA,
) -> Result<OwnedDFA, BuildError> {
let mut quitset = self.config.quitset.unwrap_or(ByteSet::empty());
if self.config.get_unicode_word_boundary()
&& nfa.look_set_any().contains_word_unicode()
{
for b in 0x80..=0xFF {
quitset.add(b);
}
}
let classes = if !self.config.get_byte_classes() {
// DFAs will always use the equivalence class map, but enabling
// this option is useful for debugging. Namely, this will cause all
// transitions to be defined over their actual bytes instead of an
// opaque equivalence class identifier. The former is much easier
// to grok as a human.
ByteClasses::singletons()
} else {
let mut set = nfa.byte_class_set().clone();
// It is important to distinguish any "quit" bytes from all other
// bytes. Otherwise, a non-quit byte may end up in the same class
// as a quit byte, and thus cause the DFA stop when it shouldn't.
//
// Test case:
//
// regex-cli find hybrid regex -w @conn.json.1000x.log \
// '^#' '\b10\.55\.182\.100\b'
if !quitset.is_empty() {
set.add_set(&quitset);
}
set.byte_classes()
};
let mut dfa = DFA::initial(
classes,
nfa.pattern_len(),
self.config.get_starts(),
nfa.look_matcher(),
self.config.get_starts_for_each_pattern(),
self.config.get_prefilter().map(|p| p.clone()),
quitset,
Flags::from_nfa(&nfa),
)?;
determinize::Config::new()
.match_kind(self.config.get_match_kind())
.quit(quitset)
.dfa_size_limit(self.config.get_dfa_size_limit())
.determinize_size_limit(self.config.get_determinize_size_limit())
.run(nfa, &mut dfa)?;
if self.config.get_minimize() {
dfa.minimize();
}
if self.config.get_accelerate() {
dfa.accelerate();
}
// The state shuffling done before this point always assumes that start
// states should be marked as "special," even though it isn't the
// default configuration. State shuffling is complex enough as it is,
// so it's simpler to just "fix" our special state ID ranges to not
// include starting states after-the-fact.
if !self.config.get_specialize_start_states() {
dfa.special.set_no_special_start_states();
}
// Look for and set the universal starting states.
dfa.set_universal_starts();
Ok(dfa)
}
/// Apply the given dense DFA configuration options to this builder.
pub fn configure(&mut self, config: Config) -> &mut Builder {
self.config = self.config.overwrite(config);
self
}
/// Set the syntax configuration for this builder using
/// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
///
/// These settings only apply when constructing a DFA directly from a
/// pattern.
#[cfg(feature = "syntax")]
pub fn syntax(
&mut self,
config: crate::util::syntax::Config,
) -> &mut Builder {
self.thompson.syntax(config);
self
}
/// Set the Thompson NFA configuration for this builder using
/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
///
/// This permits setting things like whether the DFA should match the regex
/// in reverse or if additional time should be spent shrinking the size of
/// the NFA.
///
/// These settings only apply when constructing a DFA directly from a
/// pattern.
#[cfg(feature = "syntax")]
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
self.thompson.configure(config);
self
}
}
#[cfg(feature = "dfa-build")]
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}
/// A convenience alias for an owned DFA. We use this particular instantiation
/// a lot in this crate, so it's worth giving it a name. This instantiation
/// is commonly used for mutable APIs on the DFA while building it. The main
/// reason for making DFAs generic is no_std support, and more generally,
/// making it possible to load a DFA from an arbitrary slice of bytes.
#[cfg(feature = "alloc")]
pub(crate) type OwnedDFA = DFA<alloc::vec::Vec<u32>>;
/// A dense table-based deterministic finite automaton (DFA).
///
/// All dense DFAs have one or more start states, zero or more match states
/// and a transition table that maps the current state and the current byte
/// of input to the next state. A DFA can use this information to implement
/// fast searching. In particular, the use of a dense DFA generally makes the
/// trade off that match speed is the most valuable characteristic, even if
/// building the DFA may take significant time *and* space. (More concretely,
/// building a DFA takes time and space that is exponential in the size of the
/// pattern in the worst case.) As such, the processing of every byte of input
/// is done with a small constant number of operations that does not vary with
/// the pattern, its size or the size of the alphabet. If your needs don't line
/// up with this trade off, then a dense DFA may not be an adequate solution to
/// your problem.
///
/// In contrast, a [`sparse::DFA`] makes the opposite
/// trade off: it uses less space but will execute a variable number of
/// instructions per byte at match time, which makes it slower for matching.
/// (Note that space usage is still exponential in the size of the pattern in
/// the worst case.)
///
/// A DFA can be built using the default configuration via the
/// [`DFA::new`] constructor. Otherwise, one can
/// configure various aspects via [`dense::Builder`](Builder).
///
/// A single DFA fundamentally supports the following operations:
///
/// 1. Detection of a match.
/// 2. Location of the end of a match.
/// 3. In the case of a DFA with multiple patterns, which pattern matched is
/// reported as well.
///
/// A notable absence from the above list of capabilities is the location of
/// the *start* of a match. In order to provide both the start and end of
/// a match, *two* DFAs are required. This functionality is provided by a
/// [`Regex`](crate::dfa::regex::Regex).
///
/// # Type parameters
///
/// A `DFA` has one type parameter, `T`, which is used to represent state IDs,
/// pattern IDs and accelerators. `T` is typically a `Vec<u32>` or a `&[u32]`.
///
/// # The `Automaton` trait
///
/// This type implements the [`Automaton`] trait, which means it can be used
/// for searching. For example:
///
/// ```
/// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input};
///
/// let dfa = DFA::new("foo[0-9]+")?;
/// let expected = HalfMatch::must(0, 8);
/// assert_eq!(Some(expected), dfa.try_search_fwd(&Input::new("foo12345"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone)]
pub struct DFA<T> {
/// The transition table for this DFA. This includes the transitions
/// themselves, along with the stride, number of states and the equivalence
/// class mapping.
tt: TransitionTable<T>,
/// The set of starting state identifiers for this DFA. The starting state
/// IDs act as pointers into the transition table. The specific starting
/// state chosen for each search is dependent on the context at which the
/// search begins.
st: StartTable<T>,
/// The set of match states and the patterns that match for each
/// corresponding match state.
///
/// This structure is technically only needed because of support for
/// multi-regexes. Namely, multi-regexes require answering not just whether
/// a match exists, but _which_ patterns match. So we need to store the
/// matching pattern IDs for each match state. We do this even when there
/// is only one pattern for the sake of simplicity. In practice, this uses
/// up very little space for the case of one pattern.
ms: MatchStates<T>,
/// Information about which states are "special." Special states are states
/// that are dead, quit, matching, starting or accelerated. For more info,
/// see the docs for `Special`.
special: Special,
/// The accelerators for this DFA.
///
/// If a state is accelerated, then there exist only a small number of
/// bytes that can cause the DFA to leave the state. This permits searching
/// to use optimized routines to find those specific bytes instead of using
/// the transition table.
///
/// All accelerated states exist in a contiguous range in the DFA's
/// transition table. See dfa/special.rs for more details on how states are
/// arranged.
accels: Accels<T>,
/// Any prefilter attached to this DFA.
///
/// Note that currently prefilters are not serialized. When deserializing
/// a DFA from bytes, this is always set to `None`.
pre: Option<Prefilter>,
/// The set of "quit" bytes for this DFA.
///
/// This is only used when computing the start state for a particular
/// position in a haystack. Namely, in the case where there is a quit
/// byte immediately before the start of the search, this set needs to be
/// explicitly consulted. In all other cases, quit bytes are detected by
/// the DFA itself, by transitioning all quit bytes to a special "quit
/// state."
quitset: ByteSet,
/// Various flags describing the behavior of this DFA.
flags: Flags,
}
#[cfg(feature = "dfa-build")]
impl OwnedDFA {
/// Parse the given regular expression using a default configuration and
/// return the corresponding DFA.
///
/// If you want a non-default configuration, then use the
/// [`dense::Builder`](Builder) to set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
///
/// let dfa = dense::DFA::new("foo[0-9]+bar")?;
/// let expected = Some(HalfMatch::must(0, 11));
/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "syntax")]
pub fn new(pattern: &str) -> Result<OwnedDFA, BuildError> {
Builder::new().build(pattern)
}
/// Parse the given regular expressions using a default configuration and
/// return the corresponding multi-DFA.
///
/// If you want a non-default configuration, then use the
/// [`dense::Builder`](Builder) to set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
///
/// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
/// let expected = Some(HalfMatch::must(1, 3));
/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[cfg(feature = "syntax")]
pub fn new_many<P: AsRef<str>>(
patterns: &[P],
) -> Result<OwnedDFA, BuildError> {
Builder::new().build_many(patterns)
}
}
#[cfg(feature = "dfa-build")]
impl OwnedDFA {
/// Create a new DFA that matches every input.
///
/// # Example
///
/// ```
/// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input};
///
/// let dfa = dense::DFA::always_match()?;
///
/// let expected = Some(HalfMatch::must(0, 0));
/// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn always_match() -> Result<OwnedDFA, BuildError> {
let nfa = thompson::NFA::always_match();
Builder::new().build_from_nfa(&nfa)
}
/// Create a new DFA that never matches any input.
///
/// # Example
///
/// ```
/// use regex_automata::{dfa::{Automaton, dense}, Input};
///
/// let dfa = dense::DFA::never_match()?;
/// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
/// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn never_match() -> Result<OwnedDFA, BuildError> {
let nfa = thompson::NFA::never_match();
Builder::new().build_from_nfa(&nfa)
}
/// Create an initial DFA with the given equivalence classes, pattern
/// length and whether anchored starting states are enabled for each
/// pattern. An initial DFA can be further mutated via determinization.
fn initial(
classes: ByteClasses,
pattern_len: usize,
starts: StartKind,
lookm: &LookMatcher,
--> --------------------
--> maximum size reached
--> --------------------
[ Dauer der Verarbeitung: 0.9 Sekunden
(vorverarbeitet)
]
|
2026-04-02
|
|
|
|
|