Ziele Untersuchung
mit Columbo Integrität von
Datenbanken Interaktion und
Portierbarkeit Ergonomie der
Schnittstellen

Angebot Produkte Projekt Beratung

Mittel Analytik Modellierung Sprachen Algebra Logik Hardware Denken Kreativität

Zusammenhänge Gesellschaft Wirtschaft Branche Firma


products/Sources/formale Sprachen/C/Firefox/third_party/rust/aho-corasick/src/packed/ (Browser von der Mozilla Stiftung Version 136.0.1^©) Datei vom 10.2.2025 mit Größe 17 kB

Quelle pattern.rs Sprache: unbekannt

Spracherkennung für: .rs vermutete Sprache: Unknown {[0] [0] [0]} [Methode: Schwerpunktbildung, einfache Gewichte, sechs Dimensionen]

use core::{cmp, fmt, mem, u16, usize};

use alloc::{boxed::Box, string::String, vec, vec::Vec};

use crate::{
    packed::{api::MatchKind, ext::Pointer},
    PatternID,
};

/// A non-empty collection of non-empty patterns to search for.
///
/// This collection of patterns is what is passed around to both execute
/// searches and to construct the searchers themselves. Namely, this permits
/// searches to avoid copying all of the patterns, and allows us to keep only
/// one copy throughout all packed searchers.
///
/// Note that this collection is not a set. The same pattern can appear more
/// than once.
#[derive(Clone, Debug)]
pub(crate) struct Patterns {
    /// The match semantics supported by this collection of patterns.
    ///
    /// The match semantics determines the order of the iterator over patterns.
    /// For leftmost-first, patterns are provided in the same order as were
    /// provided by the caller. For leftmost-longest, patterns are provided in
    /// descending order of length, with ties broken by the order in which they
    /// were provided by the caller.
    kind: MatchKind,
    /// The collection of patterns, indexed by their identifier.
    by_id: Vec<Vec<u8>>,
    /// The order of patterns defined for iteration, given by pattern
    /// identifiers. The order of `by_id` and `order` is always the same for
    /// leftmost-first semantics, but may be different for leftmost-longest
    /// semantics.
    order: Vec<PatternID>,
    /// The length of the smallest pattern, in bytes.
    minimum_len: usize,
    /// The total number of pattern bytes across the entire collection. This
    /// is used for reporting total heap usage in constant time.
    total_pattern_bytes: usize,
}

// BREADCRUMBS: I think we want to experiment with a different bucket
// representation. Basically, each bucket is just a Range<usize> to a single
// contiguous allocation? Maybe length-prefixed patterns or something? The
// idea is to try to get rid of the pointer chasing in verification. I don't
// know that that is the issue, but I suspect it is.

impl Patterns {
    /// Create a new collection of patterns for the given match semantics. The
    /// ID of each pattern is the index of the pattern at which it occurs in
    /// the `by_id` slice.
    ///
    /// If any of the patterns in the slice given are empty, then this panics.
    /// Similarly, if the number of patterns given is zero, then this also
    /// panics.
    pub(crate) fn new() -> Patterns {
        Patterns {
            kind: MatchKind::default(),
            by_id: vec![],
            order: vec![],
            minimum_len: usize::MAX,
            total_pattern_bytes: 0,
        }
    }

    /// Add a pattern to this collection.
    ///
    /// This panics if the pattern given is empty.
    pub(crate) fn add(&mut self, bytes: &[u8]) {
        assert!(!bytes.is_empty());
        assert!(self.by_id.len() <= u16::MAX as usize);

        let id = PatternID::new(self.by_id.len()).unwrap();
        self.order.push(id);
        self.by_id.push(bytes.to_vec());
        self.minimum_len = cmp::min(self.minimum_len, bytes.len());
        self.total_pattern_bytes += bytes.len();
    }

    /// Set the match kind semantics for this collection of patterns.
    ///
    /// If the kind is not set, then the default is leftmost-first.
    pub(crate) fn set_match_kind(&mut self, kind: MatchKind) {
        self.kind = kind;
        match self.kind {
            MatchKind::LeftmostFirst => {
                self.order.sort();
            }
            MatchKind::LeftmostLongest => {
                let (order, by_id) = (&mut self.order, &mut self.by_id);
                order.sort_by(|&id1, &id2| {
                    by_id[id1].len().cmp(&by_id[id2].len()).reverse()
                });
            }
        }
    }

    /// Return the number of patterns in this collection.
    ///
    /// This is guaranteed to be greater than zero.
    pub(crate) fn len(&self) -> usize {
        self.by_id.len()
    }

    /// Returns true if and only if this collection of patterns is empty.
    pub(crate) fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Returns the approximate total amount of heap used by these patterns, in
    /// units of bytes.
    pub(crate) fn memory_usage(&self) -> usize {
        self.order.len() * mem::size_of::<PatternID>()
            + self.by_id.len() * mem::size_of::<Vec<u8>>()
            + self.total_pattern_bytes
    }

    /// Clears all heap memory associated with this collection of patterns and
    /// resets all state such that it is a valid empty collection.
    pub(crate) fn reset(&mut self) {
        self.kind = MatchKind::default();
        self.by_id.clear();
        self.order.clear();
        self.minimum_len = usize::MAX;
    }

    /// Returns the length, in bytes, of the smallest pattern.
    ///
    /// This is guaranteed to be at least one.
    pub(crate) fn minimum_len(&self) -> usize {
        self.minimum_len
    }

    /// Returns the match semantics used by these patterns.
    pub(crate) fn match_kind(&self) -> &MatchKind {
        &self.kind
    }

    /// Return the pattern with the given identifier. If such a pattern does
    /// not exist, then this panics.
    pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> {
        Pattern(&self.by_id[id])
    }

    /// Return the pattern with the given identifier without performing bounds
    /// checks.
    ///
    /// # Safety
    ///
    /// Callers must ensure that a pattern with the given identifier exists
    /// before using this method.
    pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> {
        Pattern(self.by_id.get_unchecked(id.as_usize()))
    }

    /// Return an iterator over all the patterns in this collection, in the
    /// order in which they should be matched.
    ///
    /// Specifically, in a naive multi-pattern matcher, the following is
    /// guaranteed to satisfy the match semantics of this collection of
    /// patterns:
    ///
    /// ```ignore
    /// for i in 0..haystack.len():
    ///   for p in patterns.iter():
    ///     if haystack[i..].starts_with(p.bytes()):
    ///       return Match(p.id(), i, i + p.bytes().len())
    /// ```
    ///
    /// Namely, among the patterns in a collection, if they are matched in
    /// the order provided by this iterator, then the result is guaranteed
    /// to satisfy the correct match semantics. (Either leftmost-first or
    /// leftmost-longest.)
    pub(crate) fn iter(&self) -> PatternIter<'_> {
        PatternIter { patterns: self, i: 0 }
    }
}

/// An iterator over the patterns in the `Patterns` collection.
///
/// The order of the patterns provided by this iterator is consistent with the
/// match semantics of the originating collection of patterns.
///
/// The lifetime `'p` corresponds to the lifetime of the collection of patterns
/// this is iterating over.
#[derive(Debug)]
pub(crate) struct PatternIter<'p> {
    patterns: &'p Patterns,
    i: usize,
}

impl<'p> Iterator for PatternIter<'p> {
    type Item = (PatternID, Pattern<'p>);

    fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> {
        if self.i >= self.patterns.len() {
            return None;
        }
        let id = self.patterns.order[self.i];
        let p = self.patterns.get(id);
        self.i += 1;
        Some((id, p))
    }
}

/// A pattern that is used in packed searching.
#[derive(Clone)]
pub(crate) struct Pattern<'a>(&'a [u8]);

impl<'a> fmt::Debug for Pattern<'a> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("Pattern")
            .field("lit", &String::from_utf8_lossy(&self.0))
            .finish()
    }
}

impl<'p> Pattern<'p> {
    /// Returns the length of this pattern, in bytes.
    pub(crate) fn len(&self) -> usize {
        self.0.len()
    }

    /// Returns the bytes of this pattern.
    pub(crate) fn bytes(&self) -> &[u8] {
        &self.0
    }

    /// Returns the first `len` low nybbles from this pattern. If this pattern
    /// is shorter than `len`, then this panics.
    pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> {
        let mut nybs = vec![0; len].into_boxed_slice();
        for (i, byte) in self.bytes().iter().take(len).enumerate() {
            nybs[i] = byte & 0xF;
        }
        nybs
    }

    /// Returns true if this pattern is a prefix of the given bytes.
    #[inline(always)]
    pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool {
        is_prefix(bytes, self.bytes())
    }

    /// Returns true if this pattern is a prefix of the haystack given by the
    /// raw `start` and `end` pointers.
    ///
    /// # Safety
    ///
    /// * It must be the case that `start < end` and that the distance between
    /// them is at least equal to `V::BYTES`. That is, it must always be valid
    /// to do at least an unaligned load of `V` at `start`.
    /// * Both `start` and `end` must be valid for reads.
    /// * Both `start` and `end` must point to an initialized value.
    /// * Both `start` and `end` must point to the same allocated object and
    /// must either be in bounds or at most one byte past the end of the
    /// allocated object.
    /// * Both `start` and `end` must be _derived from_ a pointer to the same
    /// object.
    /// * The distance between `start` and `end` must not overflow `isize`.
    /// * The distance being in bounds must not rely on "wrapping around" the
    /// address space.
    #[inline(always)]
    pub(crate) unsafe fn is_prefix_raw(
        &self,
        start: *const u8,
        end: *const u8,
    ) -> bool {
        let patlen = self.bytes().len();
        let haylen = end.distance(start);
        if patlen > haylen {
            return false;
        }
        // SAFETY: We've checked that the haystack has length at least equal
        // to this pattern. All other safety concerns are the responsibility
        // of the caller.
        is_equal_raw(start, self.bytes().as_ptr(), patlen)
    }
}

/// Returns true if and only if `needle` is a prefix of `haystack`.
///
/// This uses a latency optimized variant of `memcmp` internally which *might*
/// make this faster for very short strings.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
#[inline(always)]
fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool {
    if needle.len() > haystack.len() {
        return false;
    }
    // SAFETY: Our pointers are derived directly from borrowed slices which
    // uphold all of our safety guarantees except for length. We account for
    // length with the check above.
    unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) }
}

/// Compare corresponding bytes in `x` and `y` for equality.
///
/// That is, this returns true if and only if `x.len() == y.len()` and
/// `x[i] == y[i]` for all `0 <= i < x.len()`.
///
/// Note that this isn't used. We only use it in tests as a convenient way
/// of testing `is_equal_raw`.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
///
/// # Motivation
///
/// Why not use slice equality instead? Well, slice equality usually results in
/// a call out to the current platform's `libc` which might not be inlineable
/// or have other overhead. This routine isn't guaranteed to be a win, but it
/// might be in some cases.
#[cfg(test)]
#[inline(always)]
fn is_equal(x: &[u8], y: &[u8]) -> bool {
    if x.len() != y.len() {
        return false;
    }
    // SAFETY: Our pointers are derived directly from borrowed slices which
    // uphold all of our safety guarantees except for length. We account for
    // length with the check above.
    unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) }
}

/// Compare `n` bytes at the given pointers for equality.
///
/// This returns true if and only if `*x.add(i) == *y.add(i)` for all
/// `0 <= i < n`.
///
/// # Inlining
///
/// This routine is marked `inline(always)`. If you want to call this function
/// in a way that is not always inlined, you'll need to wrap a call to it in
/// another function that is marked as `inline(never)` or just `inline`.
///
/// # Motivation
///
/// Why not use slice equality instead? Well, slice equality usually results in
/// a call out to the current platform's `libc` which might not be inlineable
/// or have other overhead. This routine isn't guaranteed to be a win, but it
/// might be in some cases.
///
/// # Safety
///
/// * Both `x` and `y` must be valid for reads of up to `n` bytes.
/// * Both `x` and `y` must point to an initialized value.
/// * Both `x` and `y` must each point to an allocated object and
/// must either be in bounds or at most one byte past the end of the
/// allocated object. `x` and `y` do not need to point to the same allocated
/// object, but they may.
/// * Both `x` and `y` must be _derived from_ a pointer to their respective
/// allocated objects.
/// * The distance between `x` and `x+n` must not overflow `isize`. Similarly
/// for `y` and `y+n`.
/// * The distance being in bounds must not rely on "wrapping around" the
/// address space.
#[inline(always)]
unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool {
    // If we don't have enough bytes to do 4-byte at a time loads, then
    // handle each possible length specially. Note that I used to have a
    // byte-at-a-time loop here and that turned out to be quite a bit slower
    // for the memmem/pathological/defeat-simple-vector-alphabet benchmark.
    if n < 4 {
        return match n {
            0 => true,
            1 => x.read() == y.read(),
            2 => {
                x.cast::<u16>().read_unaligned()
                    == y.cast::<u16>().read_unaligned()
            }
            // I also tried copy_nonoverlapping here and it looks like the
            // codegen is the same.
            3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(),
            _ => unreachable!(),
        };
    }
    // When we have 4 or more bytes to compare, then proceed in chunks of 4 at
    // a time using unaligned loads.
    //
    // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is
    // that this particular version of memcmp is likely to be called with tiny
    // needles. That means that if we do 8 byte loads, then a higher proportion
    // of memcmp calls will use the slower variant above. With that said, this
    // is a hypothesis and is only loosely supported by benchmarks. There's
    // likely some improvement that could be made here. The main thing here
    // though is to optimize for latency, not throughput.

    // SAFETY: The caller is responsible for ensuring the pointers we get are
    // valid and readable for at least `n` bytes. We also do unaligned loads,
    // so there's no need to ensure we're aligned. (This is justified by this
    // routine being specifically for short strings.)
    let xend = x.add(n.wrapping_sub(4));
    let yend = y.add(n.wrapping_sub(4));
    while x < xend {
        let vx = x.cast::<u32>().read_unaligned();
        let vy = y.cast::<u32>().read_unaligned();
        if vx != vy {
            return false;
        }
        x = x.add(4);
        y = y.add(4);
    }
    let vx = xend.cast::<u32>().read_unaligned();
    let vy = yend.cast::<u32>().read_unaligned();
    vx == vy
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn equals_different_lengths() {
        assert!(!is_equal(b"", b"a"));
        assert!(!is_equal(b"a", b""));
        assert!(!is_equal(b"ab", b"a"));
        assert!(!is_equal(b"a", b"ab"));
    }

    #[test]
    fn equals_mismatch() {
        let one_mismatch = [
            (&b"a"[..], &b"x"[..]),
            (&b"ab"[..], &b"ax"[..]),
            (&b"abc"[..], &b"abx"[..]),
            (&b"abcd"[..], &b"abcx"[..]),
            (&b"abcde"[..], &b"abcdx"[..]),
            (&b"abcdef"[..], &b"abcdex"[..]),
            (&b"abcdefg"[..], &b"abcdefx"[..]),
            (&b"abcdefgh"[..], &b"abcdefgx"[..]),
            (&b"abcdefghi"[..], &b"abcdefghx"[..]),
            (&b"abcdefghij"[..], &b"abcdefghix"[..]),
            (&b"abcdefghijk"[..], &b"abcdefghijx"[..]),
            (&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]),
            (&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]),
            (&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]),
        ];
        for (x, y) in one_mismatch {
            assert_eq!(x.len(), y.len(), "lengths should match");
            assert!(!is_equal(x, y));
            assert!(!is_equal(y, x));
        }
    }

    #[test]
    fn equals_yes() {
        assert!(is_equal(b"", b""));
        assert!(is_equal(b"a", b"a"));
        assert!(is_equal(b"ab", b"ab"));
        assert!(is_equal(b"abc", b"abc"));
        assert!(is_equal(b"abcd", b"abcd"));
        assert!(is_equal(b"abcde", b"abcde"));
        assert!(is_equal(b"abcdef", b"abcdef"));
        assert!(is_equal(b"abcdefg", b"abcdefg"));
        assert!(is_equal(b"abcdefgh", b"abcdefgh"));
        assert!(is_equal(b"abcdefghi", b"abcdefghi"));
    }

    #[test]
    fn prefix() {
        assert!(is_prefix(b"", b""));
        assert!(is_prefix(b"a", b""));
        assert!(is_prefix(b"ab", b""));
        assert!(is_prefix(b"foo", b"foo"));
        assert!(is_prefix(b"foobar", b"foo"));

        assert!(!is_prefix(b"foo", b"fob"));
        assert!(!is_prefix(b"foobar", b"fob"));
    }
}