// Copyright (c) 2013 The Chromium Authors. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// The EnergyEndpointer class finds likely speech onset and offset points. // // The implementation described here is about the simplest possible. // It is based on timings of threshold crossings for overall signal // RMS. It is suitable for light weight applications. // // As written, the basic idea is that one specifies intervals that // must be occupied by super- and sub-threshold energy levels, and // defers decisions re onset and offset times until these // specifications have been met. Three basic intervals are tested: an // onset window, a speech-on window, and an offset window. We require // super-threshold to exceed some mimimum total durations in the onset // and speech-on windows before declaring the speech onset time, and // we specify a required sub-threshold residency in the offset window // before declaring speech offset. As the various residency requirements are // met, the EnergyEndpointer instance assumes various states, and can return the // ID of these states to the client (see EpStatus below). // // The levels of the speech and background noise are continuously updated. It is // important that the background noise level be estimated initially for // robustness in noisy conditions. The first frames are assumed to be background // noise and a fast update rate is used for the noise level. The duration for // fast update is controlled by the fast_update_dur_ paramter. // // If used in noisy conditions, the endpointer should be started and run in the // EnvironmentEstimation mode, for at least 200ms, before switching to // UserInputMode. // Audio feedback contamination can appear in the input audio, if not cut // out or handled by echo cancellation. Audio feedback can trigger a false // accept. The false accepts can be ignored by setting // ep_contamination_rejection_period.
class EnergyEndpointer { public: // The default construction MUST be followed by Init(), before any // other use can be made of the instance.
EnergyEndpointer(); virtual ~EnergyEndpointer();
void Init(const EnergyEndpointerParams& params);
// Start the endpointer. This should be called at the beginning of a session. void StartSession();
// Stop the endpointer. void EndSession();
// Start environment estimation. Audio will be used for environment estimation // i.e. noise level estimation. void SetEnvironmentEstimationMode();
// Start user input. This should be called when the user indicates start of // input, e.g. by pressing a button. void SetUserInputMode();
// Computes the next input frame and modifies EnergyEndpointer status as // appropriate based on the computation. void ProcessAudioFrame(int64_t time_us, const int16_t* samples, int num_samples, float* rms_out);
// Returns the current state of the EnergyEndpointer and the time // corresponding to the most recently computed frame.
EpStatus Status(int64_t* status_time_us) const;
// Returns estimated noise level in dB. float GetNoiseLevelDb() const;
private: class HistoryRing;
// Resets the endpointer internal state. If reset_threshold is true, the // state will be reset completely, including adaptive thresholds and the // removal of all history information. void Restart(bool reset_threshold);
// Update internal speech and noise levels. void UpdateLevels(float rms);
// Returns the number of frames (or frame number) corresponding to // the 'time' (in seconds). int TimeToFrame(float time) const;
EpStatus status_; // The current state of this instance. float offset_confirm_dur_sec_; // max on time allowed to confirm POST_SPEECH
int64_t endpointer_time_us_; // Time of the most recently received audio frame.
int64_t fast_update_frames_; // Number of frames for initial level adaptation.
int64_t frame_counter_; // Number of frames seen. Used for initial adaptation. float max_window_dur_; // Largest search window size (seconds) float sample_rate_; // Sampling rate.
// Ring buffers to hold the speech activity history.
UniquePtr<HistoryRing> history_;
// RMS which must be exceeded to conclude frame is speech. float decision_threshold_;
// Flag to indicate that audio should be used to estimate environment, prior // to receiving user input. bool estimating_environment_;
// Estimate of the background noise level. Used externally for UI feedback. float noise_level_;
// An adaptive threshold used to update decision_threshold_ when appropriate. float rms_adapt_;
// Start lag corresponds to the highest fundamental frequency. int start_lag_;
// End lag corresponds to the lowest fundamental frequency. int end_lag_;
// Time when mode switched from environment estimation to user input. This // is used to time forced rejection of audio feedback contamination.
int64_t user_input_start_time_us_;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.