// Copyright (c) 2013 The Chromium Authors. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include"energy_endpointer.h"
#include <math.h>
namespace {
// Returns the RMS (quadratic mean) of the input signal. float RMS(const int16_t* samples, int num_samples) {
int64_t ssq_int64_t = 0;
int64_t sum_int64_t = 0; for (int i = 0; i < num_samples; ++i) {
sum_int64_t += samples[i];
ssq_int64_t += samples[i] * samples[i];
} // now convert to floats. double sum = static_cast<double>(sum_int64_t);
sum /= num_samples; double ssq = static_cast<double>(ssq_int64_t); returnstatic_cast<float>(sqrt((ssq / num_samples) - (sum * sum)));
}
// Stores threshold-crossing histories for making decisions about the speech // state. class EnergyEndpointer::HistoryRing { public:
HistoryRing() : insertion_index_(0) {}
// Resets the ring to |size| elements each with state |initial_state| void SetRing(int size, bool initial_state);
// Inserts a new entry into the ring and drops the oldest entry. void Insert(int64_t time_us, bool decision);
// Returns the time in microseconds of the most recently added entry.
int64_t EndTime() const;
// Returns the sum of all intervals during which 'decision' is true within // the time in seconds specified by 'duration'. The returned interval is // in seconds. float RingSum(float duration_sec);
if (reset_threshold) {
decision_threshold_ = params_.decision_threshold();
rms_adapt_ = decision_threshold_;
noise_level_ = params_.decision_threshold() / 2.0f;
frame_counter_ = 0; // Used for rapid initial update of levels.
}
// Set up the memories to hold the history windows.
history_->SetRing(TimeToFrame(max_window_dur_), false);
// Flag that indicates that current input should be used for // estimating the environment. The user has not yet started input // by e.g. pressed the push-to-talk button. By default, this is // false for backward compatibility.
estimating_environment_ = false;
}
// Find the longest history interval to be used, and make the ring // large enough to accommodate that number of frames. NOTE: This // depends upon ep_frame_period being set correctly in the factory // that did this instantiation.
max_window_dur_ = params_.onset_window(); if (params_.speech_on_window() > max_window_dur_)
max_window_dur_ = params_.speech_on_window(); if (params_.offset_window() > max_window_dur_)
max_window_dur_ = params_.offset_window();
Restart(true);
// Flag that indicates that current input should be used for // estimating the environment. The user has not yet started input // by e.g. pressed the push-to-talk button. By default, this is // false for backward compatibility.
estimating_environment_ = false; // The initial value of the noise and speech levels is inconsequential. // The level of the first frame will overwrite these values.
noise_level_ = params_.decision_threshold() / 2.0f;
fast_update_frames_ = static_cast<int64_t>(params_.fast_update_dur() / params_.frame_period());
frame_counter_ = 0; // Used for rapid initial update of levels.
// Check that this is user input audio vs. pre-input adaptation audio. // Input audio starts when the user indicates start of input, by e.g. // pressing push-to-talk. Audio recieved prior to that is used to update // noise and speech level estimates. if (!estimating_environment_) { bool decision = false; if ((endpointer_time_us_ - user_input_start_time_us_) <
Secs2Usecs(params_.contamination_rejection_period())) {
decision = false; //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("decision: forced to false, time: %d", endpointer_time_us_));
} else {
decision = (rms > decision_threshold_);
}
history_->Insert(endpointer_time_us_, decision);
switch (status_) { case EP_PRE_SPEECH: if (history_->RingSum(params_.onset_window()) >
params_.onset_detect_dur()) {
status_ = EP_POSSIBLE_ONSET;
} break;
case EP_POSSIBLE_ONSET: { float tsum = history_->RingSum(params_.onset_window()); if (tsum > params_.onset_confirm_dur()) {
status_ = EP_SPEECH_PRESENT;
} else { // If signal is not maintained, drop back to pre-speech. if (tsum <= params_.onset_detect_dur())
status_ = EP_PRE_SPEECH;
} break;
}
case EP_SPEECH_PRESENT: { // To induce hysteresis in the state residency, we allow a // smaller residency time in the on_ring, than was required to // enter the SPEECH_PERSENT state. float on_time = history_->RingSum(params_.speech_on_window()); if (on_time < params_.on_maintain_dur())
status_ = EP_POSSIBLE_OFFSET; break;
}
case EP_POSSIBLE_OFFSET: if (history_->RingSum(params_.offset_window()) <=
offset_confirm_dur_sec_) { // Note that this offset time may be beyond the end // of the input buffer in a real-time system. It will be up // to the RecognizerSession to decide what to do.
status_ = EP_PRE_SPEECH; // Automatically reset for next utterance.
} else { // If speech picks up again we allow return to SPEECH_PRESENT. if (history_->RingSum(params_.speech_on_window()) >=
params_.on_maintain_dur())
status_ = EP_SPEECH_PRESENT;
} break;
default: break;
}
// If this is a quiet, non-speech region, slowly adapt the detection // threshold to be about 6dB above the average RMS. if ((!decision) && (status_ == EP_PRE_SPEECH)) {
decision_threshold_ = (0.98f * decision_threshold_) + (0.02f * 2 * rms);
rms_adapt_ = decision_threshold_;
} else { // If this is in a speech region, adapt the decision threshold to // be about 10dB below the average RMS. If the noise level is high, // the threshold is pushed up. // Adaptation up to a higher level is 5 times faster than decay to // a lower level. if ((status_ == EP_SPEECH_PRESENT) && decision) { if (rms_adapt_ > rms) {
rms_adapt_ = (0.99f * rms_adapt_) + (0.01f * rms);
} else {
rms_adapt_ = (0.95f * rms_adapt_) + (0.05f * rms);
} float target_threshold = 0.3f * rms_adapt_ + noise_level_;
decision_threshold_ = (.90f * decision_threshold_) +
(0.10f * target_threshold);
}
}
// Set a floor if (decision_threshold_ < params_.min_decision_threshold())
decision_threshold_ = params_.min_decision_threshold();
}
// Update speech and noise levels.
UpdateLevels(rms);
++frame_counter_;
void EnergyEndpointer::UpdateLevels(float rms) { // Update quickly initially. We assume this is noise and that // speech is 6dB above the noise. if (frame_counter_ < fast_update_frames_) { // Alpha increases from 0 to (k-1)/k where k is the number of time // steps in the initial adaptation period. float alpha = static_cast<float>(frame_counter_) / static_cast<float>(fast_update_frames_);
noise_level_ = (alpha * noise_level_) + ((1 - alpha) * rms); //PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, ("FAST UPDATE, frame_counter_ %d, fast_update_frames_ %d", frame_counter_, fast_update_frames_));
} else { // Update Noise level. The noise level adapts quickly downward, but // slowly upward. The noise_level_ parameter is not currently used // for threshold adaptation. It is used for UI feedback. if (noise_level_ < rms)
noise_level_ = (0.999f * noise_level_) + (0.001f * rms); else
noise_level_ = (0.95f * noise_level_) + (0.05f * rms);
} if (estimating_environment_ || (frame_counter_ < fast_update_frames_)) {
decision_threshold_ = noise_level_ * 2; // 6dB above noise level. // Set a floor if (decision_threshold_ < params_.min_decision_threshold())
decision_threshold_ = params_.min_decision_threshold();
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.