/* * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree.
*/
// Spectrum Weighting staticconst int16_t kSpectrumWeight[kNumChannels] = { 6, 8, 10, 12, 14, 16 }; staticconst int16_t kNoiseUpdateConst = 655; // Q15 staticconst int16_t kSpeechUpdateConst = 6554; // Q15 staticconst int16_t kBackEta = 154; // Q8 // Minimum difference between the two models, Q5 staticconst int16_t kMinimumDifference[kNumChannels] = {
544, 544, 576, 576, 576, 576 }; // Upper limit of mean value for speech model, Q7 staticconst int16_t kMaximumSpeech[kNumChannels] = {
11392, 11392, 11520, 11520, 11520, 11520 }; // Minimum value for mean value staticconst int16_t kMinimumMean[kNumGaussians] = { 640, 768 }; // Upper limit of mean value for noise model, Q7 staticconst int16_t kMaximumNoise[kNumChannels] = {
9216, 9088, 8960, 8832, 8704, 8576 }; // Start values for the Gaussian models, Q7 // Weights for the two Gaussians for the six channels (noise) staticconst int16_t kNoiseDataWeights[kTableSize] = {
34, 62, 72, 66, 53, 25, 94, 66, 56, 62, 75, 103 }; // Weights for the two Gaussians for the six channels (speech) staticconst int16_t kSpeechDataWeights[kTableSize] = {
48, 82, 45, 87, 50, 47, 80, 46, 83, 41, 78, 81 }; // Means for the two Gaussians for the six channels (noise) staticconst int16_t kNoiseDataMeans[kTableSize] = {
6738, 4892, 7065, 6715, 6771, 3369, 7646, 3863, 7820, 7266, 5020, 4362 }; // Means for the two Gaussians for the six channels (speech) staticconst int16_t kSpeechDataMeans[kTableSize] = {
8306, 10085, 10078, 11823, 11843, 6309, 9473, 9571, 10879, 7581, 8180, 7483
}; // Stds for the two Gaussians for the six channels (noise) staticconst int16_t kNoiseDataStds[kTableSize] = {
378, 1064, 493, 582, 688, 593, 474, 697, 475, 688, 421, 455 }; // Stds for the two Gaussians for the six channels (speech) staticconst int16_t kSpeechDataStds[kTableSize] = {
555, 505, 567, 524, 585, 1231, 509, 828, 492, 1540, 1079, 850 };
// Constants used in GmmProbability(). // // Maximum number of counted speech (VAD = 1) frames in a row. staticconst int16_t kMaxSpeechFrames = 6; // Minimum standard deviation for both speech and noise. staticconst int16_t kMinStd = 384;
// Calculates the weighted average w.r.t. number of Gaussians. The `data` are // updated with an `offset` before averaging. // // - data [i/o] : Data to average. // - offset [i] : An offset added to `data`. // - weights [i] : Weights used for averaging. // // returns : The weighted average. static int32_t WeightedAverage(int16_t* data, int16_t offset, const int16_t* weights) { int k;
int32_t weighted_average = 0;
// An s16 x s32 -> s32 multiplication that's allowed to overflow. (It's still // undefined behavior, so not a good idea; this just makes UBSan ignore the // violation, so that our old code can continue to do what it's always been // doing.) staticinline int32_t RTC_NO_SANITIZE("signed-integer-overflow")
OverflowingMulS16ByS32ToS32(int16_t a, int32_t b) { return a * b;
}
// Calculates the probabilities for both speech and background noise using // Gaussian Mixture Models (GMM). A hypothesis-test is performed to decide which // type of signal is most probable. // // - self [i/o] : Pointer to VAD instance // - features [i] : Feature vector of length `kNumChannels` // = log10(energy in frequency band) // - total_power [i] : Total power in audio frame. // - frame_length [i] : Number of input samples // // - returns : the VAD decision (0 - noise, 1 - speech). static int16_t GmmProbability(VadInstT* self, int16_t* features,
int16_t total_power, size_t frame_length) { int channel, k;
int16_t feature_minimum;
int16_t h0, h1;
int16_t log_likelihood_ratio;
int16_t vadflag = 0;
int16_t shifts_h0, shifts_h1;
int16_t tmp_s16, tmp1_s16, tmp2_s16;
int16_t diff; int gaussian;
int16_t nmk, nmk2, nmk3, smk, smk2, nsk, ssk;
int16_t delt, ndelt;
int16_t maxspe, maxmu;
int16_t deltaN[kTableSize], deltaS[kTableSize];
int16_t ngprvec[kTableSize] = { 0 }; // Conditional probability = 0.
int16_t sgprvec[kTableSize] = { 0 }; // Conditional probability = 0.
int32_t h0_test, h1_test;
int32_t tmp1_s32, tmp2_s32;
int32_t sum_log_likelihood_ratios = 0;
int32_t noise_global_mean, speech_global_mean;
int32_t noise_probability[kNumGaussians], speech_probability[kNumGaussians];
int16_t overhead1, overhead2, individualTest, totalTest;
if (total_power > kMinEnergy) { // The signal power of current frame is large enough for processing. The // processing consists of two parts: // 1) Calculating the likelihood of speech and thereby a VAD decision. // 2) Updating the underlying model, w.r.t., the decision made.
// The detection scheme is an LRT with hypothesis // H0: Noise // H1: Speech // // We combine a global LRT with local tests, for each frequency sub-band, // here defined as `channel`. for (channel = 0; channel < kNumChannels; channel++) { // For each channel we model the probability with a GMM consisting of // `kNumGaussians`, with different means and standard deviations depending // on H0 or H1.
h0_test = 0;
h1_test = 0; for (k = 0; k < kNumGaussians; k++) {
gaussian = channel + k * kNumChannels; // Probability under H0, that is, probability of frame being noise. // Value given in Q27 = Q7 * Q20.
tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
self->noise_means[gaussian],
self->noise_stds[gaussian],
&deltaN[gaussian]);
noise_probability[k] = kNoiseDataWeights[gaussian] * tmp1_s32;
h0_test += noise_probability[k]; // Q27
// Probability under H1, that is, probability of frame being speech. // Value given in Q27 = Q7 * Q20.
tmp1_s32 = WebRtcVad_GaussianProbability(features[channel],
self->speech_means[gaussian],
self->speech_stds[gaussian],
&deltaS[gaussian]);
speech_probability[k] = kSpeechDataWeights[gaussian] * tmp1_s32;
h1_test += speech_probability[k]; // Q27
}
// Calculate the log likelihood ratio: log2(Pr{X|H1} / Pr{X|H1}). // Approximation: // log2(Pr{X|H1} / Pr{X|H1}) = log2(Pr{X|H1}*2^Q) - log2(Pr{X|H1}*2^Q) // = log2(h1_test) - log2(h0_test) // = log2(2^(31-shifts_h1)*(1+b1)) // - log2(2^(31-shifts_h0)*(1+b0)) // = shifts_h0 - shifts_h1 // + log2(1+b1) - log2(1+b0) // ~= shifts_h0 - shifts_h1 // // Note that b0 and b1 are values less than 1, hence, 0 <= log2(1+b0) < 1. // Further, b0 and b1 are independent and on the average the two terms // cancel.
shifts_h0 = WebRtcSpl_NormW32(h0_test);
shifts_h1 = WebRtcSpl_NormW32(h1_test); if (h0_test == 0) {
shifts_h0 = 31;
} if (h1_test == 0) {
shifts_h1 = 31;
}
log_likelihood_ratio = shifts_h0 - shifts_h1;
// Update `sum_log_likelihood_ratios` with spectrum weighting. This is // used for the global VAD decision.
sum_log_likelihood_ratios +=
(int32_t) (log_likelihood_ratio * kSpectrumWeight[channel]);
// Local VAD decision. if ((log_likelihood_ratio * 4) > individualTest) {
vadflag = 1;
}
// TODO(bjornv): The conditional probabilities below are applied on the // hard coded number of Gaussians set to two. Find a way to generalize. // Calculate local noise probabilities used later when updating the GMM.
h0 = (int16_t) (h0_test >> 12); // Q15 if (h0 > 0) { // High probability of noise. Assign conditional probabilities for each // Gaussian in the GMM.
tmp1_s32 = (noise_probability[0] & 0xFFFFF000) << 2; // Q29
ngprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h0); // Q14
ngprvec[channel + kNumChannels] = 16384 - ngprvec[channel];
} else { // Low noise probability. Assign conditional probability 1 to the first // Gaussian and 0 to the rest (which is already set at initialization).
ngprvec[channel] = 16384;
}
// Calculate local speech probabilities used later when updating the GMM.
h1 = (int16_t) (h1_test >> 12); // Q15 if (h1 > 0) { // High probability of speech. Assign conditional probabilities for each // Gaussian in the GMM. Otherwise use the initialized values, i.e., 0.
tmp1_s32 = (speech_probability[0] & 0xFFFFF000) << 2; // Q29
sgprvec[channel] = (int16_t) WebRtcSpl_DivW32W16(tmp1_s32, h1); // Q14
sgprvec[channel + kNumChannels] = 16384 - sgprvec[channel];
}
}
// Make a global VAD decision.
vadflag |= (sum_log_likelihood_ratios >= totalTest);
// Update the model parameters.
maxspe = 12800; for (channel = 0; channel < kNumChannels; channel++) {
// Get minimum value in past which is used for long term correction in Q4.
feature_minimum = WebRtcVad_FindMinimum(self, features[channel], channel);
// Compute the "global" mean, that is the sum of the two means weighted.
noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
&kNoiseDataWeights[channel]);
tmp1_s16 = (int16_t) (noise_global_mean >> 6); // Q8
for (k = 0; k < kNumGaussians; k++) {
gaussian = channel + k * kNumChannels;
// Long term correction of the noise mean. // Q8 - Q8 = Q8.
ndelt = (feature_minimum << 4) - tmp1_s16; // Q7 + (Q8 * Q8) >> 9 = Q7.
nmk3 = nmk2 + (int16_t)((ndelt * kBackEta) >> 9);
// Control that the noise mean does not drift to much.
tmp_s16 = (int16_t) ((k + 5) << 7); if (nmk3 < tmp_s16) {
nmk3 = tmp_s16;
}
tmp_s16 = (int16_t) ((72 + k - channel) << 7); if (nmk3 > tmp_s16) {
nmk3 = tmp_s16;
}
self->noise_means[gaussian] = nmk3;
// Control that the speech mean does not drift to much.
maxmu = maxspe + 640; if (smk2 < kMinimumMean[k]) {
smk2 = kMinimumMean[k];
} if (smk2 > maxmu) {
smk2 = maxmu;
}
self->speech_means[gaussian] = smk2; // Q7.
// Separate models if they are too close. // `noise_global_mean` in Q14 (= Q7 * Q7).
noise_global_mean = WeightedAverage(&self->noise_means[channel], 0,
&kNoiseDataWeights[channel]);
// Move Gaussian means for speech model by `tmp1_s16` and update // `speech_global_mean`. Note that `self->speech_means[channel]` is // changed after the call.
speech_global_mean = WeightedAverage(&self->speech_means[channel],
tmp1_s16,
&kSpeechDataWeights[channel]);
// Move Gaussian means for noise model by -`tmp2_s16` and update // `noise_global_mean`. Note that `self->noise_means[channel]` is // changed after the call.
noise_global_mean = WeightedAverage(&self->noise_means[channel],
-tmp2_s16,
&kNoiseDataWeights[channel]);
}
// Control that the speech & noise means do not drift to much.
maxspe = kMaximumSpeech[channel];
tmp2_s16 = (int16_t) (speech_global_mean >> 7); if (tmp2_s16 > maxspe) { // Upper limit of speech model.
tmp2_s16 -= maxspe;
for (k = 0; k < kNumGaussians; k++) {
self->speech_means[channel + k * kNumChannels] -= tmp2_s16;
}
}
// Initialize the VAD. Set aggressiveness mode to default value. int WebRtcVad_InitCore(VadInstT* self) { int i;
if (self == NULL) { return -1;
}
// Initialization of general struct variables.
self->vad = 1; // Speech active (=1).
self->frame_counter = 0;
self->over_hang = 0;
self->num_of_speech = 0;
// Initialization of downsampling filter state.
memset(self->downsampling_filter_states, 0, sizeof(self->downsampling_filter_states));
// Initialization of 48 to 8 kHz downsampling.
WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
// Read initial PDF parameters. for (i = 0; i < kTableSize; i++) {
self->noise_means[i] = kNoiseDataMeans[i];
self->speech_means[i] = kSpeechDataMeans[i];
self->noise_stds[i] = kNoiseDataStds[i];
self->speech_stds[i] = kSpeechDataStds[i];
}
// Initialize Index and Minimum value vectors. for (i = 0; i < 16 * kNumChannels; i++) {
self->low_value_vector[i] = 10000;
self->index_vector[i] = 0;
}
// Calculate VAD decision by first extracting feature values and then calculate // probability for both speech and background noise.
int WebRtcVad_CalcVad48khz(VadInstT* inst, const int16_t* speech_frame,
size_t frame_length) { int vad;
size_t i;
int16_t speech_nb[240]; // 30 ms in 8 kHz. // `tmp_mem` is a temporary memory used by resample function, length is // frame length in 10 ms (480 samples) + 256 extra.
int32_t tmp_mem[480 + 256] = { 0 }; const size_t kFrameLen10ms48khz = 480; const size_t kFrameLen10ms8khz = 80;
size_t num_10ms_frames = frame_length / kFrameLen10ms48khz;
for (i = 0; i < num_10ms_frames; i++) {
WebRtcSpl_Resample48khzTo8khz(speech_frame,
&speech_nb[i * kFrameLen10ms8khz],
&inst->state_48_to_8,
tmp_mem);
}
// Do VAD on an 8 kHz signal
vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
return vad;
}
int WebRtcVad_CalcVad32khz(VadInstT* inst, const int16_t* speech_frame,
size_t frame_length)
{
size_t len; int vad;
int16_t speechWB[480]; // Downsampled speech frame: 960 samples (30ms in SWB)
int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
// Downsample signal 32->16->8 before doing VAD
WebRtcVad_Downsampling(speech_frame, speechWB, &(inst->downsampling_filter_states[2]),
frame_length);
len = frame_length / 2;
WebRtcVad_Downsampling(speechWB, speechNB, inst->downsampling_filter_states, len);
len /= 2;
// Do VAD on an 8 kHz signal
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
return vad;
}
int WebRtcVad_CalcVad16khz(VadInstT* inst, const int16_t* speech_frame,
size_t frame_length)
{
size_t len; int vad;
int16_t speechNB[240]; // Downsampled speech frame: 480 samples (30ms in WB)
// Wideband: Downsample signal before doing VAD
WebRtcVad_Downsampling(speech_frame, speechNB, inst->downsampling_filter_states,
frame_length);
len = frame_length / 2;
vad = WebRtcVad_CalcVad8khz(inst, speechNB, len);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.