Line data Source code
1 : /*
2 : * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #ifndef WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
12 : #define WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
13 :
14 : // MSVC++ requires this to be set before any other includes to get M_PI.
15 : #ifndef _USE_MATH_DEFINES
16 : #define _USE_MATH_DEFINES
17 : #endif
18 :
19 : #include <math.h>
20 : #include <stddef.h> // size_t
21 : #include <stdio.h> // FILE
22 : #include <vector>
23 :
24 : #include "webrtc/base/arraysize.h"
25 : #include "webrtc/base/platform_file.h"
26 : #include "webrtc/modules/audio_processing/beamformer/array_util.h"
27 : #include "webrtc/modules/audio_processing/include/config.h"
28 : #include "webrtc/typedefs.h"
29 :
30 : namespace webrtc {
31 :
32 : struct AecCore;
33 :
34 : class AudioFrame;
35 :
36 : class NonlinearBeamformer;
37 :
38 : class StreamConfig;
39 : class ProcessingConfig;
40 :
41 : class EchoCancellation;
42 : class EchoControlMobile;
43 : class GainControl;
44 : class HighPassFilter;
45 : class LevelEstimator;
46 : class NoiseSuppression;
47 : class VoiceDetection;
48 :
49 : // Use to enable the extended filter mode in the AEC, along with robustness
50 : // measures around the reported system delays. It comes with a significant
51 : // increase in AEC complexity, but is much more robust to unreliable reported
52 : // delays.
53 : //
54 : // Detailed changes to the algorithm:
55 : // - The filter length is changed from 48 to 128 ms. This comes with tuning of
56 : // several parameters: i) filter adaptation stepsize and error threshold;
57 : // ii) non-linear processing smoothing and overdrive.
58 : // - Option to ignore the reported delays on platforms which we deem
59 : // sufficiently unreliable. See WEBRTC_UNTRUSTED_DELAY in echo_cancellation.c.
60 : // - Faster startup times by removing the excessive "startup phase" processing
61 : // of reported delays.
62 : // - Much more conservative adjustments to the far-end read pointer. We smooth
63 : // the delay difference more heavily, and back off from the difference more.
64 : // Adjustments force a readaptation of the filter, so they should be avoided
65 : // except when really necessary.
66 : struct ExtendedFilter {
67 0 : ExtendedFilter() : enabled(false) {}
68 : explicit ExtendedFilter(bool enabled) : enabled(enabled) {}
69 : static const ConfigOptionID identifier = ConfigOptionID::kExtendedFilter;
70 : bool enabled;
71 : };
72 :
73 : // Enables the refined linear filter adaptation in the echo canceller.
74 : // This configuration only applies to EchoCancellation and not
75 : // EchoControlMobile. It can be set in the constructor
76 : // or using AudioProcessing::SetExtraOptions().
77 : struct RefinedAdaptiveFilter {
78 0 : RefinedAdaptiveFilter() : enabled(false) {}
79 : explicit RefinedAdaptiveFilter(bool enabled) : enabled(enabled) {}
80 : static const ConfigOptionID identifier =
81 : ConfigOptionID::kAecRefinedAdaptiveFilter;
82 : bool enabled;
83 : };
84 :
85 : // Enables delay-agnostic echo cancellation. This feature relies on internally
86 : // estimated delays between the process and reverse streams, thus not relying
87 : // on reported system delays. This configuration only applies to
88 : // EchoCancellation and not EchoControlMobile. It can be set in the constructor
89 : // or using AudioProcessing::SetExtraOptions().
90 : struct DelayAgnostic {
91 0 : DelayAgnostic() : enabled(false) {}
92 : explicit DelayAgnostic(bool enabled) : enabled(enabled) {}
93 : static const ConfigOptionID identifier = ConfigOptionID::kDelayAgnostic;
94 : bool enabled;
95 : };
96 :
97 : // Use to enable experimental gain control (AGC). At startup the experimental
98 : // AGC moves the microphone volume up to |startup_min_volume| if the current
99 : // microphone volume is set too low. The value is clamped to its operating range
100 : // [12, 255]. Here, 255 maps to 100%.
101 : //
102 : // Must be provided through AudioProcessing::Create(Confg&).
103 : #if defined(WEBRTC_CHROMIUM_BUILD)
104 : static const int kAgcStartupMinVolume = 85;
105 : #else
106 : static const int kAgcStartupMinVolume = 0;
107 : #endif // defined(WEBRTC_CHROMIUM_BUILD)
108 : static constexpr int kClippedLevelMin = 170;
109 : struct ExperimentalAgc {
110 0 : ExperimentalAgc() = default;
111 0 : explicit ExperimentalAgc(bool enabled) : enabled(enabled) {}
112 : ExperimentalAgc(bool enabled, int startup_min_volume)
113 : : enabled(enabled), startup_min_volume(startup_min_volume) {}
114 : ExperimentalAgc(bool enabled, int startup_min_volume, int clipped_level_min)
115 : : enabled(enabled),
116 : startup_min_volume(startup_min_volume),
117 : clipped_level_min(clipped_level_min) {}
118 : static const ConfigOptionID identifier = ConfigOptionID::kExperimentalAgc;
119 : bool enabled = true;
120 : int startup_min_volume = kAgcStartupMinVolume;
121 : // Lowest microphone level that will be applied in response to clipping.
122 : int clipped_level_min = kClippedLevelMin;
123 : };
124 :
125 : // Use to enable experimental noise suppression. It can be set in the
126 : // constructor or using AudioProcessing::SetExtraOptions().
127 : struct ExperimentalNs {
128 0 : ExperimentalNs() : enabled(false) {}
129 : explicit ExperimentalNs(bool enabled) : enabled(enabled) {}
130 : static const ConfigOptionID identifier = ConfigOptionID::kExperimentalNs;
131 : bool enabled;
132 : };
133 :
134 : // Use to enable beamforming. Must be provided through the constructor. It will
135 : // have no impact if used with AudioProcessing::SetExtraOptions().
136 : struct Beamforming {
137 : Beamforming();
138 : Beamforming(bool enabled, const std::vector<Point>& array_geometry);
139 : Beamforming(bool enabled,
140 : const std::vector<Point>& array_geometry,
141 : SphericalPointf target_direction);
142 : ~Beamforming();
143 :
144 : static const ConfigOptionID identifier = ConfigOptionID::kBeamforming;
145 : const bool enabled;
146 : const std::vector<Point> array_geometry;
147 : const SphericalPointf target_direction;
148 : };
149 :
150 : // Use to enable intelligibility enhancer in audio processing.
151 : //
152 : // Note: If enabled and the reverse stream has more than one output channel,
153 : // the reverse stream will become an upmixed mono signal.
154 : struct Intelligibility {
155 0 : Intelligibility() : enabled(false) {}
156 : explicit Intelligibility(bool enabled) : enabled(enabled) {}
157 : static const ConfigOptionID identifier = ConfigOptionID::kIntelligibility;
158 : bool enabled;
159 : };
160 :
161 : // The Audio Processing Module (APM) provides a collection of voice processing
162 : // components designed for real-time communications software.
163 : //
164 : // APM operates on two audio streams on a frame-by-frame basis. Frames of the
165 : // primary stream, on which all processing is applied, are passed to
166 : // |ProcessStream()|. Frames of the reverse direction stream are passed to
167 : // |ProcessReverseStream()|. On the client-side, this will typically be the
168 : // near-end (capture) and far-end (render) streams, respectively. APM should be
169 : // placed in the signal chain as close to the audio hardware abstraction layer
170 : // (HAL) as possible.
171 : //
172 : // On the server-side, the reverse stream will normally not be used, with
173 : // processing occurring on each incoming stream.
174 : //
175 : // Component interfaces follow a similar pattern and are accessed through
176 : // corresponding getters in APM. All components are disabled at create-time,
177 : // with default settings that are recommended for most situations. New settings
178 : // can be applied without enabling a component. Enabling a component triggers
179 : // memory allocation and initialization to allow it to start processing the
180 : // streams.
181 : //
182 : // Thread safety is provided with the following assumptions to reduce locking
183 : // overhead:
184 : // 1. The stream getters and setters are called from the same thread as
185 : // ProcessStream(). More precisely, stream functions are never called
186 : // concurrently with ProcessStream().
187 : // 2. Parameter getters are never called concurrently with the corresponding
188 : // setter.
189 : //
190 : // APM accepts only linear PCM audio data in chunks of 10 ms. The int16
191 : // interfaces use interleaved data, while the float interfaces use deinterleaved
192 : // data.
193 : //
194 : // Usage example, omitting error checking:
195 : // AudioProcessing* apm = AudioProcessing::Create(0);
196 : //
197 : // AudioProcessing::Config config;
198 : // config.level_controller.enabled = true;
199 : // config.high_pass_filter.enabled = true;
200 : // apm->ApplyConfig(config)
201 : //
202 : // apm->echo_cancellation()->enable_drift_compensation(false);
203 : // apm->echo_cancellation()->Enable(true);
204 : //
205 : // apm->noise_reduction()->set_level(kHighSuppression);
206 : // apm->noise_reduction()->Enable(true);
207 : //
208 : // apm->gain_control()->set_analog_level_limits(0, 255);
209 : // apm->gain_control()->set_mode(kAdaptiveAnalog);
210 : // apm->gain_control()->Enable(true);
211 : //
212 : // apm->voice_detection()->Enable(true);
213 : //
214 : // // Start a voice call...
215 : //
216 : // // ... Render frame arrives bound for the audio HAL ...
217 : // apm->ProcessReverseStream(render_frame);
218 : //
219 : // // ... Capture frame arrives from the audio HAL ...
220 : // // Call required set_stream_ functions.
221 : // apm->set_stream_delay_ms(delay_ms);
222 : // apm->gain_control()->set_stream_analog_level(analog_level);
223 : //
224 : // apm->ProcessStream(capture_frame);
225 : //
226 : // // Call required stream_ functions.
227 : // analog_level = apm->gain_control()->stream_analog_level();
228 : // has_voice = apm->stream_has_voice();
229 : //
230 : // // Repeate render and capture processing for the duration of the call...
231 : // // Start a new call...
232 : // apm->Initialize();
233 : //
234 : // // Close the application...
235 : // delete apm;
236 : //
237 0 : class AudioProcessing {
238 : public:
239 : // The struct below constitutes the new parameter scheme for the audio
240 : // processing. It is being introduced gradually and until it is fully
241 : // introduced, it is prone to change.
242 : // TODO(peah): Remove this comment once the new config scheme is fully rolled
243 : // out.
244 : //
245 : // The parameters and behavior of the audio processing module are controlled
246 : // by changing the default values in the AudioProcessing::Config struct.
247 : // The config is applied by passing the struct to the ApplyConfig method.
248 0 : struct Config {
249 0 : struct LevelController {
250 : bool enabled = false;
251 :
252 : // Sets the initial peak level to use inside the level controller in order
253 : // to compute the signal gain. The unit for the peak level is dBFS and
254 : // the allowed range is [-100, 0].
255 : float initial_peak_level_dbfs = -6.0206f;
256 : } level_controller;
257 0 : struct ResidualEchoDetector {
258 : bool enabled = true;
259 : } residual_echo_detector;
260 :
261 0 : struct HighPassFilter {
262 : bool enabled = false;
263 : } high_pass_filter;
264 :
265 : // Enables the next generation AEC functionality. This feature replaces the
266 : // standard methods for echo removal in the AEC.
267 : // The functionality is not yet activated in the code and turning this on
268 : // does not yet have the desired behavior.
269 0 : struct EchoCanceller3 {
270 : bool enabled = false;
271 : } echo_canceller3;
272 : };
273 :
274 : // TODO(mgraczyk): Remove once all methods that use ChannelLayout are gone.
275 : enum ChannelLayout {
276 : kMono,
277 : // Left, right.
278 : kStereo,
279 : // Mono, keyboard, and mic.
280 : kMonoAndKeyboard,
281 : // Left, right, keyboard, and mic.
282 : kStereoAndKeyboard
283 : };
284 :
285 : // Creates an APM instance. Use one instance for every primary audio stream
286 : // requiring processing. On the client-side, this would typically be one
287 : // instance for the near-end stream, and additional instances for each far-end
288 : // stream which requires processing. On the server-side, this would typically
289 : // be one instance for every incoming stream.
290 : static AudioProcessing* Create();
291 : // Allows passing in an optional configuration at create-time.
292 : static AudioProcessing* Create(const webrtc::Config& config);
293 : // Only for testing.
294 : static AudioProcessing* Create(const webrtc::Config& config,
295 : NonlinearBeamformer* beamformer);
296 0 : virtual ~AudioProcessing() {}
297 :
298 : // Initializes internal states, while retaining all user settings. This
299 : // should be called before beginning to process a new audio stream. However,
300 : // it is not necessary to call before processing the first stream after
301 : // creation.
302 : //
303 : // It is also not necessary to call if the audio parameters (sample
304 : // rate and number of channels) have changed. Passing updated parameters
305 : // directly to |ProcessStream()| and |ProcessReverseStream()| is permissible.
306 : // If the parameters are known at init-time though, they may be provided.
307 : virtual int Initialize() = 0;
308 :
309 : // The int16 interfaces require:
310 : // - only |NativeRate|s be used
311 : // - that the input, output and reverse rates must match
312 : // - that |processing_config.output_stream()| matches
313 : // |processing_config.input_stream()|.
314 : //
315 : // The float interfaces accept arbitrary rates and support differing input and
316 : // output layouts, but the output must have either one channel or the same
317 : // number of channels as the input.
318 : virtual int Initialize(const ProcessingConfig& processing_config) = 0;
319 :
320 : // Initialize with unpacked parameters. See Initialize() above for details.
321 : //
322 : // TODO(mgraczyk): Remove once clients are updated to use the new interface.
323 : virtual int Initialize(int capture_input_sample_rate_hz,
324 : int capture_output_sample_rate_hz,
325 : int render_sample_rate_hz,
326 : ChannelLayout capture_input_layout,
327 : ChannelLayout capture_output_layout,
328 : ChannelLayout render_input_layout) = 0;
329 :
330 : // TODO(peah): This method is a temporary solution used to take control
331 : // over the parameters in the audio processing module and is likely to change.
332 : virtual void ApplyConfig(const Config& config) = 0;
333 :
334 : // Pass down additional options which don't have explicit setters. This
335 : // ensures the options are applied immediately.
336 : virtual void SetExtraOptions(const webrtc::Config& config) = 0;
337 :
338 : // TODO(ajm): Only intended for internal use. Make private and friend the
339 : // necessary classes?
340 : virtual int proc_sample_rate_hz() const = 0;
341 : virtual int proc_split_sample_rate_hz() const = 0;
342 : virtual size_t num_input_channels() const = 0;
343 : virtual size_t num_proc_channels() const = 0;
344 : virtual size_t num_output_channels() const = 0;
345 : virtual size_t num_reverse_channels() const = 0;
346 :
347 : // Set to true when the output of AudioProcessing will be muted or in some
348 : // other way not used. Ideally, the captured audio would still be processed,
349 : // but some components may change behavior based on this information.
350 : // Default false.
351 : virtual void set_output_will_be_muted(bool muted) = 0;
352 :
353 : // Processes a 10 ms |frame| of the primary audio stream. On the client-side,
354 : // this is the near-end (or captured) audio.
355 : //
356 : // If needed for enabled functionality, any function with the set_stream_ tag
357 : // must be called prior to processing the current frame. Any getter function
358 : // with the stream_ tag which is needed should be called after processing.
359 : //
360 : // The |sample_rate_hz_|, |num_channels_|, and |samples_per_channel_|
361 : // members of |frame| must be valid. If changed from the previous call to this
362 : // method, it will trigger an initialization.
363 : virtual int ProcessStream(AudioFrame* frame) = 0;
364 :
365 : // Accepts deinterleaved float audio with the range [-1, 1]. Each element
366 : // of |src| points to a channel buffer, arranged according to
367 : // |input_layout|. At output, the channels will be arranged according to
368 : // |output_layout| at |output_sample_rate_hz| in |dest|.
369 : //
370 : // The output layout must have one channel or as many channels as the input.
371 : // |src| and |dest| may use the same memory, if desired.
372 : //
373 : // TODO(mgraczyk): Remove once clients are updated to use the new interface.
374 : virtual int ProcessStream(const float* const* src,
375 : size_t samples_per_channel,
376 : int input_sample_rate_hz,
377 : ChannelLayout input_layout,
378 : int output_sample_rate_hz,
379 : ChannelLayout output_layout,
380 : float* const* dest) = 0;
381 :
382 : // Accepts deinterleaved float audio with the range [-1, 1]. Each element of
383 : // |src| points to a channel buffer, arranged according to |input_stream|. At
384 : // output, the channels will be arranged according to |output_stream| in
385 : // |dest|.
386 : //
387 : // The output must have one channel or as many channels as the input. |src|
388 : // and |dest| may use the same memory, if desired.
389 : virtual int ProcessStream(const float* const* src,
390 : const StreamConfig& input_config,
391 : const StreamConfig& output_config,
392 : float* const* dest) = 0;
393 :
394 : // Processes a 10 ms |frame| of the reverse direction audio stream. The frame
395 : // may be modified. On the client-side, this is the far-end (or to be
396 : // rendered) audio.
397 : //
398 : // It is necessary to provide this if echo processing is enabled, as the
399 : // reverse stream forms the echo reference signal. It is recommended, but not
400 : // necessary, to provide if gain control is enabled. On the server-side this
401 : // typically will not be used. If you're not sure what to pass in here,
402 : // chances are you don't need to use it.
403 : //
404 : // The |sample_rate_hz_|, |num_channels_|, and |samples_per_channel_|
405 : // members of |frame| must be valid.
406 : virtual int ProcessReverseStream(AudioFrame* frame) = 0;
407 :
408 : // Accepts deinterleaved float audio with the range [-1, 1]. Each element
409 : // of |data| points to a channel buffer, arranged according to |layout|.
410 : // TODO(mgraczyk): Remove once clients are updated to use the new interface.
411 : virtual int AnalyzeReverseStream(const float* const* data,
412 : size_t samples_per_channel,
413 : int sample_rate_hz,
414 : ChannelLayout layout) = 0;
415 :
416 : // Accepts deinterleaved float audio with the range [-1, 1]. Each element of
417 : // |data| points to a channel buffer, arranged according to |reverse_config|.
418 : virtual int ProcessReverseStream(const float* const* src,
419 : const StreamConfig& input_config,
420 : const StreamConfig& output_config,
421 : float* const* dest) = 0;
422 :
423 : // This must be called if and only if echo processing is enabled.
424 : //
425 : // Sets the |delay| in ms between ProcessReverseStream() receiving a far-end
426 : // frame and ProcessStream() receiving a near-end frame containing the
427 : // corresponding echo. On the client-side this can be expressed as
428 : // delay = (t_render - t_analyze) + (t_process - t_capture)
429 : // where,
430 : // - t_analyze is the time a frame is passed to ProcessReverseStream() and
431 : // t_render is the time the first sample of the same frame is rendered by
432 : // the audio hardware.
433 : // - t_capture is the time the first sample of a frame is captured by the
434 : // audio hardware and t_pull is the time the same frame is passed to
435 : // ProcessStream().
436 : virtual int set_stream_delay_ms(int delay) = 0;
437 : virtual int stream_delay_ms() const = 0;
438 : virtual bool was_stream_delay_set() const = 0;
439 :
440 : // Call to signal that a key press occurred (true) or did not occur (false)
441 : // with this chunk of audio.
442 : virtual void set_stream_key_pressed(bool key_pressed) = 0;
443 :
444 : // Sets a delay |offset| in ms to add to the values passed in through
445 : // set_stream_delay_ms(). May be positive or negative.
446 : //
447 : // Note that this could cause an otherwise valid value passed to
448 : // set_stream_delay_ms() to return an error.
449 : virtual void set_delay_offset_ms(int offset) = 0;
450 : virtual int delay_offset_ms() const = 0;
451 :
452 : // Starts recording debugging information to a file specified by |filename|,
453 : // a NULL-terminated string. If there is an ongoing recording, the old file
454 : // will be closed, and recording will continue in the newly specified file.
455 : // An already existing file will be overwritten without warning. A maximum
456 : // file size (in bytes) for the log can be specified. The logging is stopped
457 : // once the limit has been reached. If max_log_size_bytes is set to a value
458 : // <= 0, no limit will be used.
459 : static const size_t kMaxFilenameSize = 1024;
460 : virtual int StartDebugRecording(const char filename[kMaxFilenameSize],
461 : int64_t max_log_size_bytes) = 0;
462 :
463 : // Same as above but uses an existing file handle. Takes ownership
464 : // of |handle| and closes it at StopDebugRecording().
465 : virtual int StartDebugRecording(FILE* handle, int64_t max_log_size_bytes) = 0;
466 :
467 : // TODO(ivoc): Remove this function after Chrome stops using it.
468 : virtual int StartDebugRecording(FILE* handle) = 0;
469 :
470 : // Same as above but uses an existing PlatformFile handle. Takes ownership
471 : // of |handle| and closes it at StopDebugRecording().
472 : // TODO(xians): Make this interface pure virtual.
473 : virtual int StartDebugRecordingForPlatformFile(rtc::PlatformFile handle) = 0;
474 :
475 : // Stops recording debugging information, and closes the file. Recording
476 : // cannot be resumed in the same file (without overwriting it).
477 : virtual int StopDebugRecording() = 0;
478 :
479 : // Use to send UMA histograms at end of a call. Note that all histogram
480 : // specific member variables are reset.
481 : virtual void UpdateHistogramsOnCallEnd() = 0;
482 :
483 : // TODO(ivoc): Remove when the calling code no longer uses the old Statistics
484 : // API.
485 0 : struct Statistic {
486 : int instant = 0; // Instantaneous value.
487 : int average = 0; // Long-term average.
488 : int maximum = 0; // Long-term maximum.
489 : int minimum = 0; // Long-term minimum.
490 : };
491 :
492 0 : struct Stat {
493 0 : void Set(const Statistic& other) {
494 0 : Set(other.instant, other.average, other.maximum, other.minimum);
495 0 : }
496 0 : void Set(float instant, float average, float maximum, float minimum) {
497 0 : instant_ = instant;
498 0 : average_ = average;
499 0 : maximum_ = maximum;
500 0 : minimum_ = minimum;
501 0 : }
502 0 : float instant() const { return instant_; }
503 : float average() const { return average_; }
504 : float maximum() const { return maximum_; }
505 : float minimum() const { return minimum_; }
506 :
507 : private:
508 : float instant_ = 0.0f; // Instantaneous value.
509 : float average_ = 0.0f; // Long-term average.
510 : float maximum_ = 0.0f; // Long-term maximum.
511 : float minimum_ = 0.0f; // Long-term minimum.
512 : };
513 :
514 0 : struct AudioProcessingStatistics {
515 : AudioProcessingStatistics();
516 : AudioProcessingStatistics(const AudioProcessingStatistics& other);
517 : ~AudioProcessingStatistics();
518 :
519 : // AEC Statistics.
520 : // RERL = ERL + ERLE
521 : Stat residual_echo_return_loss;
522 : // ERL = 10log_10(P_far / P_echo)
523 : Stat echo_return_loss;
524 : // ERLE = 10log_10(P_echo / P_out)
525 : Stat echo_return_loss_enhancement;
526 : // (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a)
527 : Stat a_nlp;
528 : // Fraction of time that the AEC linear filter is divergent, in a 1-second
529 : // non-overlapped aggregation window.
530 : float divergent_filter_fraction = -1.0f;
531 :
532 : // The delay metrics consists of the delay median and standard deviation. It
533 : // also consists of the fraction of delay estimates that can make the echo
534 : // cancellation perform poorly. The values are aggregated until the first
535 : // call to |GetStatistics()| and afterwards aggregated and updated every
536 : // second. Note that if there are several clients pulling metrics from
537 : // |GetStatistics()| during a session the first call from any of them will
538 : // change to one second aggregation window for all.
539 : int delay_median = -1;
540 : int delay_standard_deviation = -1;
541 : float fraction_poor_delays = -1.0f;
542 :
543 : // Residual echo detector likelihood.
544 : float residual_echo_likelihood = -1.0f;
545 : // Maximum residual echo likelihood from the last time period.
546 : float residual_echo_likelihood_recent_max = -1.0f;
547 : };
548 :
549 : // TODO(ivoc): Make this pure virtual when all subclasses have been updated.
550 : virtual AudioProcessingStatistics GetStatistics() const;
551 :
552 : // These provide access to the component interfaces and should never return
553 : // NULL. The pointers will be valid for the lifetime of the APM instance.
554 : // The memory for these objects is entirely managed internally.
555 : virtual EchoCancellation* echo_cancellation() const = 0;
556 : virtual EchoControlMobile* echo_control_mobile() const = 0;
557 : virtual GainControl* gain_control() const = 0;
558 : // TODO(peah): Deprecate this API call.
559 : virtual HighPassFilter* high_pass_filter() const = 0;
560 : virtual LevelEstimator* level_estimator() const = 0;
561 : virtual NoiseSuppression* noise_suppression() const = 0;
562 : virtual VoiceDetection* voice_detection() const = 0;
563 :
564 : enum Error {
565 : // Fatal errors.
566 : kNoError = 0,
567 : kUnspecifiedError = -1,
568 : kCreationFailedError = -2,
569 : kUnsupportedComponentError = -3,
570 : kUnsupportedFunctionError = -4,
571 : kNullPointerError = -5,
572 : kBadParameterError = -6,
573 : kBadSampleRateError = -7,
574 : kBadDataLengthError = -8,
575 : kBadNumberChannelsError = -9,
576 : kFileError = -10,
577 : kStreamParameterNotSetError = -11,
578 : kNotEnabledError = -12,
579 :
580 : // Warnings are non-fatal.
581 : // This results when a set_stream_ parameter is out of range. Processing
582 : // will continue, but the parameter may have been truncated.
583 : kBadStreamParameterWarning = -13
584 : };
585 :
586 : enum NativeRate {
587 : kSampleRate8kHz = 8000,
588 : kSampleRate16kHz = 16000,
589 : kSampleRate32kHz = 32000,
590 : kSampleRate44_1kHz = 44100,
591 : kSampleRate48kHz = 48000
592 : };
593 :
594 : // TODO(kwiberg): We currently need to support a compiler (Visual C++) that
595 : // complains if we don't explicitly state the size of the array here. Remove
596 : // the size when that's no longer the case.
597 : static constexpr int kNativeSampleRatesHz[4] = {
598 : kSampleRate8kHz, kSampleRate16kHz, kSampleRate32kHz, kSampleRate48kHz};
599 : static constexpr size_t kNumNativeSampleRates =
600 : arraysize(kNativeSampleRatesHz);
601 : static constexpr int kMaxNativeSampleRateHz =
602 : kNativeSampleRatesHz[kNumNativeSampleRates - 1];
603 :
604 : static const int kChunkSizeMs = 10;
605 : };
606 :
607 : class StreamConfig {
608 : public:
609 : // sample_rate_hz: The sampling rate of the stream.
610 : //
611 : // num_channels: The number of audio channels in the stream, excluding the
612 : // keyboard channel if it is present. When passing a
613 : // StreamConfig with an array of arrays T*[N],
614 : //
615 : // N == {num_channels + 1 if has_keyboard
616 : // {num_channels if !has_keyboard
617 : //
618 : // has_keyboard: True if the stream has a keyboard channel. When has_keyboard
619 : // is true, the last channel in any corresponding list of
620 : // channels is the keyboard channel.
621 0 : StreamConfig(int sample_rate_hz = 0,
622 : size_t num_channels = 0,
623 : bool has_keyboard = false)
624 0 : : sample_rate_hz_(sample_rate_hz),
625 : num_channels_(num_channels),
626 : has_keyboard_(has_keyboard),
627 0 : num_frames_(calculate_frames(sample_rate_hz)) {}
628 :
629 0 : void set_sample_rate_hz(int value) {
630 0 : sample_rate_hz_ = value;
631 0 : num_frames_ = calculate_frames(value);
632 0 : }
633 0 : void set_num_channels(size_t value) { num_channels_ = value; }
634 0 : void set_has_keyboard(bool value) { has_keyboard_ = value; }
635 :
636 0 : int sample_rate_hz() const { return sample_rate_hz_; }
637 :
638 : // The number of channels in the stream, not including the keyboard channel if
639 : // present.
640 0 : size_t num_channels() const { return num_channels_; }
641 :
642 0 : bool has_keyboard() const { return has_keyboard_; }
643 0 : size_t num_frames() const { return num_frames_; }
644 0 : size_t num_samples() const { return num_channels_ * num_frames_; }
645 :
646 0 : bool operator==(const StreamConfig& other) const {
647 0 : return sample_rate_hz_ == other.sample_rate_hz_ &&
648 0 : num_channels_ == other.num_channels_ &&
649 0 : has_keyboard_ == other.has_keyboard_;
650 : }
651 :
652 0 : bool operator!=(const StreamConfig& other) const { return !(*this == other); }
653 :
654 : private:
655 0 : static size_t calculate_frames(int sample_rate_hz) {
656 : return static_cast<size_t>(
657 0 : AudioProcessing::kChunkSizeMs * sample_rate_hz / 1000);
658 : }
659 :
660 : int sample_rate_hz_;
661 : size_t num_channels_;
662 : bool has_keyboard_;
663 : size_t num_frames_;
664 : };
665 :
666 0 : class ProcessingConfig {
667 : public:
668 : enum StreamName {
669 : kInputStream,
670 : kOutputStream,
671 : kReverseInputStream,
672 : kReverseOutputStream,
673 : kNumStreamNames,
674 : };
675 :
676 0 : const StreamConfig& input_stream() const {
677 0 : return streams[StreamName::kInputStream];
678 : }
679 0 : const StreamConfig& output_stream() const {
680 0 : return streams[StreamName::kOutputStream];
681 : }
682 : const StreamConfig& reverse_input_stream() const {
683 : return streams[StreamName::kReverseInputStream];
684 : }
685 : const StreamConfig& reverse_output_stream() const {
686 : return streams[StreamName::kReverseOutputStream];
687 : }
688 :
689 0 : StreamConfig& input_stream() { return streams[StreamName::kInputStream]; }
690 0 : StreamConfig& output_stream() { return streams[StreamName::kOutputStream]; }
691 0 : StreamConfig& reverse_input_stream() {
692 0 : return streams[StreamName::kReverseInputStream];
693 : }
694 0 : StreamConfig& reverse_output_stream() {
695 0 : return streams[StreamName::kReverseOutputStream];
696 : }
697 :
698 0 : bool operator==(const ProcessingConfig& other) const {
699 0 : for (int i = 0; i < StreamName::kNumStreamNames; ++i) {
700 0 : if (this->streams[i] != other.streams[i]) {
701 0 : return false;
702 : }
703 : }
704 0 : return true;
705 : }
706 :
707 : bool operator!=(const ProcessingConfig& other) const {
708 : return !(*this == other);
709 : }
710 :
711 : StreamConfig streams[StreamName::kNumStreamNames];
712 : };
713 :
714 : // The acoustic echo cancellation (AEC) component provides better performance
715 : // than AECM but also requires more processing power and is dependent on delay
716 : // stability and reporting accuracy. As such it is well-suited and recommended
717 : // for PC and IP phone applications.
718 : //
719 : // Not recommended to be enabled on the server-side.
720 0 : class EchoCancellation {
721 : public:
722 : // EchoCancellation and EchoControlMobile may not be enabled simultaneously.
723 : // Enabling one will disable the other.
724 : virtual int Enable(bool enable) = 0;
725 : virtual bool is_enabled() const = 0;
726 :
727 : // Differences in clock speed on the primary and reverse streams can impact
728 : // the AEC performance. On the client-side, this could be seen when different
729 : // render and capture devices are used, particularly with webcams.
730 : //
731 : // This enables a compensation mechanism, and requires that
732 : // set_stream_drift_samples() be called.
733 : virtual int enable_drift_compensation(bool enable) = 0;
734 : virtual bool is_drift_compensation_enabled() const = 0;
735 :
736 : // Sets the difference between the number of samples rendered and captured by
737 : // the audio devices since the last call to |ProcessStream()|. Must be called
738 : // if drift compensation is enabled, prior to |ProcessStream()|.
739 : virtual void set_stream_drift_samples(int drift) = 0;
740 : virtual int stream_drift_samples() const = 0;
741 :
742 : enum SuppressionLevel {
743 : kLowSuppression,
744 : kModerateSuppression,
745 : kHighSuppression
746 : };
747 :
748 : // Sets the aggressiveness of the suppressor. A higher level trades off
749 : // double-talk performance for increased echo suppression.
750 : virtual int set_suppression_level(SuppressionLevel level) = 0;
751 : virtual SuppressionLevel suppression_level() const = 0;
752 :
753 : // Returns false if the current frame almost certainly contains no echo
754 : // and true if it _might_ contain echo.
755 : virtual bool stream_has_echo() const = 0;
756 :
757 : // Enables the computation of various echo metrics. These are obtained
758 : // through |GetMetrics()|.
759 : virtual int enable_metrics(bool enable) = 0;
760 : virtual bool are_metrics_enabled() const = 0;
761 :
762 : // Each statistic is reported in dB.
763 : // P_far: Far-end (render) signal power.
764 : // P_echo: Near-end (capture) echo signal power.
765 : // P_out: Signal power at the output of the AEC.
766 : // P_a: Internal signal power at the point before the AEC's non-linear
767 : // processor.
768 0 : struct Metrics {
769 : // RERL = ERL + ERLE
770 : AudioProcessing::Statistic residual_echo_return_loss;
771 :
772 : // ERL = 10log_10(P_far / P_echo)
773 : AudioProcessing::Statistic echo_return_loss;
774 :
775 : // ERLE = 10log_10(P_echo / P_out)
776 : AudioProcessing::Statistic echo_return_loss_enhancement;
777 :
778 : // (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a)
779 : AudioProcessing::Statistic a_nlp;
780 :
781 : // Fraction of time that the AEC linear filter is divergent, in a 1-second
782 : // non-overlapped aggregation window.
783 : float divergent_filter_fraction;
784 : };
785 :
786 : // Deprecated. Use GetStatistics on the AudioProcessing interface instead.
787 : // TODO(ajm): discuss the metrics update period.
788 : virtual int GetMetrics(Metrics* metrics) = 0;
789 :
790 : // Enables computation and logging of delay values. Statistics are obtained
791 : // through |GetDelayMetrics()|.
792 : virtual int enable_delay_logging(bool enable) = 0;
793 : virtual bool is_delay_logging_enabled() const = 0;
794 :
795 : // The delay metrics consists of the delay |median| and the delay standard
796 : // deviation |std|. It also consists of the fraction of delay estimates
797 : // |fraction_poor_delays| that can make the echo cancellation perform poorly.
798 : // The values are aggregated until the first call to |GetDelayMetrics()| and
799 : // afterwards aggregated and updated every second.
800 : // Note that if there are several clients pulling metrics from
801 : // |GetDelayMetrics()| during a session the first call from any of them will
802 : // change to one second aggregation window for all.
803 : // Deprecated. Use GetStatistics on the AudioProcessing interface instead.
804 : virtual int GetDelayMetrics(int* median, int* std) = 0;
805 : // Deprecated. Use GetStatistics on the AudioProcessing interface instead.
806 : virtual int GetDelayMetrics(int* median, int* std,
807 : float* fraction_poor_delays) = 0;
808 :
809 : // Returns a pointer to the low level AEC component. In case of multiple
810 : // channels, the pointer to the first one is returned. A NULL pointer is
811 : // returned when the AEC component is disabled or has not been initialized
812 : // successfully.
813 : virtual struct AecCore* aec_core() const = 0;
814 :
815 : protected:
816 0 : virtual ~EchoCancellation() {}
817 : };
818 :
819 : // The acoustic echo control for mobile (AECM) component is a low complexity
820 : // robust option intended for use on mobile devices.
821 : //
822 : // Not recommended to be enabled on the server-side.
823 0 : class EchoControlMobile {
824 : public:
825 : // EchoCancellation and EchoControlMobile may not be enabled simultaneously.
826 : // Enabling one will disable the other.
827 : virtual int Enable(bool enable) = 0;
828 : virtual bool is_enabled() const = 0;
829 :
830 : // Recommended settings for particular audio routes. In general, the louder
831 : // the echo is expected to be, the higher this value should be set. The
832 : // preferred setting may vary from device to device.
833 : enum RoutingMode {
834 : kQuietEarpieceOrHeadset,
835 : kEarpiece,
836 : kLoudEarpiece,
837 : kSpeakerphone,
838 : kLoudSpeakerphone
839 : };
840 :
841 : // Sets echo control appropriate for the audio routing |mode| on the device.
842 : // It can and should be updated during a call if the audio routing changes.
843 : virtual int set_routing_mode(RoutingMode mode) = 0;
844 : virtual RoutingMode routing_mode() const = 0;
845 :
846 : // Comfort noise replaces suppressed background noise to maintain a
847 : // consistent signal level.
848 : virtual int enable_comfort_noise(bool enable) = 0;
849 : virtual bool is_comfort_noise_enabled() const = 0;
850 :
851 : // A typical use case is to initialize the component with an echo path from a
852 : // previous call. The echo path is retrieved using |GetEchoPath()|, typically
853 : // at the end of a call. The data can then be stored for later use as an
854 : // initializer before the next call, using |SetEchoPath()|.
855 : //
856 : // Controlling the echo path this way requires the data |size_bytes| to match
857 : // the internal echo path size. This size can be acquired using
858 : // |echo_path_size_bytes()|. |SetEchoPath()| causes an entire reset, worth
859 : // noting if it is to be called during an ongoing call.
860 : //
861 : // It is possible that version incompatibilities may result in a stored echo
862 : // path of the incorrect size. In this case, the stored path should be
863 : // discarded.
864 : virtual int SetEchoPath(const void* echo_path, size_t size_bytes) = 0;
865 : virtual int GetEchoPath(void* echo_path, size_t size_bytes) const = 0;
866 :
867 : // The returned path size is guaranteed not to change for the lifetime of
868 : // the application.
869 : static size_t echo_path_size_bytes();
870 :
871 : protected:
872 0 : virtual ~EchoControlMobile() {}
873 : };
874 :
875 : // The automatic gain control (AGC) component brings the signal to an
876 : // appropriate range. This is done by applying a digital gain directly and, in
877 : // the analog mode, prescribing an analog gain to be applied at the audio HAL.
878 : //
879 : // Recommended to be enabled on the client-side.
880 0 : class GainControl {
881 : public:
882 : virtual int Enable(bool enable) = 0;
883 : virtual bool is_enabled() const = 0;
884 :
885 : // When an analog mode is set, this must be called prior to |ProcessStream()|
886 : // to pass the current analog level from the audio HAL. Must be within the
887 : // range provided to |set_analog_level_limits()|.
888 : virtual int set_stream_analog_level(int level) = 0;
889 :
890 : // When an analog mode is set, this should be called after |ProcessStream()|
891 : // to obtain the recommended new analog level for the audio HAL. It is the
892 : // users responsibility to apply this level.
893 : virtual int stream_analog_level() = 0;
894 :
895 : enum Mode {
896 : // Adaptive mode intended for use if an analog volume control is available
897 : // on the capture device. It will require the user to provide coupling
898 : // between the OS mixer controls and AGC through the |stream_analog_level()|
899 : // functions.
900 : //
901 : // It consists of an analog gain prescription for the audio device and a
902 : // digital compression stage.
903 : kAdaptiveAnalog,
904 :
905 : // Adaptive mode intended for situations in which an analog volume control
906 : // is unavailable. It operates in a similar fashion to the adaptive analog
907 : // mode, but with scaling instead applied in the digital domain. As with
908 : // the analog mode, it additionally uses a digital compression stage.
909 : kAdaptiveDigital,
910 :
911 : // Fixed mode which enables only the digital compression stage also used by
912 : // the two adaptive modes.
913 : //
914 : // It is distinguished from the adaptive modes by considering only a
915 : // short time-window of the input signal. It applies a fixed gain through
916 : // most of the input level range, and compresses (gradually reduces gain
917 : // with increasing level) the input signal at higher levels. This mode is
918 : // preferred on embedded devices where the capture signal level is
919 : // predictable, so that a known gain can be applied.
920 : kFixedDigital
921 : };
922 :
923 : virtual int set_mode(Mode mode) = 0;
924 : virtual Mode mode() const = 0;
925 :
926 : // Sets the target peak |level| (or envelope) of the AGC in dBFs (decibels
927 : // from digital full-scale). The convention is to use positive values. For
928 : // instance, passing in a value of 3 corresponds to -3 dBFs, or a target
929 : // level 3 dB below full-scale. Limited to [0, 31].
930 : //
931 : // TODO(ajm): use a negative value here instead, if/when VoE will similarly
932 : // update its interface.
933 : virtual int set_target_level_dbfs(int level) = 0;
934 : virtual int target_level_dbfs() const = 0;
935 :
936 : // Sets the maximum |gain| the digital compression stage may apply, in dB. A
937 : // higher number corresponds to greater compression, while a value of 0 will
938 : // leave the signal uncompressed. Limited to [0, 90].
939 : virtual int set_compression_gain_db(int gain) = 0;
940 : virtual int compression_gain_db() const = 0;
941 :
942 : // When enabled, the compression stage will hard limit the signal to the
943 : // target level. Otherwise, the signal will be compressed but not limited
944 : // above the target level.
945 : virtual int enable_limiter(bool enable) = 0;
946 : virtual bool is_limiter_enabled() const = 0;
947 :
948 : // Sets the |minimum| and |maximum| analog levels of the audio capture device.
949 : // Must be set if and only if an analog mode is used. Limited to [0, 65535].
950 : virtual int set_analog_level_limits(int minimum,
951 : int maximum) = 0;
952 : virtual int analog_level_minimum() const = 0;
953 : virtual int analog_level_maximum() const = 0;
954 :
955 : // Returns true if the AGC has detected a saturation event (period where the
956 : // signal reaches digital full-scale) in the current frame and the analog
957 : // level cannot be reduced.
958 : //
959 : // This could be used as an indicator to reduce or disable analog mic gain at
960 : // the audio HAL.
961 : virtual bool stream_is_saturated() const = 0;
962 :
963 : protected:
964 0 : virtual ~GainControl() {}
965 : };
966 : // TODO(peah): Remove this interface.
967 : // A filtering component which removes DC offset and low-frequency noise.
968 : // Recommended to be enabled on the client-side.
969 0 : class HighPassFilter {
970 : public:
971 : virtual int Enable(bool enable) = 0;
972 : virtual bool is_enabled() const = 0;
973 :
974 0 : virtual ~HighPassFilter() {}
975 : };
976 :
977 : // An estimation component used to retrieve level metrics.
978 0 : class LevelEstimator {
979 : public:
980 : virtual int Enable(bool enable) = 0;
981 : virtual bool is_enabled() const = 0;
982 :
983 : // Returns the root mean square (RMS) level in dBFs (decibels from digital
984 : // full-scale), or alternately dBov. It is computed over all primary stream
985 : // frames since the last call to RMS(). The returned value is positive but
986 : // should be interpreted as negative. It is constrained to [0, 127].
987 : //
988 : // The computation follows: https://tools.ietf.org/html/rfc6465
989 : // with the intent that it can provide the RTP audio level indication.
990 : //
991 : // Frames passed to ProcessStream() with an |_energy| of zero are considered
992 : // to have been muted. The RMS of the frame will be interpreted as -127.
993 : virtual int RMS() = 0;
994 :
995 : protected:
996 0 : virtual ~LevelEstimator() {}
997 : };
998 :
999 : // The noise suppression (NS) component attempts to remove noise while
1000 : // retaining speech. Recommended to be enabled on the client-side.
1001 : //
1002 : // Recommended to be enabled on the client-side.
1003 0 : class NoiseSuppression {
1004 : public:
1005 : virtual int Enable(bool enable) = 0;
1006 : virtual bool is_enabled() const = 0;
1007 :
1008 : // Determines the aggressiveness of the suppression. Increasing the level
1009 : // will reduce the noise level at the expense of a higher speech distortion.
1010 : enum Level {
1011 : kLow,
1012 : kModerate,
1013 : kHigh,
1014 : kVeryHigh
1015 : };
1016 :
1017 : virtual int set_level(Level level) = 0;
1018 : virtual Level level() const = 0;
1019 :
1020 : // Returns the internally computed prior speech probability of current frame
1021 : // averaged over output channels. This is not supported in fixed point, for
1022 : // which |kUnsupportedFunctionError| is returned.
1023 : virtual float speech_probability() const = 0;
1024 :
1025 : // Returns the noise estimate per frequency bin averaged over all channels.
1026 : virtual std::vector<float> NoiseEstimate() = 0;
1027 :
1028 : protected:
1029 0 : virtual ~NoiseSuppression() {}
1030 : };
1031 :
1032 : // The voice activity detection (VAD) component analyzes the stream to
1033 : // determine if voice is present. A facility is also provided to pass in an
1034 : // external VAD decision.
1035 : //
1036 : // In addition to |stream_has_voice()| the VAD decision is provided through the
1037 : // |AudioFrame| passed to |ProcessStream()|. The |vad_activity_| member will be
1038 : // modified to reflect the current decision.
1039 0 : class VoiceDetection {
1040 : public:
1041 : virtual int Enable(bool enable) = 0;
1042 : virtual bool is_enabled() const = 0;
1043 :
1044 : // Returns true if voice is detected in the current frame. Should be called
1045 : // after |ProcessStream()|.
1046 : virtual bool stream_has_voice() const = 0;
1047 :
1048 : // Some of the APM functionality requires a VAD decision. In the case that
1049 : // a decision is externally available for the current frame, it can be passed
1050 : // in here, before |ProcessStream()| is called.
1051 : //
1052 : // VoiceDetection does _not_ need to be enabled to use this. If it happens to
1053 : // be enabled, detection will be skipped for any frame in which an external
1054 : // VAD decision is provided.
1055 : virtual int set_stream_has_voice(bool has_voice) = 0;
1056 :
1057 : // Specifies the likelihood that a frame will be declared to contain voice.
1058 : // A higher value makes it more likely that speech will not be clipped, at
1059 : // the expense of more noise being detected as voice.
1060 : enum Likelihood {
1061 : kVeryLowLikelihood,
1062 : kLowLikelihood,
1063 : kModerateLikelihood,
1064 : kHighLikelihood
1065 : };
1066 :
1067 : virtual int set_likelihood(Likelihood likelihood) = 0;
1068 : virtual Likelihood likelihood() const = 0;
1069 :
1070 : // Sets the |size| of the frames in ms on which the VAD will operate. Larger
1071 : // frames will improve detection accuracy, but reduce the frequency of
1072 : // updates.
1073 : //
1074 : // This does not impact the size of frames passed to |ProcessStream()|.
1075 : virtual int set_frame_size_ms(int size) = 0;
1076 : virtual int frame_size_ms() const = 0;
1077 :
1078 : protected:
1079 0 : virtual ~VoiceDetection() {}
1080 : };
1081 : } // namespace webrtc
1082 :
1083 : #endif // WEBRTC_MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
|