From 35e5e21e2098f755cf63cb9ed21ef8bb294fa282 Mon Sep 17 00:00:00 2001 From: Ryo Kawaguchi Date: Mon, 30 Aug 2021 10:45:47 +0900 Subject: [PATCH] Upgrade webrtc-audio-processing lib to v1.3 (not tested to build) --- .gitmodules | 2 +- Cargo.toml | 6 +- README.md | 10 +- examples/karaoke.rs | 25 +- examples/recording.rs | 19 +- examples/simple.rs | 18 +- src/config.rs | 549 ++++++++++-------- src/lib.rs | 96 +-- src/stats.rs | 62 ++ webrtc-audio-processing-sys/Cargo.toml | 8 +- webrtc-audio-processing-sys/README.md | 4 +- webrtc-audio-processing-sys/build.rs | 208 +++---- webrtc-audio-processing-sys/src/lib.rs | 223 ++++--- webrtc-audio-processing-sys/src/wrapper.cpp | 225 ++----- webrtc-audio-processing-sys/src/wrapper.hpp | 313 ++-------- .../webrtc-audio-processing | 2 +- 16 files changed, 759 insertions(+), 1011 deletions(-) create mode 100644 src/stats.rs diff --git a/.gitmodules b/.gitmodules index daa4770..7b527c7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "webrtc-audio-processing-sys/webrtc-audio-processing"] path = webrtc-audio-processing-sys/webrtc-audio-processing - url = https://github.com/tonarino/pulseaudio-webrtc-audio-processing.git + url = https://gitlab.freedesktop.org/pulseaudio/webrtc-audio-processing diff --git a/Cargo.toml b/Cargo.toml index 848cdd9..64faf59 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,11 +27,11 @@ name = "recording" required-features = ["derive_serde"] [dev-dependencies] +anyhow = "1" crossbeam-channel = "0.5" ctrlc = { version = "3", features = ["termination"] } -failure = "0.1" -hound = "3.4" -json5 = "0.3" +hound = "3" +json5 = "0.4" portaudio = "0.7" regex = "1" serde = { version = "1", features = ["derive"]} diff --git a/README.md b/README.md index 157f053..fc71c7f 100644 --- a/README.md +++ b/README.md @@ -39,10 +39,12 @@ The webrtc source code is included as a git submodule. Be sure to clone this rep Building from source and static linking can be enabled with the `bundled` feature flag. You need the following tools to build from source: * `clang` or `gcc` -* `autotools` (MacOS: `brew install automake`, `brew install autoconf`) -* `libtoolize` (typically `glibtoolize` on MacOS: `brew install libtool`) -* `pkg-config` (MacOS: `brew install pkg-config`) -* `automake` (MacOS: `brew install automake`) +* `pkg-config` (macOS: `brew install pkg-config`) +* `meson` (masOS: `brew install meson`) +* `ninja` (macOS: `brew install ninja`) +* (to confirm) `autotools` (MacOS: `brew install automake`, `brew install autoconf`) +* (to confirm) `libtoolize` (typically `glibtoolize` on MacOS: `brew install libtool`) +* (to confirm) `automake` (MacOS: `brew install automake`) ## Publishing diff --git a/examples/karaoke.rs b/examples/karaoke.rs index 0f5a4b2..3f59431 100644 --- a/examples/karaoke.rs +++ b/examples/karaoke.rs @@ -1,8 +1,8 @@ // This example loops the microphone input back to the speakers, while applying echo cancellation, // creating an experience similar to Karaoke microphones. It uses PortAudio as an interface to the // underlying audio devices. +use anyhow::Error; use ctrlc; -use failure::Error; use portaudio; use std::{ sync::{ @@ -21,26 +21,17 @@ const SAMPLE_RATE: f64 = 48_000.0; const FRAMES_PER_BUFFER: u32 = 480; fn create_processor( - num_capture_channels: i32, - num_render_channels: i32, + num_capture_channels: usize, + num_render_channels: usize, ) -> Result { let mut processor = Processor::new(&InitializationConfig { num_capture_channels, num_render_channels, - ..InitializationConfig::default() + sample_rate_hz: SAMPLE_RATE as u32, })?; - // High pass filter is a prerequisite to running echo cancellation. - let config = Config { - echo_cancellation: Some(EchoCancellation { - suppression_level: EchoCancellationSuppressionLevel::Low, - stream_delay_ms: Some(0), - enable_delay_agnostic: true, - enable_extended_filter: true, - }), - enable_high_pass_filter: true, - ..Config::default() - }; + // The default AEC configuration enables HPF, too. + let config = Config { echo_canceller: Some(EchoCanceller::default()), ..Config::default() }; processor.set_config(config); Ok(processor) @@ -74,8 +65,8 @@ fn main() -> Result<(), Error> { let pa = portaudio::PortAudio::new()?; let stream_settings = pa.default_duplex_stream_settings( - input_channels, - output_channels, + input_channels as i32, + output_channels as i32, SAMPLE_RATE, FRAMES_PER_BUFFER, )?; diff --git a/examples/recording.rs b/examples/recording.rs index a8ab7c4..f646754 100644 --- a/examples/recording.rs +++ b/examples/recording.rs @@ -21,7 +21,7 @@ /// $ cargo run --example recording --features bundled --features derive_serde -- --config-file \ /// examples/recording-configs/record-pipeline.json5 /// ``` -use failure::{format_err, Error}; +use anyhow::{anyhow, Error}; use hound::{WavIntoSamples, WavReader, WavWriter}; use regex::Regex; use serde::{Deserialize, Serialize}; @@ -96,11 +96,12 @@ fn match_device( return Ok(device.0); } } - Err(format_err!("Audio device matching \"{}\" not found.", device_name)) + Err(anyhow!("Audio device matching \"{}\" not found.", device_name)) } fn create_stream_settings( pa: &portaudio::PortAudio, + processor: &Processor, opt: &Options, ) -> Result, Error> { let input_device = match_device(pa, Regex::new(&opt.capture.device_name)?)?; @@ -127,7 +128,7 @@ fn create_stream_settings( input_params, output_params, f64::from(AUDIO_SAMPLE_RATE), - NUM_SAMPLES_PER_FRAME as u32, + processor.num_samples_per_frame() as u32, )) } @@ -181,9 +182,9 @@ fn main() -> Result<(), Error> { let pa = portaudio::PortAudio::new()?; let mut processor = Processor::new(&InitializationConfig { - num_capture_channels: opt.capture.num_channels as i32, - num_render_channels: opt.render.num_channels as i32, - ..Default::default() + num_capture_channels: opt.capture.num_channels as usize, + num_render_channels: opt.render.num_channels as usize, + sample_rate_hz: AUDIO_SAMPLE_RATE, })?; processor.set_config(opt.config.clone()); @@ -208,13 +209,13 @@ fn main() -> Result<(), Error> { let audio_callback = { // Allocate buffers outside the performance-sensitive audio loop. let mut input_mut = - vec![0f32; NUM_SAMPLES_PER_FRAME as usize * opt.capture.num_channels as usize]; + vec![0f32; processor.num_samples_per_frame() * opt.capture.num_channels as usize]; let running = running.clone(); let mute = opt.render.mute; let mut processor = processor.clone(); move |portaudio::DuplexStreamCallbackArgs { in_buffer, out_buffer, frames, .. }| { - assert_eq!(frames, NUM_SAMPLES_PER_FRAME as usize); + assert_eq!(frames, processor.num_samples_per_frame()); let mut should_continue = true; @@ -263,7 +264,7 @@ fn main() -> Result<(), Error> { } }; - let stream_settings = create_stream_settings(&pa, &opt)?; + let stream_settings = create_stream_settings(&pa, &processor, &opt)?; let mut stream = pa.open_non_blocking_stream(stream_settings, audio_callback)?; stream.start()?; diff --git a/examples/simple.rs b/examples/simple.rs index 90c45b1..953bf41 100644 --- a/examples/simple.rs +++ b/examples/simple.rs @@ -4,25 +4,17 @@ fn main() { let config = InitializationConfig { num_capture_channels: 2, // Stereo mic input num_render_channels: 2, // Stereo speaker output - ..InitializationConfig::default() + sample_rate_hz: 48_000, // The maximum processing rate }; let mut ap = Processor::new(&config).unwrap(); - let config = Config { - echo_cancellation: Some(EchoCancellation { - suppression_level: EchoCancellationSuppressionLevel::High, - enable_delay_agnostic: false, - enable_extended_filter: false, - stream_delay_ms: None, - }), - ..Config::default() - }; + let config = Config { echo_canceller: Some(EchoCanceller::default()), ..Default::default() }; ap.set_config(config); // The render_frame is what is sent to the speakers, and // capture_frame is audio captured from a microphone. - let (render_frame, capture_frame) = sample_stereo_frames(); + let (render_frame, capture_frame) = sample_stereo_frames(&ap); let mut render_frame_output = render_frame.clone(); ap.process_render_frame(&mut render_frame_output).unwrap(); @@ -43,8 +35,8 @@ fn main() { /// Generate example stereo frames that simulates a situation where the /// microphone (capture) would be picking up the speaker (render) output. -fn sample_stereo_frames() -> (Vec, Vec) { - let num_samples_per_frame = NUM_SAMPLES_PER_FRAME as usize; +fn sample_stereo_frames(processor: &Processor) -> (Vec, Vec) { + let num_samples_per_frame = processor.num_samples_per_frame(); let mut render_frame = Vec::with_capacity(num_samples_per_frame * 2); let mut capture_frame = Vec::with_capacity(num_samples_per_frame * 2); diff --git a/src/config.rs b/src/config.rs index f1fbf50..f006210 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,150 +1,146 @@ use webrtc_audio_processing_sys as ffi; -pub use ffi::InitializationConfig; - #[cfg(feature = "derive_serde")] use serde::{Deserialize, Serialize}; -/// A level of non-linear suppression during AEC (aka NLP). +/// A configuration for initializing a Processor instance. +#[derive(Debug, Clone, PartialEq)] +pub struct InitializationConfig { + /// Number of the input and output channels for the capture frame. + pub num_capture_channels: usize, + /// Number of the input and output channels for the render frame. + pub num_render_channels: usize, + /// Sampling rate of the capture and render frames. Accepts an arbitrary value, but the maximum + /// internal processing rate is 48000, so the audio quality is capped as such. + pub sample_rate_hz: u32, +} + +/// Internal processing rate. #[derive(Debug, Copy, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] -pub enum EchoCancellationSuppressionLevel { - /// Lowest suppression level. - /// Minimum overdrive exponent = 1.0 (zero suppression). - Lowest, - /// Lower suppression level. - /// Minimum overdrive exponent = 2.0. - Lower, - /// Low suppression level. - /// Minimum overdrive exponent = 3.0. - Low, - /// Moderate suppression level. - /// Minimum overdrive exponent = 6.0. - Moderate, - /// Higher suppression level. - /// Minimum overdrive exponent = 15.0. - High, +pub enum PipelineProcessingRate { + /// Limit the rate to 32k Hz. + Max32000Hz = 32_000, + /// Limit the rate to 48k Hz. + Max48000Hz = 48_000, } -impl From for ffi::EchoCancellation_SuppressionLevel { - fn from(other: EchoCancellationSuppressionLevel) -> ffi::EchoCancellation_SuppressionLevel { - match other { - EchoCancellationSuppressionLevel::Lowest => { - ffi::EchoCancellation_SuppressionLevel::LOWEST - }, - EchoCancellationSuppressionLevel::Lower => { - ffi::EchoCancellation_SuppressionLevel::LOWER - }, - EchoCancellationSuppressionLevel::Low => ffi::EchoCancellation_SuppressionLevel::LOW, - EchoCancellationSuppressionLevel::Moderate => { - ffi::EchoCancellation_SuppressionLevel::MODERATE - }, - EchoCancellationSuppressionLevel::High => ffi::EchoCancellation_SuppressionLevel::HIGH, +impl Default for PipelineProcessingRate { + fn default() -> Self { + // cf. https://gitlab.freedesktop.org/pulseaudio/webrtc-audio-processing/-/blob/master/webrtc/modules/audio_processing/include/audio_processing.cc#L55 + if cfg!(target_arch = "arm") { + Self::Max32000Hz + } else { + Self::Max48000Hz } } } -/// Echo cancellation configuration. -#[derive(Debug, Clone, PartialEq)] +/// Audio processing pipeline configuration. +#[derive(Debug, Default, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] -pub struct EchoCancellation { - /// Determines the aggressiveness of the suppressor. A higher level trades off - /// double-talk performance for increased echo suppression. - pub suppression_level: EchoCancellationSuppressionLevel, - - /// Use to enable the extended filter mode in the AEC, along with robustness - /// measures around the reported system delays. It comes with a significant - /// increase in AEC complexity, but is much more robust to unreliable reported - /// delays. - pub enable_extended_filter: bool, - - /// Enables delay-agnostic echo cancellation. This feature relies on internally - /// estimated delays between the process and reverse streams, thus not relying - /// on reported system delays. - pub enable_delay_agnostic: bool, - - /// Sets the delay in ms between process_render_frame() receiving a far-end - /// frame and process_capture_frame() receiving a near-end frame containing - /// the corresponding echo. You should set this only if you are certain that - /// the delay will be stable and constant. enable_delay_agnostic will be - /// ignored when this option is set. - pub stream_delay_ms: Option, +pub struct Pipeline { + /// Maximum allowed processing rate used internally. The default rate is currently selected + /// based on the CPU architecture. + pub maximum_internal_processing_rate: PipelineProcessingRate, + + /// Allow multi-channel processing of capture audio when AEC3 is active. + pub multi_channel_capture: bool, + + /// Allow multi-channel processing of render audio. + pub multi_channel_render: bool, } -impl From for ffi::EchoCancellation { - fn from(other: EchoCancellation) -> ffi::EchoCancellation { - ffi::EchoCancellation { - enable: true, - suppression_level: other.suppression_level.into(), - enable_extended_filter: other.enable_extended_filter, - enable_delay_agnostic: other.enable_delay_agnostic, - stream_delay_ms: other.stream_delay_ms.into(), +impl From for ffi::AudioProcessing_Config_Pipeline { + fn from(other: Pipeline) -> Self { + Self { + maximum_internal_processing_rate: other.maximum_internal_processing_rate as i32, + multi_channel_capture: other.multi_channel_capture, + multi_channel_render: other.multi_channel_render, } } } -/// Mode of gain control. -#[derive(Debug, Copy, Clone, PartialEq)] +/// Pre-amplifier configuration. +#[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] -pub enum GainControlMode { - /// Bring the signal to an appropriate range by applying an adaptive gain - /// control. The volume is dynamically amplified with a microphone with - /// small pickup and vice versa. - AdaptiveDigital, +pub struct PreAmplifier { + /// Fixed linear gain multiplifier. The default is 1.0 (no effect). + pub fixed_gain_factor: f32, +} - /// Unlike ADAPTIVE_DIGITAL, it only compresses (i.e. gradually reduces - /// gain with increasing level) the input signal when at higher levels. - /// Use this where the capture signal level is predictable, so that a - /// known gain can be applied. - FixedDigital, +impl Default for PreAmplifier { + fn default() -> Self { + Self { fixed_gain_factor: 1.0 } + } } -impl From for ffi::GainControl_Mode { - fn from(other: GainControlMode) -> ffi::GainControl_Mode { - match other { - GainControlMode::AdaptiveDigital => ffi::GainControl_Mode::ADAPTIVE_DIGITAL, - GainControlMode::FixedDigital => ffi::GainControl_Mode::FIXED_DIGITAL, - } +impl From for ffi::AudioProcessing_Config_PreAmplifier { + fn from(other: PreAmplifier) -> Self { + Self { enabled: true, fixed_gain_factor: other.fixed_gain_factor } } } -/// Gain control configuration. +/// HPF (high-pass fitler) configuration. #[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] -pub struct GainControl { - /// Determines what type of gain control is applied. - pub mode: GainControlMode, - - /// Sets the target peak level (or envelope) of the AGC in dBFs (decibels from - /// digital full-scale). The convention is to use positive values. - /// For instance, passing in a value of 3 corresponds to -3 dBFs, or a target - /// level 3 dB below full-scale. Limited to [0, 31]. - pub target_level_dbfs: i32, - - /// Sets the maximum gain the digital compression stage may apply, in dB. A - /// higher number corresponds to greater compression, while a value of 0 will - /// leave the signal uncompressed. Limited to [0, 90]. - pub compression_gain_db: i32, - - /// When enabled, the compression stage will hard limit the signal to the - /// target level. Otherwise, the signal will be compressed but not limited - /// above the target level. - pub enable_limiter: bool, +pub struct HighPassFilter { + /// HPF should be applied in the full-band (i.e. 20 – 20,000 Hz). + pub apply_in_full_band: bool, } -impl From for ffi::GainControl { - fn from(other: GainControl) -> ffi::GainControl { - ffi::GainControl { - enable: true, - mode: other.mode.into(), - target_level_dbfs: other.target_level_dbfs, - compression_gain_db: other.compression_gain_db, - enable_limiter: other.enable_limiter, +impl Default for HighPassFilter { + fn default() -> Self { + Self { apply_in_full_band: true } + } +} + +impl From for ffi::AudioProcessing_Config_HighPassFilter { + fn from(other: HighPassFilter) -> Self { + Self { enabled: true, apply_in_full_band: other.apply_in_full_band } + } +} + +/// AEC (acoustic echo cancellation) configuration. +#[derive(Debug, Clone, PartialEq)] +#[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] +pub enum EchoCanceller { + /// Uses low-complexity AEC implementation that is optimized for mobile. + Mobile, + + /// Uses the full AEC3 implementation. + Full { + /// Enforce the highpass filter to be on. It has no effect for the mobile mode. + enforce_high_pass_filtering: bool, + }, +} + +impl Default for EchoCanceller { + fn default() -> Self { + Self::Full { enforce_high_pass_filtering: true } + } +} + +impl From for ffi::AudioProcessing_Config_EchoCanceller { + fn from(other: EchoCanceller) -> Self { + match other { + EchoCanceller::Mobile => Self { + enabled: true, + mobile_mode: true, + enforce_high_pass_filtering: false, + export_linear_aec_output: false, + }, + EchoCanceller::Full { enforce_high_pass_filtering } => Self { + enabled: true, + mobile_mode: false, + enforce_high_pass_filtering, + export_linear_aec_output: false, + }, } } } -/// A level of noise suppression. +/// Noise suppression level. #[derive(Debug, Copy, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] pub enum NoiseSuppressionLevel { @@ -158,13 +154,17 @@ pub enum NoiseSuppressionLevel { VeryHigh, } -impl From for ffi::NoiseSuppression_SuppressionLevel { - fn from(other: NoiseSuppressionLevel) -> ffi::NoiseSuppression_SuppressionLevel { +impl From for ffi::AudioProcessing_Config_NoiseSuppression_Level { + fn from(other: NoiseSuppressionLevel) -> Self { match other { - NoiseSuppressionLevel::Low => ffi::NoiseSuppression_SuppressionLevel::LOW, - NoiseSuppressionLevel::Moderate => ffi::NoiseSuppression_SuppressionLevel::MODERATE, - NoiseSuppressionLevel::High => ffi::NoiseSuppression_SuppressionLevel::HIGH, - NoiseSuppressionLevel::VeryHigh => ffi::NoiseSuppression_SuppressionLevel::VERY_HIGH, + NoiseSuppressionLevel::Low => ffi::AudioProcessing_Config_NoiseSuppression_Level_kLow, + NoiseSuppressionLevel::Moderate => { + ffi::AudioProcessing_Config_NoiseSuppression_Level_kModerate + }, + NoiseSuppressionLevel::High => ffi::AudioProcessing_Config_NoiseSuppression_Level_kHigh, + NoiseSuppressionLevel::VeryHigh => { + ffi::AudioProcessing_Config_NoiseSuppression_Level_kVeryHigh + }, } } } @@ -173,186 +173,241 @@ impl From for ffi::NoiseSuppression_SuppressionLevel { #[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] pub struct NoiseSuppression { - /// Determines the aggressiveness of the suppression. Increasing the level will - /// reduce the noise level at the expense of a higher speech distortion. - pub suppression_level: NoiseSuppressionLevel, + /// Determines the aggressiveness of the suppression. Increasing the level will reduce the + /// noise level at the expense of a higher speech distortion. + pub level: NoiseSuppressionLevel, + /// Analyze the output of the linear AEC instead of the capture frame. Has no effect if echo + /// cancellation is not enabled. + pub analyze_linear_aec_output: bool, } -impl From for ffi::NoiseSuppression { - fn from(other: NoiseSuppression) -> ffi::NoiseSuppression { - ffi::NoiseSuppression { enable: true, suppression_level: other.suppression_level.into() } +impl Default for NoiseSuppression { + fn default() -> Self { + Self { level: NoiseSuppressionLevel::Moderate, analyze_linear_aec_output: false } } } -/// The sensitivity of the noise detector. +impl From for ffi::AudioProcessing_Config_NoiseSuppression { + fn from(other: NoiseSuppression) -> Self { + Self { + enabled: true, + level: other.level.into(), + analyze_linear_aec_output_when_available: other.analyze_linear_aec_output, + } + } +} + +/// Gain control mode. #[derive(Debug, Copy, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] -pub enum VoiceDetectionLikelihood { - /// Even lower detection likelihood. - VeryLow, - /// Lower detection likelihood. - Low, - /// Moderate detection likelihood. - Moderate, - /// Higher detection likelihood. - High, +pub enum GainControllerMode { + /// Adaptive mode intended for use if an analog volume control is available on the capture + /// device. It will require the user to provide coupling between the OS mixer controls and AGC + /// through the stream_analog_level() functions. It consists of an analog gain prescription for + /// the audio device and a digital compression stage. + /// TODO: this mode is not supported yet. + AdaptiveAnalog, + /// Adaptive mode intended for situations in which an analog volume control is unavailable. It + /// operates in a similar fashion to the adaptive analog mode, but with scaling instead applied + /// in the digital domain. As with the analog mode, it additionally uses a digital compression + /// stage. + AdaptiveDigital, + /// Fixed mode which enables only the digital compression stage also used by the two adaptive + /// modes. It is distinguished from the adaptive modes by considering only a short time-window + /// of the input signal. It applies a fixed gain through most of the input level range, and + /// compresses (gradually reduces gain with increasing level) the input signal at higher + /// levels. This mode is preferred on embedded devices where the capture signal level is + /// predictable, so that a known gain can be applied. + FixedDigital, } -impl From for ffi::VoiceDetection_DetectionLikelihood { - fn from(other: VoiceDetectionLikelihood) -> ffi::VoiceDetection_DetectionLikelihood { +impl From for ffi::AudioProcessing_Config_GainController1_Mode { + fn from(other: GainControllerMode) -> Self { match other { - VoiceDetectionLikelihood::VeryLow => ffi::VoiceDetection_DetectionLikelihood::VERY_LOW, - VoiceDetectionLikelihood::Low => ffi::VoiceDetection_DetectionLikelihood::LOW, - VoiceDetectionLikelihood::Moderate => ffi::VoiceDetection_DetectionLikelihood::MODERATE, - VoiceDetectionLikelihood::High => ffi::VoiceDetection_DetectionLikelihood::HIGH, + GainControllerMode::AdaptiveAnalog => { + ffi::AudioProcessing_Config_GainController1_Mode_kAdaptiveAnalog + }, + GainControllerMode::AdaptiveDigital => { + ffi::AudioProcessing_Config_GainController1_Mode_kAdaptiveDigital + }, + GainControllerMode::FixedDigital => { + ffi::AudioProcessing_Config_GainController1_Mode_kFixedDigital + }, } } } -/// Voice detection configuration. +/// AGC (automatic gain control) configuration. #[derive(Debug, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] -pub struct VoiceDetection { - /// Specifies the likelihood that a frame will be declared to contain voice. A - /// higher value makes it more likely that speech will not be clipped, at the - /// expense of more noise being detected as voice. - pub detection_likelihood: VoiceDetectionLikelihood, +pub struct GainController { + /// AGC mode. + pub mode: GainControllerMode, + + /// Sets the target peak level (or envelope) of the AGC in dBFs (decibels from digital + /// full-scale). The convention is to use positive values. For instance, passing in a value of + /// 3 corresponds to -3 dBFs, or a target level 3 dB below full-scale. Limited to [0, 31]. + pub target_level_dbfs: u8, + + /// Sets the maximum gain the digital compression stage may apply, in dB. A higher number + /// corresponds to greater compression, while a value of 0 will leave the signal uncompressed. + /// Limited to [0, 90]. For updates after APM setup, use a RuntimeSetting instead. + pub compression_gain_db: u8, + + /// When enabled, the compression stage will hard limit the signal to the target level. + /// Otherwise, the signal will be compressed but not limited above the target level. + pub enable_limiter: bool, +} + +impl Default for GainController { + fn default() -> Self { + Self { + mode: GainControllerMode::AdaptiveDigital, + target_level_dbfs: 3, + compression_gain_db: 9, + enable_limiter: true, + } + } } -impl From for ffi::VoiceDetection { - fn from(other: VoiceDetection) -> ffi::VoiceDetection { - ffi::VoiceDetection { - enable: true, - detection_likelihood: other.detection_likelihood.into(), +impl From for ffi::AudioProcessing_Config_GainController1 { + fn from(other: GainController) -> Self { + Self { + enabled: true, + mode: other.mode.into(), + target_level_dbfs: other.target_level_dbfs as i32, + compression_gain_db: other.compression_gain_db as i32, + enable_limiter: other.enable_limiter, + ..Default::default() } } } -/// Config that can be used mid-processing. +/// The parameters to control reporting of selected field in [`Stats`]. +#[derive(Debug, Default, Clone, PartialEq)] +#[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] +pub struct ReportingConfig { + /// Enables reporting of [`voice_detected`] in [`Stats`]. + pub enable_voice_detection: bool, + + /// Enables reporting of [`residual_echo_likelihood`] and + /// [`residual_echo_likelihood_recent_max`] in [`Stats`]. + pub enable_residual_echo_detector: bool, + + /// Enables reporting of [`output_rms_dbfs`] in [`Stats`]. + pub enable_level_estimation: bool, +} + +/// The parameters and behavior of the audio processing module are controlled +/// by changing the default values in this `Config` struct. +/// The config is applied by passing the struct to the [`set_config`] method. #[derive(Debug, Default, Clone, PartialEq)] #[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] pub struct Config { - /// Enable and configure AEC (acoustic echo cancellation). - pub echo_cancellation: Option, + /// Sets the properties of the audio processing pipeline. + #[serde(default)] + pub pipeline: Pipeline, + + /// Enables and configures the pre-amplifier. It amplifies the capture signal before any other + /// processing is done. + #[serde(default)] + pub pre_amplifier: Option, + + /// Enables and configures high pass filter. + #[serde(default)] + pub high_pass_filter: Option, - /// Enable and configure AGC (automatic gain control). - pub gain_control: Option, + /// Enables and configures acoustic echo cancellation. + #[serde(default)] + pub echo_canceller: Option, - /// Enable and configure noise suppression. + /// Enables and configures background noise suppression. + #[serde(default)] pub noise_suppression: Option, - /// Enable and configure voice detection. - pub voice_detection: Option, + /// Enables transient noise suppression. + #[serde(default)] + pub enable_transient_suppression: bool, - /// Use to enable experimental transient noise suppression. - #[cfg_attr(feature = "derive_serde", serde(default))] - pub enable_transient_suppressor: bool, + /// Enables and configures automatic gain control. + /// TODO: Experiment with and migrate to GainController2. + #[serde(default)] + pub gain_controller: Option, - /// Use to enable a filtering component which removes DC offset and - /// low-frequency noise. - #[cfg_attr(feature = "derive_serde", serde(default))] - pub enable_high_pass_filter: bool, + /// Toggles reporting of selected fields in [`Stats`]. + #[serde(default)] + pub reporting: ReportingConfig, } -impl From for ffi::Config { - fn from(other: Config) -> ffi::Config { - let echo_cancellation = if let Some(enabled_value) = other.echo_cancellation { - enabled_value.into() +impl From for ffi::AudioProcessing_Config { + fn from(other: Config) -> Self { + let pre_amplifier = if let Some(config) = other.pre_amplifier { + config.into() } else { - ffi::EchoCancellation { enable: false, ..ffi::EchoCancellation::default() } + ffi::AudioProcessing_Config_PreAmplifier { enabled: false, ..Default::default() } }; - let gain_control = if let Some(enabled_value) = other.gain_control { - enabled_value.into() + let high_pass_filter = if let Some(config) = other.high_pass_filter { + config.into() } else { - ffi::GainControl { enable: false, ..ffi::GainControl::default() } + ffi::AudioProcessing_Config_HighPassFilter { enabled: false, ..Default::default() } }; - let noise_suppression = if let Some(enabled_value) = other.noise_suppression { - enabled_value.into() + let echo_canceller = if let Some(config) = other.echo_canceller { + let mut echo_canceller = ffi::AudioProcessing_Config_EchoCanceller::from(config); + echo_canceller.export_linear_aec_output = if let Some(ns) = &other.noise_suppression { + ns.analyze_linear_aec_output + } else { + false + }; + echo_canceller } else { - ffi::NoiseSuppression { enable: false, ..ffi::NoiseSuppression::default() } + ffi::AudioProcessing_Config_EchoCanceller { enabled: false, ..Default::default() } }; - let voice_detection = if let Some(enabled_value) = other.voice_detection { - enabled_value.into() + let noise_suppression = if let Some(config) = other.noise_suppression { + config.into() } else { - ffi::VoiceDetection { enable: false, ..ffi::VoiceDetection::default() } + ffi::AudioProcessing_Config_NoiseSuppression { enabled: false, ..Default::default() } }; - ffi::Config { - echo_cancellation, - gain_control, - noise_suppression, - voice_detection, - enable_transient_suppressor: other.enable_transient_suppressor, - enable_high_pass_filter: other.enable_high_pass_filter, - } - } -} - -/// Statistics about the processor state. -#[derive(Debug, Clone)] -#[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] -pub struct Stats { - /// True if voice is detected in the current frame. - pub has_voice: Option, - - /// False if the current frame almost certainly contains no echo and true if it - /// _might_ contain echo. - pub has_echo: Option, - - /// Root mean square (RMS) level in dBFs (decibels from digital full-scale), or - /// alternately dBov. It is computed over all primary stream frames since the - /// last call to |get_stats()|. The returned value is constrained to [-127, 0], - /// where -127 indicates muted. - pub rms_dbfs: Option, - - /// Prior speech probability of the current frame averaged over output - /// channels, internally computed by noise suppressor. - pub speech_probability: Option, - - /// RERL = ERL + ERLE - pub residual_echo_return_loss: Option, - - /// ERL = 10log_10(P_far / P_echo) - pub echo_return_loss: Option, + let transient_suppression = ffi::AudioProcessing_Config_TransientSuppression { + enabled: other.enable_transient_suppression, + }; - /// ERLE = 10log_10(P_echo / P_out) - pub echo_return_loss_enhancement: Option, + let voice_detection = ffi::AudioProcessing_Config_VoiceDetection { + enabled: other.reporting.enable_voice_detection, + }; - /// (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a) - pub a_nlp: Option, + let gain_controller1 = if let Some(config) = other.gain_controller { + config.into() + } else { + ffi::AudioProcessing_Config_GainController1 { enabled: false, ..Default::default() } + }; - /// Median of the measured delay in ms. The values are aggregated until the - /// first call to |get_stats()| and afterwards aggregated and updated every - /// second. - pub delay_median_ms: Option, + let gain_controller2 = + ffi::AudioProcessing_Config_GainController2 { enabled: false, ..Default::default() }; - /// Standard deviation of the measured delay in ms. The values are aggregated - /// until the first call to |get_stats()| and afterwards aggregated and updated - /// every second. - pub delay_standard_deviation_ms: Option, + let residual_echo_detector = ffi::AudioProcessing_Config_ResidualEchoDetector { + enabled: other.reporting.enable_residual_echo_detector, + }; - /// The fraction of delay estimates that can make the echo cancellation perform - /// poorly. - pub delay_fraction_poor_delays: Option, -} + let level_estimation = ffi::AudioProcessing_Config_LevelEstimation { + enabled: other.reporting.enable_level_estimation, + }; -impl From for Stats { - fn from(other: ffi::Stats) -> Stats { - Stats { - has_voice: other.has_voice.into(), - has_echo: other.has_echo.into(), - rms_dbfs: other.rms_dbfs.into(), - speech_probability: other.speech_probability.into(), - residual_echo_return_loss: other.residual_echo_return_loss.into(), - echo_return_loss: other.echo_return_loss.into(), - echo_return_loss_enhancement: other.echo_return_loss_enhancement.into(), - a_nlp: other.a_nlp.into(), - delay_median_ms: other.delay_median_ms.into(), - delay_standard_deviation_ms: other.delay_standard_deviation_ms.into(), - delay_fraction_poor_delays: other.delay_fraction_poor_delays.into(), + Self { + pipeline: other.pipeline.into(), + pre_amplifier, + high_pass_filter, + echo_canceller, + noise_suppression, + transient_suppression, + voice_detection, + gain_controller1, + gain_controller2, + residual_echo_detector, + level_estimation, } } } diff --git a/src/lib.rs b/src/lib.rs index 549e588..11376b9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,12 +6,13 @@ #![warn(missing_docs)] mod config; +mod stats; use std::{error, fmt, sync::Arc}; use webrtc_audio_processing_sys as ffi; pub use config::*; -pub use ffi::NUM_SAMPLES_PER_FRAME; +pub use stats::*; /// Represents an error inside webrtc::AudioProcessing. /// See the documentation of [`webrtc::AudioProcessing::Error`](https://cgit.freedesktop.org/pulseaudio/webrtc-audio-processing/tree/webrtc/modules/audio_processing/include/audio_processing.h?id=9def8cf10d3c97640d32f1328535e881288f700f) @@ -47,15 +48,17 @@ impl Processor { /// Creates a new `Processor`. `InitializationConfig` is only used on /// instantiation, however new configs can be be passed to `set_config()` /// at any time during processing. - pub fn new(config: &ffi::InitializationConfig) -> Result { + pub fn new(config: &InitializationConfig) -> Result { + let inner = Arc::new(AudioProcessing::new(config)?); + let num_samples = inner.num_samples_per_frame(); Ok(Self { - inner: Arc::new(AudioProcessing::new(config)?), + inner, deinterleaved_capture_frame: vec![ - vec![0f32; NUM_SAMPLES_PER_FRAME as usize]; + vec![0f32; num_samples]; config.num_capture_channels as usize ], deinterleaved_render_frame: vec![ - vec![0f32; NUM_SAMPLES_PER_FRAME as usize]; + vec![0f32; num_samples]; config.num_render_channels as usize ], }) @@ -107,6 +110,11 @@ impl Processor { self.inner.get_stats() } + /// Returns the number of samples per frame based on the sample rate and frame size. + pub fn num_samples_per_frame(&self) -> usize { + self.inner.num_samples_per_frame() + } + /// Immediately updates the configurations of the internal signal processor. /// May be called multiple times after the initialization and during /// processing. @@ -174,9 +182,16 @@ struct AudioProcessing { } impl AudioProcessing { - fn new(config: &ffi::InitializationConfig) -> Result { + fn new(config: &InitializationConfig) -> Result { let mut code = 0; - let inner = unsafe { ffi::audio_processing_create(config, &mut code) }; + let inner = unsafe { + ffi::audio_processing_create( + config.num_capture_channels as i32, + config.num_render_channels as i32, + config.sample_rate_hz as i32, + &mut code, + ) + }; if !inner.is_null() { Ok(Self { inner }) } else { @@ -212,6 +227,10 @@ impl AudioProcessing { unsafe { ffi::get_stats(self.inner).into() } } + fn num_samples_per_frame(&self) -> usize { + unsafe { ffi::get_num_samples_per_frame(self.inner) as usize } + } + fn set_config(&self, config: Config) { unsafe { ffi::set_config(self.inner, &config.into()); @@ -239,8 +258,8 @@ impl Drop for AudioProcessing { } } -// ffi::AudioProcessing provides thread safety with a few exceptions around -// the concurrent usage of its getters and setters e.g. `set_stream_delay_ms()`. +// ffi::AudioProcessing provides thread safety with a few exceptions around the concurrent usage of +// its corresponding getters and setters. unsafe impl Sync for AudioProcessing {} unsafe impl Send for AudioProcessing {} @@ -249,20 +268,23 @@ mod tests { use super::*; use std::{thread, time::Duration}; + fn init_config(num_channels: usize) -> InitializationConfig { + InitializationConfig { + num_capture_channels: num_channels, + num_render_channels: num_channels, + sample_rate_hz: 48_000, + } + } + #[test] fn test_create_failure() { - let config = - InitializationConfig { num_capture_channels: 0, ..InitializationConfig::default() }; + let config = init_config(0); assert!(Processor::new(&config).is_err()); } #[test] fn test_create_drop() { - let config = InitializationConfig { - num_capture_channels: 1, - num_render_channels: 1, - ..InitializationConfig::default() - }; + let config = init_config(1); let _p = Processor::new(&config).unwrap(); } @@ -281,8 +303,8 @@ mod tests { assert_eq!(interleaved, interleaved_out); } - fn sample_stereo_frames() -> (Vec, Vec) { - let num_samples_per_frame = NUM_SAMPLES_PER_FRAME as usize; + fn sample_stereo_frames(ap: &Processor) -> (Vec, Vec) { + let num_samples_per_frame = ap.num_samples_per_frame(); // Stereo frame with a lower frequency cosine wave. let mut render_frame = Vec::with_capacity(num_samples_per_frame * 2); @@ -304,25 +326,14 @@ mod tests { #[test] fn test_nominal() { - let config = InitializationConfig { - num_capture_channels: 2, - num_render_channels: 2, - ..InitializationConfig::default() - }; + let config = init_config(2); let mut ap = Processor::new(&config).unwrap(); - let config = Config { - echo_cancellation: Some(EchoCancellation { - suppression_level: EchoCancellationSuppressionLevel::High, - stream_delay_ms: None, - enable_delay_agnostic: false, - enable_extended_filter: false, - }), - ..Config::default() - }; + let config = + Config { echo_canceller: Some(EchoCanceller::default()), ..Default::default() }; ap.set_config(config); - let (render_frame, capture_frame) = sample_stereo_frames(); + let (render_frame, capture_frame) = sample_stereo_frames(&ap); let mut render_frame_output = render_frame.clone(); ap.process_render_frame(&mut render_frame_output).unwrap(); @@ -345,28 +356,17 @@ mod tests { #[test] #[ignore] fn test_nominal_threaded() { - let config = InitializationConfig { - num_capture_channels: 2, - num_render_channels: 2, - ..InitializationConfig::default() - }; + let config = init_config(2); let ap = Processor::new(&config).unwrap(); - let (render_frame, capture_frame) = sample_stereo_frames(); + let (render_frame, capture_frame) = sample_stereo_frames(&ap); let mut config_ap = ap.clone(); let config_thread = thread::spawn(move || { thread::sleep(Duration::from_millis(100)); - let config = Config { - echo_cancellation: Some(EchoCancellation { - suppression_level: EchoCancellationSuppressionLevel::High, - stream_delay_ms: None, - enable_delay_agnostic: false, - enable_extended_filter: false, - }), - ..Config::default() - }; + let config = + Config { echo_canceller: Some(EchoCanceller::default()), ..Default::default() }; config_ap.set_config(config); }); diff --git a/src/stats.rs b/src/stats.rs new file mode 100644 index 0000000..16fd708 --- /dev/null +++ b/src/stats.rs @@ -0,0 +1,62 @@ +use webrtc_audio_processing_sys as ffi; + +#[cfg(feature = "derive_serde")] +use serde::{Deserialize, Serialize}; + +/// Statistics about the processor state. +#[derive(Debug, Clone)] +#[cfg_attr(feature = "derive_serde", derive(Serialize, Deserialize))] +pub struct Stats { + /// The root mean square (RMS) level in dBFS (decibels from digital full-scale) of the last + /// capture frame, after processing. It is constrained to [-127, 0]. The computation follows: + /// https://tools.ietf.org/html/rfc6465 with the intent that it can provide the RTP audio level + /// indication. Only reported if level estimation is enabled in [`ReportingConfig`]. + pub output_rms_dbfs: Option, + + /// True if voice is detected in the last capture frame, after processing. It is conservative + /// in flagging audio as speech, with low likelihood of incorrectly flagging a frame as voice. + /// Only reported if voice detection is enabled in [`Config`]. + pub voice_detected: Option, + + /// AEC stats: ERL = 10log_10(P_far / P_echo) + pub echo_return_loss: Option, + /// AEC stats: ERLE = 10log_10(P_echo / P_out) + pub echo_return_loss_enhancement: Option, + /// AEC stats: Fraction of time that the AEC linear filter is divergent, in a 1-second + /// non-overlapped aggregation window. + pub divergent_filter_fraction: Option, + + /// The delay median in milliseconds. The values are aggregated until the first call to + /// [`get_stats()`] and afterwards aggregated and updated every second. + pub delay_median_ms: Option, + /// The delay standard deviation in milliseconds. The values are aggregated until the first + /// call to [`get_stats()`] and afterwards aggregated and updated every second. + pub delay_standard_deviation_ms: Option, + + /// Residual echo detector likelihood. + pub residual_echo_likelihood: Option, + /// Maximum residual echo likelihood from the last time period. + pub residual_echo_likelihood_recent_max: Option, + + /// The instantaneous delay estimate produced in the AEC. The unit is in milliseconds and the + /// value is the instantaneous value at the time of the call to [`get_stats()`]. + pub delay_ms: Option, +} + +impl From for Stats { + fn from(other: ffi::Stats) -> Self { + Self { + output_rms_dbfs: Option::::from(other.output_rms_dbfs).map(|v| v as i8), + voice_detected: other.voice_detected.into(), + echo_return_loss: other.echo_return_loss.into(), + echo_return_loss_enhancement: other.echo_return_loss_enhancement.into(), + divergent_filter_fraction: other.divergent_filter_fraction.into(), + delay_median_ms: Option::::from(other.delay_median_ms).map(|v| v as u32), + delay_standard_deviation_ms: Option::::from(other.delay_standard_deviation_ms) + .map(|v| v as u32), + residual_echo_likelihood: other.residual_echo_likelihood.into(), + residual_echo_likelihood_recent_max: other.residual_echo_likelihood_recent_max.into(), + delay_ms: Option::::from(other.delay_ms).map(|v| v as u32), + } + } +} diff --git a/webrtc-audio-processing-sys/Cargo.toml b/webrtc-audio-processing-sys/Cargo.toml index e8b6b99..e00cae6 100644 --- a/webrtc-audio-processing-sys/Cargo.toml +++ b/webrtc-audio-processing-sys/Cargo.toml @@ -21,12 +21,10 @@ derive_serde = ["serde"] bundled = [] [build-dependencies] -autotools = "0.2" -bindgen = "0" +anyhow = "1" +bindgen = "0.69" cc = "1" -failure = "0.1" -fs_extra = "1" -regex = "1" +cmake = "0.1" pkg-config = "0.3" [dependencies] diff --git a/webrtc-audio-processing-sys/README.md b/webrtc-audio-processing-sys/README.md index dc80864..2f87fd6 100644 --- a/webrtc-audio-processing-sys/README.md +++ b/webrtc-audio-processing-sys/README.md @@ -27,5 +27,5 @@ Static linking can be enabled with the `bundled` feature flag. The following tools are needed in order to use the `bundled` feature flag: -* libtool (`$ sudo apt install libtool`) -* autotools (`$ sudo apt install autotools-dev`) +* meson +* ninja diff --git a/webrtc-audio-processing-sys/build.rs b/webrtc-audio-processing-sys/build.rs index f43d365..faf6681 100644 --- a/webrtc-audio-processing-sys/build.rs +++ b/webrtc-audio-processing-sys/build.rs @@ -1,11 +1,5 @@ -use failure::Error; -use regex::Regex; -use std::{ - env, - fs::File, - io::{Read, Write}, - path::{Path, PathBuf}, -}; +use anyhow::{bail, Error, Result}; +use std::{env, path::PathBuf}; const DEPLOYMENT_TARGET_VAR: &str = "MACOSX_DEPLOYMENT_TARGET"; @@ -16,11 +10,11 @@ fn out_dir() -> PathBuf { #[cfg(not(feature = "bundled"))] mod webrtc { use super::*; - use failure::bail; - const LIB_NAME: &str = "webrtc-audio-processing"; + const LIB_NAME: &str = "webrtc-audio-processing-1"; + const LIB_MIN_VERSION: &str = "1.0"; - pub(super) fn get_build_paths() -> Result<(PathBuf, PathBuf), Error> { + pub(super) fn get_build_paths() -> Result<(Vec, Vec), Error> { let (pkgconfig_include_path, pkgconfig_lib_path) = find_pkgconfig_paths()?; let include_path = std::env::var("WEBRTC_AUDIO_PROCESSING_INCLUDE") @@ -32,12 +26,13 @@ mod webrtc { .map(|x| x.into()) .or(pkgconfig_lib_path); - println!("{:?}, {:?}", include_path, lib_path); - match (include_path, lib_path) { - (Some(include_path), Some(lib_path)) => Ok((include_path, lib_path)), + (Some(include_path), Some(lib_path)) => Ok((vec![include_path], vec![lib_path])), _ => { - eprintln!("Couldn't find either header or lib files for {}.", LIB_NAME); + eprintln!( + "Couldn't find either header or lib files for {}>={}.", + LIB_NAME, LIB_MIN_VERSION + ); eprintln!("See the crate README for installation instructions, or use the 'bundled' feature to statically compile."); bail!("Aborting compilation due to linker failure."); }, @@ -50,6 +45,7 @@ mod webrtc { fn find_pkgconfig_paths() -> Result<(Option, Option), Error> { Ok(pkg_config::Config::new() + .atleast_version(LIB_MIN_VERSION) .probe(LIB_NAME) .and_then(|mut lib| Ok((lib.include_paths.pop(), lib.link_paths.pop())))?) } @@ -58,18 +54,29 @@ mod webrtc { #[cfg(feature = "bundled")] mod webrtc { use super::*; - use failure::bail; + use std::{path::Path, process::Command}; const BUNDLED_SOURCE_PATH: &str = "./webrtc-audio-processing"; - pub(super) fn get_build_paths() -> Result<(PathBuf, PathBuf), Error> { - let include_path = out_dir().join(BUNDLED_SOURCE_PATH); - let lib_path = out_dir().join("lib"); - Ok((include_path, lib_path)) + pub(super) fn get_build_paths() -> Result<(Vec, Vec), Error> { + let mut include_paths = vec![ + out_dir().join("include"), + out_dir().join("include").join("webrtc-audio-processing-1"), + ]; + let mut lib_paths = vec![out_dir().join("lib")]; + + // If abseil package is installed locally, meson would have picked it for building + // webrtc-audio-processing-1. Use the same resource for ourselves, too. + if let Ok(mut lib) = pkg_config::Config::new().probe("absl_base") { + include_paths.append(&mut lib.include_paths); + lib_paths.append(&mut lib.link_paths); + } + + Ok((include_paths, lib_paths)) } - fn copy_source_to_out_dir() -> Result { - use fs_extra::dir::CopyOptions; + pub(super) fn build_if_necessary() -> Result<(), Error> { + use anyhow::Context; if Path::new(BUNDLED_SOURCE_PATH).read_dir()?.next().is_none() { eprintln!("The webrtc-audio-processing source directory is empty."); @@ -78,85 +85,54 @@ mod webrtc { bail!("Aborting compilation because bundled source directory is empty."); } - let out_dir = out_dir(); - let mut options = CopyOptions::new(); - options.overwrite = true; - - fs_extra::dir::copy(BUNDLED_SOURCE_PATH, &out_dir, &options)?; - - Ok(out_dir.join(BUNDLED_SOURCE_PATH)) - } - - pub(super) fn build_if_necessary() -> Result<(), Error> { - let build_dir = copy_source_to_out_dir()?; - - if cfg!(target_os = "macos") { - run_command(&build_dir, "glibtoolize", None)?; - } else { - run_command(&build_dir, "libtoolize", None)?; - } - - run_command(&build_dir, "aclocal", None)?; - run_command(&build_dir, "automake", Some(&["--add-missing", "--copy"]))?; - run_command(&build_dir, "autoconf", None)?; - - autotools::Config::new(build_dir) - .cflag("-fPIC") - .cxxflag("-fPIC") - .disable_shared() - .enable_static() - .build(); - - Ok(()) - } - - fn run_command>( - curr_dir: P, - cmd: &str, - args_opt: Option<&[&str]>, - ) -> Result<(), Error> { - let mut command = std::process::Command::new(cmd); - - command.current_dir(curr_dir); - - if let Some(args) = args_opt { - command.args(args); - } - - let _output = command.output().map_err(|e| { - failure::format_err!( - "Error running command '{}' with args '{:?}' - {:?}", - cmd, - args_opt, - e - ) - })?; + let build_dir = out_dir(); + let install_dir = out_dir(); + + let webrtc_build_dir = build_dir.join(BUNDLED_SOURCE_PATH); + let mut meson = Command::new("meson"); + let status = meson + .args(&["--prefix", install_dir.to_str().unwrap()]) + .arg("-Ddefault_library=static") + .arg(BUNDLED_SOURCE_PATH) + .arg(webrtc_build_dir.to_str().unwrap()) + .status() + .context("Failed to execute meson. Do you have it installed?")?; + assert!(status.success(), "Command failed: {:?}", &meson); + + let mut ninja = Command::new("ninja"); + let status = ninja + .args(&["-C", webrtc_build_dir.to_str().unwrap()]) + .arg("install") + .status() + .context("Failed to execute ninja. Do you have it installed?")?; + assert!(status.success(), "Command failed: {:?}", &ninja); Ok(()) } } -// TODO: Consider fixing this with the upstream. -// https://github.com/rust-lang/rust-bindgen/issues/1089 -// https://github.com/rust-lang/rust-bindgen/issues/1301 -fn derive_serde(binding_file: &Path) -> Result<(), Error> { - let mut contents = String::new(); - File::open(binding_file)?.read_to_string(&mut contents)?; - - let new_contents = format!( - "use serde::{{Serialize, Deserialize}};\n{}", - Regex::new(r"#\s*\[\s*derive\s*\((?P[^)]+)\)\s*\]\s*pub\s*(?Pstruct|enum)")? - .replace_all(&contents, "#[derive($d, Serialize, Deserialize)] pub $s") - ); +fn main() -> Result<(), Error> { + webrtc::build_if_necessary()?; + let (include_dirs, lib_dirs) = webrtc::get_build_paths()?; - File::create(&binding_file)?.write_all(new_contents.as_bytes())?; + for dir in &lib_dirs { + println!("cargo:rustc-link-search=native={}", dir.display()); + } - Ok(()) -} + if cfg!(feature = "bundled") { + println!("cargo:rustc-link-lib=static=webrtc-audio-processing-1"); + } else { + println!("cargo:rustc-link-lib=dylib=webrtc-audio-processing-1"); + } -fn main() -> Result<(), Error> { - webrtc::build_if_necessary()?; - let (webrtc_include, webrtc_lib) = webrtc::get_build_paths()?; + if cfg!(target_os = "macos") { + // TODO: Remove after confirming this is not necessary. + //println!("cargo:rustc-link-lib=dylib=c++"); + println!("cargo:rustc-link-lib=framework=CoreFoundation"); + } else { + // TODO: Remove after confirming this is not necessary. + //println!("cargo:rustc-link-lib=dylib=stdc++"); + } let mut cc_build = cc::Build::new(); @@ -182,48 +158,40 @@ fn main() -> Result<(), Error> { cc_build .cpp(true) .file("src/wrapper.cpp") - .include(&webrtc_include) + .includes(&include_dirs) + .flag("-std=c++17") .flag("-Wno-unused-parameter") .flag("-Wno-deprecated-declarations") - .flag("-std=c++11") .out_dir(&out_dir()) .compile("webrtc_audio_processing_wrapper"); - println!("cargo:rustc-link-search=native={}", webrtc_lib.display()); println!("cargo:rustc-link-lib=static=webrtc_audio_processing_wrapper"); - println!("cargo:rerun-if-env-changed={}", DEPLOYMENT_TARGET_VAR); - - if cfg!(feature = "bundled") { - println!("cargo:rustc-link-lib=static=webrtc_audio_processing"); - } else { - println!("cargo:rustc-link-lib=dylib=webrtc_audio_processing"); - } - - if cfg!(target_os = "macos") { - println!("cargo:rustc-link-lib=dylib=c++"); - } else { - println!("cargo:rustc-link-lib=dylib=stdc++"); - } - let binding_file = out_dir().join("bindings.rs"); - bindgen::Builder::default() + let mut builder = bindgen::Builder::default() .header("src/wrapper.hpp") + .clang_args(&["-x", "c++", "-std=c++17", "-fparse-all-comments"]) .generate_comments(true) - .rustified_enum(".*") + .enable_cxx_namespaces() + .allowlist_type("webrtc::AudioProcessing_Error") + .allowlist_type("webrtc::AudioProcessing_Config") + .allowlist_type("webrtc::AudioProcessing_RealtimeSetting") + .allowlist_type("webrtc::StreamConfig") + .allowlist_type("webrtc::ProcessingConfig") + .allowlist_function("webrtc_audio_processing_wrapper::.*") + // The functions returns std::string, and is not FFI-safe. + .blocklist_item("webrtc::AudioProcessing_Config_ToString") + .opaque_type("std::.*") .derive_debug(true) - .derive_default(true) - .derive_partialeq(true) - .clang_arg(&format!("-I{}", &webrtc_include.display())) - .disable_name_namespacing() + .derive_default(true); + for dir in &include_dirs { + builder = builder.clang_arg(&format!("-I{}", dir.display())); + } + builder .generate() .expect("Unable to generate bindings") .write_to_file(&binding_file) .expect("Couldn't write bindings!"); - if cfg!(feature = "derive_serde") { - derive_serde(&binding_file).expect("Failed to modify derive macros"); - } - Ok(()) } diff --git a/webrtc-audio-processing-sys/src/lib.rs b/webrtc-audio-processing-sys/src/lib.rs index f18760b..9090dbd 100644 --- a/webrtc-audio-processing-sys/src/lib.rs +++ b/webrtc-audio-processing-sys/src/lib.rs @@ -1,65 +1,39 @@ #![allow(non_upper_case_globals)] #![allow(non_camel_case_types)] #![allow(non_snake_case)] +// https://github.com/rust-lang/rust-bindgen/issues/1651 +#![allow(deref_nullptr)] include!(concat!(env!("OUT_DIR"), "/bindings.rs")); -impl Into> for OptionalBool { - fn into(self) -> Option { - if self.has_value { - Some(self.value) - } else { - None - } - } -} +pub use root::{webrtc::*, webrtc_audio_processing_wrapper::*}; -impl From> for OptionalBool { - fn from(other: Option) -> OptionalBool { - if let Some(value) = other { - OptionalBool { has_value: true, value } - } else { - OptionalBool { has_value: false, value: false } - } - } -} - -impl Into> for OptionalInt { - fn into(self) -> Option { - if self.has_value { - Some(self.value) +impl From for Option { + fn from(other: OptionalBool) -> Option { + if other.has_value { + Some(other.value) } else { None } } } -impl From> for OptionalInt { - fn from(other: Option) -> OptionalInt { - if let Some(value) = other { - OptionalInt { has_value: true, value } - } else { - OptionalInt { has_value: false, value: 0 } - } - } -} - -impl Into> for OptionalDouble { - fn into(self) -> Option { - if self.has_value { - Some(self.value) +impl From for Option { + fn from(other: OptionalInt) -> Option { + if other.has_value { + Some(other.value) } else { None } } } -impl From> for OptionalDouble { - fn from(other: Option) -> OptionalDouble { - if let Some(value) = other { - OptionalDouble { has_value: true, value } +impl From for Option { + fn from(other: OptionalDouble) -> Option { + if other.has_value { + Some(other.value) } else { - OptionalDouble { has_value: false, value: 0.0 } + None } } } @@ -68,48 +42,74 @@ impl From> for OptionalDouble { mod tests { use super::*; - fn init_config_with_all_enabled() -> InitializationConfig { - InitializationConfig { - num_capture_channels: 1, - num_render_channels: 1, - enable_experimental_agc: true, - enable_intelligibility_enhancer: true, - } - } + const SAMPLE_RATE_HZ: i32 = 48_000; - fn config_with_all_enabled() -> Config { - Config { - echo_cancellation: EchoCancellation { - enable: true, - suppression_level: EchoCancellation_SuppressionLevel::HIGH, + fn config_with_all_enabled() -> AudioProcessing_Config { + AudioProcessing_Config { + pipeline: AudioProcessing_Config_Pipeline { + maximum_internal_processing_rate: SAMPLE_RATE_HZ, + ..Default::default() + }, + pre_amplifier: AudioProcessing_Config_PreAmplifier { + enabled: true, + ..Default::default() + }, + high_pass_filter: AudioProcessing_Config_HighPassFilter { + enabled: true, + ..Default::default() + }, + echo_canceller: AudioProcessing_Config_EchoCanceller { + enabled: true, + ..Default::default() + }, + noise_suppression: AudioProcessing_Config_NoiseSuppression { + enabled: true, + ..Default::default() + }, + transient_suppression: AudioProcessing_Config_TransientSuppression { + enabled: true, + ..Default::default() + }, + voice_detection: AudioProcessing_Config_VoiceDetection { + enabled: true, + ..Default::default() + }, + gain_controller1: AudioProcessing_Config_GainController1 { + enabled: true, + mode: AudioProcessing_Config_GainController1_Mode_kAdaptiveDigital, + analog_gain_controller: + AudioProcessing_Config_GainController1_AnalogGainController { + enabled: false, + ..Default::default() + }, + ..Default::default() }, - gain_control: GainControl { - enable: true, - target_level_dbfs: 3, - compression_gain_db: 3, - enable_limiter: true, + gain_controller2: AudioProcessing_Config_GainController2 { + enabled: false, + ..Default::default() }, - noise_suppression: NoiseSuppression { - enable: true, - suppression_level: NoiseSuppression_SuppressionLevel::HIGH, + residual_echo_detector: AudioProcessing_Config_ResidualEchoDetector { + enabled: true, + ..Default::default() }, - voice_detection: VoiceDetection { - enable: true, - detection_likelihood: VoiceDetection_DetectionLikelihood::HIGH, + level_estimation: AudioProcessing_Config_LevelEstimation { + enabled: true, + ..Default::default() }, - enable_extended_filter: true, - enable_delay_agnostic: true, - enable_transient_suppressor: true, - enable_high_pass_filter: true, + } + } + + fn assert_success(code: i32) { + unsafe { + assert!(is_success(code), "code={}", code); } } #[test] fn test_create_failure() { unsafe { - let config = InitializationConfig::default(); let mut error = 0; - let ap = audio_processing_create(&config, &mut error); + let ap = audio_processing_create(0, 0, SAMPLE_RATE_HZ, &mut error); assert!(ap.is_null()); assert!(!is_success(error)); } @@ -118,15 +118,10 @@ mod tests { #[test] fn test_create_delete() { unsafe { - let config = InitializationConfig { - num_capture_channels: 1, - num_render_channels: 1, - ..InitializationConfig::default() - }; let mut error = 0; - let ap = audio_processing_create(&config, &mut error); + let ap = audio_processing_create(1, 1, SAMPLE_RATE_HZ, &mut error); assert!(!ap.is_null()); - assert!(is_success(error)); + assert_success(error); audio_processing_delete(ap); } } @@ -135,11 +130,11 @@ mod tests { fn test_config() { unsafe { let mut error = 0; - let ap = audio_processing_create(&init_config_with_all_enabled(), &mut error); + let ap = audio_processing_create(1, 1, SAMPLE_RATE_HZ, &mut error); assert!(!ap.is_null()); - assert!(is_success(error)); + assert_success(error); - let config = Config::default(); + let config = AudioProcessing_Config::default(); set_config(ap, &config); let config = config_with_all_enabled(); @@ -153,17 +148,18 @@ mod tests { fn test_process() { unsafe { let mut error = 0; - let ap = audio_processing_create(&init_config_with_all_enabled(), &mut error); + let ap = audio_processing_create(1, 1, SAMPLE_RATE_HZ, &mut error); assert!(!ap.is_null()); - assert!(is_success(error)); + assert_success(error); let config = config_with_all_enabled(); set_config(ap, &config); - let mut frame = vec![vec![0f32; NUM_SAMPLES_PER_FRAME as usize]; 1]; + let num_samples = get_num_samples_per_frame(ap); + let mut frame = vec![vec![0f32; num_samples as usize]; 1]; let mut frame_ptr = frame.iter_mut().map(|v| v.as_mut_ptr()).collect::>(); - assert!(is_success(process_render_frame(ap, frame_ptr.as_mut_ptr()))); - assert!(is_success(process_capture_frame(ap, frame_ptr.as_mut_ptr()))); + assert_success(process_render_frame(ap, frame_ptr.as_mut_ptr())); + assert_success(process_capture_frame(ap, frame_ptr.as_mut_ptr())); audio_processing_delete(ap); } @@ -172,29 +168,23 @@ mod tests { #[test] fn test_empty_stats() { unsafe { - let config = InitializationConfig { - num_capture_channels: 1, - num_render_channels: 1, - ..InitializationConfig::default() - }; let mut error = 0; - let ap = audio_processing_create(&config, &mut error); + let ap = audio_processing_create(1, 1, SAMPLE_RATE_HZ, &mut error); assert!(!ap.is_null()); - assert!(is_success(error)); + assert_success(error); let stats = get_stats(ap); println!("Stats:\n{:#?}", stats); - assert!(!stats.has_voice.has_value); - assert!(!stats.has_echo.has_value); - assert!(!stats.rms_dbfs.has_value); - assert!(!stats.speech_probability.has_value); - assert!(!stats.residual_echo_return_loss.has_value); + assert!(!stats.output_rms_dbfs.has_value); + assert!(!stats.voice_detected.has_value); assert!(!stats.echo_return_loss.has_value); assert!(!stats.echo_return_loss_enhancement.has_value); - assert!(!stats.a_nlp.has_value); + assert!(!stats.divergent_filter_fraction.has_value); assert!(!stats.delay_median_ms.has_value); assert!(!stats.delay_standard_deviation_ms.has_value); - assert!(!stats.delay_fraction_poor_delays.has_value); + assert!(!stats.residual_echo_likelihood.has_value); + assert!(!stats.residual_echo_likelihood_recent_max.has_value); + assert!(!stats.delay_ms.has_value); audio_processing_delete(ap); } @@ -204,30 +194,33 @@ mod tests { fn test_some_stats() { unsafe { let mut error = 0; - let ap = audio_processing_create(&init_config_with_all_enabled(), &mut error); + let ap = audio_processing_create(1, 1, SAMPLE_RATE_HZ, &mut error); assert!(!ap.is_null()); - assert!(is_success(error)); + assert_success(error); let config = config_with_all_enabled(); set_config(ap, &config); - let mut frame = vec![vec![0f32; NUM_SAMPLES_PER_FRAME as usize]; 1]; + let num_samples = get_num_samples_per_frame(ap); + let mut frame = vec![vec![0f32; num_samples as usize]; 1]; let mut frame_ptr = frame.iter_mut().map(|v| v.as_mut_ptr()).collect::>(); - assert!(is_success(process_render_frame(ap, frame_ptr.as_mut_ptr()))); - assert!(is_success(process_capture_frame(ap, frame_ptr.as_mut_ptr()))); + assert_success(process_render_frame(ap, frame_ptr.as_mut_ptr())); + assert_success(process_capture_frame(ap, frame_ptr.as_mut_ptr())); + let stats = get_stats(ap); println!("Stats:\n{:#?}", stats); - assert!(stats.has_voice.has_value); - assert!(stats.has_echo.has_value); - assert!(stats.rms_dbfs.has_value); - assert!(stats.speech_probability.has_value); - assert!(stats.residual_echo_return_loss.has_value); + assert!(stats.output_rms_dbfs.has_value); + assert!(stats.voice_detected.has_value); assert!(stats.echo_return_loss.has_value); assert!(stats.echo_return_loss_enhancement.has_value); - assert!(stats.a_nlp.has_value); - assert!(stats.delay_median_ms.has_value); - assert!(stats.delay_standard_deviation_ms.has_value); - assert!(stats.delay_fraction_poor_delays.has_value); + assert!(stats.residual_echo_likelihood.has_value); + assert!(stats.residual_echo_likelihood_recent_max.has_value); + assert!(stats.delay_ms.has_value); + + // TODO: Investigate why these stats are not filled. + assert!(!stats.divergent_filter_fraction.has_value); + assert!(!stats.delay_median_ms.has_value); + assert!(!stats.delay_standard_deviation_ms.has_value); audio_processing_delete(ap); } diff --git a/webrtc-audio-processing-sys/src/wrapper.cpp b/webrtc-audio-processing-sys/src/wrapper.cpp index 59af81e..46d25a8 100644 --- a/webrtc-audio-processing-sys/src/wrapper.cpp +++ b/webrtc-audio-processing-sys/src/wrapper.cpp @@ -1,40 +1,31 @@ -// TODO(ryo): Add TraceCallback. - #include "wrapper.hpp" #include #include #define WEBRTC_POSIX -#define WEBRTC_AUDIO_PROCESSING_ONLY_BUILD - -#include -#include -namespace webrtc_audio_processing { +namespace webrtc_audio_processing_wrapper { namespace { -// This is the default that Chromium uses. -const int AGC_STARTUP_MIN_VOLUME = 85; - -OptionalDouble make_optional_double(const double value) { +OptionalDouble from_absl_optional(const absl::optional& optional) { OptionalDouble rv; - rv.has_value = true; - rv.value = value; + rv.has_value = optional.has_value(); + rv.value = optional.value_or(0.0); return rv; } -OptionalInt make_optional_int(const int value) { +OptionalInt from_absl_optional(const absl::optional& optional) { OptionalInt rv; - rv.has_value = true; - rv.value = value; + rv.has_value = optional.has_value(); + rv.value = optional.value_or(0); return rv; } -OptionalBool make_optional_bool(const bool value) { +OptionalBool from_absl_optional(const absl::optional& optional) { OptionalBool rv; - rv.has_value = true; - rv.value = value; + rv.has_value = optional.has_value(); + rv.value = optional.value_or(false); return rv; } @@ -42,39 +33,32 @@ OptionalBool make_optional_bool(const bool value) { struct AudioProcessing { std::unique_ptr processor; + webrtc::AudioProcessing::Config config; webrtc::StreamConfig capture_stream_config; webrtc::StreamConfig render_stream_config; - OptionalInt stream_delay_ms; + absl::optional stream_delay_ms; }; AudioProcessing* audio_processing_create( - const InitializationConfig& init_config, + int num_capture_channels, + int num_render_channels, + int sample_rate_hz, int* error) { - webrtc::Config config; - if (init_config.enable_experimental_agc) { - config.Set( - new webrtc::ExperimentalAgc(true, AGC_STARTUP_MIN_VOLUME)); - } - if (init_config.enable_intelligibility_enhancer) { - config.Set(new webrtc::Intelligibility(true)); - } - // TODO(ryo): Experiment with the webrtc's builtin beamformer. There are some - // preconditions; see |ec_fixate_spec()| in the pulseaudio's example. - AudioProcessing* ap = new AudioProcessing; - ap->processor.reset(webrtc::AudioProcessing::Create(config)); + ap->processor.reset(webrtc::AudioProcessingBuilder().Create()); const bool has_keyboard = false; ap->capture_stream_config = webrtc::StreamConfig( - SAMPLE_RATE_HZ, init_config.num_capture_channels, has_keyboard); + sample_rate_hz, num_capture_channels, has_keyboard); ap->render_stream_config = webrtc::StreamConfig( - SAMPLE_RATE_HZ, init_config.num_render_channels, has_keyboard); + sample_rate_hz, num_render_channels, has_keyboard); + // The input and output streams must have the same number of channels. webrtc::ProcessingConfig pconfig = { - ap->capture_stream_config, - ap->capture_stream_config, - ap->render_stream_config, - ap->render_stream_config, + ap->capture_stream_config, // capture input + ap->capture_stream_config, // capture output + ap->render_stream_config, // render input + ap->render_stream_config, // render output }; const int code = ap->processor->Initialize(pconfig); if (code != webrtc::AudioProcessing::kNoError) { @@ -86,15 +70,17 @@ AudioProcessing* audio_processing_create( return ap; } -int process_capture_frame(AudioProcessing* ap, float** channels) { - auto* p = ap->processor.get(); +void initialize(AudioProcessing* ap) { + ap->processor->Initialize(); +} - if (p->echo_cancellation()->is_enabled()) { - p->set_stream_delay_ms( - ap->stream_delay_ms.has_value ? ap->stream_delay_ms.value : 0); +int process_capture_frame(AudioProcessing* ap, float** channels) { + if (ap->config.echo_canceller.enabled) { + ap->processor->set_stream_delay_ms( + ap->stream_delay_ms.value_or(0)); } - return p->ProcessStream( + return ap->processor->ProcessStream( channels, ap->capture_stream_config, ap->capture_stream_config, channels); } @@ -104,129 +90,38 @@ int process_render_frame(AudioProcessing* ap, float** channels) { } Stats get_stats(AudioProcessing* ap) { - auto* p = ap->processor.get(); - - Stats stats; - if (p->voice_detection()->is_enabled()) { - stats.has_voice = - make_optional_bool(p->voice_detection()->stream_has_voice()); - } - if (p->echo_cancellation()->is_enabled()) { - stats.has_echo = - make_optional_bool(p->echo_cancellation()->stream_has_echo()); - } - if (p->level_estimator()->is_enabled()) { - stats.rms_dbfs = make_optional_int(-1 * p->level_estimator()->RMS()); - } - if (p->noise_suppression()->is_enabled()) { - if (p->noise_suppression()->speech_probability() - != webrtc::AudioProcessing::kUnsupportedFunctionError) { - stats.speech_probability = - make_optional_double(p->noise_suppression()->speech_probability()); - } - // TODO(ryo): NoiseSuppression supports NoiseEstimate function in the latest - // master. - } - - // TODO(ryo): AudioProcessing supports useful GetStatistics function in the - // latest master. - if (p->echo_cancellation()->is_enabled()) { - webrtc::EchoCancellation::Metrics metrics; - if (p->echo_cancellation()->GetMetrics(&metrics) - == webrtc::AudioProcessing::kNoError) { - stats.residual_echo_return_loss = - make_optional_double(metrics.residual_echo_return_loss.instant); - stats.echo_return_loss = - make_optional_double(metrics.echo_return_loss.instant); - stats.echo_return_loss_enhancement = - make_optional_double(metrics.echo_return_loss_enhancement.instant); - stats.a_nlp = make_optional_double(metrics.a_nlp.instant); - } - - int delay_median_ms = -1; - int delay_stddev_ms = -1; - float fraction_poor_delays = -1; - if (p->echo_cancellation()->GetDelayMetrics( - &delay_median_ms, &delay_stddev_ms, &fraction_poor_delays) - == webrtc::AudioProcessing::kNoError) { - stats.delay_median_ms = make_optional_int(delay_median_ms); - stats.delay_standard_deviation_ms = make_optional_int(delay_stddev_ms); - stats.delay_fraction_poor_delays = - make_optional_double(fraction_poor_delays); - } - } - - return stats; -} - -void set_config(AudioProcessing* ap, const Config& config) { - auto* p = ap->processor.get(); - - webrtc::Config extra_config; - extra_config.Set( - new webrtc::ExtendedFilter( - config.echo_cancellation.enable_extended_filter)); - extra_config.Set( - new webrtc::DelayAgnostic( - !config.echo_cancellation.stream_delay_ms.has_value && - config.echo_cancellation.enable_delay_agnostic)); - extra_config.Set( - new webrtc::ExperimentalNs(config.enable_transient_suppressor)); - // TODO(ryo): There is a new RefinedAdaptiveFilter in the latest master. - p->SetExtraOptions(extra_config); - - // TODO(ryo): Look into EchoCanceller3. - if (config.echo_cancellation.enable) { - ap->stream_delay_ms = config.echo_cancellation.stream_delay_ms; - // According to the webrtc documentation, drift compensation should not be - // necessary as long as we are using the same audio device for input and - // output. - p->echo_cancellation()->enable_drift_compensation(false); - p->echo_cancellation()->enable_metrics(true); - p->echo_cancellation()->enable_delay_logging(true); - p->echo_cancellation()->set_suppression_level( - static_cast( - config.echo_cancellation.suppression_level)); - p->echo_cancellation()->Enable(true); - } else { - p->echo_cancellation()->Enable(false); - } - - if (config.gain_control.enable) { - p->gain_control()->set_mode( - static_cast(config.gain_control.mode)); - p->gain_control()->set_target_level_dbfs( - config.gain_control.target_level_dbfs); - p->gain_control()->set_compression_gain_db( - config.gain_control.compression_gain_db); - p->gain_control()->enable_limiter(config.gain_control.enable_limiter); - p->gain_control()->Enable(true); - } else { - p->gain_control()->Enable(false); - } + const webrtc::AudioProcessingStats& stats = ap->processor->GetStatistics(); + + return Stats { + from_absl_optional(stats.output_rms_dbfs), + from_absl_optional(stats.voice_detected), + from_absl_optional(stats.echo_return_loss), + from_absl_optional(stats.echo_return_loss_enhancement), + from_absl_optional(stats.divergent_filter_fraction), + from_absl_optional(stats.delay_median_ms), + from_absl_optional(stats.delay_standard_deviation_ms), + from_absl_optional(stats.residual_echo_likelihood), + from_absl_optional(stats.residual_echo_likelihood_recent_max), + from_absl_optional(stats.delay_ms), + }; +} - if (config.noise_suppression.enable) { - p->noise_suppression()->set_level( - static_cast( - config.noise_suppression.suppression_level)); - p->noise_suppression()->Enable(true); - } else { - p->noise_suppression()->Enable(false); - } +int get_num_samples_per_frame(AudioProcessing* ap) { + return ap->capture_stream_config.sample_rate_hz() * webrtc::AudioProcessing::kChunkSizeMs / 1000; +} - if (config.voice_detection.enable) { - p->voice_detection()->set_likelihood( - static_cast( - config.voice_detection.detection_likelihood)); - p->voice_detection()->set_frame_size_ms(FRAME_MS); - p->voice_detection()->Enable(true); - } else { - p->voice_detection()->Enable(false); - } +void set_config(AudioProcessing* ap, const webrtc::AudioProcessing::Config& config) { + ap->config = config; + ap->processor->ApplyConfig(config); +} - p->high_pass_filter()->Enable(config.enable_high_pass_filter); +void set_runtime_setting(AudioProcessing* ap, webrtc::AudioProcessing::RuntimeSetting setting) { + ap->processor->SetRuntimeSetting(setting); +} - p->level_estimator()->Enable(true); +void set_stream_delay_ms(AudioProcessing* ap, int delay) { + // TODO: Need to mutex lock. + ap->stream_delay_ms = delay; } void set_output_will_be_muted(AudioProcessing* ap, bool muted) { @@ -245,4 +140,4 @@ bool is_success(const int code) { return code == webrtc::AudioProcessing::kNoError; } -} // namespace webrtc_audio_processing +} // namespace webrtc_audio_processing_wrapper diff --git a/webrtc-audio-processing-sys/src/wrapper.hpp b/webrtc-audio-processing-sys/src/wrapper.hpp index 63ecf7b..9521a6b 100644 --- a/webrtc-audio-processing-sys/src/wrapper.hpp +++ b/webrtc-audio-processing-sys/src/wrapper.hpp @@ -1,20 +1,13 @@ // This is a c++ header file, but we are using minimal c++ constructs and not // including any complex header files to keep Rust interoperability simple. +// The provided functions are thread-safe. +// +// TODO: Add support for AEC dump. webrtc-audio-processing library does not include TaskQueue +// implementation, which is needed. -#ifndef WEBRTC_AUDIO_PROCESSING_WRAPPER_HPP_ -#define WEBRTC_AUDIO_PROCESSING_WRAPPER_HPP_ +#include -namespace webrtc_audio_processing { - -// AudioProcessing accepts only one of 48000, 32000, 16000, and 8000 hz. -// TODO: support multiple sample rates. -const int SAMPLE_RATE_HZ = 48000; - -// AudioProcessing expects each frame to be of fixed 10 ms. -const int FRAME_MS = 10; - -///
The number of expected samples per frame.
-const int NUM_SAMPLES_PER_FRAME = SAMPLE_RATE_HZ * FRAME_MS / 1000; +namespace webrtc_audio_processing_wrapper { struct AudioProcessing; @@ -33,278 +26,76 @@ struct OptionalBool { bool value = false; }; -///
A configuration used only when initializing a Processor.
-struct InitializationConfig { - int num_capture_channels; - int num_render_channels; - - // TODO: Investigate how it's different from the default gain control and the effect of using the two at the same time. - bool enable_experimental_agc; - - bool enable_intelligibility_enhancer; -}; - -///
Echo cancellation configuration.
-struct EchoCancellation { - ///
Whether to use echo cancellation.
- bool enable; - - ///
A level of echo suppression.
- enum SuppressionLevel { - LOWEST, - LOWER, - LOW, - MODERATE, - HIGH, - }; - - ///
- /// Determines the aggressiveness of the suppressor. A higher level trades off - /// double-talk performance for increased echo suppression. - ///
- SuppressionLevel suppression_level; - - ///
- /// Use to enable the extended filter mode in the AEC, along with robustness - /// measures around the reported system delays. It comes with a significant - /// increase in AEC complexity, but is much more robust to unreliable reported - /// delays. - ///
- bool enable_extended_filter; - - ///
- /// Enables delay-agnostic echo cancellation. This feature relies on internally - /// estimated delays between the process and reverse streams, thus not relying - /// on reported system delays. - ///
- bool enable_delay_agnostic; - - ///
- /// Sets the delay in ms between process_render_frame() receiving a far-end - /// frame and process_capture_frame() receiving a near-end frame containing - /// the corresponding echo. You should set this only if you are certain that - /// the delay will be stable and constant. enable_delay_agnostic will be - /// ignored when this option is set. - ///
- OptionalInt stream_delay_ms; -}; - -///
Gain control configuration.
-struct GainControl { - ///
Whether to use gain control.
- bool enable; - - ///
Mode of gain control.
- enum Mode { - ///
Not supported yet.
- /// TODO(skywhale): Expose set_stream_analog_level() and - /// stream_analog_level(). - ADAPTIVE_ANALOG, - - ///
- /// Bring the signal to an appropriate range by applying an adaptive gain - /// control. The volume is dynamically amplified with a microphone with - /// small pickup and vice versa. - ///
- ADAPTIVE_DIGITAL, - - ///
- /// Unlike ADAPTIVE_DIGITAL, it only compresses (i.e. gradually reduces - /// gain with increasing level) the input signal when at higher levels. - /// Use this where the capture signal level is predictable, so that a - /// known gain can be applied. - ///
- FIXED_DIGITAL, - }; - - ///
Determines what type of gain control is applied.
- Mode mode; - - ///
- /// Sets the target peak level (or envelope) of the AGC in dBFs (decibels from - /// digital full-scale). The convention is to use positive values. - /// For instance, passing in a value of 3 corresponds to -3 dBFs, or a target - /// level 3 dB below full-scale. Limited to [0, 31]. - ///
- int target_level_dbfs; - - ///
- /// Sets the maximum gain the digital compression stage may apply, in dB. A - /// higher number corresponds to greater compression, while a value of 0 will - /// leave the signal uncompressed. Limited to [0, 90]. - ///
- int compression_gain_db; - - ///
- /// When enabled, the compression stage will hard limit the signal to the - /// target level. Otherwise, the signal will be compressed but not limited - /// above the target level. - ///
- bool enable_limiter; -}; - -///
Noise suppression configuration.
-struct NoiseSuppression { - ///
Whether to use noise supression.
- bool enable; - - ///
A level of noise suppression.
- enum SuppressionLevel { - LOW, - MODERATE, - HIGH, - VERY_HIGH, - }; - - ///
- /// Determines the aggressiveness of the suppression. Increasing the level will - /// reduce the noise level at the expense of a higher speech distortion. - ///
- SuppressionLevel suppression_level; -}; - -///
Voice detection configuration.
-struct VoiceDetection { - ///
Whether to use voice detection.
- bool enable; - - ///
The sensitivity of the noise detector.
- enum DetectionLikelihood { - VERY_LOW, - LOW, - MODERATE, - HIGH, - }; - - ///
- /// Specifies the likelihood that a frame will be declared to contain voice. A - /// higher value makes it more likely that speech will not be clipped, at the - /// expense of more noise being detected as voice. - ///
- DetectionLikelihood detection_likelihood; -}; - -///
Config that can be used mid-processing.
-struct Config { - EchoCancellation echo_cancellation; - GainControl gain_control; - NoiseSuppression noise_suppression; - VoiceDetection voice_detection; - - ///
- /// Use to enable experimental transient noise suppression. - ///
- bool enable_transient_suppressor; - - ///
- /// Use to enable a filtering component which removes DC offset and - /// low-frequency noise. - ///
- bool enable_high_pass_filter; -}; - -///
Statistics about the processor state.
+// A variant of AudioProcessingStats without absl::optional dependency, +// which can not be bindgen-ed. struct Stats { - ///
- /// True if voice is detected in the current frame. - ///
- OptionalBool has_voice; - - ///
- /// False if the current frame almost certainly contains no echo and true if it - /// _might_ contain echo. - ///
- OptionalBool has_echo; - - ///
- /// Root mean square (RMS) level in dBFs (decibels from digital full-scale), or - /// alternately dBov. It is computed over all primary stream frames since the - /// last call to |get_stats()|. The returned value is constrained to [-127, 0], - /// where -127 indicates muted. - ///
- OptionalInt rms_dbfs; - - ///
- /// Prior speech probability of the current frame averaged over output - /// channels, internally computed by noise suppressor. - ///
- OptionalDouble speech_probability; - - ///
- /// RERL = ERL + ERLE - ///
- OptionalDouble residual_echo_return_loss; - - ///
- /// ERL = 10log_10(P_far / P_echo) - ///
+ OptionalInt output_rms_dbfs; + OptionalBool voice_detected; OptionalDouble echo_return_loss; - - ///
- /// ERLE = 10log_10(P_echo / P_out) - ///
OptionalDouble echo_return_loss_enhancement; - - ///
- /// (Pre non-linear processing suppression) A_NLP = 10log_10(P_echo / P_a) - ///
- OptionalDouble a_nlp; - - ///
- /// Median of the measured delay in ms. The values are aggregated until the - /// first call to |get_stats()| and afterwards aggregated and updated every - /// second. - ///
+ OptionalDouble divergent_filter_fraction; OptionalInt delay_median_ms; - - ///
- /// Standard deviation of the measured delay in ms. The values are aggregated - /// until the first call to |get_stats()| and afterwards aggregated and updated - /// every second. - ///
OptionalInt delay_standard_deviation_ms; - - ///
- /// The fraction of delay estimates that can make the echo cancellation perform - /// poorly. - ///
- OptionalDouble delay_fraction_poor_delays; + OptionalDouble residual_echo_likelihood; + OptionalDouble residual_echo_likelihood_recent_max; + OptionalInt delay_ms; }; -// Creates a new instance of the signal processor. -AudioProcessing* audio_processing_create(const InitializationConfig& init_config, int* error); +// Creates a new instance of AudioProcessing. +AudioProcessing* audio_processing_create( + int num_capture_channels, + int num_render_channels, + int sample_rate_hz, + int* error); -// Processes and modifies the audio frame from a capture device. Each element in -// |channels| is an array of float representing a single-channel frame of 10 ms -// length. Returns an error code or |kNoError|. +// Processes and modifies the audio frame from a capture device. +// Each element in |channels| is an array of float representing a single-channel frame of 10 ms +// length (i.e. deinterleaved). Returns an error code or |kNoError|. int process_capture_frame(AudioProcessing* ap, float** channels); // Processes and optionally modifies the audio frame from a playback device. -// Each element in |channels| is an array of float representing a single-channel -// frame of 10 ms length. Returns an error code or |kNoError|. +// Each element in |channels| is an array of float representing a single-channel frame of 10 ms +// length (i.e. deinterleaved). Returns an error code or |kNoError|. int process_render_frame(AudioProcessing* ap, float** channel3); // Returns statistics from the last |process_capture_frame()| call. Stats get_stats(AudioProcessing* ap); -// Immediately updates the configurations of the signal processor. -// May be called multiple times after the initialization and during processing. -void set_config(AudioProcessing* ap, const Config& config); +// Returns the number of samples per frame based on the current configuration of sample rate and the +// frame chunk size. As of 2021/08/21, the chunk size is fixed to 10ms. +int get_num_samples_per_frame(AudioProcessing* ap); -// Signals the AEC and AGC that the audio output will be / is muted. -// They may use the hint to improve their parameter adaptation. +// Immediately updates the configurations of the signal processor. +// This config is intended to be used during setup, and to enable/disable top-level processing +// effects. Use during processing may cause undesired submodule resets, affecting the audio quality. +// Use the RuntimeSetting construct for runtime configuration. +void set_config(AudioProcessing* ap, const webrtc::AudioProcessing::Config& config); + +// Enqueues a runtime setting. +void set_runtime_setting(AudioProcessing* ap, webrtc::AudioProcessing::RuntimeSetting setting); + +// Sets the |delay| in ms between process_render_frame() receiving a far-end frame and +// process_capture_frame() receiving a near-end frame containing the corresponding echo. +// It assumes that there is no such delay if this function is not called. +void set_stream_delay_ms(AudioProcessing* ap, int delay); + +// Set to true when the output of AudioProcessing will be muted or in some other way not used. +// Ideally, the captured audio would still be processed, but some components may change behavior +// based on this information. void set_output_will_be_muted(AudioProcessing* ap, bool muted); /// Signals the AEC and AGC that the next frame will contain key press sound void set_stream_key_pressed(AudioProcessing* ap, bool pressed); -// Every processor created by |audio_processing_create()| needs to destroyed by -// this function. +// Initializes internal states, while retaining all user settings. This should be called before +// beginning to process a new audio stream. However, it is not necessary to call before processing +// the first stream after creation. +void initialize(AudioProcessing* ap); + +// Every AudioProcessing created by |audio_processing_create()| needs to destroyed by this function. void audio_processing_delete(AudioProcessing* ap); // Returns true iff the code indicates a successful operation. bool is_success(int code); -} // namespace webrtc_audio_processing - -#endif // WEBRTC_AUDIO_PROCESSING_WRAPPER_HPP_ +} // namespace webrtc_audio_processing_wrapper diff --git a/webrtc-audio-processing-sys/webrtc-audio-processing b/webrtc-audio-processing-sys/webrtc-audio-processing index 967d06c..8e258a1 160000 --- a/webrtc-audio-processing-sys/webrtc-audio-processing +++ b/webrtc-audio-processing-sys/webrtc-audio-processing @@ -1 +1 @@ -Subproject commit 967d06c24e9bc74b1b28a330a8b37fbf584b0162 +Subproject commit 8e258a1933d405073c9e6465628a69ac7d2a1f13