Started milestone 2

This commit is contained in:
sam
2026-05-03 15:53:25 +02:00
parent 43483c2145
commit 041955345b
18 changed files with 4616 additions and 60 deletions

View File

@@ -6,9 +6,17 @@ edition = "2024"
[dependencies]
anyhow = "1.0.102"
core_protocol = { version = "0.1.0", path = "../core_protocol" }
cpal = "0.17.3"
eframe = "0.34.1"
egui = "0.34.1"
futures = "0.3.32"
hound = "3.5.1"
rdev = "0.5.3"
ringbuf = "0.5.0"
tokio = { version = "1.52.1", features = ["rt-multi-thread", "net", "macros"] }
tokio-serde = { version = "0.9.0", features = ["bincode"] }
tokio-util = { version = "0.7.18", features = ["codec"] }
tracing = "0.1.44"
tracing-subscriber = "0.3.23"
webrtc-audio-processing = { version = "2.0.4", features = ["bundled"] }
webrtc-audio-processing-config = "2.0.4"

View File

@@ -0,0 +1,59 @@
//! Headless and hardware-backed audio capture logic.
//!
//! This module abstracts `cpal` to capture microphone input cleanly,
//! pushing raw samples directly to a lock-free ringbuffer.
//!
//! The `cpal` data callback is kept strictly lock-free and allocation-free
//! to satisfy real-time audio constraints.
use anyhow::{Result, anyhow};
use cpal::StreamConfig;
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
use ringbuf::HeapProd;
use ringbuf::traits::Producer;
use tracing::{error, info};
use super::{INPUT_CHANNELS, SAMPLE_RATE};
/// Sets up the default microphone device and returns the active cpal stream.
///
/// The stream pushes raw f32 samples into the provided ringbuffer producer.
///
/// # Errors
/// Returns an error if no input device is found or if the stream cannot be built.
pub fn start_audio_capture(mut producer: HeapProd<f32>) -> Result<cpal::Stream> {
let host = cpal::default_host();
let device = host
.default_input_device()
.ok_or_else(|| anyhow!("No default input device found"))?;
// cpal 0.17 deprecates `name()` in favor of `description()`.
let device_desc = device
.description()
.map_or_else(|_| "Unknown".to_string(), |d| d.name().to_string());
info!("Using input device: {}", device_desc);
let config = StreamConfig {
channels: INPUT_CHANNELS,
// cpal 0.17 changed SampleRate from a tuple struct to a plain u32 alias.
sample_rate: SAMPLE_RATE,
buffer_size: cpal::BufferSize::Default,
};
let stream = device.build_input_stream(
&config,
move |data: &[f32], _: &cpal::InputCallbackInfo| {
// STRICT RULE: No locks, no allocations in this callback.
// Just push samples to the ringbuffer.
let _ = producer.push_slice(data);
},
move |err| {
error!("An error occurred on the audio capture stream: {}", err);
},
None, // None = default timeout
)?;
stream.play()?;
Ok(stream)
}

View File

@@ -0,0 +1,108 @@
//! DSP and Voice Activity Detection (VAD) thread.
//!
//! Pulls audio from the lock-free ringbuffer, applies WebRTC noise suppression
//! and echo cancellation, then checks for voice activity before signalling
//! the UI via a `tokio::sync::watch` channel.
//!
//! This thread is a dedicated `std::thread` (not a Tokio task) because
//! real-time audio processing must never be at the mercy of a cooperative
//! async scheduler.
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::thread;
use std::time::Duration;
use ringbuf::HeapCons;
use ringbuf::traits::{Consumer, Observer};
use tokio::sync::watch;
use tracing::info;
use webrtc_audio_processing::Processor;
use webrtc_audio_processing_config::{
Config, EchoCanceller, NoiseSuppression, NoiseSuppressionLevel,
};
use super::{FRAME_SIZE, SAMPLE_RATE};
/// RMS threshold below which a frame is considered silence.
/// This provides a simple amplitude-based VAD since the WebRTC v2 API
/// removed the standalone voice detection configuration.
const VAD_RMS_THRESHOLD: f32 = 0.01;
/// Spawns the dedicated background DSP thread.
///
/// Reads 960-sample frames from the ringbuffer, applies WebRTC
/// noise suppression + echo cancellation, and updates the active
/// speaker state via the provided watch channel.
pub fn spawn_dsp_thread(
mut consumer: HeapCons<f32>,
ptt_flag: Arc<AtomicBool>,
active_speaker_tx: watch::Sender<bool>,
) {
thread::spawn(move || {
info!("DSP thread started.");
let ap = match Processor::new(SAMPLE_RATE) {
Ok(ap) => ap,
Err(e) => {
tracing::error!("Failed to initialize WebRTC APM: {:?}", e);
return;
}
};
let config = Config {
echo_canceller: Some(EchoCanceller::default()),
noise_suppression: Some(NoiseSuppression {
level: NoiseSuppressionLevel::High,
analyze_linear_aec_output: false,
}),
..Default::default()
};
ap.set_config(config);
// Mono capture: one channel with FRAME_SIZE samples.
let mut frame_buf = vec![vec![0.0f32; FRAME_SIZE]];
loop {
// Wait until we have a full 20ms frame (960 samples at 48kHz).
if consumer.occupied_len() >= FRAME_SIZE {
let _ = consumer.pop_slice(&mut frame_buf[0]);
let is_transmitting = ptt_flag.load(Ordering::Relaxed);
// Run the WebRTC DSP pipeline on the capture frame.
if let Err(e) = ap.process_capture_frame(&mut frame_buf) {
tracing::warn!("APM processing failed: {:?}", e);
}
// Simple RMS-based VAD since webrtc-audio-processing v2
// removed the dedicated VoiceDetection config field.
let rms = compute_rms(&frame_buf[0]);
let has_voice = rms > VAD_RMS_THRESHOLD;
let should_transmit = is_transmitting && has_voice;
// Only update the watch channel when the state actually changes
// to avoid unnecessary UI repaints.
if *active_speaker_tx.borrow() != should_transmit {
let _ = active_speaker_tx.send(should_transmit);
}
} else {
thread::sleep(Duration::from_millis(2));
}
}
});
}
/// Computes the Root Mean Square (RMS) of a sample buffer.
///
/// Used as a lightweight VAD: if the RMS is below a threshold,
/// the frame is considered silence.
fn compute_rms(samples: &[f32]) -> f32 {
if samples.is_empty() {
return 0.0;
}
let sum_sq: f32 = samples.iter().map(|s| s * s).sum();
#[allow(clippy::cast_precision_loss)] // FRAME_SIZE (960) is well within f32's 23-bit mantissa.
let divisor = samples.len() as f32;
(sum_sq / divisor).sqrt()
}

View File

@@ -0,0 +1,19 @@
//! Shared audio configuration constants.
//!
//! Defines strict mathematical constraints required for Opus encoding and decoding.
#![forbid(unsafe_code)]
#![deny(clippy::all, clippy::pedantic)]
#![deny(clippy::unwrap_used, clippy::expect_used)]
pub mod capture;
pub mod dsp;
/// The strict sample rate required across the entire DSP pipeline.
pub const SAMPLE_RATE: u32 = 48_000;
/// The number of channels for microphone capture (Mono).
pub const INPUT_CHANNELS: u16 = 1;
/// The exact number of samples required per frame for Opus (20ms).
pub const FRAME_SIZE: usize = 960;

View File

@@ -0,0 +1,34 @@
//! Global Hotkey listener.
//!
//! Uses `rdev` to capture global PTT events across the OS without requiring window focus.
#![forbid(unsafe_code)]
#![deny(clippy::all, clippy::pedantic)]
#![deny(clippy::unwrap_used, clippy::expect_used)]
use rdev::{Event, EventType, Key, listen};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::thread;
use tracing::info;
/// Spawns the global hotkey listener.
pub fn spawn_hotkey_listener(ptt_flag: Arc<AtomicBool>) {
thread::spawn(move || {
info!("Hotkey listener started. Press 'V' to talk.");
let callback = move |event: Event| match event.event_type {
EventType::KeyPress(Key::KeyV) => {
ptt_flag.store(true, Ordering::Relaxed);
}
EventType::KeyRelease(Key::KeyV) => {
ptt_flag.store(false, Ordering::Relaxed);
}
_ => {}
};
if let Err(e) = listen(callback) {
tracing::error!("Error listening to global hotkeys: {:?}", e);
}
});
}

View File

@@ -1,24 +1,76 @@
//! Client Node entry point.
//!
//! This module initializes the desktop client application, sets up the Tokio
//! This module initializes the desktop client application, sets up the Tokio
//! background thread for networking, and eventually binds to the UI framework.
#![forbid(unsafe_code)]
#![deny(clippy::all, clippy::pedantic)]
#![deny(clippy::unwrap_used, clippy::expect_used)]
mod audio;
mod hotkey;
mod network;
mod ui;
use anyhow::Result;
use eframe::egui;
use ringbuf::HeapRb;
use ringbuf::traits::Split;
use std::sync::Arc;
use std::sync::atomic::AtomicBool;
use tokio::sync::watch;
use tracing::{error, info};
#[tokio::main]
async fn main() -> anyhow::Result<()> {
fn main() -> Result<()> {
tracing_subscriber::fmt::init();
info!("Starting client node...");
if let Err(e) = network::control::connect_and_auth("TestUser").await {
error!("Connection error: {:?}", e);
}
// Setup communication channels
let (active_speaker_tx, active_speaker_rx) = watch::channel(false);
let ptt_flag = Arc::new(AtomicBool::new(false));
// Setup lock-free ringbuffer for audio capture (4096 capacity)
let audio_rb = HeapRb::<f32>::new(4096);
let (producer, consumer) = audio_rb.split();
// Spawn DSP and audio capture threads
audio::dsp::spawn_dsp_thread(consumer, ptt_flag.clone(), active_speaker_tx);
let _stream = audio::capture::start_audio_capture(producer).map_err(|e| {
error!("Failed to start audio capture: {:?}", e);
e
});
// Spawn Global Hotkey listener
hotkey::spawn_hotkey_listener(ptt_flag);
// Spawn custom tokio runtime for network background tasks
std::thread::spawn(move || {
let Ok(rt) = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()
else {
tracing::error!("Failed to build tokio runtime");
return;
};
rt.block_on(async {
if let Err(e) = network::control::connect_and_auth("TestUser").await {
error!("Connection error: {:?}", e);
}
});
});
let options = eframe::NativeOptions {
viewport: egui::ViewportBuilder::default().with_inner_size([800.0, 600.0]),
..Default::default()
};
eframe::run_native(
"Voice App",
options,
Box::new(|_cc| Ok(Box::new(ui::VoiceApp::new(active_speaker_rx)))),
)
.map_err(|e| anyhow::anyhow!("eframe error: {e:?}"))?;
Ok(())
}

View File

@@ -8,8 +8,8 @@ use core_protocol::constants;
use core_protocol::tcp_events::TcpEvent;
use futures::{SinkExt, StreamExt};
use tokio::net::TcpStream;
use tokio_serde::formats::Bincode;
use tokio_serde::SymmetricallyFramed;
use tokio_serde::formats::Bincode;
use tokio_util::codec::{Framed, LengthDelimitedCodec};
use tracing::{info, warn};
@@ -22,7 +22,7 @@ type FramedStream = SymmetricallyFramed<
/// Connects to the server and performs the initial authentication handshake.
///
/// This establishes a length-delimited TCP connection to prevent fragmentation,
/// This establishes a length-delimited TCP connection to prevent fragmentation,
/// sends an `AuthRequest` with the given username, and awaits the `AuthResponse`.
///
/// # Arguments
@@ -34,31 +34,37 @@ type FramedStream = SymmetricallyFramed<
pub async fn connect_and_auth(username: &str) -> Result<()> {
let addr = format!("127.0.0.1:{}", constants::TCP_PORT);
info!("Connecting to server at {}...", addr);
let stream = TcpStream::connect(&addr).await.context("Failed to connect to server")?;
let stream = TcpStream::connect(&addr)
.await
.context("Failed to connect to server")?;
info!("Connected!");
// Construct the codec pipeline exactly mirroring the server's configuration
// Construct the codec pipeline exactly mirroring the server's configuration
// to ensure reliable packet framing.
let length_delimited = Framed::new(stream, LengthDelimitedCodec::new());
let mut framed: FramedStream = SymmetricallyFramed::new(
length_delimited,
Bincode::<TcpEvent, TcpEvent>::default(),
);
let mut framed: FramedStream =
SymmetricallyFramed::new(length_delimited, Bincode::<TcpEvent, TcpEvent>::default());
let auth_req = TcpEvent::AuthRequest {
username: username.to_string(),
};
framed.send(auth_req).await.context("Failed to send AuthRequest")?;
framed
.send(auth_req)
.await
.context("Failed to send AuthRequest")?;
info!("Sent AuthRequest for user: {}", username);
if let Some(response) = framed.next().await {
let response = response.context("Failed to deserialize response")?;
match response {
TcpEvent::AuthResponse { session_token } => {
info!("Successfully authenticated! Session token: {}", session_token);
info!(
"Successfully authenticated! Session token: {}",
session_token
);
}
_ => {
warn!("Received unexpected event instead of AuthResponse");

64
client_node/src/ui/app.rs Normal file
View File

@@ -0,0 +1,64 @@
//! The core application state for the eframe UI.
//!
//! This module defines the `VoiceApp` struct which implements `eframe::App`.
//! It listens to background events via `tokio::sync::watch` and draws the UI at 60 FPS.
//!
//! We implement `ui()` (not the deprecated `update()`) because eframe 0.34
//! changed the required trait method to receive an `&mut egui::Ui` directly
//! instead of a raw `egui::Context`.
use eframe::egui;
use tokio::sync::watch;
/// The central state for the eframe UI.
pub struct VoiceApp {
/// Receiver for the active speaker state, updated by the DSP thread.
pub active_speaker_rx: watch::Receiver<bool>,
/// Whether the audio dumper is enabled for debugging.
pub audio_dumper_enabled: bool,
}
impl VoiceApp {
/// Creates a new `VoiceApp` instance.
#[must_use]
pub fn new(active_speaker_rx: watch::Receiver<bool>) -> Self {
Self {
active_speaker_rx,
audio_dumper_enabled: false,
}
}
}
impl eframe::App for VoiceApp {
fn ui(&mut self, ui: &mut egui::Ui, _frame: &mut eframe::Frame) {
let is_active_speaker = *self.active_speaker_rx.borrow();
// Use columns to simulate a side-panel layout within the single Ui.
ui.columns(2, |columns| {
// Left column: Channel tree view
columns[0].heading("Channels");
columns[0].label("General");
columns[0].label("Gaming");
columns[0].label("AFK");
// Right column: Voice chat state + dev tools
columns[1].heading("Voice Chat");
columns[1].horizontal(|ui| {
ui.label("You: ");
if is_active_speaker {
ui.label(egui::RichText::new("Speaking").color(egui::Color32::GREEN));
} else {
ui.label(egui::RichText::new("Silent").color(egui::Color32::GRAY));
}
});
columns[1].separator();
columns[1].heading("Developer Settings");
columns[1].checkbox(&mut self.audio_dumper_enabled, "Enable Audio Dumper (.wav)");
});
// Force continuous repaint so the watch channel updates immediately reflect.
ui.ctx().request_repaint();
}
}

11
client_node/src/ui/mod.rs Normal file
View File

@@ -0,0 +1,11 @@
//! UI Module entry point.
//!
//! This module handles the egui visual interface and maintains the `eframe` Application state.
#![forbid(unsafe_code)]
#![deny(clippy::all, clippy::pedantic)]
#![deny(clippy::unwrap_used, clippy::expect_used)]
pub mod app;
pub use app::VoiceApp;