Started milestone 2

2026-05-03 15:53:25 +02:00
parent 43483c2145
commit 041955345b
18 changed files with 4616 additions and 60 deletions
--- a/client_node/Cargo.toml
+++ b/client_node/Cargo.toml
@@ -6,9 +6,17 @@ edition = "2024"
 [dependencies]
 anyhow = "1.0.102"
 core_protocol = { version = "0.1.0", path = "../core_protocol" }
+cpal = "0.17.3"
+eframe = "0.34.1"
+egui = "0.34.1"
 futures = "0.3.32"
+hound = "3.5.1"
+rdev = "0.5.3"
+ringbuf = "0.5.0"
 tokio = { version = "1.52.1", features = ["rt-multi-thread", "net", "macros"] }
 tokio-serde = { version = "0.9.0", features = ["bincode"] }
 tokio-util = { version = "0.7.18", features = ["codec"] }
 tracing = "0.1.44"
 tracing-subscriber = "0.3.23"
+webrtc-audio-processing = { version = "2.0.4", features = ["bundled"] }
+webrtc-audio-processing-config = "2.0.4"
--- a/client_node/src/audio/capture.rs
+++ b/client_node/src/audio/capture.rs
@@ -0,0 +1,59 @@
+//! Headless and hardware-backed audio capture logic.
+//!
+//! This module abstracts `cpal` to capture microphone input cleanly,
+//! pushing raw samples directly to a lock-free ringbuffer.
+//!
+//! The `cpal` data callback is kept strictly lock-free and allocation-free
+//! to satisfy real-time audio constraints.
+
+use anyhow::{Result, anyhow};
+use cpal::StreamConfig;
+use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
+use ringbuf::HeapProd;
+use ringbuf::traits::Producer;
+use tracing::{error, info};
+
+use super::{INPUT_CHANNELS, SAMPLE_RATE};
+
+/// Sets up the default microphone device and returns the active cpal stream.
+///
+/// The stream pushes raw f32 samples into the provided ringbuffer producer.
+///
+/// # Errors
+/// Returns an error if no input device is found or if the stream cannot be built.
+pub fn start_audio_capture(mut producer: HeapProd<f32>) -> Result<cpal::Stream> {
+    let host = cpal::default_host();
+
+    let device = host
+        .default_input_device()
+        .ok_or_else(|| anyhow!("No default input device found"))?;
+
+    // cpal 0.17 deprecates `name()` in favor of `description()`.
+    let device_desc = device
+        .description()
+        .map_or_else(|_| "Unknown".to_string(), |d| d.name().to_string());
+    info!("Using input device: {}", device_desc);
+
+    let config = StreamConfig {
+        channels: INPUT_CHANNELS,
+        // cpal 0.17 changed SampleRate from a tuple struct to a plain u32 alias.
+        sample_rate: SAMPLE_RATE,
+        buffer_size: cpal::BufferSize::Default,
+    };
+
+    let stream = device.build_input_stream(
+        &config,
+        move |data: &[f32], _: &cpal::InputCallbackInfo| {
+            // STRICT RULE: No locks, no allocations in this callback.
+            // Just push samples to the ringbuffer.
+            let _ = producer.push_slice(data);
+        },
+        move |err| {
+            error!("An error occurred on the audio capture stream: {}", err);
+        },
+        None, // None = default timeout
+    )?;
+
+    stream.play()?;
+    Ok(stream)
+}
--- a/client_node/src/audio/dsp.rs
+++ b/client_node/src/audio/dsp.rs
@@ -0,0 +1,108 @@
+//! DSP and Voice Activity Detection (VAD) thread.
+//!
+//! Pulls audio from the lock-free ringbuffer, applies WebRTC noise suppression
+//! and echo cancellation, then checks for voice activity before signalling
+//! the UI via a `tokio::sync::watch` channel.
+//!
+//! This thread is a dedicated `std::thread` (not a Tokio task) because
+//! real-time audio processing must never be at the mercy of a cooperative
+//! async scheduler.
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::thread;
+use std::time::Duration;
+
+use ringbuf::HeapCons;
+use ringbuf::traits::{Consumer, Observer};
+use tokio::sync::watch;
+use tracing::info;
+use webrtc_audio_processing::Processor;
+use webrtc_audio_processing_config::{
+    Config, EchoCanceller, NoiseSuppression, NoiseSuppressionLevel,
+};
+
+use super::{FRAME_SIZE, SAMPLE_RATE};
+
+/// RMS threshold below which a frame is considered silence.
+/// This provides a simple amplitude-based VAD since the WebRTC v2 API
+/// removed the standalone voice detection configuration.
+const VAD_RMS_THRESHOLD: f32 = 0.01;
+
+/// Spawns the dedicated background DSP thread.
+///
+/// Reads 960-sample frames from the ringbuffer, applies WebRTC
+/// noise suppression + echo cancellation, and updates the active
+/// speaker state via the provided watch channel.
+pub fn spawn_dsp_thread(
+    mut consumer: HeapCons<f32>,
+    ptt_flag: Arc<AtomicBool>,
+    active_speaker_tx: watch::Sender<bool>,
+) {
+    thread::spawn(move || {
+        info!("DSP thread started.");
+
+        let ap = match Processor::new(SAMPLE_RATE) {
+            Ok(ap) => ap,
+            Err(e) => {
+                tracing::error!("Failed to initialize WebRTC APM: {:?}", e);
+                return;
+            }
+        };
+
+        let config = Config {
+            echo_canceller: Some(EchoCanceller::default()),
+            noise_suppression: Some(NoiseSuppression {
+                level: NoiseSuppressionLevel::High,
+                analyze_linear_aec_output: false,
+            }),
+            ..Default::default()
+        };
+        ap.set_config(config);
+
+        // Mono capture: one channel with FRAME_SIZE samples.
+        let mut frame_buf = vec![vec![0.0f32; FRAME_SIZE]];
+
+        loop {
+            // Wait until we have a full 20ms frame (960 samples at 48kHz).
+            if consumer.occupied_len() >= FRAME_SIZE {
+                let _ = consumer.pop_slice(&mut frame_buf[0]);
+
+                let is_transmitting = ptt_flag.load(Ordering::Relaxed);
+
+                // Run the WebRTC DSP pipeline on the capture frame.
+                if let Err(e) = ap.process_capture_frame(&mut frame_buf) {
+                    tracing::warn!("APM processing failed: {:?}", e);
+                }
+
+                // Simple RMS-based VAD since webrtc-audio-processing v2
+                // removed the dedicated VoiceDetection config field.
+                let rms = compute_rms(&frame_buf[0]);
+                let has_voice = rms > VAD_RMS_THRESHOLD;
+                let should_transmit = is_transmitting && has_voice;
+
+                // Only update the watch channel when the state actually changes
+                // to avoid unnecessary UI repaints.
+                if *active_speaker_tx.borrow() != should_transmit {
+                    let _ = active_speaker_tx.send(should_transmit);
+                }
+            } else {
+                thread::sleep(Duration::from_millis(2));
+            }
+        }
+    });
+}
+
+/// Computes the Root Mean Square (RMS) of a sample buffer.
+///
+/// Used as a lightweight VAD: if the RMS is below a threshold,
+/// the frame is considered silence.
+fn compute_rms(samples: &[f32]) -> f32 {
+    if samples.is_empty() {
+        return 0.0;
+    }
+    let sum_sq: f32 = samples.iter().map(|s| s * s).sum();
+    #[allow(clippy::cast_precision_loss)] // FRAME_SIZE (960) is well within f32's 23-bit mantissa.
+    let divisor = samples.len() as f32;
+    (sum_sq / divisor).sqrt()
+}
--- a/client_node/src/audio/mod.rs
+++ b/client_node/src/audio/mod.rs
@@ -0,0 +1,19 @@
+//! Shared audio configuration constants.
+//!
+//! Defines strict mathematical constraints required for Opus encoding and decoding.
+
+#![forbid(unsafe_code)]
+#![deny(clippy::all, clippy::pedantic)]
+#![deny(clippy::unwrap_used, clippy::expect_used)]
+
+pub mod capture;
+pub mod dsp;
+
+/// The strict sample rate required across the entire DSP pipeline.
+pub const SAMPLE_RATE: u32 = 48_000;
+
+/// The number of channels for microphone capture (Mono).
+pub const INPUT_CHANNELS: u16 = 1;
+
+/// The exact number of samples required per frame for Opus (20ms).
+pub const FRAME_SIZE: usize = 960;
--- a/client_node/src/hotkey/mod.rs
+++ b/client_node/src/hotkey/mod.rs
@@ -0,0 +1,34 @@
+//! Global Hotkey listener.
+//!
+//! Uses `rdev` to capture global PTT events across the OS without requiring window focus.
+
+#![forbid(unsafe_code)]
+#![deny(clippy::all, clippy::pedantic)]
+#![deny(clippy::unwrap_used, clippy::expect_used)]
+
+use rdev::{Event, EventType, Key, listen};
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::thread;
+use tracing::info;
+
+/// Spawns the global hotkey listener.
+pub fn spawn_hotkey_listener(ptt_flag: Arc<AtomicBool>) {
+    thread::spawn(move || {
+        info!("Hotkey listener started. Press 'V' to talk.");
+
+        let callback = move |event: Event| match event.event_type {
+            EventType::KeyPress(Key::KeyV) => {
+                ptt_flag.store(true, Ordering::Relaxed);
+            }
+            EventType::KeyRelease(Key::KeyV) => {
+                ptt_flag.store(false, Ordering::Relaxed);
+            }
+            _ => {}
+        };
+
+        if let Err(e) = listen(callback) {
+            tracing::error!("Error listening to global hotkeys: {:?}", e);
+        }
+    });
+}
--- a/client_node/src/main.rs
+++ b/client_node/src/main.rs
@@ -1,24 +1,76 @@
 //! Client Node entry point.
 //!
-//! This module initializes the desktop client application, sets up the Tokio 
+//! This module initializes the desktop client application, sets up the Tokio
 //! background thread for networking, and eventually binds to the UI framework.

 #![forbid(unsafe_code)]
 #![deny(clippy::all, clippy::pedantic)]
 #![deny(clippy::unwrap_used, clippy::expect_used)]

+mod audio;
+mod hotkey;
 mod network;
+mod ui;

+use anyhow::Result;
+use eframe::egui;
+use ringbuf::HeapRb;
+use ringbuf::traits::Split;
+use std::sync::Arc;
+use std::sync::atomic::AtomicBool;
+use tokio::sync::watch;
 use tracing::{error, info};

-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
+fn main() -> Result<()> {
    tracing_subscriber::fmt::init();
    info!("Starting client node...");

-    if let Err(e) = network::control::connect_and_auth("TestUser").await {
-        error!("Connection error: {:?}", e);
-    }
+    // Setup communication channels
+    let (active_speaker_tx, active_speaker_rx) = watch::channel(false);
+    let ptt_flag = Arc::new(AtomicBool::new(false));
+
+    // Setup lock-free ringbuffer for audio capture (4096 capacity)
+    let audio_rb = HeapRb::<f32>::new(4096);
+    let (producer, consumer) = audio_rb.split();
+
+    // Spawn DSP and audio capture threads
+    audio::dsp::spawn_dsp_thread(consumer, ptt_flag.clone(), active_speaker_tx);
+    let _stream = audio::capture::start_audio_capture(producer).map_err(|e| {
+        error!("Failed to start audio capture: {:?}", e);
+        e
+    });
+
+    // Spawn Global Hotkey listener
+    hotkey::spawn_hotkey_listener(ptt_flag);
+
+    // Spawn custom tokio runtime for network background tasks
+    std::thread::spawn(move || {
+        let Ok(rt) = tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .build()
+        else {
+            tracing::error!("Failed to build tokio runtime");
+            return;
+        };
+
+        rt.block_on(async {
+            if let Err(e) = network::control::connect_and_auth("TestUser").await {
+                error!("Connection error: {:?}", e);
+            }
+        });
+    });
+
+    let options = eframe::NativeOptions {
+        viewport: egui::ViewportBuilder::default().with_inner_size([800.0, 600.0]),
+        ..Default::default()
+    };
+
+    eframe::run_native(
+        "Voice App",
+        options,
+        Box::new(|_cc| Ok(Box::new(ui::VoiceApp::new(active_speaker_rx)))),
+    )
+    .map_err(|e| anyhow::anyhow!("eframe error: {e:?}"))?;

    Ok(())
 }
--- a/client_node/src/network/control.rs
+++ b/client_node/src/network/control.rs
@@ -8,8 +8,8 @@ use core_protocol::constants;
 use core_protocol::tcp_events::TcpEvent;
 use futures::{SinkExt, StreamExt};
 use tokio::net::TcpStream;
-use tokio_serde::formats::Bincode;
 use tokio_serde::SymmetricallyFramed;
+use tokio_serde::formats::Bincode;
 use tokio_util::codec::{Framed, LengthDelimitedCodec};
 use tracing::{info, warn};

@@ -22,7 +22,7 @@ type FramedStream = SymmetricallyFramed<

 /// Connects to the server and performs the initial authentication handshake.
 ///
-/// This establishes a length-delimited TCP connection to prevent fragmentation, 
+/// This establishes a length-delimited TCP connection to prevent fragmentation,
 /// sends an `AuthRequest` with the given username, and awaits the `AuthResponse`.
 ///
 /// # Arguments
@@ -34,31 +34,37 @@ type FramedStream = SymmetricallyFramed<
 pub async fn connect_and_auth(username: &str) -> Result<()> {
    let addr = format!("127.0.0.1:{}", constants::TCP_PORT);
    info!("Connecting to server at {}...", addr);
-    
-    let stream = TcpStream::connect(&addr).await.context("Failed to connect to server")?;
+
+    let stream = TcpStream::connect(&addr)
+        .await
+        .context("Failed to connect to server")?;
    info!("Connected!");

-    // Construct the codec pipeline exactly mirroring the server's configuration 
+    // Construct the codec pipeline exactly mirroring the server's configuration
    // to ensure reliable packet framing.
    let length_delimited = Framed::new(stream, LengthDelimitedCodec::new());
-    let mut framed: FramedStream = SymmetricallyFramed::new(
-        length_delimited,
-        Bincode::<TcpEvent, TcpEvent>::default(),
-    );
+    let mut framed: FramedStream =
+        SymmetricallyFramed::new(length_delimited, Bincode::<TcpEvent, TcpEvent>::default());

    let auth_req = TcpEvent::AuthRequest {
        username: username.to_string(),
    };
-    
-    framed.send(auth_req).await.context("Failed to send AuthRequest")?;
+
+    framed
+        .send(auth_req)
+        .await
+        .context("Failed to send AuthRequest")?;
    info!("Sent AuthRequest for user: {}", username);

    if let Some(response) = framed.next().await {
        let response = response.context("Failed to deserialize response")?;
-        
+
        match response {
            TcpEvent::AuthResponse { session_token } => {
-                info!("Successfully authenticated! Session token: {}", session_token);
+                info!(
+                    "Successfully authenticated! Session token: {}",
+                    session_token
+                );
            }
            _ => {
                warn!("Received unexpected event instead of AuthResponse");
--- a/client_node/src/ui/app.rs
+++ b/client_node/src/ui/app.rs
@@ -0,0 +1,64 @@
+//! The core application state for the eframe UI.
+//!
+//! This module defines the `VoiceApp` struct which implements `eframe::App`.
+//! It listens to background events via `tokio::sync::watch` and draws the UI at 60 FPS.
+//!
+//! We implement `ui()` (not the deprecated `update()`) because eframe 0.34
+//! changed the required trait method to receive an `&mut egui::Ui` directly
+//! instead of a raw `egui::Context`.
+
+use eframe::egui;
+use tokio::sync::watch;
+
+/// The central state for the eframe UI.
+pub struct VoiceApp {
+    /// Receiver for the active speaker state, updated by the DSP thread.
+    pub active_speaker_rx: watch::Receiver<bool>,
+    /// Whether the audio dumper is enabled for debugging.
+    pub audio_dumper_enabled: bool,
+}
+
+impl VoiceApp {
+    /// Creates a new `VoiceApp` instance.
+    #[must_use]
+    pub fn new(active_speaker_rx: watch::Receiver<bool>) -> Self {
+        Self {
+            active_speaker_rx,
+            audio_dumper_enabled: false,
+        }
+    }
+}
+
+impl eframe::App for VoiceApp {
+    fn ui(&mut self, ui: &mut egui::Ui, _frame: &mut eframe::Frame) {
+        let is_active_speaker = *self.active_speaker_rx.borrow();
+
+        // Use columns to simulate a side-panel layout within the single Ui.
+        ui.columns(2, |columns| {
+            // Left column: Channel tree view
+            columns[0].heading("Channels");
+            columns[0].label("General");
+            columns[0].label("Gaming");
+            columns[0].label("AFK");
+
+            // Right column: Voice chat state + dev tools
+            columns[1].heading("Voice Chat");
+
+            columns[1].horizontal(|ui| {
+                ui.label("You: ");
+                if is_active_speaker {
+                    ui.label(egui::RichText::new("Speaking").color(egui::Color32::GREEN));
+                } else {
+                    ui.label(egui::RichText::new("Silent").color(egui::Color32::GRAY));
+                }
+            });
+
+            columns[1].separator();
+            columns[1].heading("Developer Settings");
+            columns[1].checkbox(&mut self.audio_dumper_enabled, "Enable Audio Dumper (.wav)");
+        });
+
+        // Force continuous repaint so the watch channel updates immediately reflect.
+        ui.ctx().request_repaint();
+    }
+}
--- a/client_node/src/ui/mod.rs
+++ b/client_node/src/ui/mod.rs
@@ -0,0 +1,11 @@
+//! UI Module entry point.
+//!
+//! This module handles the egui visual interface and maintains the `eframe` Application state.
+
+#![forbid(unsafe_code)]
+#![deny(clippy::all, clippy::pedantic)]
+#![deny(clippy::unwrap_used, clippy::expect_used)]
+
+pub mod app;
+
+pub use app::VoiceApp;