Documentation Index Fetch the complete documentation index at: https://docs.runanywhere.ai/llms.txt
Use this file to discover all available pages before exploring further.
Voice Activity Detection (VAD) identifies speech in audio streams, enabling hands-free voice interfaces and efficient audio processing by filtering out silence.
Overview
VAD answers the question: “Is someone speaking right now?” This enables:
Wake word detection – Start listening when speech begins
Audio trimming – Only process speech segments
Turn-taking – Know when to respond in conversations
Battery efficiency – Don’t process silence
Basic Usage
// Initialize VAD
try await RunAnywhere. initializeVAD ()
// Detect speech in audio buffer
import AVFoundation
let buffer: AVAudioPCMBuffer = // From microphone
let isSpeaking = try await RunAnywhere. detectSpeech ( in : buffer)
if isSpeaking {
print ( "Speech detected!" )
}
Setup
import RunAnywhere
import ONNXRuntime
// Register ONNX module (at app launch)
@MainActor
func setup () {
ONNX. register ()
}
// Initialize VAD with default configuration
try await RunAnywhere. initializeVAD ()
// Or with custom configuration
let config = VADConfiguration (
sampleRate : 16000 ,
frameLength : 0.032 , // 32ms frames
energyThreshold : 0.5 // Sensitivity (0-1)
)
try await RunAnywhere. initializeVAD (config)
VADConfiguration
public struct VADConfiguration : Sendable {
public let sampleRate: Int // Audio sample rate (default: 16000)
public let frameLength: Double // Frame length in seconds (default: 0.032)
public let energyThreshold: Double // Detection threshold 0-1 (default: 0.5)
public init (
sampleRate : Int = 16000 ,
frameLength : Double = 0.032 ,
energyThreshold : Double = 0.5
)
}
Threshold Tuning
Threshold Behavior Use Case 0.2-0.4 Very sensitive, more false positives Quiet environments 0.4-0.6 Balanced (recommended) Normal rooms 0.6-0.8 Less sensitive, fewer false positives Noisy environments
Detection Methods
From AVAudioPCMBuffer
import AVFoundation
let buffer: AVAudioPCMBuffer = // From audio input
let isSpeaking = try await RunAnywhere. detectSpeech ( in : buffer)
From Float Array
let samples: [ Float ] = // Raw audio samples
let isSpeaking = try await RunAnywhere. detectSpeech ( in : samples)
Continuous VAD with Callbacks
For real-time applications, use the callback-based API:
// Set speech activity callback
await RunAnywhere. setVADSpeechActivityCallback { event in
switch event {
case . started :
print ( "User started speaking" )
startRecording ()
case . ended :
print ( "User stopped speaking" )
stopRecording ()
}
}
// Set audio buffer callback (optional)
await RunAnywhere. setVADAudioBufferCallback { samples in
// Process audio samples
visualizeSamples (samples)
}
// Start VAD processing
try await RunAnywhere. startVAD ()
// ... later
try await RunAnywhere. stopVAD ()
Complete Voice Recording Example
class VoiceRecorder : ObservableObject {
@Published var isListening = false
@Published var isSpeaking = false
@Published var recordedAudio: Data ?
private var audioEngine: AVAudioEngine ?
private var audioBuffer = Data ()
func startListening () async throws {
// Initialize VAD
try await RunAnywhere. initializeVAD (
VADConfiguration ( energyThreshold : 0.5 )
)
// Set up speech callbacks
await RunAnywhere. setVADSpeechActivityCallback { [ weak self ] event in
Task { @MainActor in
switch event {
case . started :
self ? . isSpeaking = true
self ? . audioBuffer = Data ()
case . ended :
self ? . isSpeaking = false
self ? . recordedAudio = self ? . audioBuffer
}
}
}
// Set up audio engine
audioEngine = AVAudioEngine ()
let inputNode = audioEngine ! . inputNode
let format = AVAudioFormat (
commonFormat : . pcmFormatFloat32 ,
sampleRate : 16000 ,
channels : 1 ,
interleaved : false
) !
inputNode. installTap ( onBus : 0 , bufferSize : 4096 , format : format) { [ weak self ] buffer, _ in
self ? . processBuffer (buffer)
}
try audioEngine ! . start ()
try await RunAnywhere. startVAD ()
await MainActor. run { isListening = true }
}
private func processBuffer ( _ buffer : AVAudioPCMBuffer) {
Task {
// Feed to VAD
let isSpeech = try ? await RunAnywhere. detectSpeech ( in : buffer)
// If speaking, accumulate audio
if isSpeech == true , let channelData = buffer.floatChannelData ? [ 0 ] {
let data = Data ( bytes : channelData, count : Int (buffer. frameLength ) * MemoryLayout < Float > . size )
await MainActor. run {
audioBuffer. append (data)
}
}
}
}
func stopListening () async {
audioEngine ? . stop ()
audioEngine ? . inputNode . removeTap ( onBus : 0 )
try ? await RunAnywhere. stopVAD ()
await RunAnywhere. cleanupVAD ()
await MainActor. run { isListening = false }
}
}
SwiftUI Voice Activity UI
struct VoiceActivityView : View {
@StateObject private var recorder = VoiceRecorder ()
var body: some View {
VStack ( spacing : 30 ) {
// Visual indicator
ZStack {
Circle ()
. fill (indicatorColor)
. frame ( width : 120 , height : 120 )
. scaleEffect (recorder. isSpeaking ? 1.2 : 1.0 )
. animation (. easeInOut ( duration : 0.2 ), value : recorder. isSpeaking )
Image ( systemName : recorder. isSpeaking ? "waveform" : "mic.fill" )
. font (. system ( size : 40 ))
. foregroundColor (. white )
}
// Status text
Text (statusText)
. font (. headline )
// Control button
Button ( action : toggleListening) {
Text (recorder. isListening ? "Stop" : "Start Listening" )
. frame ( maxWidth : . infinity )
. padding ()
. background (recorder. isListening ? Color. red : Color. blue )
. foregroundColor (. white )
. cornerRadius ( 10 )
}
. padding (. horizontal )
// Recorded audio indicator
if recorder.recordedAudio != nil {
Label ( "Audio captured!" , systemImage : "checkmark.circle.fill" )
. foregroundColor (. green )
}
}
. padding ()
}
var indicatorColor: Color {
if ! recorder.isListening {
return . gray
} else if recorder.isSpeaking {
return . green
} else {
return . blue
}
}
var statusText: String {
if ! recorder.isListening {
return "Tap to start"
} else if recorder.isSpeaking {
return "Speaking..."
} else {
return "Listening..."
}
}
func toggleListening () {
Task {
if recorder.isListening {
await recorder. stopListening ()
} else {
try ? await recorder. startListening ()
}
}
}
}
VAD State Management
// Check if VAD is ready
let isReady = await RunAnywhere. isVADReady
// Start/stop processing
try await RunAnywhere. startVAD ()
try await RunAnywhere. stopVAD ()
// Cleanup resources
await RunAnywhere. cleanupVAD ()
Best Practices
Adjust threshold for environment
Start with 0.5 and adjust based on testing. Noisy environments need higher thresholds.
Add a small delay before acting on speech end events to handle brief pauses. var speechEndTimer: Task< Void , Never > ?
await RunAnywhere. setVADSpeechActivityCallback { event in
switch event {
case . started :
speechEndTimer ? . cancel ()
handleSpeechStart ()
case . ended :
speechEndTimer = Task {
try ? await Task. sleep ( for : . milliseconds ( 500 ))
handleSpeechEnd ()
}
}
}
Configure AVAudioSession appropriately to handle interruptions.
Always call cleanupVAD() when done to free resources.
Error Handling
do {
try await RunAnywhere. initializeVAD ()
try await RunAnywhere. startVAD ()
} catch let error as SDKError {
switch error.code {
case . initializationFailed :
print ( "VAD initialization failed" )
case . microphonePermissionDenied :
print ( "Microphone access required" )
default :
print ( "Error: \( error ) " )
}
}
Voice Agent Build complete voice experiences with VAD + STT + LLM + TTS →