Voice Activity Detection

Voice Activity Detection (VAD) identifies speech in audio streams, enabling hands-free voice interfaces and efficient audio processing by filtering out silence.

Overview

VAD answers the question: “Is someone speaking right now?” This enables:

Wake word detection – Start listening when speech begins
Audio trimming – Only process speech segments
Turn-taking – Know when to respond in conversations
Battery efficiency – Don’t process silence

Basic Usage

// Initialize VAD
try await RunAnywhere.initializeVAD()

// Detect speech in audio buffer
import AVFoundation
let buffer: AVAudioPCMBuffer = // From microphone
let isSpeaking = try await RunAnywhere.detectSpeech(in: buffer)

if isSpeaking {
    print("Speech detected!")
}

Setup

import RunAnywhere
import ONNXRuntime

// Register ONNX module (at app launch)
@MainActor
func setup() {
    ONNX.register()
}

// Initialize VAD with default configuration
try await RunAnywhere.initializeVAD()

// Or with custom configuration
let config = VADConfiguration(
    sampleRate: 16000,
    frameLength: 0.032,     // 32ms frames
    energyThreshold: 0.5    // Sensitivity (0-1)
)
try await RunAnywhere.initializeVAD(config)

VADConfiguration

public struct VADConfiguration: Sendable {
    public let sampleRate: Int          // Audio sample rate (default: 16000)
    public let frameLength: Double      // Frame length in seconds (default: 0.032)
    public let energyThreshold: Double  // Detection threshold 0-1 (default: 0.5)

    public init(
        sampleRate: Int = 16000,
        frameLength: Double = 0.032,
        energyThreshold: Double = 0.5
    )
}

Threshold Tuning

Threshold	Behavior	Use Case
0.2-0.4	Very sensitive, more false positives	Quiet environments
0.4-0.6	Balanced (recommended)	Normal rooms
0.6-0.8	Less sensitive, fewer false positives	Noisy environments

Detection Methods

From AVAudioPCMBuffer

import AVFoundation

let buffer: AVAudioPCMBuffer = // From audio input
let isSpeaking = try await RunAnywhere.detectSpeech(in: buffer)

From Float Array

let samples: [Float] = // Raw audio samples
let isSpeaking = try await RunAnywhere.detectSpeech(in: samples)

Continuous VAD with Callbacks

For real-time applications, use the callback-based API:

// Set speech activity callback
await RunAnywhere.setVADSpeechActivityCallback { event in
    switch event {
    case .started:
        print("User started speaking")
        startRecording()
    case .ended:
        print("User stopped speaking")
        stopRecording()
    }
}

// Set audio buffer callback (optional)
await RunAnywhere.setVADAudioBufferCallback { samples in
    // Process audio samples
    visualizeSamples(samples)
}

// Start VAD processing
try await RunAnywhere.startVAD()

// ... later
try await RunAnywhere.stopVAD()

Complete Voice Recording Example

class VoiceRecorder: ObservableObject {
    @Published var isListening = false
    @Published var isSpeaking = false
    @Published var recordedAudio: Data?

    private var audioEngine: AVAudioEngine?
    private var audioBuffer = Data()

    func startListening() async throws {
        // Initialize VAD
        try await RunAnywhere.initializeVAD(
            VADConfiguration(energyThreshold: 0.5)
        )

        // Set up speech callbacks
        await RunAnywhere.setVADSpeechActivityCallback { [weak self] event in
            Task { @MainActor in
                switch event {
                case .started:
                    self?.isSpeaking = true
                    self?.audioBuffer = Data()
                case .ended:
                    self?.isSpeaking = false
                    self?.recordedAudio = self?.audioBuffer
                }
            }
        }

        // Set up audio engine
        audioEngine = AVAudioEngine()
        let inputNode = audioEngine!.inputNode

        let format = AVAudioFormat(
            commonFormat: .pcmFormatFloat32,
            sampleRate: 16000,
            channels: 1,
            interleaved: false
        )!

        inputNode.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, _ in
            self?.processBuffer(buffer)
        }

        try audioEngine!.start()
        try await RunAnywhere.startVAD()

        await MainActor.run { isListening = true }
    }

    private func processBuffer(_ buffer: AVAudioPCMBuffer) {
        Task {
            // Feed to VAD
            let isSpeech = try? await RunAnywhere.detectSpeech(in: buffer)

            // If speaking, accumulate audio
            if isSpeech == true, let channelData = buffer.floatChannelData?[0] {
                let data = Data(bytes: channelData, count: Int(buffer.frameLength) * MemoryLayout<Float>.size)
                await MainActor.run {
                    audioBuffer.append(data)
                }
            }
        }
    }

    func stopListening() async {
        audioEngine?.stop()
        audioEngine?.inputNode.removeTap(onBus: 0)
        try? await RunAnywhere.stopVAD()
        await RunAnywhere.cleanupVAD()

        await MainActor.run { isListening = false }
    }
}

SwiftUI Voice Activity UI

struct VoiceActivityView: View {
    @StateObject private var recorder = VoiceRecorder()

    var body: some View {
        VStack(spacing: 30) {
            // Visual indicator
            ZStack {
                Circle()
                    .fill(indicatorColor)
                    .frame(width: 120, height: 120)
                    .scaleEffect(recorder.isSpeaking ? 1.2 : 1.0)
                    .animation(.easeInOut(duration: 0.2), value: recorder.isSpeaking)

                Image(systemName: recorder.isSpeaking ? "waveform" : "mic.fill")
                    .font(.system(size: 40))
                    .foregroundColor(.white)
            }

            // Status text
            Text(statusText)
                .font(.headline)

            // Control button
            Button(action: toggleListening) {
                Text(recorder.isListening ? "Stop" : "Start Listening")
                    .frame(maxWidth: .infinity)
                    .padding()
                    .background(recorder.isListening ? Color.red : Color.blue)
                    .foregroundColor(.white)
                    .cornerRadius(10)
            }
            .padding(.horizontal)

            // Recorded audio indicator
            if recorder.recordedAudio != nil {
                Label("Audio captured!", systemImage: "checkmark.circle.fill")
                    .foregroundColor(.green)
            }
        }
        .padding()
    }

    var indicatorColor: Color {
        if !recorder.isListening {
            return .gray
        } else if recorder.isSpeaking {
            return .green
        } else {
            return .blue
        }
    }

    var statusText: String {
        if !recorder.isListening {
            return "Tap to start"
        } else if recorder.isSpeaking {
            return "Speaking..."
        } else {
            return "Listening..."
        }
    }

    func toggleListening() {
        Task {
            if recorder.isListening {
                await recorder.stopListening()
            } else {
                try? await recorder.startListening()
            }
        }
    }
}

VAD State Management

// Check if VAD is ready
let isReady = await RunAnywhere.isVADReady

// Start/stop processing
try await RunAnywhere.startVAD()
try await RunAnywhere.stopVAD()

// Cleanup resources
await RunAnywhere.cleanupVAD()

Best Practices

Adjust threshold for environment

Start with 0.5 and adjust based on testing. Noisy environments need higher thresholds.

Add debouncing

Add a small delay before acting on speech end events to handle brief pauses.

var speechEndTimer: Task<Void, Never>?

await RunAnywhere.setVADSpeechActivityCallback { event in
    switch event {
    case .started:
        speechEndTimer?.cancel()
        handleSpeechStart()
    case .ended:
        speechEndTimer = Task {
            try? await Task.sleep(for: .milliseconds(500))
            handleSpeechEnd()
        }
    }
}

Handle background audio

Configure AVAudioSession appropriately to handle interruptions.

Clean up resources

Always call cleanupVAD() when done to free resources.

Error Handling

do {
    try await RunAnywhere.initializeVAD()
    try await RunAnywhere.startVAD()
} catch let error as SDKError {
    switch error.code {
    case .initializationFailed:
        print("VAD initialization failed")
    case .microphonePermissionDenied:
        print("Microphone access required")
    default:
        print("Error: \(error)")
    }
}

Voice Agent

Build complete voice experiences with VAD + STT + LLM + TTS →

Getting Started

Swift SDK

Kotlin SDK

React Native SDK

Flutter SDK

Web SDK

Vibe Coding

Overview

Basic Usage

Setup

VADConfiguration

Threshold Tuning

Detection Methods

From AVAudioPCMBuffer

From Float Array

Continuous VAD with Callbacks

Complete Voice Recording Example

SwiftUI Voice Activity UI

VAD State Management

Best Practices

Error Handling

Voice Agent

Getting Started

Swift SDK

Kotlin SDK

React Native SDK

Flutter SDK

Web SDK

Vibe Coding

​Overview

​Basic Usage

​Setup

​VADConfiguration

​Threshold Tuning

​Detection Methods

​From AVAudioPCMBuffer

​From Float Array

​Continuous VAD with Callbacks

​Complete Voice Recording Example

​SwiftUI Voice Activity UI

​VAD State Management

​Best Practices

​Error Handling

Voice Agent

Overview

Basic Usage

Setup

VADConfiguration

Threshold Tuning

Detection Methods

From AVAudioPCMBuffer

From Float Array

Continuous VAD with Callbacks

Complete Voice Recording Example

SwiftUI Voice Activity UI

VAD State Management

Best Practices

Error Handling