Skip to main content
Voice Activity Detection (VAD) identifies speech in audio streams, enabling hands-free voice interfaces and efficient audio processing by filtering out silence.

Overview

VAD answers the question: “Is someone speaking right now?” This enables:
  • Wake word detection – Start listening when speech begins
  • Audio trimming – Only process speech segments
  • Turn-taking – Know when to respond in conversations
  • Battery efficiency – Don’t process silence

Basic Usage

// Initialize VAD
try await RunAnywhere.initializeVAD()

// Detect speech in audio buffer
import AVFoundation
let buffer: AVAudioPCMBuffer = // From microphone
let isSpeaking = try await RunAnywhere.detectSpeech(in: buffer)

if isSpeaking {
    print("Speech detected!")
}

Setup

import RunAnywhere
import ONNXRuntime

// Register ONNX module (at app launch)
@MainActor
func setup() {
    ONNX.register()
}

// Initialize VAD with default configuration
try await RunAnywhere.initializeVAD()

// Or with custom configuration
let config = VADConfiguration(
    sampleRate: 16000,
    frameLength: 0.032,     // 32ms frames
    energyThreshold: 0.5    // Sensitivity (0-1)
)
try await RunAnywhere.initializeVAD(config)

VADConfiguration

public struct VADConfiguration: Sendable {
    public let sampleRate: Int          // Audio sample rate (default: 16000)
    public let frameLength: Double      // Frame length in seconds (default: 0.032)
    public let energyThreshold: Double  // Detection threshold 0-1 (default: 0.5)

    public init(
        sampleRate: Int = 16000,
        frameLength: Double = 0.032,
        energyThreshold: Double = 0.5
    )
}

Threshold Tuning

ThresholdBehaviorUse Case
0.2-0.4Very sensitive, more false positivesQuiet environments
0.4-0.6Balanced (recommended)Normal rooms
0.6-0.8Less sensitive, fewer false positivesNoisy environments

Detection Methods

From AVAudioPCMBuffer

import AVFoundation

let buffer: AVAudioPCMBuffer = // From audio input
let isSpeaking = try await RunAnywhere.detectSpeech(in: buffer)

From Float Array

let samples: [Float] = // Raw audio samples
let isSpeaking = try await RunAnywhere.detectSpeech(in: samples)

Continuous VAD with Callbacks

For real-time applications, use the callback-based API:
// Set speech activity callback
await RunAnywhere.setVADSpeechActivityCallback { event in
    switch event {
    case .started:
        print("User started speaking")
        startRecording()
    case .ended:
        print("User stopped speaking")
        stopRecording()
    }
}

// Set audio buffer callback (optional)
await RunAnywhere.setVADAudioBufferCallback { samples in
    // Process audio samples
    visualizeSamples(samples)
}

// Start VAD processing
try await RunAnywhere.startVAD()

// ... later
try await RunAnywhere.stopVAD()

Complete Voice Recording Example

class VoiceRecorder: ObservableObject {
    @Published var isListening = false
    @Published var isSpeaking = false
    @Published var recordedAudio: Data?

    private var audioEngine: AVAudioEngine?
    private var audioBuffer = Data()

    func startListening() async throws {
        // Initialize VAD
        try await RunAnywhere.initializeVAD(
            VADConfiguration(energyThreshold: 0.5)
        )

        // Set up speech callbacks
        await RunAnywhere.setVADSpeechActivityCallback { [weak self] event in
            Task { @MainActor in
                switch event {
                case .started:
                    self?.isSpeaking = true
                    self?.audioBuffer = Data()
                case .ended:
                    self?.isSpeaking = false
                    self?.recordedAudio = self?.audioBuffer
                }
            }
        }

        // Set up audio engine
        audioEngine = AVAudioEngine()
        let inputNode = audioEngine!.inputNode

        let format = AVAudioFormat(
            commonFormat: .pcmFormatFloat32,
            sampleRate: 16000,
            channels: 1,
            interleaved: false
        )!

        inputNode.installTap(onBus: 0, bufferSize: 4096, format: format) { [weak self] buffer, _ in
            self?.processBuffer(buffer)
        }

        try audioEngine!.start()
        try await RunAnywhere.startVAD()

        await MainActor.run { isListening = true }
    }

    private func processBuffer(_ buffer: AVAudioPCMBuffer) {
        Task {
            // Feed to VAD
            let isSpeech = try? await RunAnywhere.detectSpeech(in: buffer)

            // If speaking, accumulate audio
            if isSpeech == true, let channelData = buffer.floatChannelData?[0] {
                let data = Data(bytes: channelData, count: Int(buffer.frameLength) * MemoryLayout<Float>.size)
                await MainActor.run {
                    audioBuffer.append(data)
                }
            }
        }
    }

    func stopListening() async {
        audioEngine?.stop()
        audioEngine?.inputNode.removeTap(onBus: 0)
        try? await RunAnywhere.stopVAD()
        await RunAnywhere.cleanupVAD()

        await MainActor.run { isListening = false }
    }
}

SwiftUI Voice Activity UI

struct VoiceActivityView: View {
    @StateObject private var recorder = VoiceRecorder()

    var body: some View {
        VStack(spacing: 30) {
            // Visual indicator
            ZStack {
                Circle()
                    .fill(indicatorColor)
                    .frame(width: 120, height: 120)
                    .scaleEffect(recorder.isSpeaking ? 1.2 : 1.0)
                    .animation(.easeInOut(duration: 0.2), value: recorder.isSpeaking)

                Image(systemName: recorder.isSpeaking ? "waveform" : "mic.fill")
                    .font(.system(size: 40))
                    .foregroundColor(.white)
            }

            // Status text
            Text(statusText)
                .font(.headline)

            // Control button
            Button(action: toggleListening) {
                Text(recorder.isListening ? "Stop" : "Start Listening")
                    .frame(maxWidth: .infinity)
                    .padding()
                    .background(recorder.isListening ? Color.red : Color.blue)
                    .foregroundColor(.white)
                    .cornerRadius(10)
            }
            .padding(.horizontal)

            // Recorded audio indicator
            if recorder.recordedAudio != nil {
                Label("Audio captured!", systemImage: "checkmark.circle.fill")
                    .foregroundColor(.green)
            }
        }
        .padding()
    }

    var indicatorColor: Color {
        if !recorder.isListening {
            return .gray
        } else if recorder.isSpeaking {
            return .green
        } else {
            return .blue
        }
    }

    var statusText: String {
        if !recorder.isListening {
            return "Tap to start"
        } else if recorder.isSpeaking {
            return "Speaking..."
        } else {
            return "Listening..."
        }
    }

    func toggleListening() {
        Task {
            if recorder.isListening {
                await recorder.stopListening()
            } else {
                try? await recorder.startListening()
            }
        }
    }
}

VAD State Management

// Check if VAD is ready
let isReady = await RunAnywhere.isVADReady

// Start/stop processing
try await RunAnywhere.startVAD()
try await RunAnywhere.stopVAD()

// Cleanup resources
await RunAnywhere.cleanupVAD()

Best Practices

Start with 0.5 and adjust based on testing. Noisy environments need higher thresholds.
Add a small delay before acting on speech end events to handle brief pauses.
var speechEndTimer: Task<Void, Never>?

await RunAnywhere.setVADSpeechActivityCallback { event in
    switch event {
    case .started:
        speechEndTimer?.cancel()
        handleSpeechStart()
    case .ended:
        speechEndTimer = Task {
            try? await Task.sleep(for: .milliseconds(500))
            handleSpeechEnd()
        }
    }
}
Configure AVAudioSession appropriately to handle interruptions.
Always call cleanupVAD() when done to free resources.

Error Handling

do {
    try await RunAnywhere.initializeVAD()
    try await RunAnywhere.startVAD()
} catch let error as SDKError {
    switch error.code {
    case .initializationFailed:
        print("VAD initialization failed")
    case .microphonePermissionDenied:
        print("Microphone access required")
    default:
        print("Error: \(error)")
    }
}

Voice Agent

Build complete voice experiences with VAD + STT + LLM + TTS →