Skip to main content

Overview

Voice Activity Detection (VAD) determines when speech is present in an audio stream. It’s essential for building voice interfaces that can automatically detect when users start and stop speaking.

Basic Usage

import { RunAnywhere } from '@runanywhere/core'

// Initialize VAD
await RunAnywhere.initializeVAD({
  energyThreshold: 0.5,
  sampleRate: 16000,
})

// Process audio samples
const result = await RunAnywhere.processVAD(audioSamples)

if (result.isSpeech) {
  console.log('Speech detected!', result.confidence)
}

Setup

import { RunAnywhere, ModelCategory } from '@runanywhere/core'
import { ONNX, ModelArtifactType } from '@runanywhere/onnx'

// 1. Initialize SDK and register ONNX backend
await RunAnywhere.initialize({ environment: SDKEnvironment.Development })
ONNX.register()

// 2. Add Silero VAD model
await ONNX.addModel({
  id: 'silero-vad',
  name: 'Silero VAD',
  url: 'https://github.com/RunanywhereAI/sherpa-onnx/releases/.../silero-vad.tar.gz',
  modality: ModelCategory.Audio,
  artifactType: ModelArtifactType.TarGzArchive,
  memoryRequirement: 5_000_000,
})

// 3. Download and load
await RunAnywhere.downloadModel('silero-vad')
const modelInfo = await RunAnywhere.getModelInfo('silero-vad')
await RunAnywhere.loadVADModel(modelInfo.localPath)

API Reference

initializeVAD

Configure VAD settings.
await RunAnywhere.initializeVAD(config?: VADConfiguration): Promise<boolean>

Configuration

interface VADConfiguration {
  /** Energy threshold for speech detection (0.0-1.0) */
  energyThreshold?: number

  /** Audio sample rate in Hz (default: 16000) */
  sampleRate?: number

  /** Frame length in milliseconds (default: 30) */
  frameLength?: number

  /** Enable auto-calibration */
  autoCalibration?: boolean
}

processVAD

Process audio samples for voice activity.
await RunAnywhere.processVAD(audioSamples: number[]): Promise<VADResult>

VADResult

interface VADResult {
  /** Whether speech is detected */
  isSpeech: boolean

  /** Confidence score (0.0-1.0) */
  confidence: number

  /** Start time of speech segment (if detected) */
  startTime?: number

  /** End time of speech segment (if completed) */
  endTime?: number
}

Continuous VAD

// Start continuous VAD processing
await RunAnywhere.startVAD(): Promise<void>

// Stop continuous VAD
await RunAnywhere.stopVAD(): Promise<void>

// Set callback for speech events
RunAnywhere.setVADSpeechActivityCallback(
  callback: (event: SpeechActivityEvent) => void
): void

SpeechActivityEvent

interface SpeechActivityEvent {
  /** Event type */
  type: 'speechStarted' | 'speechEnded' | 'speechContinuing'

  /** Timestamp */
  timestamp: number

  /** Confidence */
  confidence: number

  /** Audio buffer (for speechEnded) */
  audioBuffer?: number[]
}

Examples

Single Frame Processing

// Process a single audio frame
const result = await RunAnywhere.processVAD(audioFrame)

console.log('Is speech:', result.isSpeech)
console.log('Confidence:', result.confidence)

Continuous Listening

VoiceListener.tsx
import React, { useState, useEffect, useCallback } from 'react'
import { View, Text, StyleSheet } from 'react-native'
import { RunAnywhere, SpeechActivityEvent } from '@runanywhere/core'

export function VoiceListener() {
  const [isSpeaking, setIsSpeaking] = useState(false)
  const [confidence, setConfidence] = useState(0)

  useEffect(() => {
    // Set up VAD callback
    RunAnywhere.setVADSpeechActivityCallback((event: SpeechActivityEvent) => {
      switch (event.type) {
        case 'speechStarted':
          setIsSpeaking(true)
          setConfidence(event.confidence)
          console.log('User started speaking')
          break
        case 'speechEnded':
          setIsSpeaking(false)
          console.log('User stopped speaking')
          // event.audioBuffer contains the speech audio
          break
        case 'speechContinuing':
          setConfidence(event.confidence)
          break
      }
    })

    // Start VAD
    RunAnywhere.startVAD()

    return () => {
      RunAnywhere.stopVAD()
    }
  }, [])

  return (
    <View style={styles.container}>
      <View style={[styles.indicator, isSpeaking && styles.speaking]} />
      <Text style={styles.status}>{isSpeaking ? 'Speaking...' : 'Listening...'}</Text>
      <Text style={styles.confidence}>Confidence: {(confidence * 100).toFixed(0)}%</Text>
    </View>
  )
}

const styles = StyleSheet.create({
  container: { alignItems: 'center', padding: 32 },
  indicator: {
    width: 80,
    height: 80,
    borderRadius: 40,
    backgroundColor: '#ddd',
    marginBottom: 16,
  },
  speaking: {
    backgroundColor: '#4caf50',
  },
  status: { fontSize: 18, fontWeight: '600' },
  confidence: { fontSize: 14, color: '#666', marginTop: 8 },
})

VAD with Recording

import { RunAnywhere, SpeechActivityEvent } from '@runanywhere/core'

class VoiceRecorder {
  private audioBuffer: number[] = []
  private isRecording = false

  async start() {
    RunAnywhere.setVADSpeechActivityCallback(this.handleVADEvent.bind(this))
    await RunAnywhere.startVAD()
  }

  private handleVADEvent(event: SpeechActivityEvent) {
    switch (event.type) {
      case 'speechStarted':
        this.isRecording = true
        this.audioBuffer = []
        console.log('Started recording')
        break

      case 'speechContinuing':
        // Audio is being accumulated by VAD
        break

      case 'speechEnded':
        this.isRecording = false
        if (event.audioBuffer) {
          this.audioBuffer = event.audioBuffer
          this.processRecording()
        }
        break
    }
  }

  private async processRecording() {
    // Transcribe the speech
    const result = await RunAnywhere.transcribeBuffer(this.audioBuffer, 16000, { language: 'en' })
    console.log('Transcription:', result.text)
  }

  async stop() {
    await RunAnywhere.stopVAD()
  }
}

Push-to-Talk Alternative

PushToTalk.tsx
import React, { useState, useCallback } from 'react'
import { View, Pressable, Text, StyleSheet } from 'react-native'
import { RunAnywhere } from '@runanywhere/core'

export function PushToTalk({ onTranscription }: { onTranscription: (text: string) => void }) {
  const [isPressed, setIsPressed] = useState(false)

  const handlePressIn = useCallback(async () => {
    setIsPressed(true)
    // Start recording without VAD
    await RunAnywhere.startVAD()
  }, [])

  const handlePressOut = useCallback(async () => {
    setIsPressed(false)
    await RunAnywhere.stopVAD()

    // Process the recorded audio
    // (Implementation depends on how audio is captured)
  }, [])

  return (
    <Pressable
      onPressIn={handlePressIn}
      onPressOut={handlePressOut}
      style={[styles.button, isPressed && styles.pressed]}
    >
      <Text style={styles.text}>{isPressed ? '🎤 Recording...' : '🎤 Hold to Talk'}</Text>
    </Pressable>
  )
}

const styles = StyleSheet.create({
  button: {
    padding: 24,
    borderRadius: 12,
    backgroundColor: '#2196f3',
    alignItems: 'center',
  },
  pressed: {
    backgroundColor: '#f44336',
  },
  text: {
    color: '#fff',
    fontSize: 18,
    fontWeight: '600',
  },
})

Configuration Tips

Energy Threshold

The energyThreshold determines how sensitive VAD is to speech:
// More sensitive (picks up quiet speech, but more false positives)
await RunAnywhere.initializeVAD({ energyThreshold: 0.3 })

// Less sensitive (misses quiet speech, fewer false positives)
await RunAnywhere.initializeVAD({ energyThreshold: 0.7 })

// Balanced (default)
await RunAnywhere.initializeVAD({ energyThreshold: 0.5 })
ThresholdSensitivityUse Case
0.2-0.4HighQuiet environments
0.4-0.6MediumNormal conversation
0.6-0.8LowNoisy environments

Auto Calibration

Enable auto-calibration to adapt to ambient noise:
await RunAnywhere.initializeVAD({
  autoCalibration: true,
  // Initial threshold will be adjusted based on background noise
})

Error Handling

import { isSDKError, SDKErrorCode } from '@runanywhere/core'

try {
  await RunAnywhere.processVAD(samples)
} catch (error) {
  if (isSDKError(error)) {
    switch (error.code) {
      case SDKErrorCode.notInitialized:
        console.error('Initialize VAD first')
        break
      case SDKErrorCode.vadFailed:
        console.error('VAD processing failed')
        break
    }
  }
}