Overview
The Voice Agent provides a complete voice conversation pipeline that orchestrates VAD → STT → LLM → TTS in a seamless flow. Use it to build voice assistants, hands-free interfaces, and conversational AI applications.Basic Usage
Copy
Ask AI
import { RunAnywhere } from '@runanywhere/core'
// Initialize voice agent
await RunAnywhere.initializeVoiceAgent({
llmModelId: 'smollm2-360m',
sttModelId: 'whisper-tiny-en',
ttsModelId: 'piper-en-lessac',
systemPrompt: 'You are a helpful voice assistant. Keep responses brief.',
})
// Process a complete voice turn
const result = await RunAnywhere.processVoiceTurn(audioData)
console.log('User said:', result.userTranscript)
console.log('AI response:', result.assistantResponse)
// result.audio contains the spoken response
Setup
1. Download Required Models
Copy
Ask AI
import { RunAnywhere, SDKEnvironment, ModelCategory } from '@runanywhere/core'
import { LlamaCPP } from '@runanywhere/llamacpp'
import { ONNX, ModelArtifactType } from '@runanywhere/onnx'
// Initialize SDK
await RunAnywhere.initialize({ environment: SDKEnvironment.Development })
// Register backends
LlamaCPP.register()
ONNX.register()
// Add models
await LlamaCPP.addModel({
id: 'smollm2-360m',
name: 'SmolLM2 360M',
url: 'https://huggingface.co/.../SmolLM2-360M.Q8_0.gguf',
memoryRequirement: 500_000_000,
})
await ONNX.addModel({
id: 'whisper-tiny-en',
name: 'Whisper Tiny English',
url: 'https://github.com/.../sherpa-onnx-whisper-tiny.en.tar.gz',
modality: ModelCategory.SpeechRecognition,
artifactType: ModelArtifactType.TarGzArchive,
memoryRequirement: 75_000_000,
})
await ONNX.addModel({
id: 'piper-en-lessac',
name: 'Piper English',
url: 'https://github.com/.../vits-piper-en_US-lessac-medium.tar.gz',
modality: ModelCategory.SpeechSynthesis,
artifactType: ModelArtifactType.TarGzArchive,
memoryRequirement: 65_000_000,
})
// Download all models
await Promise.all([
RunAnywhere.downloadModel('smollm2-360m'),
RunAnywhere.downloadModel('whisper-tiny-en'),
RunAnywhere.downloadModel('piper-en-lessac'),
])
2. Initialize Voice Agent
Copy
Ask AI
await RunAnywhere.initializeVoiceAgent({
llmModelId: 'smollm2-360m',
sttModelId: 'whisper-tiny-en',
ttsModelId: 'piper-en-lessac',
systemPrompt: 'You are a helpful voice assistant. Be concise and friendly.',
generationOptions: {
maxTokens: 150,
temperature: 0.7,
},
})
API Reference
initializeVoiceAgent
Initialize the voice agent pipeline.
Copy
Ask AI
await RunAnywhere.initializeVoiceAgent(
config: VoiceAgentConfig
): Promise<boolean>
VoiceAgentConfig
Copy
Ask AI
interface VoiceAgentConfig {
/** LLM model ID (must be downloaded) */
llmModelId: string
/** STT model ID (must be downloaded) */
sttModelId: string
/** TTS model ID (must be downloaded) */
ttsModelId: string
/** System prompt for LLM */
systemPrompt?: string
/** Generation options */
generationOptions?: GenerationOptions
}
processVoiceTurn
Process a complete voice turn (STT → LLM → TTS).
Copy
Ask AI
await RunAnywhere.processVoiceTurn(
audioData: string
): Promise<VoiceTurnResult>
VoiceTurnResult
Copy
Ask AI
interface VoiceTurnResult {
/** Transcribed user speech */
userTranscript: string
/** LLM-generated response */
assistantResponse: string
/** Synthesized audio (base64 float32 PCM) */
audio: string
/** Performance metrics */
metrics: VoiceAgentMetrics
}
interface VoiceAgentMetrics {
/** STT latency in ms */
sttLatencyMs: number
/** LLM latency in ms */
llmLatencyMs: number
/** TTS latency in ms */
ttsLatencyMs: number
/** Total turn latency in ms */
totalLatencyMs: number
/** Tokens generated */
tokensGenerated: number
}
startVoiceSession
Start an interactive voice session with automatic VAD.
Copy
Ask AI
await RunAnywhere.startVoiceSession(
config: VoiceSessionConfig,
callback: (event: VoiceSessionEvent) => void
): Promise<VoiceSessionHandle>
VoiceSessionConfig
Copy
Ask AI
interface VoiceSessionConfig {
/** Voice agent config */
agentConfig: VoiceAgentConfig
/** Enable VAD for automatic speech detection */
enableVAD?: boolean
/** VAD sensitivity (0.0-1.0) */
vadSensitivity?: number
/** Timeout for user speech (ms) */
speechTimeout?: number
}
VoiceSessionEvent
Copy
Ask AI
interface VoiceSessionEvent {
type:
| 'sessionStarted'
| 'listeningStarted'
| 'speechDetected'
| 'speechEnded'
| 'transcribing'
| 'transcriptionComplete'
| 'generating'
| 'generationComplete'
| 'synthesizing'
| 'synthesisComplete'
| 'speaking'
| 'turnComplete'
| 'error'
/** Event data */
data?: {
transcript?: string
response?: string
audio?: string
error?: string
}
}
Examples
Complete Voice Assistant
VoiceAssistant.tsx
Copy
Ask AI
import React, { useState, useCallback, useEffect } from 'react'
import { View, Text, Button, StyleSheet } from 'react-native'
import { RunAnywhere, VoiceSessionEvent, VoiceSessionHandle } from '@runanywhere/core'
export function VoiceAssistant() {
const [isActive, setIsActive] = useState(false)
const [status, setStatus] = useState('Ready')
const [transcript, setTranscript] = useState('')
const [response, setResponse] = useState('')
const sessionRef = React.useRef<VoiceSessionHandle | null>(null)
const handleEvent = useCallback((event: VoiceSessionEvent) => {
switch (event.type) {
case 'sessionStarted':
setStatus('Listening...')
break
case 'speechDetected':
setStatus('Hearing you...')
break
case 'transcribing':
setStatus('Processing...')
break
case 'transcriptionComplete':
setTranscript(event.data?.transcript || '')
setStatus('Thinking...')
break
case 'generationComplete':
setResponse(event.data?.response || '')
setStatus('Speaking...')
break
case 'speaking':
// Play audio: event.data?.audio
break
case 'turnComplete':
setStatus('Listening...')
break
case 'error':
setStatus('Error: ' + event.data?.error)
break
}
}, [])
const startSession = useCallback(async () => {
setIsActive(true)
setTranscript('')
setResponse('')
sessionRef.current = await RunAnywhere.startVoiceSession(
{
agentConfig: {
llmModelId: 'smollm2-360m',
sttModelId: 'whisper-tiny-en',
ttsModelId: 'piper-en-lessac',
systemPrompt: 'You are a helpful voice assistant.',
},
enableVAD: true,
vadSensitivity: 0.5,
},
handleEvent
)
}, [handleEvent])
const stopSession = useCallback(async () => {
if (sessionRef.current) {
await sessionRef.current.stop()
sessionRef.current = null
}
setIsActive(false)
setStatus('Ready')
}, [])
return (
<View style={styles.container}>
<View style={[styles.statusBadge, isActive && styles.active]}>
<Text style={styles.statusText}>{status}</Text>
</View>
{transcript && (
<View style={styles.bubble}>
<Text style={styles.bubbleLabel}>You</Text>
<Text style={styles.bubbleText}>{transcript}</Text>
</View>
)}
{response && (
<View style={[styles.bubble, styles.assistantBubble]}>
<Text style={styles.bubbleLabel}>Assistant</Text>
<Text style={styles.bubbleText}>{response}</Text>
</View>
)}
<Button
title={isActive ? 'Stop' : 'Start Voice Assistant'}
onPress={isActive ? stopSession : startSession}
/>
</View>
)
}
const styles = StyleSheet.create({
container: { flex: 1, padding: 16 },
statusBadge: {
alignSelf: 'center',
paddingHorizontal: 16,
paddingVertical: 8,
borderRadius: 20,
backgroundColor: '#e0e0e0',
marginBottom: 24,
},
active: { backgroundColor: '#4caf50' },
statusText: { fontWeight: '600', color: '#333' },
bubble: {
padding: 12,
borderRadius: 12,
backgroundColor: '#f5f5f5',
marginBottom: 12,
},
assistantBubble: { backgroundColor: '#e3f2fd' },
bubbleLabel: { fontSize: 12, color: '#666', marginBottom: 4 },
bubbleText: { fontSize: 16 },
})
Single Turn Processing
Copy
Ask AI
// For push-to-talk or single-turn interactions
async function processSingleTurn(audioBase64: string) {
const result = await RunAnywhere.processVoiceTurn(audioBase64)
console.log('=== Voice Turn Results ===')
console.log('User:', result.userTranscript)
console.log('Assistant:', result.assistantResponse)
console.log('')
console.log('=== Metrics ===')
console.log('STT:', result.metrics.sttLatencyMs, 'ms')
console.log('LLM:', result.metrics.llmLatencyMs, 'ms')
console.log('TTS:', result.metrics.ttsLatencyMs, 'ms')
console.log('Total:', result.metrics.totalLatencyMs, 'ms')
// Play the response audio
playAudio(result.audio)
return result
}
Custom Voice Pipeline
Build a custom pipeline for more control:Copy
Ask AI
class CustomVoiceAgent {
private conversationHistory: { role: string; content: string }[] = []
async processTurn(audioPath: string): Promise<void> {
// 1. Transcribe user speech
const sttResult = await RunAnywhere.transcribeFile(audioPath, {
language: 'en',
})
console.log('User:', sttResult.text)
// 2. Add to conversation history
this.conversationHistory.push({
role: 'user',
content: sttResult.text,
})
// 3. Generate LLM response with history
const prompt = this.buildPrompt()
const llmResult = await RunAnywhere.generate(prompt, {
maxTokens: 150,
temperature: 0.7,
systemPrompt: 'You are a helpful assistant.',
})
console.log('Assistant:', llmResult.text)
// 4. Add response to history
this.conversationHistory.push({
role: 'assistant',
content: llmResult.text,
})
// 5. Synthesize and play response
const ttsResult = await RunAnywhere.synthesize(llmResult.text)
await playAudio(ttsResult.audio, ttsResult.sampleRate)
}
private buildPrompt(): string {
return (
this.conversationHistory
.map((m) => `${m.role === 'user' ? 'User' : 'Assistant'}: ${m.content}`)
.join('\n') + '\nAssistant:'
)
}
clearHistory() {
this.conversationHistory = []
}
}
Performance Optimization
Latency Breakdown
| Stage | Typical Latency | Optimization |
|---|---|---|
| VAD | < 50ms | Use appropriate frame size |
| STT | 100-500ms | Use smaller models |
| LLM | 200-2000ms | Use smaller models, limit tokens |
| TTS | 100-300ms | Use streaming |
| Total | 400-3000ms |
Tips for Lower Latency
Use smaller models for faster response times. A 360M LLM with Whisper Tiny can achieve sub-second
voice turns.
Copy
Ask AI
// Optimized for speed
await RunAnywhere.initializeVoiceAgent({
llmModelId: 'smollm2-360m', // Small, fast model
sttModelId: 'whisper-tiny-en', // Fastest STT
ttsModelId: 'piper-en-lessac', // Neural TTS
generationOptions: {
maxTokens: 100, // Shorter responses
temperature: 0.5, // Faster sampling
},
})
Error Handling
Copy
Ask AI
try {
const session = await RunAnywhere.startVoiceSession(config, (event) => {
if (event.type === 'error') {
handleError(event.data?.error)
}
})
} catch (error) {
if (isSDKError(error)) {
switch (error.code) {
case SDKErrorCode.modelNotFound:
console.error('Download required models first')
break
case SDKErrorCode.voiceAgentFailed:
console.error('Voice agent failed:', error.message)
break
}
}
}