Overview
The Voice Agent orchestrates the complete voice interaction pipeline: VAD → STT → LLM → TTS. This enables full conversational AI with a single API.Prerequisites
All three models (LLM + STT + TTS) must be loaded before starting a voice session:Copy
Ask AI
RunAnywhere.loadLLMModel("smollm2-360m-instruct-q8_0")
RunAnywhere.loadSTTModel("sherpa-onnx-whisper-tiny.en")
RunAnywhere.loadTTSVoice("vits-piper-en_US-lessac-medium")
// Verify readiness
val ready = RunAnywhere.isVoiceAgentReady()
VoiceSessionConfig
Copy
Ask AI
import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceSessionConfig
val config = VoiceSessionConfig(
silenceDuration = 1.5, // Seconds of silence before processing
speechThreshold = 0.1f, // Audio level threshold for speech detection
autoPlayTTS = false, // If true, SDK handles TTS playback
continuousMode = true // Auto-resume listening after each turn
)
Stream Voice Session
The voice session takes aFlow<ByteArray> of audio data and returns a Flow<VoiceSessionEvent>:
Copy
Ask AI
import com.runanywhere.sdk.public.extensions.VoiceAgent.VoiceSessionEvent
import com.runanywhere.sdk.public.extensions.streamVoiceSession
lifecycleScope.launch {
RunAnywhere.streamVoiceSession(audioFlow, config)
.collect { event ->
when (event) {
is VoiceSessionEvent.Started -> {
updateUI("Session started")
}
is VoiceSessionEvent.Listening -> {
updateAudioLevel(event.audioLevel)
}
is VoiceSessionEvent.SpeechStarted -> {
updateUI("Speech detected...")
}
is VoiceSessionEvent.Processing -> {
updateUI("Processing...")
}
is VoiceSessionEvent.Transcribed -> {
updateUI("You: ${event.text}")
}
is VoiceSessionEvent.Responded -> {
updateUI("AI: ${event.text}")
}
is VoiceSessionEvent.Speaking -> {
updateUI("Speaking...")
}
is VoiceSessionEvent.TurnCompleted -> {
event.audio?.let { playAudio(it) }
}
is VoiceSessionEvent.Stopped -> {
updateUI("Session ended")
}
is VoiceSessionEvent.Error -> {
showError(event.message)
}
}
}
}
VoiceSessionEvent
Copy
Ask AI
sealed class VoiceSessionEvent {
object Started : VoiceSessionEvent()
data class Listening(val audioLevel: Float) : VoiceSessionEvent()
object SpeechStarted : VoiceSessionEvent()
object Processing : VoiceSessionEvent()
data class Transcribed(val text: String) : VoiceSessionEvent()
data class Responded(val text: String) : VoiceSessionEvent()
object Speaking : VoiceSessionEvent()
data class TurnCompleted(val audio: ByteArray?) : VoiceSessionEvent()
object Stopped : VoiceSessionEvent()
data class Error(val message: String) : VoiceSessionEvent()
}
Providing Audio Input
The voice session expects aFlow<ByteArray> of 16kHz mono PCM 16-bit audio in ~100ms chunks. Use Kotlin’s callbackFlow with Android’s AudioRecord:
Copy
Ask AI
import kotlinx.coroutines.channels.awaitClose
import kotlinx.coroutines.flow.callbackFlow
val audioFlow = callbackFlow<ByteArray> {
val bufferSize = AudioRecord.getMinBufferSize(16000, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT)
val recorder = AudioRecord(MediaRecorder.AudioSource.MIC, 16000, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT, bufferSize)
recorder.startRecording()
val buffer = ByteArray(1600) // 100ms at 16kHz
while (isActive) {
val read = recorder.read(buffer, 0, buffer.size)
if (read > 0) trySend(buffer.copyOf(read))
}
awaitClose { recorder.stop(); recorder.release() }
}
Example: Complete Voice Assistant
Copy
Ask AI
class VoiceAssistantActivity : AppCompatActivity() {
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
setContentView(R.layout.activity_voice_assistant)
lifecycleScope.launch {
// Configure voice agent
RunAnywhere.configureVoiceAgent(VoiceAgentConfiguration(
sttModelId = "whisper-tiny",
llmModelId = "qwen-0.5b",
ttsVoiceId = "en-us-default",
systemPrompt = """
You are a helpful voice assistant.
Keep responses under 2 sentences.
Be friendly and conversational.
""".trimIndent()
))
// Check if ready
if (!RunAnywhere.isVoiceAgentReady()) {
showMessage("Loading models...")
RunAnywhere.initializeVoiceAgentWithLoadedModels()
}
}
startButton.setOnClickListener {
startVoiceSession()
}
stopButton.setOnClickListener {
lifecycleScope.launch {
RunAnywhere.stopVoiceSession()
}
}
}
private fun startVoiceSession() {
lifecycleScope.launch {
RunAnywhere.startVoiceSession()
.collect { event ->
runOnUiThread {
when (event) {
is VoiceSessionEvent.Listening -> {
statusText.text = "🎤 Listening..."
micButton.setColorFilter(Color.RED)
}
is VoiceSessionEvent.Transcribed -> {
transcriptionText.text = event.text
}
is VoiceSessionEvent.Thinking -> {
statusText.text = "🤔 Thinking..."
}
is VoiceSessionEvent.Responded -> {
responseText.text = event.text
}
is VoiceSessionEvent.Speaking -> {
statusText.text = "🔊 Speaking..."
}
is VoiceSessionEvent.Idle -> {
statusText.text = "Ready"
micButton.clearColorFilter()
}
is VoiceSessionEvent.Error -> {
Toast.makeText(this@VoiceAssistantActivity,
event.message, Toast.LENGTH_SHORT).show()
}
}
}
}
}
}
}
Component States
Check individual component readiness:Copy
Ask AI
val states = RunAnywhere.voiceAgentComponentStates()
println("STT ready: ${states.sttReady}")
println("LLM ready: ${states.llmReady}")
println("TTS ready: ${states.ttsReady}")
println("VAD ready: ${states.vadReady}")
Performance Optimization
Minimize total latency: Use smaller models (whisper-tiny, 0.5B LLM), enable streaming for both
LLM and TTS, and preload all models at app startup.
Typical Latencies (Pixel 7)
| Stage | Latency |
|---|---|
| STT | 200-400ms for 5s audio |
| LLM | 150-300ms for 50 tokens |
| TTS | 100-200ms for short responses |
| Total round-trip | 500-900ms |