Skip to main content

Overview

The Voice Agent orchestrates the complete voice interaction pipeline: VAD → STT → LLM → TTS. This enables full conversational AI with a single API.

Configure Voice Agent

RunAnywhere.configureVoiceAgent(VoiceAgentConfiguration(
    sttModelId = "whisper-tiny",
    llmModelId = "qwen-0.5b",
    ttsVoiceId = "en-us-default",
    systemPrompt = "You are a helpful assistant. Keep responses brief.",
    interruptionEnabled = true
))

VoiceAgentConfiguration

data class VoiceAgentConfiguration(
    val sttModelId: String,              // Speech-to-text model
    val llmModelId: String,              // Language model
    val ttsVoiceId: String,              // Text-to-speech voice
    val systemPrompt: String? = null,    // LLM system prompt
    val vadConfiguration: VADConfiguration? = null,
    val interruptionEnabled: Boolean = true  // Allow user interruption
)

Process Voice Turn

Process a complete voice turn (listen → transcribe → respond → speak):
val audioData = // ... recorded audio from user

val result = RunAnywhere.processVoice(audioData)

println("User said: ${result.transcription}")
println("AI response: ${result.response}")
println("Total latency: ${result.totalLatencyMs}ms")

// Play the response audio
result.audioData?.let { audio ->
    audioPlayer.play(audio)
}

VoiceAgentResult

PropertyTypeDescription
transcriptionStringUser’s transcribed speech
responseStringAI’s text response
audioDataByteArray?Synthesized response audio
totalLatencyMsDoubleTotal processing time
sttLatencyMsDoubleSpeech-to-text time
llmLatencyMsDoubleLLM generation time
ttsLatencyMsDoubleText-to-speech time

Voice Session (Event-Driven)

Start a continuous voice session with event streaming:
lifecycleScope.launch {
    RunAnywhere.startVoiceSession()
        .collect { event ->
            when (event) {
                is VoiceSessionEvent.Listening -> {
                    updateUI("Listening...")
                    showMicrophoneActive()
                }
                is VoiceSessionEvent.Transcribed -> {
                    updateUI("You: ${event.text}")
                }
                is VoiceSessionEvent.Thinking -> {
                    updateUI("Thinking...")
                    showLoadingIndicator()
                }
                is VoiceSessionEvent.Responded -> {
                    updateUI("AI: ${event.text}")
                }
                is VoiceSessionEvent.Speaking -> {
                    updateUI("Speaking...")
                }
                is VoiceSessionEvent.Idle -> {
                    updateUI("Ready")
                }
                is VoiceSessionEvent.Error -> {
                    showError(event.message)
                }
            }
        }
}

VoiceSessionEvent

sealed class VoiceSessionEvent {
    object Listening : VoiceSessionEvent()
    data class Transcribed(val text: String) : VoiceSessionEvent()
    object Thinking : VoiceSessionEvent()
    data class Responded(val text: String) : VoiceSessionEvent()
    object Speaking : VoiceSessionEvent()
    object Idle : VoiceSessionEvent()
    data class Error(val message: String) : VoiceSessionEvent()
}

Session Control

// Stop the voice session
RunAnywhere.stopVoiceSession()

// Check if session is active
val isActive = RunAnywhere.isVoiceSessionActive()

// Clear conversation history
RunAnywhere.clearVoiceConversation()

// Update system prompt mid-session
RunAnywhere.setVoiceSystemPrompt("You are now a coding assistant.")

Example: Complete Voice Assistant

class VoiceAssistantActivity : AppCompatActivity() {

    override fun onCreate(savedInstanceState: Bundle?) {
        super.onCreate(savedInstanceState)
        setContentView(R.layout.activity_voice_assistant)

        lifecycleScope.launch {
            // Configure voice agent
            RunAnywhere.configureVoiceAgent(VoiceAgentConfiguration(
                sttModelId = "whisper-tiny",
                llmModelId = "qwen-0.5b",
                ttsVoiceId = "en-us-default",
                systemPrompt = """
                    You are a helpful voice assistant.
                    Keep responses under 2 sentences.
                    Be friendly and conversational.
                """.trimIndent()
            ))

            // Check if ready
            if (!RunAnywhere.isVoiceAgentReady()) {
                showMessage("Loading models...")
                RunAnywhere.initializeVoiceAgentWithLoadedModels()
            }
        }

        startButton.setOnClickListener {
            startVoiceSession()
        }

        stopButton.setOnClickListener {
            lifecycleScope.launch {
                RunAnywhere.stopVoiceSession()
            }
        }
    }

    private fun startVoiceSession() {
        lifecycleScope.launch {
            RunAnywhere.startVoiceSession()
                .collect { event ->
                    runOnUiThread {
                        when (event) {
                            is VoiceSessionEvent.Listening -> {
                                statusText.text = "🎤 Listening..."
                                micButton.setColorFilter(Color.RED)
                            }
                            is VoiceSessionEvent.Transcribed -> {
                                transcriptionText.text = event.text
                            }
                            is VoiceSessionEvent.Thinking -> {
                                statusText.text = "🤔 Thinking..."
                            }
                            is VoiceSessionEvent.Responded -> {
                                responseText.text = event.text
                            }
                            is VoiceSessionEvent.Speaking -> {
                                statusText.text = "🔊 Speaking..."
                            }
                            is VoiceSessionEvent.Idle -> {
                                statusText.text = "Ready"
                                micButton.clearColorFilter()
                            }
                            is VoiceSessionEvent.Error -> {
                                Toast.makeText(this@VoiceAssistantActivity,
                                    event.message, Toast.LENGTH_SHORT).show()
                            }
                        }
                    }
                }
        }
    }
}

Component States

Check individual component readiness:
val states = RunAnywhere.voiceAgentComponentStates()
println("STT ready: ${states.sttReady}")
println("LLM ready: ${states.llmReady}")
println("TTS ready: ${states.ttsReady}")
println("VAD ready: ${states.vadReady}")

Performance Optimization

Minimize total latency: Use smaller models (whisper-tiny, 0.5B LLM), enable streaming for both LLM and TTS, and preload all models at app startup.

Typical Latencies (Pixel 7)

StageLatency
STT200-400ms for 5s audio
LLM150-300ms for 50 tokens
TTS100-200ms for short responses
Total round-trip500-900ms