Skip to main content
The Voice Agent orchestrates the complete voice interaction pipeline: VAD → STT → LLM → TTS. It provides a unified API for building voice assistants, hands-free interfaces, and conversational AI.

Overview

The Voice Agent handles:
  • VAD: Detects when the user starts/stops speaking
  • STT: Transcribes speech to text
  • LLM: Generates AI responses
  • TTS: Synthesizes speech from responses

Quick Start

// Initialize the voice agent
try await RunAnywhere.initializeVoiceAgent(
    VoiceAgentConfiguration(
        sttModelId: "whisper-base-onnx",
        llmModelId: "llama-3.2-1b-instruct-q4",
        ttsVoice: "piper-en-us-amy"
    )
)

// Process a voice turn
let audioData: Data = // Recorded user speech
let result = try await RunAnywhere.processVoiceTurn(audioData)

print("User said: \(result.transcription ?? "")")
print("AI response: \(result.response ?? "")")

// Play the synthesized audio
if let audio = result.synthesizedAudio {
    playAudio(audio)
}

Configuration

VoiceAgentConfiguration

public struct VoiceAgentConfiguration: Sendable {
    public let sttModelId: String?       // STT model (e.g., "whisper-base-onnx")
    public let llmModelId: String?       // LLM model (e.g., "llama-3.2-1b-instruct-q4")
    public let ttsVoice: String?         // TTS voice (e.g., "piper-en-us-amy")
    public let vadSampleRate: Int        // VAD sample rate (default: 16000)
    public let vadFrameLength: Float     // VAD frame length (default: 0.032)
    public let vadEnergyThreshold: Float // VAD sensitivity (default: 0.5)

    public init(
        sttModelId: String? = nil,
        llmModelId: String? = nil,
        ttsVoice: String? = nil,
        vadSampleRate: Int = 16000,
        vadFrameLength: Float = 0.032,
        vadEnergyThreshold: Float = 0.5
    )
}

Initialize with Pre-loaded Models

If you’ve already loaded models, use:
// Load models separately
try await RunAnywhere.loadSTTModel("whisper-base-onnx")
try await RunAnywhere.loadModel("llama-3.2-1b-instruct-q4")
try await RunAnywhere.loadTTSVoice("piper-en-us-amy")

// Initialize voice agent with loaded models
try await RunAnywhere.initializeVoiceAgentWithLoadedModels()

VoiceAgentResult

public struct VoiceAgentResult: Sendable {
    public let speechDetected: Bool       // Was speech detected?
    public let transcription: String?     // What the user said
    public let response: String?          // AI response text
    public let synthesizedAudio: Data?    // Audio of response
}

Component State

Check readiness of individual components:
// Check all components
let states = await RunAnywhere.getVoiceAgentComponentStates()
print("STT: \(states.stt)")
print("LLM: \(states.llm)")
print("TTS: \(states.tts)")

// Check if everything is ready
if await RunAnywhere.areAllVoiceComponentsReady {
    // Ready for voice interaction
}

// Check if voice agent is initialized
let isReady = await RunAnywhere.isVoiceAgentReady

ComponentLoadState

public enum ComponentLoadState: Sendable {
    case notLoaded
    case loading
    case loaded(modelId: String)
    case failed(Error)
}

Individual Operations

Use components separately when needed:
// Just transcribe
let transcription = try await RunAnywhere.voiceAgentTranscribe(audioData)

// Just generate response
let response = try await RunAnywhere.voiceAgentGenerateResponse(transcription)

// Just synthesize speech
let audio = try await RunAnywhere.voiceAgentSynthesizeSpeech(response)

Complete Voice Assistant

import SwiftUI
import AVFoundation

@Observable
class VoiceAssistant {
    var isListening = false
    var isProcessing = false
    var transcription = ""
    var response = ""
    var conversationHistory: [(role: String, text: String)] = []

    private var audioEngine: AVAudioEngine?
    private var audioRecorder: AVAudioRecorder?
    private var audioPlayer: AVAudioPlayer?

    func initialize() async throws {
        // Initialize voice agent
        try await RunAnywhere.initializeVoiceAgent(
            VoiceAgentConfiguration(
                sttModelId: "whisper-base-onnx",
                llmModelId: "llama-3.2-1b-instruct-q4",
                ttsVoice: "piper-en-us-amy",
                vadEnergyThreshold: 0.5
            )
        )
    }

    func startListening() async throws {
        let session = AVAudioSession.sharedInstance()
        try session.setCategory(.playAndRecord, mode: .default)
        try session.setActive(true)

        let url = FileManager.default.temporaryDirectory.appendingPathComponent("voice.wav")

        let settings: [String: Any] = [
            AVFormatIDKey: Int(kAudioFormatLinearPCM),
            AVSampleRateKey: 16000,
            AVNumberOfChannelsKey: 1,
            AVLinearPCMBitDepthKey: 16,
            AVLinearPCMIsFloatKey: false
        ]

        audioRecorder = try AVAudioRecorder(url: url, settings: settings)
        audioRecorder?.record()

        await MainActor.run { isListening = true }
    }

    func stopAndProcess() async throws {
        guard let recorder = audioRecorder else { return }

        recorder.stop()
        await MainActor.run {
            isListening = false
            isProcessing = true
        }

        let audioData = try Data(contentsOf: recorder.url)

        // Process voice turn
        let result = try await RunAnywhere.processVoiceTurn(audioData)

        await MainActor.run {
            transcription = result.transcription ?? ""
            response = result.response ?? ""

            if !transcription.isEmpty {
                conversationHistory.append((role: "user", text: transcription))
            }
            if !response.isEmpty {
                conversationHistory.append((role: "assistant", text: response))
            }

            isProcessing = false
        }

        // Play response
        if let audio = result.synthesizedAudio {
            audioPlayer = try AVAudioPlayer(data: audio)
            audioPlayer?.play()
        }
    }

    func cleanup() async {
        await RunAnywhere.cleanupVoiceAgent()
    }
}

struct VoiceAssistantView: View {
    @State private var assistant = VoiceAssistant()
    @State private var isInitialized = false
    @State private var error: String?

    var body: some View {
        VStack(spacing: 20) {
            // Conversation history
            ScrollView {
                LazyVStack(alignment: .leading, spacing: 12) {
                    ForEach(Array(assistant.conversationHistory.enumerated()), id: \.offset) { _, message in
                        HStack {
                            if message.role == "user" { Spacer() }

                            VStack(alignment: message.role == "user" ? .trailing : .leading) {
                                Text(message.role == "user" ? "You" : "Assistant")
                                    .font(.caption)
                                    .foregroundColor(.secondary)

                                Text(message.text)
                                    .padding()
                                    .background(message.role == "user" ? Color.blue : Color.gray.opacity(0.2))
                                    .foregroundColor(message.role == "user" ? .white : .primary)
                                    .cornerRadius(16)
                            }

                            if message.role == "assistant" { Spacer() }
                        }
                    }
                }
                .padding()
            }

            // Voice button
            Button(action: toggleVoice) {
                ZStack {
                    Circle()
                        .fill(buttonColor)
                        .frame(width: 80, height: 80)

                    if assistant.isProcessing {
                        ProgressView()
                            .tint(.white)
                    } else {
                        Image(systemName: assistant.isListening ? "stop.fill" : "mic.fill")
                            .font(.title)
                            .foregroundColor(.white)
                    }
                }
            }
            .disabled(!isInitialized || assistant.isProcessing)

            Text(statusText)
                .font(.caption)
                .foregroundColor(.secondary)
        }
        .task {
            do {
                try await assistant.initialize()
                isInitialized = true
            } catch {
                self.error = error.localizedDescription
            }
        }
        .alert("Error", isPresented: .constant(error != nil)) {
            Button("OK") { error = nil }
        } message: {
            Text(error ?? "")
        }
    }

    var buttonColor: Color {
        if !isInitialized { return .gray }
        if assistant.isProcessing { return .orange }
        if assistant.isListening { return .red }
        return .blue
    }

    var statusText: String {
        if !isInitialized { return "Initializing..." }
        if assistant.isProcessing { return "Processing..." }
        if assistant.isListening { return "Listening... tap to stop" }
        return "Tap to speak"
    }

    func toggleVoice() {
        Task {
            if assistant.isListening {
                try? await assistant.stopAndProcess()
            } else {
                try? await assistant.startListening()
            }
        }
    }
}

Conversation Context

Maintain conversation history for context-aware responses:
class ConversationalVoiceAgent {
    private var conversationContext: [Message] = []

    struct Message {
        let role: String
        let content: String
    }

    func processWithContext(_ audioData: Data) async throws -> VoiceAgentResult {
        // Transcribe
        let transcription = try await RunAnywhere.voiceAgentTranscribe(audioData)

        // Add to context
        conversationContext.append(Message(role: "user", content: transcription))

        // Build context prompt
        let contextPrompt = conversationContext
            .suffix(10)  // Last 10 messages
            .map { "\($0.role): \($0.content)" }
            .joined(separator: "\n")

        // Generate response with context
        let response = try await RunAnywhere.voiceAgentGenerateResponse(contextPrompt)

        // Add response to context
        conversationContext.append(Message(role: "assistant", content: response))

        // Synthesize
        let audio = try await RunAnywhere.voiceAgentSynthesizeSpeech(response)

        return VoiceAgentResult(
            speechDetected: true,
            transcription: transcription,
            response: response,
            synthesizedAudio: audio
        )
    }

    func clearContext() {
        conversationContext.removeAll()
    }
}

Error Handling

do {
    try await RunAnywhere.initializeVoiceAgent(config)
    let result = try await RunAnywhere.processVoiceTurn(audioData)
} catch let error as SDKError {
    switch error.code {
    case .notInitialized:
        print("Initialize voice agent first")
    case .modelNotFound:
        print("Required model not loaded")
    case .processingFailed:
        print("Processing failed: \(error.message)")
    default:
        print("Error: \(error)")
    }
}

Cleanup

Always clean up when done:
await RunAnywhere.cleanupVoiceAgent()

Best Practices

Load all models during app initialization to avoid delays during conversation.
Properly handle audio session interruptions (phone calls, etc.).
Show clear visual indicators for listening, processing, and speaking states.
Implement graceful error recovery to maintain conversation flow.