Overview
The Voice Agent handles:- VAD: Detects when the user starts/stops speaking
- STT: Transcribes speech to text
- LLM: Generates AI responses
- TTS: Synthesizes speech from responses
Quick Start
Copy
Ask AI
// Initialize the voice agent
try await RunAnywhere.initializeVoiceAgent(
VoiceAgentConfiguration(
sttModelId: "whisper-base-onnx",
llmModelId: "llama-3.2-1b-instruct-q4",
ttsVoice: "piper-en-us-amy"
)
)
// Process a voice turn
let audioData: Data = // Recorded user speech
let result = try await RunAnywhere.processVoiceTurn(audioData)
print("User said: \(result.transcription ?? "")")
print("AI response: \(result.response ?? "")")
// Play the synthesized audio
if let audio = result.synthesizedAudio {
playAudio(audio)
}
Configuration
VoiceAgentConfiguration
Copy
Ask AI
public struct VoiceAgentConfiguration: Sendable {
public let sttModelId: String? // STT model (e.g., "whisper-base-onnx")
public let llmModelId: String? // LLM model (e.g., "llama-3.2-1b-instruct-q4")
public let ttsVoice: String? // TTS voice (e.g., "piper-en-us-amy")
public let vadSampleRate: Int // VAD sample rate (default: 16000)
public let vadFrameLength: Float // VAD frame length (default: 0.032)
public let vadEnergyThreshold: Float // VAD sensitivity (default: 0.5)
public init(
sttModelId: String? = nil,
llmModelId: String? = nil,
ttsVoice: String? = nil,
vadSampleRate: Int = 16000,
vadFrameLength: Float = 0.032,
vadEnergyThreshold: Float = 0.5
)
}
Initialize with Pre-loaded Models
If you’ve already loaded models, use:Copy
Ask AI
// Load models separately
try await RunAnywhere.loadSTTModel("whisper-base-onnx")
try await RunAnywhere.loadModel("llama-3.2-1b-instruct-q4")
try await RunAnywhere.loadTTSVoice("piper-en-us-amy")
// Initialize voice agent with loaded models
try await RunAnywhere.initializeVoiceAgentWithLoadedModels()
VoiceAgentResult
Copy
Ask AI
public struct VoiceAgentResult: Sendable {
public let speechDetected: Bool // Was speech detected?
public let transcription: String? // What the user said
public let response: String? // AI response text
public let synthesizedAudio: Data? // Audio of response
}
Component State
Check readiness of individual components:Copy
Ask AI
// Check all components
let states = await RunAnywhere.getVoiceAgentComponentStates()
print("STT: \(states.stt)")
print("LLM: \(states.llm)")
print("TTS: \(states.tts)")
// Check if everything is ready
if await RunAnywhere.areAllVoiceComponentsReady {
// Ready for voice interaction
}
// Check if voice agent is initialized
let isReady = await RunAnywhere.isVoiceAgentReady
ComponentLoadState
Copy
Ask AI
public enum ComponentLoadState: Sendable {
case notLoaded
case loading
case loaded(modelId: String)
case failed(Error)
}
Individual Operations
Use components separately when needed:Copy
Ask AI
// Just transcribe
let transcription = try await RunAnywhere.voiceAgentTranscribe(audioData)
// Just generate response
let response = try await RunAnywhere.voiceAgentGenerateResponse(transcription)
// Just synthesize speech
let audio = try await RunAnywhere.voiceAgentSynthesizeSpeech(response)
Complete Voice Assistant
Copy
Ask AI
import SwiftUI
import AVFoundation
@Observable
class VoiceAssistant {
var isListening = false
var isProcessing = false
var transcription = ""
var response = ""
var conversationHistory: [(role: String, text: String)] = []
private var audioEngine: AVAudioEngine?
private var audioRecorder: AVAudioRecorder?
private var audioPlayer: AVAudioPlayer?
func initialize() async throws {
// Initialize voice agent
try await RunAnywhere.initializeVoiceAgent(
VoiceAgentConfiguration(
sttModelId: "whisper-base-onnx",
llmModelId: "llama-3.2-1b-instruct-q4",
ttsVoice: "piper-en-us-amy",
vadEnergyThreshold: 0.5
)
)
}
func startListening() async throws {
let session = AVAudioSession.sharedInstance()
try session.setCategory(.playAndRecord, mode: .default)
try session.setActive(true)
let url = FileManager.default.temporaryDirectory.appendingPathComponent("voice.wav")
let settings: [String: Any] = [
AVFormatIDKey: Int(kAudioFormatLinearPCM),
AVSampleRateKey: 16000,
AVNumberOfChannelsKey: 1,
AVLinearPCMBitDepthKey: 16,
AVLinearPCMIsFloatKey: false
]
audioRecorder = try AVAudioRecorder(url: url, settings: settings)
audioRecorder?.record()
await MainActor.run { isListening = true }
}
func stopAndProcess() async throws {
guard let recorder = audioRecorder else { return }
recorder.stop()
await MainActor.run {
isListening = false
isProcessing = true
}
let audioData = try Data(contentsOf: recorder.url)
// Process voice turn
let result = try await RunAnywhere.processVoiceTurn(audioData)
await MainActor.run {
transcription = result.transcription ?? ""
response = result.response ?? ""
if !transcription.isEmpty {
conversationHistory.append((role: "user", text: transcription))
}
if !response.isEmpty {
conversationHistory.append((role: "assistant", text: response))
}
isProcessing = false
}
// Play response
if let audio = result.synthesizedAudio {
audioPlayer = try AVAudioPlayer(data: audio)
audioPlayer?.play()
}
}
func cleanup() async {
await RunAnywhere.cleanupVoiceAgent()
}
}
struct VoiceAssistantView: View {
@State private var assistant = VoiceAssistant()
@State private var isInitialized = false
@State private var error: String?
var body: some View {
VStack(spacing: 20) {
// Conversation history
ScrollView {
LazyVStack(alignment: .leading, spacing: 12) {
ForEach(Array(assistant.conversationHistory.enumerated()), id: \.offset) { _, message in
HStack {
if message.role == "user" { Spacer() }
VStack(alignment: message.role == "user" ? .trailing : .leading) {
Text(message.role == "user" ? "You" : "Assistant")
.font(.caption)
.foregroundColor(.secondary)
Text(message.text)
.padding()
.background(message.role == "user" ? Color.blue : Color.gray.opacity(0.2))
.foregroundColor(message.role == "user" ? .white : .primary)
.cornerRadius(16)
}
if message.role == "assistant" { Spacer() }
}
}
}
.padding()
}
// Voice button
Button(action: toggleVoice) {
ZStack {
Circle()
.fill(buttonColor)
.frame(width: 80, height: 80)
if assistant.isProcessing {
ProgressView()
.tint(.white)
} else {
Image(systemName: assistant.isListening ? "stop.fill" : "mic.fill")
.font(.title)
.foregroundColor(.white)
}
}
}
.disabled(!isInitialized || assistant.isProcessing)
Text(statusText)
.font(.caption)
.foregroundColor(.secondary)
}
.task {
do {
try await assistant.initialize()
isInitialized = true
} catch {
self.error = error.localizedDescription
}
}
.alert("Error", isPresented: .constant(error != nil)) {
Button("OK") { error = nil }
} message: {
Text(error ?? "")
}
}
var buttonColor: Color {
if !isInitialized { return .gray }
if assistant.isProcessing { return .orange }
if assistant.isListening { return .red }
return .blue
}
var statusText: String {
if !isInitialized { return "Initializing..." }
if assistant.isProcessing { return "Processing..." }
if assistant.isListening { return "Listening... tap to stop" }
return "Tap to speak"
}
func toggleVoice() {
Task {
if assistant.isListening {
try? await assistant.stopAndProcess()
} else {
try? await assistant.startListening()
}
}
}
}
Conversation Context
Maintain conversation history for context-aware responses:Copy
Ask AI
class ConversationalVoiceAgent {
private var conversationContext: [Message] = []
struct Message {
let role: String
let content: String
}
func processWithContext(_ audioData: Data) async throws -> VoiceAgentResult {
// Transcribe
let transcription = try await RunAnywhere.voiceAgentTranscribe(audioData)
// Add to context
conversationContext.append(Message(role: "user", content: transcription))
// Build context prompt
let contextPrompt = conversationContext
.suffix(10) // Last 10 messages
.map { "\($0.role): \($0.content)" }
.joined(separator: "\n")
// Generate response with context
let response = try await RunAnywhere.voiceAgentGenerateResponse(contextPrompt)
// Add response to context
conversationContext.append(Message(role: "assistant", content: response))
// Synthesize
let audio = try await RunAnywhere.voiceAgentSynthesizeSpeech(response)
return VoiceAgentResult(
speechDetected: true,
transcription: transcription,
response: response,
synthesizedAudio: audio
)
}
func clearContext() {
conversationContext.removeAll()
}
}
Error Handling
Copy
Ask AI
do {
try await RunAnywhere.initializeVoiceAgent(config)
let result = try await RunAnywhere.processVoiceTurn(audioData)
} catch let error as SDKError {
switch error.code {
case .notInitialized:
print("Initialize voice agent first")
case .modelNotFound:
print("Required model not loaded")
case .processingFailed:
print("Processing failed: \(error.message)")
default:
print("Error: \(error)")
}
}
Cleanup
Always clean up when done:Copy
Ask AI
await RunAnywhere.cleanupVoiceAgent()
Best Practices
Pre-load models
Pre-load models
Load all models during app initialization to avoid delays during conversation.
Handle interruptions
Handle interruptions
Properly handle audio session interruptions (phone calls, etc.).
Visual feedback
Visual feedback
Show clear visual indicators for listening, processing, and speaking states.
Error recovery
Error recovery
Implement graceful error recovery to maintain conversation flow.