Skip to main content
The synthesize() method converts text to natural-sounding speech using on-device neural voice models.

Basic Usage

// Synthesize text to audio
let output = try await RunAnywhere.synthesize("Hello! Welcome to RunAnywhere.")

// Play the audio
let player = try AVAudioPlayer(data: output.audioData)
player.play()

Setup

Before synthesizing, register the ONNX module and load a TTS voice:
import RunAnywhere
import ONNXRuntime

// Register ONNX module (at app launch)
@MainActor
func setup() {
    ONNX.register()
}

// Load TTS voice
try await RunAnywhere.loadTTSVoice("piper-en-us-amy")

Method Signatures

Basic Synthesis

public static func synthesize(
    _ text: String,
    options: TTSOptions = TTSOptions()
) async throws -> TTSOutput

Speak (with automatic playback)

public static func speak(
    _ text: String,
    options: TTSOptions = TTSOptions()
) async throws -> TTSSpeakResult

TTSOutput

public struct TTSOutput: Sendable {
    public let audioData: Data                    // Audio bytes
    public let format: AudioFormat                // WAV, PCM, etc.
    public let duration: TimeInterval             // Audio duration
    public let phonemeTimestamps: [PhonemeTimestamp]? // Phoneme timing
    public let metadata: TTSSynthesisMetadata?    // Processing info
}

TTSOptions

public struct TTSOptions: Sendable {
    public let rate: Float          // Speed (0.5-2.0, default: 1.0)
    public let pitch: Float         // Pitch (0.5-2.0, default: 1.0)
    public let volume: Float        // Volume (0.0-1.0, default: 1.0)
    public let language: String     // Language code
    public let sampleRate: Int      // Output sample rate
    public let audioFormat: AudioFormat  // Output format

    public init(
        rate: Float = 1.0,
        pitch: Float = 1.0,
        volume: Float = 1.0,
        language: String = "en-US",
        sampleRate: Int = 22050,
        audioFormat: AudioFormat = .wav
    )
}

Examples

Simple Text-to-Speech

let output = try await RunAnywhere.synthesize("Hello world!")
print("Generated \(output.duration) seconds of audio")

With Custom Options

let options = TTSOptions(
    rate: 0.9,      // Slightly slower
    pitch: 1.1,     // Slightly higher pitch
    volume: 0.8     // 80% volume
)

let output = try await RunAnywhere.synthesize(
    "This is a customized voice.",
    options: options
)

Speak (Automatic Playback)

// Synthesize and play automatically
try await RunAnywhere.speak("Hello! How can I help you today?")

// Check if still speaking
let isSpeaking = await RunAnywhere.isSpeaking

// Stop playback
await RunAnywhere.stopSpeaking()

Playing Audio Manually

import AVFoundation

class SpeechPlayer {
    private var audioPlayer: AVAudioPlayer?

    func speak(_ text: String) async throws {
        let output = try await RunAnywhere.synthesize(text)

        audioPlayer = try AVAudioPlayer(data: output.audioData)
        audioPlayer?.play()
    }

    func stop() {
        audioPlayer?.stop()
    }
}

SwiftUI Voice Output

struct TextToSpeechView: View {
    @State private var text = ""
    @State private var isSpeaking = false
    @State private var rate: Float = 1.0

    var body: some View {
        VStack(spacing: 20) {
            // Text input
            TextEditor(text: $text)
                .frame(height: 150)
                .border(Color.gray.opacity(0.3))

            // Rate slider
            VStack(alignment: .leading) {
                Text("Speed: \(String(format: "%.1f", rate))x")
                Slider(value: $rate, in: 0.5...2.0)
            }

            // Control buttons
            HStack(spacing: 20) {
                Button(action: speak) {
                    Label("Speak", systemImage: "play.fill")
                        .padding()
                        .background(Color.blue)
                        .foregroundColor(.white)
                        .cornerRadius(10)
                }
                .disabled(text.isEmpty || isSpeaking)

                Button(action: stop) {
                    Label("Stop", systemImage: "stop.fill")
                        .padding()
                        .background(Color.red)
                        .foregroundColor(.white)
                        .cornerRadius(10)
                }
                .disabled(!isSpeaking)
            }
        }
        .padding()
    }

    func speak() {
        isSpeaking = true

        Task {
            do {
                try await RunAnywhere.speak(
                    text,
                    options: TTSOptions(rate: rate)
                )
                await MainActor.run {
                    isSpeaking = false
                }
            } catch {
                print("Speech failed: \(error)")
                await MainActor.run {
                    isSpeaking = false
                }
            }
        }
    }

    func stop() {
        Task {
            await RunAnywhere.stopSpeaking()
            await MainActor.run {
                isSpeaking = false
            }
        }
    }
}

Save to File

func saveToFile(text: String, filename: String) async throws -> URL {
    let output = try await RunAnywhere.synthesize(text)

    let documentsPath = FileManager.default.urls(
        for: .documentDirectory,
        in: .userDomainMask
    ).first!

    let fileURL = documentsPath.appendingPathComponent("\(filename).wav")
    try output.audioData.write(to: fileURL)

    return fileURL
}

// Usage
let url = try await saveToFile(text: "Hello world", filename: "greeting")
print("Saved to: \(url.path)")

Voice Management

// Load a voice
try await RunAnywhere.loadTTSVoice("piper-en-us-amy")

// Check if voice is loaded
let isLoaded = await RunAnywhere.isTTSVoiceLoaded

// Get current voice
let voiceId = await RunAnywhere.currentTTSVoiceId

// List available voices
let voices = await RunAnywhere.availableTTSVoices
for voice in voices {
    print(voice)
}

// Unload voice
try await RunAnywhere.unloadTTSVoice()

Synthesis Metadata

let output = try await RunAnywhere.synthesize(text)

if let metadata = output.metadata {
    print("Voice: \(metadata.voice)")
    print("Processing time: \(metadata.processingTime)s")
    print("Characters: \(metadata.characterCount)")

    let charsPerSecond = Double(metadata.characterCount) / metadata.processingTime
    print("Speed: \(String(format: "%.0f", charsPerSecond)) chars/sec")
}

Error Handling

do {
    let output = try await RunAnywhere.synthesize(text)
    playAudio(output.audioData)
} catch let error as SDKError {
    switch error.code {
    case .modelNotFound:
        print("Load a TTS voice first")
    case .processingFailed:
        print("Synthesis failed: \(error.message)")
    case .invalidInput:
        print("Invalid text input")
    default:
        print("Error: \(error.localizedDescription)")
    }
}

Best Practices

For long text, use streaming synthesis or split into sentences to reduce latency.
Load TTS voices at app startup to avoid delays when synthesizing.
Configure AVAudioSession appropriately for your app’s audio needs.