Skip to main content
Streaming TTS generates audio in chunks, enabling playback to begin before the entire text is synthesized. This is ideal for long content where you want immediate audio feedback.

Basic Usage

try await RunAnywhere.synthesizeStream(
    "This is a long piece of text that will be synthesized in chunks...",
    options: TTSOptions(rate: 1.0),
    onAudioChunk: { chunk in
        // Play or buffer each audio chunk
        audioPlayer.enqueue(chunk)
    }
)

Method Signature

public static func synthesizeStream(
    _ text: String,
    options: TTSOptions = TTSOptions(),
    onAudioChunk: @escaping (Data) -> Void
) async throws -> TTSOutput

Parameters

ParameterTypeDescription
textStringText to synthesize
optionsTTSOptionsSynthesis options
onAudioChunk(Data) -> VoidCallback for each audio chunk

Returns

TTSOutput with the complete synthesized audio after streaming completes.

Streaming Audio Player

Build a player that handles streaming audio chunks:
import AVFoundation

class StreamingAudioPlayer {
    private var audioEngine: AVAudioEngine?
    private var playerNode: AVAudioPlayerNode?
    private var audioFormat: AVAudioFormat?

    func prepare(sampleRate: Double = 22050) throws {
        audioEngine = AVAudioEngine()
        playerNode = AVAudioPlayerNode()

        guard let engine = audioEngine, let player = playerNode else { return }

        audioFormat = AVAudioFormat(
            commonFormat: .pcmFormatFloat32,
            sampleRate: sampleRate,
            channels: 1,
            interleaved: false
        )

        engine.attach(player)
        engine.connect(player, to: engine.mainMixerNode, format: audioFormat)

        try engine.start()
        player.play()
    }

    func enqueue(_ audioData: Data) {
        guard let format = audioFormat,
              let player = playerNode else { return }

        let frameCount = UInt32(audioData.count) / UInt32(MemoryLayout<Float>.size)
        guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else { return }

        buffer.frameLength = frameCount
        audioData.withUnsafeBytes { ptr in
            if let address = ptr.baseAddress?.assumingMemoryBound(to: Float.self) {
                buffer.floatChannelData?[0].update(from: address, count: Int(frameCount))
            }
        }

        player.scheduleBuffer(buffer)
    }

    func stop() {
        playerNode?.stop()
        audioEngine?.stop()
    }
}

SwiftUI Integration

struct StreamingTTSView: View {
    @State private var text = "Hello! This is a demonstration of streaming text-to-speech synthesis. The audio will begin playing before the entire text is processed, providing a more responsive experience for longer content."
    @State private var isSynthesizing = false
    @State private var progress: Double = 0
    @StateObject private var player = StreamingAudioPlayerWrapper()

    var body: some View {
        VStack(spacing: 20) {
            // Text input
            TextEditor(text: $text)
                .frame(height: 150)
                .border(Color.gray.opacity(0.3))

            // Progress indicator
            if isSynthesizing {
                ProgressView(value: progress)
                    .progressViewStyle(.linear)
            }

            // Control button
            Button(action: synthesize) {
                Label(
                    isSynthesizing ? "Synthesizing..." : "Speak",
                    systemImage: isSynthesizing ? "waveform" : "play.fill"
                )
                .frame(maxWidth: .infinity)
                .padding()
                .background(isSynthesizing ? Color.gray : Color.blue)
                .foregroundColor(.white)
                .cornerRadius(10)
            }
            .disabled(text.isEmpty || isSynthesizing)
        }
        .padding()
    }

    func synthesize() {
        isSynthesizing = true
        progress = 0

        Task {
            do {
                try player.prepare()

                var totalChunks = 0
                var processedChunks = 0

                // Estimate total chunks based on text length
                totalChunks = max(1, text.count / 100)

                let _ = try await RunAnywhere.synthesizeStream(
                    text,
                    options: TTSOptions(rate: 1.0),
                    onAudioChunk: { chunk in
                        player.enqueue(chunk)
                        processedChunks += 1

                        Task { @MainActor in
                            progress = min(1.0, Double(processedChunks) / Double(totalChunks))
                        }
                    }
                )

                await MainActor.run {
                    isSynthesizing = false
                    progress = 1.0
                }

            } catch {
                print("Synthesis failed: \(error)")
                await MainActor.run {
                    isSynthesizing = false
                }
            }
        }
    }
}

@MainActor
class StreamingAudioPlayerWrapper: ObservableObject {
    private let player = StreamingAudioPlayer()

    func prepare() throws {
        try player.prepare()
    }

    func enqueue(_ data: Data) {
        player.enqueue(data)
    }

    func stop() {
        player.stop()
    }
}

Sentence-by-Sentence Streaming

For natural pauses, split text into sentences:
func speakSentenceBySentence(_ text: String) async throws {
    let sentences = text.components(separatedBy: CharacterSet(charactersIn: ".!?"))
        .map { $0.trimmingCharacters(in: .whitespaces) }
        .filter { !$0.isEmpty }

    for sentence in sentences {
        let output = try await RunAnywhere.synthesize(
            sentence + ".",
            options: TTSOptions(rate: 1.0)
        )

        // Play and wait for completion
        let player = try AVAudioPlayer(data: output.audioData)
        player.play()

        // Wait for playback to complete
        try await Task.sleep(for: .seconds(output.duration))
    }
}

Buffered Streaming

Buffer chunks for smoother playback:
class BufferedStreamingPlayer {
    private var buffer: [Data] = []
    private var isPlaying = false
    private let minimumBufferSize = 3  // Start playing after 3 chunks

    func addChunk(_ chunk: Data) {
        buffer.append(chunk)

        if buffer.count >= minimumBufferSize && !isPlaying {
            startPlayback()
        }
    }

    private func startPlayback() {
        isPlaying = true

        Task {
            while !buffer.isEmpty || isPlaying {
                if let chunk = buffer.first {
                    buffer.removeFirst()
                    await playChunk(chunk)
                } else {
                    try? await Task.sleep(for: .milliseconds(50))
                }
            }
        }
    }

    private func playChunk(_ data: Data) async {
        // Play audio chunk
    }

    func finish() {
        isPlaying = false
    }
}

Use Cases

Stream long-form content with immediate playback.
func readChapter(_ text: String) async throws {
    try await RunAnywhere.synthesizeStream(
        text,
        options: TTSOptions(rate: 0.9),
        onAudioChunk: { audioPlayer.enqueue($0) }
    )
}
Stream LLM responses as they’re generated.
let llmResult = try await RunAnywhere.generateStream(prompt)
var accumulatedText = ""

for try await token in llmResult.stream {
    accumulatedText += token
    
    // Synthesize complete sentences
    if token.contains(".") || token.contains("!") || token.contains("?") {
        try await RunAnywhere.synthesizeStream(accumulatedText) { chunk in
            player.enqueue(chunk)
        }
        accumulatedText = ""
    }
}
Read screen content aloud as user navigates.
import SwiftUI

struct AccessibleContentView: View {
    @State private var isSpeaking = false
    
    var body: some View {
        VStack {
            Text("Welcome to the app")
                .accessibilityLabel("Welcome to the app")
            
            Button(isSpeaking ? "Stop Reading" : "Read Aloud") {
                Task {
                    if isSpeaking {
                        await RunAnywhere.stopSpeaking()
                    } else {
                        isSpeaking = true
                        try? await RunAnywhere.synthesizeStream(
                            "Welcome to the app. Navigate using swipe gestures.",
                            options: TTSOptions(rate: 0.9),
                            onAudioChunk: { chunk in
                                AudioPlayer.shared.enqueue(chunk)
                            }
                        )
                        isSpeaking = false
                    }
                }
            }
            .accessibilityHint("Reads the current screen content aloud")
        }
    }
}

Error Handling

do {
    try await RunAnywhere.synthesizeStream(
        text,
        options: TTSOptions(),
        onAudioChunk: { chunk in
            player.enqueue(chunk)
        }
    )
} catch let error as SDKError {
    switch error.code {
    case .modelNotFound:
        print("Load a TTS voice first")
    case .processingFailed:
        print("Streaming synthesis failed")
    default:
        print("Error: \(error)")
    }

    // Stop player on error
    player.stop()
}

Performance Tips

Buffer before playing: Wait for a few chunks before starting playback to avoid stuttering.
Match sample rates: Ensure your audio player’s sample rate matches the TTS output (typically 22050 Hz).