Skip to main content
Stream transcription provides real-time results as audio is being processed, enabling responsive voice interfaces.

Basic Usage

let output = try await RunAnywhere.transcribeStream(
    audioData: audioData,
    options: STTOptions(language: "en"),
    onPartialResult: { result in
        print("Partial: \(result.transcript)")
    }
)

print("Final: \(output.text)")

Method Signature

public static func transcribeStream(
    audioData: Data,
    options: STTOptions = STTOptions(),
    onPartialResult: @escaping (STTTranscriptionResult) -> Void
) async throws -> STTOutput

Parameters

ParameterTypeDescription
audioDataDataAudio data to transcribe
optionsSTTOptionsTranscription options
onPartialResult(STTTranscriptionResult) -> VoidCallback for partial results

Returns

STTOutput with the final complete transcription.

STTTranscriptionResult

public struct STTTranscriptionResult: Sendable {
    public let transcript: String           // Current transcription
    public let confidence: Float?           // Confidence score
    public let timestamps: [WordTimestamp]? // Word-level timing
    public let language: String?            // Detected language
    public let alternatives: [STTAlternative]? // Alternative transcriptions
}

Live Transcription Example

class LiveTranscriber: ObservableObject {
    @Published var partialText = ""
    @Published var finalText = ""
    @Published var isTranscribing = false

    private var audioEngine: AVAudioEngine?
    private var audioBuffer = Data()

    func startLiveTranscription() async throws {
        let audioSession = AVAudioSession.sharedInstance()
        try audioSession.setCategory(.playAndRecord, mode: .measurement)
        try audioSession.setActive(true)

        audioEngine = AVAudioEngine()
        guard let audioEngine = audioEngine else { return }

        let inputNode = audioEngine.inputNode
        let recordingFormat = AVAudioFormat(
            commonFormat: .pcmFormatFloat32,
            sampleRate: 16000,
            channels: 1,
            interleaved: false
        )!

        isTranscribing = true

        inputNode.installTap(onBus: 0, bufferSize: 4096, format: recordingFormat) { [weak self] buffer, _ in
            self?.processAudioBuffer(buffer)
        }

        audioEngine.prepare()
        try audioEngine.start()
    }

    private func processAudioBuffer(_ buffer: AVAudioPCMBuffer) {
        // Convert buffer to Data and accumulate
        guard let channelData = buffer.floatChannelData?[0] else { return }
        let frameCount = Int(buffer.frameLength)

        let data = Data(bytes: channelData, count: frameCount * MemoryLayout<Float>.size)
        audioBuffer.append(data)

        // Process every ~1 second of audio
        if audioBuffer.count >= 16000 * 4 { // 1 second at 16kHz, Float32
            let chunk = audioBuffer
            audioBuffer = Data()

            Task {
                await transcribeChunk(chunk)
            }
        }
    }

    private func transcribeChunk(_ data: Data) async {
        do {
            let output = try await RunAnywhere.transcribeStream(
                audioData: data,
                options: STTOptions(language: "en"),
                onPartialResult: { [weak self] result in
                    Task { @MainActor in
                        self?.partialText = result.transcript
                    }
                }
            )

            await MainActor.run {
                finalText += " " + output.text
                partialText = ""
            }
        } catch {
            print("Transcription error: \(error)")
        }
    }

    func stop() {
        audioEngine?.stop()
        audioEngine?.inputNode.removeTap(onBus: 0)
        audioEngine = nil
        isTranscribing = false
    }
}

SwiftUI Integration

struct LiveTranscriptionView: View {
    @StateObject private var transcriber = LiveTranscriber()

    var body: some View {
        VStack(spacing: 20) {
            // Final transcription
            ScrollView {
                Text(transcriber.finalText)
                    .frame(maxWidth: .infinity, alignment: .leading)
            }
            .frame(height: 200)
            .padding()
            .background(Color.gray.opacity(0.1))
            .cornerRadius(12)

            // Partial (in-progress) transcription
            if !transcriber.partialText.isEmpty {
                Text(transcriber.partialText)
                    .foregroundColor(.secondary)
                    .italic()
            }

            // Recording indicator
            HStack {
                if transcriber.isTranscribing {
                    Circle()
                        .fill(Color.red)
                        .frame(width: 12, height: 12)
                    Text("Listening...")
                }
            }

            // Control buttons
            HStack(spacing: 20) {
                Button(action: startTranscription) {
                    Label("Start", systemImage: "mic.fill")
                        .padding()
                        .background(Color.green)
                        .foregroundColor(.white)
                        .cornerRadius(10)
                }
                .disabled(transcriber.isTranscribing)

                Button(action: stopTranscription) {
                    Label("Stop", systemImage: "stop.fill")
                        .padding()
                        .background(Color.red)
                        .foregroundColor(.white)
                        .cornerRadius(10)
                }
                .disabled(!transcriber.isTranscribing)
            }
        }
        .padding()
    }

    func startTranscription() {
        Task {
            try await transcriber.startLiveTranscription()
        }
    }

    func stopTranscription() {
        transcriber.stop()
    }
}

Voice Command Detection

Use streaming STT to detect commands in real-time:
class VoiceCommandDetector {
    private let commands = ["play", "pause", "stop", "next", "previous"]
    var onCommandDetected: ((String) -> Void)?

    func processAudio(_ data: Data) async {
        do {
            let _ = try await RunAnywhere.transcribeStream(
                audioData: data,
                options: STTOptions(language: "en"),
                onPartialResult: { [weak self] result in
                    self?.checkForCommands(result.transcript)
                }
            )
        } catch {
            print("Error: \(error)")
        }
    }

    private func checkForCommands(_ text: String) {
        let lowercased = text.lowercased()
        for command in commands {
            if lowercased.contains(command) {
                onCommandDetected?(command)
                break
            }
        }
    }
}

// Usage
let detector = VoiceCommandDetector()
detector.onCommandDetected = { command in
    switch command {
    case "play":
        player.play()
    case "pause":
        player.pause()
    default:
        break
    }
}

Performance Tips

Process audio in chunks of 1-2 seconds for best balance of latency and accuracy.
Clear audio buffers after processing to prevent memory growth during long sessions.
Transcription runs in background threads automatically. Update UI on MainActor.

Error Handling

do {
    let output = try await RunAnywhere.transcribeStream(
        audioData: data,
        onPartialResult: { _ in }
    )
} catch let error as SDKError {
    switch error.code {
    case .modelNotFound:
        print("Load STT model first")
    case .emptyAudioBuffer:
        print("No audio data")
    case .processingFailed:
        print("Streaming failed: \(error.message)")
    default:
        print("Error: \(error)")
    }
}