Skip to main content
The transcribe() method converts audio data to text using on-device speech recognition models like Whisper.

Basic Usage

// Simple transcription
let text = try await RunAnywhere.transcribe(audioData)
print("You said: \(text)")

Setup

Before transcribing, register the ONNX module and load an STT model:
import RunAnywhere
import ONNXRuntime

// Register ONNX module (at app launch)
@MainActor
func setup() {
    ONNX.register()
}

// Load STT model
try await RunAnywhere.loadSTTModel("whisper-base-onnx")

Method Signatures

Simple Transcription

public static func transcribe(_ audioData: Data) async throws -> String
Returns just the transcribed text.

Transcription with Options

public static func transcribeWithOptions(
    _ audioData: Data,
    options: STTOptions
) async throws -> STTOutput
Returns detailed output including confidence and timestamps.

Buffer Transcription

public static func transcribeBuffer(
    _ buffer: AVAudioPCMBuffer,
    language: String? = nil
) async throws -> STTOutput
Transcribe directly from an AVAudioPCMBuffer.

Audio Requirements

PropertyRequirement
Sample Rate16,000 Hz (recommended)
ChannelsMono (1 channel)
FormatFloat32 or Int16 PCM
DurationUp to 30 seconds per call (Whisper limitation)

STTOutput

public struct STTOutput: Sendable {
    public let text: String                       // Transcribed text
    public let confidence: Float?                 // Confidence score (0-1)
    public let wordTimestamps: [WordTimestamp]?   // Per-word timing
    public let detectedLanguage: String?          // Detected language code
    public let alternatives: [STTAlternative]?    // Alternative transcriptions
    public let metadata: TranscriptionMetadata?   // Processing info
}

Examples

Recording and Transcribing

import AVFoundation

class AudioRecorder: ObservableObject {
    private var audioRecorder: AVAudioRecorder?
    private let audioSession = AVAudioSession.sharedInstance()

    func startRecording() async throws {
        try audioSession.setCategory(.playAndRecord, mode: .default)
        try audioSession.setActive(true)

        let url = FileManager.default.temporaryDirectory
            .appendingPathComponent("recording.wav")

        let settings: [String: Any] = [
            AVFormatIDKey: Int(kAudioFormatLinearPCM),
            AVSampleRateKey: 16000,
            AVNumberOfChannelsKey: 1,
            AVLinearPCMBitDepthKey: 16,
            AVLinearPCMIsFloatKey: false
        ]

        audioRecorder = try AVAudioRecorder(url: url, settings: settings)
        audioRecorder?.record()
    }

    func stopAndTranscribe() async throws -> String {
        guard let recorder = audioRecorder else { return "" }

        recorder.stop()
        let audioData = try Data(contentsOf: recorder.url)

        return try await RunAnywhere.transcribe(audioData)
    }
}

With Timestamps

let output = try await RunAnywhere.transcribeWithOptions(
    audioData,
    options: STTOptions(
        language: "en",
        enableWordTimestamps: true
    )
)

print("Full text: \(output.text)")

if let timestamps = output.wordTimestamps {
    for word in timestamps {
        print("\(word.word): \(word.startTime)s - \(word.endTime)s")
    }
}

Multi-Language Support

// Auto-detect language
let output = try await RunAnywhere.transcribeWithOptions(
    audioData,
    options: STTOptions()
)
print("Detected language: \(output.detectedLanguage ?? "unknown")")

// Force specific language
let spanishOutput = try await RunAnywhere.transcribeWithOptions(
    audioData,
    options: STTOptions(language: "es")
)

SwiftUI Voice Input

struct VoiceInputView: View {
    @StateObject private var recorder = AudioRecorder()
    @State private var transcription = ""
    @State private var isRecording = false
    @State private var isProcessing = false

    var body: some View {
        VStack(spacing: 20) {
            // Transcription display
            Text(transcription)
                .frame(maxWidth: .infinity, minHeight: 100, alignment: .topLeading)
                .padding()
                .background(Color.gray.opacity(0.1))
                .cornerRadius(12)

            // Record button
            Button(action: toggleRecording) {
                ZStack {
                    Circle()
                        .fill(isRecording ? Color.red : Color.blue)
                        .frame(width: 80, height: 80)

                    if isProcessing {
                        ProgressView()
                            .tint(.white)
                    } else {
                        Image(systemName: isRecording ? "stop.fill" : "mic.fill")
                            .font(.title)
                            .foregroundColor(.white)
                    }
                }
            }
            .disabled(isProcessing)

            Text(isRecording ? "Tap to stop" : "Tap to record")
                .font(.caption)
                .foregroundColor(.secondary)
        }
        .padding()
    }

    func toggleRecording() {
        if isRecording {
            stopRecording()
        } else {
            startRecording()
        }
    }

    func startRecording() {
        Task {
            do {
                try await recorder.startRecording()
                isRecording = true
            } catch {
                transcription = "Failed to start recording: \(error)"
            }
        }
    }

    func stopRecording() {
        isRecording = false
        isProcessing = true

        Task {
            do {
                let text = try await recorder.stopAndTranscribe()
                await MainActor.run {
                    transcription = text
                    isProcessing = false
                }
            } catch {
                await MainActor.run {
                    transcription = "Transcription failed: \(error)"
                    isProcessing = false
                }
            }
        }
    }
}

Model Management

// Load a model
try await RunAnywhere.loadSTTModel("whisper-base-onnx")

// Check if loaded
let isLoaded = await RunAnywhere.isSTTModelLoaded

// Get current model
let model = await RunAnywhere.currentSTTModel

// Unload when done
try await RunAnywhere.unloadSTTModel()

Available Models

Model IDSizeQualitySpeed
whisper-tiny-onnx~40MBGoodFastest
whisper-base-onnx~150MBBetterFast
whisper-small-onnx~500MBBestSlower

Error Handling

do {
    let text = try await RunAnywhere.transcribe(audioData)
    print(text)
} catch let error as SDKError {
    switch error.code {
    case .modelNotFound:
        print("Load an STT model first")
    case .emptyAudioBuffer:
        print("Audio data is empty")
    case .processingFailed:
        print("Transcription failed: \(error.message)")
    default:
        print("Error: \(error.localizedDescription)")
    }
}