transcribe()

The transcribe() method converts audio data to text using on-device speech recognition models like Whisper.

Basic Usage

// Simple transcription
let text = try await RunAnywhere.transcribe(audioData)
print("You said: \(text)")

Setup

Before transcribing, register the ONNX module and load an STT model:

import RunAnywhere
import ONNXRuntime

// Register ONNX module (at app launch)
@MainActor
func setup() {
    ONNX.register()
}

// Load STT model (use the registered model ID)
try await RunAnywhere.loadSTTModel(modelId: "sherpa-onnx-whisper-tiny.en")

Method Signatures

Simple Transcription

public static func transcribe(_ audioData: Data) async throws -> String

Returns just the transcribed text.

Transcription with Options

public static func transcribeWithOptions(
    _ audioData: Data,
    options: STTOptions
) async throws -> STTOutput

Returns detailed output including confidence and timestamps.

Buffer Transcription

public static func transcribeBuffer(
    _ buffer: AVAudioPCMBuffer,
    language: String? = nil
) async throws -> STTOutput

Transcribe directly from an AVAudioPCMBuffer.

Audio Requirements

Property	Requirement
Sample Rate	16,000 Hz (recommended)
Channels	Mono (1 channel)
Format	Float32 or Int16 PCM
Duration	Up to 30 seconds per call (Whisper limitation)

STTOutput

public struct STTOutput: Sendable {
    public let text: String                       // Transcribed text
    public let confidence: Float?                 // Confidence score (0-1)
    public let wordTimestamps: [WordTimestamp]?   // Per-word timing
    public let detectedLanguage: String?          // Detected language code
    public let alternatives: [STTAlternative]?    // Alternative transcriptions
    public let metadata: TranscriptionMetadata?   // Processing info
}

Examples

Recording and Transcribing

import AVFoundation

class AudioRecorder: ObservableObject {
    private var audioRecorder: AVAudioRecorder?
    private let audioSession = AVAudioSession.sharedInstance()

    func startRecording() async throws {
        try audioSession.setCategory(.playAndRecord, mode: .default)
        try audioSession.setActive(true)

        let url = FileManager.default.temporaryDirectory
            .appendingPathComponent("recording.wav")

        let settings: [String: Any] = [
            AVFormatIDKey: Int(kAudioFormatLinearPCM),
            AVSampleRateKey: 16000,
            AVNumberOfChannelsKey: 1,
            AVLinearPCMBitDepthKey: 16,
            AVLinearPCMIsFloatKey: false
        ]

        audioRecorder = try AVAudioRecorder(url: url, settings: settings)
        audioRecorder?.record()
    }

    func stopAndTranscribe() async throws -> String {
        guard let recorder = audioRecorder else { return "" }

        recorder.stop()
        let audioData = try Data(contentsOf: recorder.url)

        return try await RunAnywhere.transcribe(audioData)
    }
}

With Timestamps

let output = try await RunAnywhere.transcribeWithOptions(
    audioData,
    options: STTOptions(
        language: "en",
        enableWordTimestamps: true
    )
)

print("Full text: \(output.text)")

if let timestamps = output.wordTimestamps {
    for word in timestamps {
        print("\(word.word): \(word.startTime)s - \(word.endTime)s")
    }
}

Multi-Language Support

// Auto-detect language
let output = try await RunAnywhere.transcribeWithOptions(
    audioData,
    options: STTOptions()
)
print("Detected language: \(output.detectedLanguage ?? "unknown")")

// Force specific language
let spanishOutput = try await RunAnywhere.transcribeWithOptions(
    audioData,
    options: STTOptions(language: "es")
)

SwiftUI Voice Input

struct VoiceInputView: View {
    @StateObject private var recorder = AudioRecorder()
    @State private var transcription = ""
    @State private var isRecording = false
    @State private var isProcessing = false

    var body: some View {
        VStack(spacing: 20) {
            // Transcription display
            Text(transcription)
                .frame(maxWidth: .infinity, minHeight: 100, alignment: .topLeading)
                .padding()
                .background(Color.gray.opacity(0.1))
                .cornerRadius(12)

            // Record button
            Button(action: toggleRecording) {
                ZStack {
                    Circle()
                        .fill(isRecording ? Color.red : Color.blue)
                        .frame(width: 80, height: 80)

                    if isProcessing {
                        ProgressView()
                            .tint(.white)
                    } else {
                        Image(systemName: isRecording ? "stop.fill" : "mic.fill")
                            .font(.title)
                            .foregroundColor(.white)
                    }
                }
            }
            .disabled(isProcessing)

            Text(isRecording ? "Tap to stop" : "Tap to record")
                .font(.caption)
                .foregroundColor(.secondary)
        }
        .padding()
    }

    func toggleRecording() {
        if isRecording {
            stopRecording()
        } else {
            startRecording()
        }
    }

    func startRecording() {
        Task {
            do {
                try await recorder.startRecording()
                isRecording = true
            } catch {
                transcription = "Failed to start recording: \(error)"
            }
        }
    }

    func stopRecording() {
        isRecording = false
        isProcessing = true

        Task {
            do {
                let text = try await recorder.stopAndTranscribe()
                await MainActor.run {
                    transcription = text
                    isProcessing = false
                }
            } catch {
                await MainActor.run {
                    transcription = "Transcription failed: \(error)"
                    isProcessing = false
                }
            }
        }
    }
}

Model Management

// Load a model
try await RunAnywhere.loadSTTModel(modelId: "sherpa-onnx-whisper-tiny.en")

// Check if loaded
let isLoaded = await RunAnywhere.isSTTModelLoaded

// Get current model
let model = await RunAnywhere.currentSTTModel

// Unload when done
try await RunAnywhere.unloadSTTModel()

Available Models

These are the Sherpa-ONNX Whisper models available as tar.gz archives:

Model ID	Size	Quality	Speed
`sherpa-onnx-whisper-tiny.en`	~40MB	Good	Fastest
`sherpa-onnx-whisper-base.en`	~150MB	Better	Fast
`sherpa-onnx-whisper-small.en`	~500MB	Best	Slower

Register models with framework: .onnx, modality: .speechRecognition, and artifactType: .archive(.tarGz, structure: .nestedDirectory).

Error Handling

do {
    let text = try await RunAnywhere.transcribe(audioData)
    print(text)
} catch let error as SDKError {
    switch error.code {
    case .modelNotFound:
        print("Load an STT model first")
    case .emptyAudioBuffer:
        print("Audio data is empty")
    case .processingFailed:
        print("Transcription failed: \(error.message)")
    default:
        print("Error: \(error.localizedDescription)")
    }
}

Streaming STT

Real-time transcription →

STT Options

Configure transcription →

Getting Started

Swift SDK

Kotlin SDK

React Native SDK

Flutter SDK

Web SDK

Vibe Coding

Basic Usage

Setup

Method Signatures

Simple Transcription

Transcription with Options

Buffer Transcription

Audio Requirements

STTOutput

Examples

Recording and Transcribing

With Timestamps

Multi-Language Support

SwiftUI Voice Input

Model Management

Available Models

Error Handling

Streaming STT

STT Options

Getting Started

Swift SDK

Kotlin SDK

React Native SDK

Flutter SDK

Web SDK

Vibe Coding

​Basic Usage

​Setup

​Method Signatures

​Simple Transcription

​Transcription with Options

​Buffer Transcription

​Audio Requirements

​STTOutput

​Examples

​Recording and Transcribing

​With Timestamps

​Multi-Language Support

​SwiftUI Voice Input

​Model Management

​Available Models

​Error Handling

Streaming STT

STT Options

Basic Usage

Setup

Method Signatures

Simple Transcription

Transcription with Options

Buffer Transcription

Audio Requirements

STTOutput

Examples

Recording and Transcribing

With Timestamps

Multi-Language Support

SwiftUI Voice Input

Model Management

Available Models

Error Handling