Skip to main content
The STTOptions struct allows you to customize speech-to-text behavior including language, timestamps, and audio format.

STTOptions

public struct STTOptions: Sendable {
    public let language: String
    public let sampleRate: Int
    public let enableWordTimestamps: Bool
    public let enableVAD: Bool

    public init(
        language: String = "en",
        sampleRate: Int = 16000,
        enableWordTimestamps: Bool = false,
        enableVAD: Bool = true
    )
}

Parameters

ParameterTypeDefaultDescription
languageString"en"Language code (ISO 639-1)
sampleRateInt16000Audio sample rate in Hz
enableWordTimestampsBoolfalseInclude per-word timing information
enableVADBooltrueUse VAD to filter silent segments

Language Support

Whisper models support 99 languages. Common language codes:
LanguageCodeLanguageCode
EnglishenFrenchfr
SpanishesGermande
ItalianitPortuguesept
ChinesezhJapaneseja
KoreankoRussianru
ArabicarHindihi

Auto-Detection

Leave language unspecified or set to empty string for auto-detection:
let options = STTOptions()  // Auto-detect language

let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
print("Detected language: \(output.detectedLanguage ?? "unknown")")

Force Language

// Force Spanish transcription
let options = STTOptions(language: "es")
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

Word Timestamps

Enable word-level timing for subtitles, karaoke, or word highlighting:
let options = STTOptions(
    language: "en",
    enableWordTimestamps: true
)

let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

if let timestamps = output.wordTimestamps {
    for word in timestamps {
        print("\(word.word): \(word.startTime)s - \(word.endTime)s")
    }
}

WordTimestamp Structure

public struct WordTimestamp: Sendable {
    public let word: String       // The word
    public let startTime: Double  // Start time in seconds
    public let endTime: Double    // End time in seconds
    public let confidence: Float? // Word-level confidence
}

Subtitle Generation Example

func generateSubtitles(from audioData: Data) async throws -> [Subtitle] {
    let options = STTOptions(
        language: "en",
        enableWordTimestamps: true
    )

    let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

    guard let timestamps = output.wordTimestamps else {
        return [Subtitle(text: output.text, start: 0, end: 5)]
    }

    // Group words into subtitle segments (max 7 words per segment)
    var subtitles: [Subtitle] = []
    var currentWords: [WordTimestamp] = []

    for word in timestamps {
        currentWords.append(word)

        if currentWords.count >= 7 {
            let text = currentWords.map(\.word).joined(separator: " ")
            let start = currentWords.first!.startTime
            let end = currentWords.last!.endTime
            subtitles.append(Subtitle(text: text, start: start, end: end))
            currentWords = []
        }
    }

    // Add remaining words
    if !currentWords.isEmpty {
        let text = currentWords.map(\.word).joined(separator: " ")
        let start = currentWords.first!.startTime
        let end = currentWords.last!.endTime
        subtitles.append(Subtitle(text: text, start: start, end: end))
    }

    return subtitles
}

struct Subtitle {
    let text: String
    let start: Double
    let end: Double
}

Voice Activity Detection (VAD)

VAD filters out silent segments before transcription:
// With VAD (default) - silent parts are skipped
let withVAD = STTOptions(enableVAD: true)

// Without VAD - entire audio is processed
let withoutVAD = STTOptions(enableVAD: false)
Enable VAD for real-time transcription to reduce processing time and improve accuracy by ignoring silence.

Sample Rate

The SDK expects audio at 16kHz by default. If your audio has a different sample rate, specify it:
// For 44.1kHz audio (resampling will be applied)
let options = STTOptions(sampleRate: 44100)

// For 48kHz audio
let options = STTOptions(sampleRate: 48000)
For best results, record audio at 16kHz mono. While the SDK can resample, native 16kHz audio produces better accuracy.

TranscriptionMetadata

Get information about the transcription process:
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

if let metadata = output.metadata {
    print("Model: \(metadata.modelId)")
    print("Processing time: \(metadata.processingTime)s")
    print("Audio duration: \(metadata.audioLength)s")

    let realTimeFactor = metadata.processingTime / metadata.audioLength
    print("Real-time factor: \(String(format: "%.2f", realTimeFactor))x")
}

Alternative Transcriptions

Access alternative interpretations:
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

print("Best: \(output.text)")

if let alternatives = output.alternatives {
    for (i, alt) in alternatives.enumerated() {
        print("Alt \(i + 1): \(alt.text) (confidence: \(alt.confidence ?? 0))")
    }
}

Complete Example

class TranscriptionService {
    func transcribe(
        audioURL: URL,
        language: String? = nil,
        includeTimestamps: Bool = false
    ) async throws -> TranscriptionResult {
        let audioData = try Data(contentsOf: audioURL)

        let options = STTOptions(
            language: language ?? "en",
            sampleRate: 16000,
            enableWordTimestamps: includeTimestamps,
            enableVAD: true
        )

        let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

        return TranscriptionResult(
            text: output.text,
            confidence: output.confidence,
            language: output.detectedLanguage ?? language ?? "en",
            timestamps: output.wordTimestamps,
            processingTime: output.metadata?.processingTime
        )
    }
}

struct TranscriptionResult {
    let text: String
    let confidence: Float?
    let language: String
    let timestamps: [WordTimestamp]?
    let processingTime: TimeInterval?
}