STT Options

The STTOptions struct allows you to customize speech-to-text behavior including language, timestamps, and audio format.

STTOptions

public struct STTOptions: Sendable {
    public let language: String
    public let sampleRate: Int
    public let enableWordTimestamps: Bool
    public let enableVAD: Bool

    public init(
        language: String = "en",
        sampleRate: Int = 16000,
        enableWordTimestamps: Bool = false,
        enableVAD: Bool = true
    )
}

Parameters

Parameter	Type	Default	Description
`language`	`String`	`"en"`	Language code (ISO 639-1)
`sampleRate`	`Int`	`16000`	Audio sample rate in Hz
`enableWordTimestamps`	`Bool`	`false`	Include per-word timing information
`enableVAD`	`Bool`	`true`	Use VAD to filter silent segments

Language Support

Whisper models support 99 languages. Common language codes:

Language	Code	Language	Code
English	`en`	French	`fr`
Spanish	`es`	German	`de`
Italian	`it`	Portuguese	`pt`
Chinese	`zh`	Japanese	`ja`
Korean	`ko`	Russian	`ru`
Arabic	`ar`	Hindi	`hi`

Auto-Detection

Leave language unspecified or set to empty string for auto-detection:

let options = STTOptions()  // Auto-detect language

let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
print("Detected language: \(output.detectedLanguage ?? "unknown")")

Force Language

// Force Spanish transcription
let options = STTOptions(language: "es")
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

Word Timestamps

Enable word-level timing for subtitles, karaoke, or word highlighting:

let options = STTOptions(
    language: "en",
    enableWordTimestamps: true
)

let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

if let timestamps = output.wordTimestamps {
    for word in timestamps {
        print("\(word.word): \(word.startTime)s - \(word.endTime)s")
    }
}

WordTimestamp Structure

public struct WordTimestamp: Sendable {
    public let word: String       // The word
    public let startTime: Double  // Start time in seconds
    public let endTime: Double    // End time in seconds
    public let confidence: Float? // Word-level confidence
}

Subtitle Generation Example

func generateSubtitles(from audioData: Data) async throws -> [Subtitle] {
    let options = STTOptions(
        language: "en",
        enableWordTimestamps: true
    )

    let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

    guard let timestamps = output.wordTimestamps else {
        return [Subtitle(text: output.text, start: 0, end: 5)]
    }

    // Group words into subtitle segments (max 7 words per segment)
    var subtitles: [Subtitle] = []
    var currentWords: [WordTimestamp] = []

    for word in timestamps {
        currentWords.append(word)

        if currentWords.count >= 7 {
            let text = currentWords.map(\.word).joined(separator: " ")
            let start = currentWords.first!.startTime
            let end = currentWords.last!.endTime
            subtitles.append(Subtitle(text: text, start: start, end: end))
            currentWords = []
        }
    }

    // Add remaining words
    if !currentWords.isEmpty {
        let text = currentWords.map(\.word).joined(separator: " ")
        let start = currentWords.first!.startTime
        let end = currentWords.last!.endTime
        subtitles.append(Subtitle(text: text, start: start, end: end))
    }

    return subtitles
}

struct Subtitle {
    let text: String
    let start: Double
    let end: Double
}

Voice Activity Detection (VAD)

VAD filters out silent segments before transcription:

// With VAD (default) - silent parts are skipped
let withVAD = STTOptions(enableVAD: true)

// Without VAD - entire audio is processed
let withoutVAD = STTOptions(enableVAD: false)

Enable VAD for real-time transcription to reduce processing time and improve accuracy by ignoring silence.

Sample Rate

The SDK expects audio at 16kHz by default. If your audio has a different sample rate, specify it:

// For 44.1kHz audio (resampling will be applied)
let options = STTOptions(sampleRate: 44100)

// For 48kHz audio
let options = STTOptions(sampleRate: 48000)

For best results, record audio at 16kHz mono. While the SDK can resample, native 16kHz audio produces better accuracy.

TranscriptionMetadata

Get information about the transcription process:

let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

if let metadata = output.metadata {
    print("Model: \(metadata.modelId)")
    print("Processing time: \(metadata.processingTime)s")
    print("Audio duration: \(metadata.audioLength)s")

    let realTimeFactor = metadata.processingTime / metadata.audioLength
    print("Real-time factor: \(String(format: "%.2f", realTimeFactor))x")
}

Alternative Transcriptions

Access alternative interpretations:

let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

print("Best: \(output.text)")

if let alternatives = output.alternatives {
    for (i, alt) in alternatives.enumerated() {
        print("Alt \(i + 1): \(alt.text) (confidence: \(alt.confidence ?? 0))")
    }
}

Complete Example

class TranscriptionService {
    func transcribe(
        audioURL: URL,
        language: String? = nil,
        includeTimestamps: Bool = false
    ) async throws -> TranscriptionResult {
        let audioData = try Data(contentsOf: audioURL)

        let options = STTOptions(
            language: language ?? "en",
            sampleRate: 16000,
            enableWordTimestamps: includeTimestamps,
            enableVAD: true
        )

        let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)

        return TranscriptionResult(
            text: output.text,
            confidence: output.confidence,
            language: output.detectedLanguage ?? language ?? "en",
            timestamps: output.wordTimestamps,
            processingTime: output.metadata?.processingTime
        )
    }
}

struct TranscriptionResult {
    let text: String
    let confidence: Float?
    let language: String
    let timestamps: [WordTimestamp]?
    let processingTime: TimeInterval?
}

transcribe()

Basic transcription →

Streaming STT

Real-time transcription →

Getting Started

Swift SDK

Kotlin SDK

React Native SDK

Flutter SDK

Web SDK

Vibe Coding

STTOptions

Parameters

Language Support

Auto-Detection

Force Language

Word Timestamps

WordTimestamp Structure

Subtitle Generation Example

Voice Activity Detection (VAD)

Sample Rate

TranscriptionMetadata

Alternative Transcriptions

Complete Example

transcribe()

Streaming STT

Getting Started

Swift SDK

Kotlin SDK

React Native SDK

Flutter SDK

Web SDK

Vibe Coding

​STTOptions

​Parameters

​Language Support

​Auto-Detection

​Force Language

​Word Timestamps

​WordTimestamp Structure

​Subtitle Generation Example

​Voice Activity Detection (VAD)

​Sample Rate

​TranscriptionMetadata

​Alternative Transcriptions

​Complete Example

transcribe()

Streaming STT

STTOptions

Parameters

Language Support

Auto-Detection

Force Language

Word Timestamps

WordTimestamp Structure

Subtitle Generation Example

Voice Activity Detection (VAD)

Sample Rate

TranscriptionMetadata

Alternative Transcriptions

Complete Example