The STTOptions struct allows you to customize speech-to-text behavior including language, timestamps, and audio format.
STTOptions
public struct STTOptions: Sendable {
public let language: String
public let sampleRate: Int
public let enableWordTimestamps: Bool
public let enableVAD: Bool
public init(
language: String = "en",
sampleRate: Int = 16000,
enableWordTimestamps: Bool = false,
enableVAD: Bool = true
)
}
Parameters
| Parameter | Type | Default | Description |
|---|
language | String | "en" | Language code (ISO 639-1) |
sampleRate | Int | 16000 | Audio sample rate in Hz |
enableWordTimestamps | Bool | false | Include per-word timing information |
enableVAD | Bool | true | Use VAD to filter silent segments |
Language Support
Whisper models support 99 languages. Common language codes:
| Language | Code | Language | Code |
|---|
| English | en | French | fr |
| Spanish | es | German | de |
| Italian | it | Portuguese | pt |
| Chinese | zh | Japanese | ja |
| Korean | ko | Russian | ru |
| Arabic | ar | Hindi | hi |
Auto-Detection
Leave language unspecified or set to empty string for auto-detection:
let options = STTOptions() // Auto-detect language
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
print("Detected language: \(output.detectedLanguage ?? "unknown")")
Force Language
// Force Spanish transcription
let options = STTOptions(language: "es")
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
Word Timestamps
Enable word-level timing for subtitles, karaoke, or word highlighting:
let options = STTOptions(
language: "en",
enableWordTimestamps: true
)
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
if let timestamps = output.wordTimestamps {
for word in timestamps {
print("\(word.word): \(word.startTime)s - \(word.endTime)s")
}
}
WordTimestamp Structure
public struct WordTimestamp: Sendable {
public let word: String // The word
public let startTime: Double // Start time in seconds
public let endTime: Double // End time in seconds
public let confidence: Float? // Word-level confidence
}
Subtitle Generation Example
func generateSubtitles(from audioData: Data) async throws -> [Subtitle] {
let options = STTOptions(
language: "en",
enableWordTimestamps: true
)
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
guard let timestamps = output.wordTimestamps else {
return [Subtitle(text: output.text, start: 0, end: 5)]
}
// Group words into subtitle segments (max 7 words per segment)
var subtitles: [Subtitle] = []
var currentWords: [WordTimestamp] = []
for word in timestamps {
currentWords.append(word)
if currentWords.count >= 7 {
let text = currentWords.map(\.word).joined(separator: " ")
let start = currentWords.first!.startTime
let end = currentWords.last!.endTime
subtitles.append(Subtitle(text: text, start: start, end: end))
currentWords = []
}
}
// Add remaining words
if !currentWords.isEmpty {
let text = currentWords.map(\.word).joined(separator: " ")
let start = currentWords.first!.startTime
let end = currentWords.last!.endTime
subtitles.append(Subtitle(text: text, start: start, end: end))
}
return subtitles
}
struct Subtitle {
let text: String
let start: Double
let end: Double
}
Voice Activity Detection (VAD)
VAD filters out silent segments before transcription:
// With VAD (default) - silent parts are skipped
let withVAD = STTOptions(enableVAD: true)
// Without VAD - entire audio is processed
let withoutVAD = STTOptions(enableVAD: false)
Enable VAD for real-time transcription to reduce processing time and improve accuracy by ignoring
silence.
Sample Rate
The SDK expects audio at 16kHz by default. If your audio has a different sample rate, specify it:
// For 44.1kHz audio (resampling will be applied)
let options = STTOptions(sampleRate: 44100)
// For 48kHz audio
let options = STTOptions(sampleRate: 48000)
For best results, record audio at 16kHz mono. While the SDK can resample, native 16kHz audio
produces better accuracy.
Get information about the transcription process:
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
if let metadata = output.metadata {
print("Model: \(metadata.modelId)")
print("Processing time: \(metadata.processingTime)s")
print("Audio duration: \(metadata.audioLength)s")
let realTimeFactor = metadata.processingTime / metadata.audioLength
print("Real-time factor: \(String(format: "%.2f", realTimeFactor))x")
}
Alternative Transcriptions
Access alternative interpretations:
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
print("Best: \(output.text)")
if let alternatives = output.alternatives {
for (i, alt) in alternatives.enumerated() {
print("Alt \(i + 1): \(alt.text) (confidence: \(alt.confidence ?? 0))")
}
}
Complete Example
class TranscriptionService {
func transcribe(
audioURL: URL,
language: String? = nil,
includeTimestamps: Bool = false
) async throws -> TranscriptionResult {
let audioData = try Data(contentsOf: audioURL)
let options = STTOptions(
language: language ?? "en",
sampleRate: 16000,
enableWordTimestamps: includeTimestamps,
enableVAD: true
)
let output = try await RunAnywhere.transcribeWithOptions(audioData, options: options)
return TranscriptionResult(
text: output.text,
confidence: output.confidence,
language: output.detectedLanguage ?? language ?? "en",
timestamps: output.wordTimestamps,
processingTime: output.metadata?.processingTime
)
}
}
struct TranscriptionResult {
let text: String
let confidence: Float?
let language: String
let timestamps: [WordTimestamp]?
let processingTime: TimeInterval?
}