Overview
Real-time STT streaming allows you to transcribe audio as it’s being recorded, providing immediate feedback to users. This is essential for voice interfaces where low latency is critical.Basic Usage
Copy
Ask AI
import { RunAnywhere } from '@runanywhere/core'
// Start streaming transcription
const session = await RunAnywhere.startSTTStream({
language: 'en',
onPartialResult: (partial) => {
console.log('Partial:', partial.text)
},
onFinalResult: (final) => {
console.log('Final:', final.text)
},
})
// Feed audio chunks as they arrive
session.feedAudio(audioChunk) // Float32 samples
// When done
const result = await session.stop()
console.log('Complete transcription:', result.text)
API Reference
startSTTStream
Start a streaming transcription session.
Copy
Ask AI
await RunAnywhere.startSTTStream(
config: STTStreamConfig
): Promise<STTStreamSession>
Configuration
Copy
Ask AI
interface STTStreamConfig {
/** Language code (e.g., 'en', 'es') */
language?: string
/** Sample rate of incoming audio (default: 16000) */
sampleRate?: number
/** Callback for partial results */
onPartialResult?: (result: STTPartialResult) => void
/** Callback for final segment results */
onFinalResult?: (result: STTResult) => void
/** Callback for errors */
onError?: (error: Error) => void
/** Enable Voice Activity Detection */
enableVAD?: boolean
}
Session Methods
Copy
Ask AI
interface STTStreamSession {
/** Feed audio samples to the stream */
feedAudio(samples: number[]): void
/** Pause transcription */
pause(): void
/** Resume transcription */
resume(): void
/** Stop and get final result */
stop(): Promise<STTResult>
/** Check if session is active */
isActive: boolean
}
Examples
Real-Time Microphone Transcription
LiveTranscription.tsx
Copy
Ask AI
import React, { useState, useCallback, useRef, useEffect } from 'react'
import { View, Button, Text } from 'react-native'
import { RunAnywhere, STTStreamSession } from '@runanywhere/core'
import AudioRecord from 'react-native-audio-record'
export function LiveTranscription() {
const [isListening, setIsListening] = useState(false)
const [transcript, setTranscript] = useState('')
const [partial, setPartial] = useState('')
const sessionRef = useRef<STTStreamSession | null>(null)
useEffect(() => {
// Initialize audio recording
AudioRecord.init({
sampleRate: 16000,
channels: 1,
bitsPerSample: 16,
audioSource: 6,
})
return () => {
if (sessionRef.current?.isActive) {
sessionRef.current.stop()
}
}
}, [])
const startListening = useCallback(async () => {
setIsListening(true)
setTranscript('')
setPartial('')
// Start STT stream
sessionRef.current = await RunAnywhere.startSTTStream({
language: 'en',
enableVAD: true,
onPartialResult: (result) => {
setPartial(result.text)
},
onFinalResult: (result) => {
setTranscript((prev) => prev + result.text + ' ')
setPartial('')
},
})
// Start audio recording and pipe to STT
AudioRecord.start()
AudioRecord.on('data', (data: string) => {
// Convert base64 to float32 samples
const samples = base64ToFloat32(data)
sessionRef.current?.feedAudio(samples)
})
}, [])
const stopListening = useCallback(async () => {
setIsListening(false)
AudioRecord.stop()
if (sessionRef.current) {
const finalResult = await sessionRef.current.stop()
setTranscript((prev) => prev + finalResult.text)
setPartial('')
}
}, [])
return (
<View style={{ padding: 16 }}>
<Button
title={isListening ? '🔴 Stop' : '🎤 Start'}
onPress={isListening ? stopListening : startListening}
/>
<View style={{ marginTop: 16 }}>
<Text style={{ fontSize: 16 }}>
{transcript}
<Text style={{ color: '#888' }}>{partial}</Text>
</Text>
</View>
</View>
)
}
// Helper to convert base64 audio to float32 samples
function base64ToFloat32(base64: string): number[] {
const binary = atob(base64)
const int16Array = new Int16Array(binary.length / 2)
for (let i = 0; i < int16Array.length; i++) {
int16Array[i] = binary.charCodeAt(i * 2) | (binary.charCodeAt(i * 2 + 1) << 8)
}
return Array.from(int16Array).map((x) => x / 32768.0)
}
With VAD Integration
When VAD is enabled, the stream automatically detects speech segments:Copy
Ask AI
const session = await RunAnywhere.startSTTStream({
language: 'en',
enableVAD: true,
onPartialResult: (result) => {
// Updates while user is speaking
updateUI(result.text)
},
onFinalResult: (result) => {
// Called when VAD detects end of speech
console.log('User finished speaking:', result.text)
},
})
// Audio is processed, VAD handles segmentation automatically
session.feedAudio(audioSamples)
Custom Hook
useSTTStream.ts
Copy
Ask AI
import { useState, useRef, useCallback } from 'react'
import { RunAnywhere, STTStreamSession, STTResult } from '@runanywhere/core'
export function useSTTStream() {
const [isStreaming, setIsStreaming] = useState(false)
const [partial, setPartial] = useState('')
const [transcript, setTranscript] = useState('')
const sessionRef = useRef<STTStreamSession | null>(null)
const start = useCallback(async (language = 'en') => {
setIsStreaming(true)
setPartial('')
sessionRef.current = await RunAnywhere.startSTTStream({
language,
enableVAD: true,
onPartialResult: (result) => setPartial(result.text),
onFinalResult: (result) => {
setTranscript((prev) => prev + result.text + ' ')
setPartial('')
},
})
}, [])
const feedAudio = useCallback((samples: number[]) => {
sessionRef.current?.feedAudio(samples)
}, [])
const stop = useCallback(async (): Promise<STTResult | null> => {
setIsStreaming(false)
if (sessionRef.current) {
const result = await sessionRef.current.stop()
setTranscript((prev) => prev + result.text)
setPartial('')
return result
}
return null
}, [])
const reset = useCallback(() => {
setTranscript('')
setPartial('')
}, [])
return {
isStreaming,
partial,
transcript,
start,
feedAudio,
stop,
reset,
}
}
Performance Considerations
Feed audio in chunks of 100-500ms for optimal balance between latency and accuracy. Too small
chunks increase overhead; too large chunks increase perceived latency.
Optimal Chunk Size
Copy
Ask AI
// Recommended chunk sizes for 16kHz audio
const CHUNK_DURATION_MS = 200
const SAMPLE_RATE = 16000
const CHUNK_SIZE = (SAMPLE_RATE * CHUNK_DURATION_MS) / 1000 // 3200 samples
Buffer Management
Copy
Ask AI
// Buffer audio to send in optimal chunks
class AudioBuffer {
private buffer: number[] = []
private chunkSize: number
private onChunkReady: (chunk: number[]) => void
constructor(chunkSize: number, onChunkReady: (chunk: number[]) => void) {
this.chunkSize = chunkSize
this.onChunkReady = onChunkReady
}
push(samples: number[]) {
this.buffer.push(...samples)
while (this.buffer.length >= this.chunkSize) {
const chunk = this.buffer.splice(0, this.chunkSize)
this.onChunkReady(chunk)
}
}
flush() {
if (this.buffer.length > 0) {
this.onChunkReady(this.buffer)
this.buffer = []
}
}
}
Error Handling
Copy
Ask AI
const session = await RunAnywhere.startSTTStream({
language: 'en',
onError: (error) => {
console.error('STT stream error:', error.message)
// Handle gracefully - maybe restart the stream
},
})
// Wrap feedAudio in try-catch for safety
try {
session.feedAudio(samples)
} catch (error) {
console.error('Failed to feed audio:', error)
}