Overview
Real-time STT streaming allows you to transcribe audio as it’s being recorded, providing immediate feedback to users. This is essential for voice interfaces where low latency is critical.Basic Usage
import { RunAnywhere } from '@runanywhere/core'
// Start streaming transcription
const session = await RunAnywhere.startSTTStream({
language: 'en',
onPartialResult: (partial) => {
console.log('Partial:', partial.text)
},
onFinalResult: (final) => {
console.log('Final:', final.text)
},
})
// Feed audio chunks as they arrive
session.feedAudio(audioChunk) // Float32 samples
// When done
const result = await session.stop()
console.log('Complete transcription:', result.text)
API Reference
startSTTStream
Start a streaming transcription session.
await RunAnywhere.startSTTStream(
config: STTStreamConfig
): Promise<STTStreamSession>
Configuration
interface STTStreamConfig {
/** Language code (e.g., 'en', 'es') */
language?: string
/** Sample rate of incoming audio (default: 16000) */
sampleRate?: number
/** Callback for partial results */
onPartialResult?: (result: STTPartialResult) => void
/** Callback for final segment results */
onFinalResult?: (result: STTResult) => void
/** Callback for errors */
onError?: (error: Error) => void
/** Enable Voice Activity Detection */
enableVAD?: boolean
}
Session Methods
interface STTStreamSession {
/** Feed audio samples to the stream */
feedAudio(samples: number[]): void
/** Pause transcription */
pause(): void
/** Resume transcription */
resume(): void
/** Stop and get final result */
stop(): Promise<STTResult>
/** Check if session is active */
isActive: boolean
}
Examples
Real-Time Microphone Transcription
LiveTranscription.tsx
import React, { useState, useCallback, useRef, useEffect } from 'react'
import { View, Button, Text } from 'react-native'
import { RunAnywhere, STTStreamSession } from '@runanywhere/core'
import AudioRecord from 'react-native-audio-record'
export function LiveTranscription() {
const [isListening, setIsListening] = useState(false)
const [transcript, setTranscript] = useState('')
const [partial, setPartial] = useState('')
const sessionRef = useRef<STTStreamSession | null>(null)
useEffect(() => {
// Initialize audio recording
AudioRecord.init({
sampleRate: 16000,
channels: 1,
bitsPerSample: 16,
audioSource: 6,
})
return () => {
if (sessionRef.current?.isActive) {
sessionRef.current.stop()
}
}
}, [])
const startListening = useCallback(async () => {
setIsListening(true)
setTranscript('')
setPartial('')
// Start STT stream
sessionRef.current = await RunAnywhere.startSTTStream({
language: 'en',
enableVAD: true,
onPartialResult: (result) => {
setPartial(result.text)
},
onFinalResult: (result) => {
setTranscript((prev) => prev + result.text + ' ')
setPartial('')
},
})
// Start audio recording and pipe to STT
AudioRecord.start()
AudioRecord.on('data', (data: string) => {
// Convert base64 to float32 samples
const samples = base64ToFloat32(data)
sessionRef.current?.feedAudio(samples)
})
}, [])
const stopListening = useCallback(async () => {
setIsListening(false)
AudioRecord.stop()
if (sessionRef.current) {
const finalResult = await sessionRef.current.stop()
setTranscript((prev) => prev + finalResult.text)
setPartial('')
}
}, [])
return (
<View style={{ padding: 16 }}>
<Button
title={isListening ? '🔴 Stop' : '🎤 Start'}
onPress={isListening ? stopListening : startListening}
/>
<View style={{ marginTop: 16 }}>
<Text style={{ fontSize: 16 }}>
{transcript}
<Text style={{ color: '#888' }}>{partial}</Text>
</Text>
</View>
</View>
)
}
// Helper to convert base64 audio to float32 samples
function base64ToFloat32(base64: string): number[] {
const binary = atob(base64)
const int16Array = new Int16Array(binary.length / 2)
for (let i = 0; i < int16Array.length; i++) {
int16Array[i] = binary.charCodeAt(i * 2) | (binary.charCodeAt(i * 2 + 1) << 8)
}
return Array.from(int16Array).map((x) => x / 32768.0)
}
With VAD Integration
When VAD is enabled, the stream automatically detects speech segments:const session = await RunAnywhere.startSTTStream({
language: 'en',
enableVAD: true,
onPartialResult: (result) => {
// Updates while user is speaking
updateUI(result.text)
},
onFinalResult: (result) => {
// Called when VAD detects end of speech
console.log('User finished speaking:', result.text)
},
})
// Audio is processed, VAD handles segmentation automatically
session.feedAudio(audioSamples)
Custom Hook
useSTTStream.ts
import { useState, useRef, useCallback } from 'react'
import { RunAnywhere, STTStreamSession, STTResult } from '@runanywhere/core'
export function useSTTStream() {
const [isStreaming, setIsStreaming] = useState(false)
const [partial, setPartial] = useState('')
const [transcript, setTranscript] = useState('')
const sessionRef = useRef<STTStreamSession | null>(null)
const start = useCallback(async (language = 'en') => {
setIsStreaming(true)
setPartial('')
sessionRef.current = await RunAnywhere.startSTTStream({
language,
enableVAD: true,
onPartialResult: (result) => setPartial(result.text),
onFinalResult: (result) => {
setTranscript((prev) => prev + result.text + ' ')
setPartial('')
},
})
}, [])
const feedAudio = useCallback((samples: number[]) => {
sessionRef.current?.feedAudio(samples)
}, [])
const stop = useCallback(async (): Promise<STTResult | null> => {
setIsStreaming(false)
if (sessionRef.current) {
const result = await sessionRef.current.stop()
setTranscript((prev) => prev + result.text)
setPartial('')
return result
}
return null
}, [])
const reset = useCallback(() => {
setTranscript('')
setPartial('')
}, [])
return {
isStreaming,
partial,
transcript,
start,
feedAudio,
stop,
reset,
}
}
Performance Considerations
Feed audio in chunks of 100-500ms for optimal balance between latency and accuracy. Too small
chunks increase overhead; too large chunks increase perceived latency.
Optimal Chunk Size
// Recommended chunk sizes for 16kHz audio
const CHUNK_DURATION_MS = 200
const SAMPLE_RATE = 16000
const CHUNK_SIZE = (SAMPLE_RATE * CHUNK_DURATION_MS) / 1000 // 3200 samples
Buffer Management
// Buffer audio to send in optimal chunks
class AudioBuffer {
private buffer: number[] = []
private chunkSize: number
private onChunkReady: (chunk: number[]) => void
constructor(chunkSize: number, onChunkReady: (chunk: number[]) => void) {
this.chunkSize = chunkSize
this.onChunkReady = onChunkReady
}
push(samples: number[]) {
this.buffer.push(...samples)
while (this.buffer.length >= this.chunkSize) {
const chunk = this.buffer.splice(0, this.chunkSize)
this.onChunkReady(chunk)
}
}
flush() {
if (this.buffer.length > 0) {
this.onChunkReady(this.buffer)
this.buffer = []
}
}
}
Error Handling
const session = await RunAnywhere.startSTTStream({
language: 'en',
onError: (error) => {
console.error('STT stream error:', error.message)
// Handle gracefully - maybe restart the stream
},
})
// Wrap feedAudio in try-catch for safety
try {
session.feedAudio(samples)
} catch (error) {
console.error('Failed to feed audio:', error)
}
Related
Transcribe
Batch transcription
STT Options
Configuration options
VAD
Voice Activity Detection
Voice Agent
Full voice pipeline