Skip to main content

Overview

Real-time STT streaming allows you to transcribe audio as it’s being recorded, providing immediate feedback to users. This is essential for voice interfaces where low latency is critical.

Basic Usage

import { RunAnywhere } from '@runanywhere/core'

// Start streaming transcription
const session = await RunAnywhere.startSTTStream({
  language: 'en',
  onPartialResult: (partial) => {
    console.log('Partial:', partial.text)
  },
  onFinalResult: (final) => {
    console.log('Final:', final.text)
  },
})

// Feed audio chunks as they arrive
session.feedAudio(audioChunk) // Float32 samples

// When done
const result = await session.stop()
console.log('Complete transcription:', result.text)

API Reference

startSTTStream

Start a streaming transcription session.
await RunAnywhere.startSTTStream(
  config: STTStreamConfig
): Promise<STTStreamSession>

Configuration

interface STTStreamConfig {
  /** Language code (e.g., 'en', 'es') */
  language?: string

  /** Sample rate of incoming audio (default: 16000) */
  sampleRate?: number

  /** Callback for partial results */
  onPartialResult?: (result: STTPartialResult) => void

  /** Callback for final segment results */
  onFinalResult?: (result: STTResult) => void

  /** Callback for errors */
  onError?: (error: Error) => void

  /** Enable Voice Activity Detection */
  enableVAD?: boolean
}

Session Methods

interface STTStreamSession {
  /** Feed audio samples to the stream */
  feedAudio(samples: number[]): void

  /** Pause transcription */
  pause(): void

  /** Resume transcription */
  resume(): void

  /** Stop and get final result */
  stop(): Promise<STTResult>

  /** Check if session is active */
  isActive: boolean
}

Examples

Real-Time Microphone Transcription

LiveTranscription.tsx
import React, { useState, useCallback, useRef, useEffect } from 'react'
import { View, Button, Text } from 'react-native'
import { RunAnywhere, STTStreamSession } from '@runanywhere/core'
import AudioRecord from 'react-native-audio-record'

export function LiveTranscription() {
  const [isListening, setIsListening] = useState(false)
  const [transcript, setTranscript] = useState('')
  const [partial, setPartial] = useState('')
  const sessionRef = useRef<STTStreamSession | null>(null)

  useEffect(() => {
    // Initialize audio recording
    AudioRecord.init({
      sampleRate: 16000,
      channels: 1,
      bitsPerSample: 16,
      audioSource: 6,
    })

    return () => {
      if (sessionRef.current?.isActive) {
        sessionRef.current.stop()
      }
    }
  }, [])

  const startListening = useCallback(async () => {
    setIsListening(true)
    setTranscript('')
    setPartial('')

    // Start STT stream
    sessionRef.current = await RunAnywhere.startSTTStream({
      language: 'en',
      enableVAD: true,
      onPartialResult: (result) => {
        setPartial(result.text)
      },
      onFinalResult: (result) => {
        setTranscript((prev) => prev + result.text + ' ')
        setPartial('')
      },
    })

    // Start audio recording and pipe to STT
    AudioRecord.start()
    AudioRecord.on('data', (data: string) => {
      // Convert base64 to float32 samples
      const samples = base64ToFloat32(data)
      sessionRef.current?.feedAudio(samples)
    })
  }, [])

  const stopListening = useCallback(async () => {
    setIsListening(false)
    AudioRecord.stop()

    if (sessionRef.current) {
      const finalResult = await sessionRef.current.stop()
      setTranscript((prev) => prev + finalResult.text)
      setPartial('')
    }
  }, [])

  return (
    <View style={{ padding: 16 }}>
      <Button
        title={isListening ? '🔴 Stop' : '🎤 Start'}
        onPress={isListening ? stopListening : startListening}
      />
      <View style={{ marginTop: 16 }}>
        <Text style={{ fontSize: 16 }}>
          {transcript}
          <Text style={{ color: '#888' }}>{partial}</Text>
        </Text>
      </View>
    </View>
  )
}

// Helper to convert base64 audio to float32 samples
function base64ToFloat32(base64: string): number[] {
  const binary = atob(base64)
  const int16Array = new Int16Array(binary.length / 2)
  for (let i = 0; i < int16Array.length; i++) {
    int16Array[i] = binary.charCodeAt(i * 2) | (binary.charCodeAt(i * 2 + 1) << 8)
  }
  return Array.from(int16Array).map((x) => x / 32768.0)
}

With VAD Integration

When VAD is enabled, the stream automatically detects speech segments:
const session = await RunAnywhere.startSTTStream({
  language: 'en',
  enableVAD: true,
  onPartialResult: (result) => {
    // Updates while user is speaking
    updateUI(result.text)
  },
  onFinalResult: (result) => {
    // Called when VAD detects end of speech
    console.log('User finished speaking:', result.text)
  },
})

// Audio is processed, VAD handles segmentation automatically
session.feedAudio(audioSamples)

Custom Hook

useSTTStream.ts
import { useState, useRef, useCallback } from 'react'
import { RunAnywhere, STTStreamSession, STTResult } from '@runanywhere/core'

export function useSTTStream() {
  const [isStreaming, setIsStreaming] = useState(false)
  const [partial, setPartial] = useState('')
  const [transcript, setTranscript] = useState('')
  const sessionRef = useRef<STTStreamSession | null>(null)

  const start = useCallback(async (language = 'en') => {
    setIsStreaming(true)
    setPartial('')

    sessionRef.current = await RunAnywhere.startSTTStream({
      language,
      enableVAD: true,
      onPartialResult: (result) => setPartial(result.text),
      onFinalResult: (result) => {
        setTranscript((prev) => prev + result.text + ' ')
        setPartial('')
      },
    })
  }, [])

  const feedAudio = useCallback((samples: number[]) => {
    sessionRef.current?.feedAudio(samples)
  }, [])

  const stop = useCallback(async (): Promise<STTResult | null> => {
    setIsStreaming(false)
    if (sessionRef.current) {
      const result = await sessionRef.current.stop()
      setTranscript((prev) => prev + result.text)
      setPartial('')
      return result
    }
    return null
  }, [])

  const reset = useCallback(() => {
    setTranscript('')
    setPartial('')
  }, [])

  return {
    isStreaming,
    partial,
    transcript,
    start,
    feedAudio,
    stop,
    reset,
  }
}

Performance Considerations

Feed audio in chunks of 100-500ms for optimal balance between latency and accuracy. Too small chunks increase overhead; too large chunks increase perceived latency.

Optimal Chunk Size

// Recommended chunk sizes for 16kHz audio
const CHUNK_DURATION_MS = 200
const SAMPLE_RATE = 16000
const CHUNK_SIZE = (SAMPLE_RATE * CHUNK_DURATION_MS) / 1000 // 3200 samples

Buffer Management

// Buffer audio to send in optimal chunks
class AudioBuffer {
  private buffer: number[] = []
  private chunkSize: number
  private onChunkReady: (chunk: number[]) => void

  constructor(chunkSize: number, onChunkReady: (chunk: number[]) => void) {
    this.chunkSize = chunkSize
    this.onChunkReady = onChunkReady
  }

  push(samples: number[]) {
    this.buffer.push(...samples)

    while (this.buffer.length >= this.chunkSize) {
      const chunk = this.buffer.splice(0, this.chunkSize)
      this.onChunkReady(chunk)
    }
  }

  flush() {
    if (this.buffer.length > 0) {
      this.onChunkReady(this.buffer)
      this.buffer = []
    }
  }
}

Error Handling

const session = await RunAnywhere.startSTTStream({
  language: 'en',
  onError: (error) => {
    console.error('STT stream error:', error.message)
    // Handle gracefully - maybe restart the stream
  },
})

// Wrap feedAudio in try-catch for safety
try {
  session.feedAudio(samples)
} catch (error) {
  console.error('Failed to feed audio:', error)
}