Skip to main content
Early Beta — The Web SDK is in early beta. APIs may change between releases.

Overview

Token streaming allows you to display AI responses as they’re generated, token by token. This provides a much better user experience than waiting for the entire response, especially for longer outputs.

Basic Usage

import { TextGeneration } from '@runanywhere/web'

const { stream, result } = await TextGeneration.generateStream(
  'Write a short story about a robot',
  { maxTokens: 200 }
)

// Display tokens as they arrive
let fullResponse = ''
for await (const token of stream) {
  fullResponse += token
  document.getElementById('output').textContent = fullResponse
}

// Get final metrics
const finalResult = await result
console.log('Speed:', finalResult.tokensPerSecond.toFixed(1), 'tok/s')

API Reference

await TextGeneration.generateStream(
  prompt: string,
  options?: LLMGenerationOptions
): Promise<LLMStreamingResult>

Parameters

Same as generate() — see Generation Options.

Returns

interface LLMStreamingResult {
  /** Async iterator yielding tokens one at a time */
  stream: AsyncIterable<string>

  /** Promise resolving to final result with metrics */
  result: Promise<LLMGenerationResult>

  /** Cancel the ongoing generation */
  cancel: () => void
}

Examples

React Component

StreamingChat.tsx
import { useState, useCallback } from 'react'
import { TextGeneration } from '@runanywhere/web'

export function StreamingChat() {
  const [response, setResponse] = useState('')
  const [isStreaming, setIsStreaming] = useState(false)
  const [metrics, setMetrics] = useState('')

  const handleStream = useCallback(async () => {
    setResponse('')
    setMetrics('')
    setIsStreaming(true)

    try {
      const { stream, result } = await TextGeneration.generateStream(
        'Explain how neural networks work',
        { maxTokens: 300, temperature: 0.7 }
      )

      let fullText = ''
      for await (const token of stream) {
        fullText += token
        setResponse(fullText)
      }

      const final = await result
      setMetrics(
        `${final.tokensPerSecond.toFixed(1)} tok/s | ` +
          `${final.latencyMs}ms | ${final.tokensUsed} tokens`
      )
    } catch (error) {
      setResponse('Error: ' + (error as Error).message)
    } finally {
      setIsStreaming(false)
    }
  }, [])

  return (
    <div>
      <button onClick={handleStream} disabled={isStreaming}>
        {isStreaming ? 'Streaming...' : 'Start Streaming'}
      </button>

      <div style={{ marginTop: 16, fontSize: 16, lineHeight: 1.6 }}>
        {response}
        {isStreaming && <span style={{ opacity: 0.5 }}>|</span>}
      </div>

      {metrics && <div style={{ marginTop: 8, color: '#666', fontSize: 12 }}>{metrics}</div>}
    </div>
  )
}

Cancellable Streaming

const { stream, result, cancel } = await TextGeneration.generateStream(
  'Write a very long story...',
  { maxTokens: 1000 }
)

// Cancel after 3 seconds
const timeout = setTimeout(() => cancel(), 3000)

let text = ''
for await (const token of stream) {
  text += token
}

clearTimeout(timeout)
console.log('Generated text:', text)

Custom Streaming Hook (React)

useStreamingGenerate.ts
import { useState, useCallback, useRef } from 'react'
import { TextGeneration, LLMGenerationOptions, LLMGenerationResult } from '@runanywhere/web'

export function useStreamingGenerate() {
  const [text, setText] = useState('')
  const [isStreaming, setIsStreaming] = useState(false)
  const [error, setError] = useState<Error | null>(null)
  const [metrics, setMetrics] = useState<LLMGenerationResult | null>(null)
  const cancelRef = useRef<(() => void) | null>(null)

  const generate = useCallback(async (prompt: string, options?: LLMGenerationOptions) => {
    setText('')
    setIsStreaming(true)
    setError(null)
    setMetrics(null)

    try {
      const { stream, result, cancel } = await TextGeneration.generateStream(prompt, options)
      cancelRef.current = cancel

      let accumulated = ''
      for await (const token of stream) {
        accumulated += token
        setText(accumulated)
      }

      const finalMetrics = await result
      setMetrics(finalMetrics)
      return finalMetrics
    } catch (err) {
      const e = err instanceof Error ? err : new Error('Streaming failed')
      setError(e)
      throw e
    } finally {
      setIsStreaming(false)
      cancelRef.current = null
    }
  }, [])

  const cancel = useCallback(() => {
    cancelRef.current?.()
  }, [])

  return { text, isStreaming, error, metrics, generate, cancel }
}

Optimize UI Updates

For very fast generation, batch UI updates to avoid overwhelming the browser:
function createThrottledUpdater(element: HTMLElement) {
  let pending = ''
  let frameId: number | null = null

  return {
    append(token: string) {
      pending += token
      if (!frameId) {
        frameId = requestAnimationFrame(() => {
          element.textContent = pending
          frameId = null
        })
      }
    },
    reset() {
      pending = ''
      element.textContent = ''
      if (frameId) {
        cancelAnimationFrame(frameId)
        frameId = null
      }
    },
  }
}

// Usage
const updater = createThrottledUpdater(document.getElementById('output')!)
const { stream } = await TextGeneration.generateStream('Tell me a story', { maxTokens: 200 })

for await (const token of stream) {
  updater.append(token)
}

Performance Tips

Streaming has minimal overhead compared to non-streaming generation. The time-to-first-token (TTFT) is the same, and total generation time is nearly identical.
  • Use requestAnimationFrame to batch DOM updates for smoother rendering
  • Avoid setting React state on every token for very fast models — batch updates with a throttle
  • Cancel streams when users navigate away to free WASM resources