Skip to main content
Early Beta — The Web SDK is in early beta. APIs may change between releases.

Complete Example

Here’s a complete example to get you started with on-device text generation in the browser using React + Vite:
runanywhere.ts
import {
  RunAnywhere,
  SDKEnvironment,
  ModelManager,
  ModelCategory,
  LLMFramework,
  type CompactModelDef,
} from '@runanywhere/web'

import { LlamaCPP } from '@runanywhere/web-llamacpp'

// Define your model catalog
const MODELS: CompactModelDef[] = [
  {
    id: 'lfm2-350m-q4_k_m',
    name: 'LFM2 350M Q4_K_M',
    repo: 'LiquidAI/LFM2-350M-GGUF',
    files: ['LFM2-350M-Q4_K_M.gguf'],
    framework: LLMFramework.LlamaCpp,
    modality: ModelCategory.Language,
    memoryRequirement: 250_000_000,
  },
]

let _initPromise: Promise<void> | null = null

export async function initSDK(): Promise<void> {
  if (_initPromise) return _initPromise

  _initPromise = (async () => {
    // 1. Initialize core SDK
    await RunAnywhere.initialize({
      environment: SDKEnvironment.Development,
      debug: true,
    })

    // 2. Register the LlamaCpp backend (loads WASM automatically)
    await LlamaCPP.register()

    // 3. Register model catalog
    RunAnywhere.registerModels(MODELS)
  })()

  return _initPromise
}

export { RunAnywhere, ModelManager, ModelCategory }
App.tsx
import { useEffect, useState, useCallback } from 'react'
import { ModelManager, ModelCategory, EventBus } from '@runanywhere/web'
import { TextGeneration } from '@runanywhere/web-llamacpp'
import { initSDK } from './runanywhere'

function App() {
  const [ready, setReady] = useState(false)
  const [response, setResponse] = useState('')

  useEffect(() => {
    initSDK().then(() => setReady(true))
  }, [])

  const handleGenerate = useCallback(async () => {
    // Download model if needed
    const models = ModelManager.getModels().filter((m) => m.modality === ModelCategory.Language)
    const model = models[0]

    if (model.status !== 'downloaded' && model.status !== 'loaded') {
      await ModelManager.downloadModel(model.id)
    }

    // Load model
    await ModelManager.loadModel(model.id)

    // Stream tokens in real-time
    const { stream, result: resultPromise } = await TextGeneration.generateStream(
      'Explain quantum computing briefly.',
      { maxTokens: 200, temperature: 0.7 }
    )

    let text = ''
    for await (const token of stream) {
      text += token
      setResponse(text)
    }

    const metrics = await resultPromise
    console.log(`${metrics.tokensPerSecond.toFixed(1)} tok/s | ${metrics.latencyMs}ms`)
  }, [])

  return (
    <div>
      <button onClick={handleGenerate} disabled={!ready}>
        {ready ? 'Generate' : 'Loading SDK...'}
      </button>
      <div>{response}</div>
    </div>
  )
}

Step-by-Step Guide

1. Install Packages

npm install @runanywhere/web @runanywhere/web-llamacpp
Add @runanywhere/web-onnx too if you need STT, TTS, or VAD. See Installation for the full list.

2. Initialize the SDK

Initialize RunAnywhere once when your app starts. This is a three-step process:
import { RunAnywhere, SDKEnvironment } from '@runanywhere/web'
import { LlamaCPP } from '@runanywhere/web-llamacpp'

// Step 1: Initialize core SDK (TypeScript-only, no WASM)
await RunAnywhere.initialize({
  environment: SDKEnvironment.Development,
  debug: true,
})

// Step 2: Register backend(s) — this loads WASM automatically
await LlamaCPP.register()
Wrap initialization in an idempotent function (using a cached promise) so it’s safe to call from multiple components. See the complete example above.

Environment Options

EnvironmentEnum ValueLog LevelDescription
DevelopmentSDKEnvironment.DevelopmentDebugFull logging, local testing
StagingSDKEnvironment.StagingInfoStaging backend, moderate logging
ProductionSDKEnvironment.ProductionWarningProduction deployment, minimal logs

3. Register and Load a Model

Models are registered in a catalog, downloaded to OPFS, and then loaded into memory:
import { RunAnywhere, ModelManager, ModelCategory, LLMFramework, EventBus } from '@runanywhere/web'

// Register model catalog
RunAnywhere.registerModels([
  {
    id: 'lfm2-350m-q4_k_m',
    name: 'LFM2 350M Q4_K_M',
    repo: 'LiquidAI/LFM2-350M-GGUF',
    files: ['LFM2-350M-Q4_K_M.gguf'],
    framework: LLMFramework.LlamaCpp,
    modality: ModelCategory.Language,
    memoryRequirement: 250_000_000,
  },
])

// Track download progress
EventBus.shared.on('model.downloadProgress', (evt) => {
  console.log(`Downloading ${evt.modelId}: ${((evt.progress ?? 0) * 100).toFixed(0)}%`)
})

// Download to OPFS (persists across page reloads)
await ModelManager.downloadModel('lfm2-350m-q4_k_m')

// Load into WASM engine for inference
await ModelManager.loadModel('lfm2-350m-q4_k_m')

4. Generate Text

import { TextGeneration } from '@runanywhere/web-llamacpp'

const {
  stream,
  result: resultPromise,
  cancel,
} = await TextGeneration.generateStream('What is 2+2?', { maxTokens: 50, temperature: 0.7 })

let fullResponse = ''
for await (const token of stream) {
  fullResponse += token
}

const result = await resultPromise
console.log('Response:', result.text)
console.log('Tokens used:', result.tokensUsed)
console.log('Tokens/sec:', result.tokensPerSecond.toFixed(1))
console.log('Latency:', result.latencyMs, 'ms')

Generation Result

The result promise resolves with:
interface TextGenerationResult {
  text: string
  tokensUsed: number
  tokensPerSecond: number
  latencyMs: number
}

Model Management with OPFS

Models are downloaded from HuggingFace and stored in the browser’s OPFS:
import { RunAnywhere, ModelManager, ModelCategory, LLMFramework, EventBus } from '@runanywhere/web'

// Register models with HuggingFace repo paths
RunAnywhere.registerModels([
  {
    id: 'lfm2-350m-q4_k_m',
    name: 'LFM2 350M Q4_K_M',
    repo: 'LiquidAI/LFM2-350M-GGUF',
    files: ['LFM2-350M-Q4_K_M.gguf'],
    framework: LLMFramework.LlamaCpp,
    modality: ModelCategory.Language,
    memoryRequirement: 250_000_000,
  },
  // Or use a direct URL
  {
    id: 'qwen-0.5b',
    name: 'Qwen 2.5 0.5B',
    url: 'https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_0.gguf',
    framework: LLMFramework.LlamaCpp,
    modality: ModelCategory.Language,
    memoryRequirement: 400_000_000,
  },
])

// Track download progress
EventBus.shared.on('model.downloadProgress', (evt) => {
  console.log(`Downloading: ${((evt.progress ?? 0) * 100).toFixed(0)}%`)
})

// Download to OPFS (persists across page reloads and browser restarts)
await ModelManager.downloadModel('lfm2-350m-q4_k_m')

// Load into memory for inference
await ModelManager.loadModel('lfm2-350m-q4_k_m')

// Check what's loaded
const loaded = ModelManager.getLoadedModel(ModelCategory.Language)
console.log('Loaded model:', loaded?.id)
When you provide a repo field like 'LiquidAI/LFM2-350M-GGUF', the SDK automatically constructs the download URL: https://huggingface.co/LiquidAI/LFM2-350M-GGUF/resolve/main/LFM2-350M-Q4_K_M.gguf. See Model Sources for all available models.

Using in a React App

Chat.tsx
import { useState, useCallback } from 'react'
import { TextGeneration } from '@runanywhere/web-llamacpp'

export function Chat() {
  const [prompt, setPrompt] = useState('')
  const [response, setResponse] = useState('')
  const [isStreaming, setIsStreaming] = useState(false)

  const handleGenerate = useCallback(async () => {
    if (!prompt.trim()) return

    setResponse('')
    setIsStreaming(true)

    try {
      const { stream } = await TextGeneration.generateStream(prompt, {
        maxTokens: 200,
        temperature: 0.7,
      })

      let accumulated = ''
      for await (const token of stream) {
        accumulated += token
        setResponse(accumulated)
      }
    } catch (error) {
      setResponse('Error: ' + (error as Error).message)
    } finally {
      setIsStreaming(false)
    }
  }, [prompt])

  return (
    <div>
      <textarea
        value={prompt}
        onChange={(e) => setPrompt(e.target.value)}
        placeholder="Ask anything..."
      />
      <button onClick={handleGenerate} disabled={isStreaming}>
        {isStreaming ? 'Generating...' : 'Generate'}
      </button>
      <div>{response}</div>
    </div>
  )
}

Checking Acceleration Mode

After registering the LlamaCpp backend, you can check whether WebGPU is active:
import { LlamaCPP } from '@runanywhere/web-llamacpp'

if (LlamaCPP.isRegistered) {
  console.log('Acceleration:', LlamaCPP.accelerationMode) // 'webgpu' or 'cpu'
}

What’s Next?