Skip to main content
The generateStream() method enables real-time token streaming, perfect for building responsive chat interfaces where text appears progressively.

Basic Usage

let result = try await RunAnywhere.generateStream(
    "Tell me a story about a brave knight",
    options: LLMGenerationOptions(maxTokens: 500)
)

// Display tokens as they arrive
for try await token in result.stream {
    print(token, terminator: "")
}

// Get final metrics after streaming completes
let metrics = try await result.result.value
print("\n\nGenerated \(metrics.tokensUsed) tokens at \(metrics.tokensPerSecond) tok/s")

Method Signature

public static func generateStream(
    _ prompt: String,
    options: LLMGenerationOptions? = nil
) async throws -> LLMStreamingResult

Parameters

ParameterTypeDescription
promptStringThe text prompt
optionsLLMGenerationOptions?Generation configuration (optional)

Returns

An LLMStreamingResult containing:
public struct LLMStreamingResult: Sendable {
    // Async stream of tokens
    public let stream: AsyncThrowingStream<String, Error>

    // Task that completes with final metrics
    public let result: Task<LLMGenerationResult, Error>
}

SwiftUI Integration

Basic Streaming View

struct StreamingView: View {
    @State private var prompt = ""
    @State private var response = ""
    @State private var isStreaming = false
    @State private var tokensPerSecond: Double = 0

    var body: some View {
        VStack(spacing: 16) {
            // Prompt input
            TextField("Enter your prompt...", text: $prompt)
                .textFieldStyle(.roundedBorder)

            // Generate button
            Button(action: { Task { await generate() } }) {
                Label(
                    isStreaming ? "Generating..." : "Generate",
                    systemImage: isStreaming ? "stop.fill" : "play.fill"
                )
            }
            .disabled(prompt.isEmpty)

            // Streaming response
            ScrollView {
                Text(response)
                    .frame(maxWidth: .infinity, alignment: .leading)
            }
            .frame(maxHeight: 300)
            .background(Color.gray.opacity(0.1))
            .cornerRadius(8)

            // Metrics
            if tokensPerSecond > 0 {
                Text("\(String(format: "%.1f", tokensPerSecond)) tokens/sec")
                    .font(.caption)
                    .foregroundColor(.secondary)
            }
        }
        .padding()
    }

    func generate() async {
        response = ""
        isStreaming = true

        do {
            let result = try await RunAnywhere.generateStream(
                prompt,
                options: LLMGenerationOptions(maxTokens: 500)
            )

            // Stream tokens to UI
            for try await token in result.stream {
                await MainActor.run {
                    response += token
                }
            }

            // Get final metrics
            let metrics = try await result.result.value
            await MainActor.run {
                tokensPerSecond = metrics.tokensPerSecond
                isStreaming = false
            }

        } catch {
            await MainActor.run {
                response = "Error: \(error.localizedDescription)"
                isStreaming = false
            }
        }
    }
}

With Typing Animation Effect

class StreamingViewModel: ObservableObject {
    @Published var displayedText = ""
    @Published var isStreaming = false
    @Published var metrics: LLMGenerationResult?

    private var fullText = ""

    func stream(prompt: String) async {
        await MainActor.run {
            displayedText = ""
            fullText = ""
            isStreaming = true
            metrics = nil
        }

        do {
            let result = try await RunAnywhere.generateStream(
                prompt,
                options: LLMGenerationOptions(maxTokens: 500, temperature: 0.7)
            )

            for try await token in result.stream {
                fullText += token

                // Update UI on main thread
                await MainActor.run {
                    displayedText = fullText
                }
            }

            let finalMetrics = try await result.result.value
            await MainActor.run {
                metrics = finalMetrics
                isStreaming = false
            }

        } catch {
            await MainActor.run {
                displayedText = "Error: \(error.localizedDescription)"
                isStreaming = false
            }
        }
    }

    func cancel() async {
        await RunAnywhere.cancelGeneration()
        await MainActor.run {
            isStreaming = false
        }
    }
}

Cancellation

Cancel streaming mid-generation:
// Start streaming in a task
let streamTask = Task {
    let result = try await RunAnywhere.generateStream(prompt)
    for try await token in result.stream {
        print(token, terminator: "")
    }
}

// Cancel after 5 seconds
try await Task.sleep(for: .seconds(5))
await RunAnywhere.cancelGeneration()
streamTask.cancel()

Error Handling

do {
    let result = try await RunAnywhere.generateStream(prompt)

    for try await token in result.stream {
        // Handle each token
        await updateUI(with: token)
    }

    // Stream completed successfully
    let metrics = try await result.result.value

} catch is CancellationError {
    print("Generation was cancelled")
} catch let error as SDKError {
    switch error.code {
    case .generationFailed:
        print("Generation failed: \(error.message)")
    case .streamingNotSupported:
        print("Model doesn't support streaming")
    default:
        print("Error: \(error.localizedDescription)")
    }
}

Check Streaming Support

Not all models support streaming. Check before calling:
if await RunAnywhere.supportsLLMStreaming {
    let result = try await RunAnywhere.generateStream(prompt)
    // ... handle stream
} else {
    // Fall back to non-streaming
    let result = try await RunAnywhere.generate(prompt)
    print(result.text)
}

Performance Tips

Instead of updating the UI for every token, batch updates:
var buffer = ""
var lastUpdate = Date()

for try await token in result.stream {
    buffer += token
    
    // Update UI every 50ms
    if Date().timeIntervalSince(lastUpdate) > 0.05 {
        await MainActor.run { displayedText += buffer }
        buffer = ""
        lastUpdate = Date()
    }
}

// Flush remaining buffer
if !buffer.isEmpty {
    await MainActor.run { displayedText += buffer }
}
For markdown rendering, consider processing the complete text periodically rather than on every token.
For very long generations, consider trimming displayed history to prevent memory issues.

Complete Chat Example

struct ChatMessage: Identifiable {
    let id = UUID()
    let role: Role
    var content: String

    enum Role {
        case user, assistant
    }
}

@Observable
class ChatViewModel {
    var messages: [ChatMessage] = []
    var isStreaming = false

    func send(_ text: String) async {
        // Add user message
        messages.append(ChatMessage(role: .user, content: text))

        // Add placeholder for assistant
        let assistantMessage = ChatMessage(role: .assistant, content: "")
        messages.append(assistantMessage)
        let messageIndex = messages.count - 1

        isStreaming = true

        do {
            let result = try await RunAnywhere.generateStream(
                text,
                options: LLMGenerationOptions(maxTokens: 500)
            )

            for try await token in result.stream {
                await MainActor.run {
                    messages[messageIndex].content += token
                }
            }

        } catch {
            await MainActor.run {
                messages[messageIndex].content = "Error: \(error.localizedDescription)"
            }
        }

        isStreaming = false
    }
}