Skip to main content

Documentation Index

Fetch the complete documentation index at: https://docs.runanywhere.ai/llms.txt

Use this file to discover all available pages before exploring further.

The generate() method provides complete control over text generation with detailed performance metrics and customizable options.

Basic Usage

let result = try await RunAnywhere.generate(
    "Explain quantum computing in simple terms",
    options: LLMGenerationOptions(
        maxTokens: 200,
        temperature: 0.7
    )
)

print("Response: \(result.text)")
print("Speed: \(result.tokensPerSecond) tok/s")

Method Signature

public static func generate(
    _ prompt: String,
    options: LLMGenerationOptions? = nil
) async throws -> LLMGenerationResult

Parameters

ParameterTypeDescription
promptStringThe text prompt
optionsLLMGenerationOptions?Generation configuration (optional)

Returns

An LLMGenerationResult containing the response and metrics.

LLMGenerationResult

public struct LLMGenerationResult: Sendable {
    public let text: String              // Generated text
    public let thinkingContent: String?  // Reasoning tokens (for thinking models)
    public let inputTokens: Int          // Prompt tokens
    public let tokensUsed: Int           // Total output tokens
    public let modelUsed: String         // Model ID
    public let latencyMs: TimeInterval   // Total generation time
    public let framework: String?        // Backend framework used
    public let tokensPerSecond: Double   // Generation speed
    public let timeToFirstTokenMs: Double? // Time to first token
    public let thinkingTokens: Int?      // Reasoning token count
    public let responseTokens: Int       // Response token count
}

LLMGenerationOptions

let options = LLMGenerationOptions(
    maxTokens: 100,           // Maximum tokens to generate (default: 100)
    temperature: 0.8,         // Randomness 0.0-2.0 (default: 0.8)
    topP: 1.0,                // Nucleus sampling (default: 1.0)
    stopSequences: ["###"],   // Stop generation at these strings
    streamingEnabled: false,  // Enable token streaming
    preferredFramework: .llamaCpp,  // Preferred backend
    systemPrompt: "You are a helpful assistant."
)

Generation Parameters

ParameterTypeDefaultDescription
maxTokensInt100Maximum tokens to generate
temperatureFloat0.8Controls randomness (0.0 = deterministic, 2.0 = very random)
topPFloat1.0Nucleus sampling threshold
stopSequences[String][]Stop generation at these strings
streamingEnabledBoolfalseEnable token-by-token streaming
preferredFrameworkInferenceFramework?nilPreferred backend framework
systemPromptString?nilSystem prompt for behavior

Examples

Basic Generation

let result = try await RunAnywhere.generate("Write a haiku about programming")
print(result.text)
print("Generated in \(result.latencyMs)ms")

With Custom Options

let result = try await RunAnywhere.generate(
    "Write a creative story about a robot",
    options: LLMGenerationOptions(
        maxTokens: 500,
        temperature: 1.2,  // More creative
        topP: 0.9,
        stopSequences: ["THE END"]
    )
)

With System Prompt

let result = try await RunAnywhere.generate(
    "What should I cook tonight?",
    options: LLMGenerationOptions(
        maxTokens: 200,
        systemPrompt: "You are a professional chef. Suggest creative recipes with detailed instructions."
    )
)

For Reasoning Models

Some models output their reasoning process. Extract it with thinkingContent:
let result = try await RunAnywhere.generate(
    "Solve: If a train travels 60 mph for 2 hours, how far does it go?",
    options: LLMGenerationOptions(maxTokens: 300)
)

// The main response
print("Answer: \(result.text)")

// The model's reasoning (if available)
if let thinking = result.thinkingContent {
    print("Reasoning: \(thinking)")
}

// Token breakdown
if let thinkingTokens = result.thinkingTokens {
    print("Thinking tokens: \(thinkingTokens)")
}
print("Response tokens: \(result.responseTokens)")

Performance Monitoring

let result = try await RunAnywhere.generate(prompt)

// Performance metrics
print("Model: \(result.modelUsed)")
print("Input tokens: \(result.inputTokens)")
print("Output tokens: \(result.tokensUsed)")
print("Speed: \(String(format: "%.1f", result.tokensPerSecond)) tok/s")
print("Latency: \(String(format: "%.0f", result.latencyMs))ms")

if let ttft = result.timeToFirstTokenMs {
    print("Time to first token: \(String(format: "%.0f", ttft))ms")
}

Structured Output

Generate type-safe structured output using the Generatable protocol:
struct Recipe: Generatable {
    let name: String
    let ingredients: [String]
    let steps: [String]
    let cookingTime: Int

    static var jsonSchema: String {
        """
        {
          "type": "object",
          "properties": {
            "name": { "type": "string" },
            "ingredients": { "type": "array", "items": { "type": "string" } },
            "steps": { "type": "array", "items": { "type": "string" } },
            "cookingTime": { "type": "integer" }
          },
          "required": ["name", "ingredients", "steps", "cookingTime"]
        }
        """
    }
}

let recipe: Recipe = try await RunAnywhere.generateStructured(
    Recipe.self,
    prompt: "Create a simple pasta recipe"
)

print("Recipe: \(recipe.name)")
print("Ingredients: \(recipe.ingredients.joined(separator: ", "))")
print("Cook time: \(recipe.cookingTime) minutes")

Error Handling

do {
    let result = try await RunAnywhere.generate(prompt, options: options)
    print(result.text)
} catch let error as SDKError {
    switch error.code {
    case .notInitialized:
        print("SDK not initialized")
    case .modelNotFound:
        print("Model not loaded")
    case .generationFailed:
        print("Generation failed: \(error.message)")
    case .contextTooLong:
        print("Prompt too long for model's context window")
    default:
        print("Error: \(error.localizedDescription)")
    }
}

Temperature Guide

TemperatureUse Case
0.0Deterministic, factual answers
0.3-0.5Focused, coherent responses
0.7-0.8Balanced creativity (default)
1.0-1.2Creative writing, brainstorming
1.5+Very random, experimental

generateStream()

Stream tokens in real-time →

System Prompts

Control model behavior →