Skip to main content
The generate() method provides complete control over text generation with detailed performance metrics and customizable options.

Basic Usage

let result = try await RunAnywhere.generate(
    "Explain quantum computing in simple terms",
    options: LLMGenerationOptions(
        maxTokens: 200,
        temperature: 0.7
    )
)

print("Response: \(result.text)")
print("Speed: \(result.tokensPerSecond) tok/s")

Method Signature

public static func generate(
    _ prompt: String,
    options: LLMGenerationOptions? = nil
) async throws -> LLMGenerationResult

Parameters

ParameterTypeDescription
promptStringThe text prompt
optionsLLMGenerationOptions?Generation configuration (optional)

Returns

An LLMGenerationResult containing the response and metrics.

LLMGenerationResult

public struct LLMGenerationResult: Sendable {
    public let text: String              // Generated text
    public let thinkingContent: String?  // Reasoning tokens (for thinking models)
    public let inputTokens: Int          // Prompt tokens
    public let tokensUsed: Int           // Total output tokens
    public let modelUsed: String         // Model ID
    public let latencyMs: TimeInterval   // Total generation time
    public let framework: String?        // Backend framework used
    public let tokensPerSecond: Double   // Generation speed
    public let timeToFirstTokenMs: Double? // Time to first token
    public let thinkingTokens: Int?      // Reasoning token count
    public let responseTokens: Int       // Response token count
}

LLMGenerationOptions

let options = LLMGenerationOptions(
    maxTokens: 100,           // Maximum tokens to generate (default: 100)
    temperature: 0.8,         // Randomness 0.0-2.0 (default: 0.8)
    topP: 1.0,                // Nucleus sampling (default: 1.0)
    stopSequences: ["###"],   // Stop generation at these strings
    streamingEnabled: false,  // Enable token streaming
    preferredFramework: .llamaCpp,  // Preferred backend
    systemPrompt: "You are a helpful assistant."
)

Generation Parameters

ParameterTypeDefaultDescription
maxTokensInt100Maximum tokens to generate
temperatureFloat0.8Controls randomness (0.0 = deterministic, 2.0 = very random)
topPFloat1.0Nucleus sampling threshold
stopSequences[String][]Stop generation at these strings
streamingEnabledBoolfalseEnable token-by-token streaming
preferredFrameworkInferenceFramework?nilPreferred backend framework
systemPromptString?nilSystem prompt for behavior

Examples

Basic Generation

let result = try await RunAnywhere.generate("Write a haiku about programming")
print(result.text)
print("Generated in \(result.latencyMs)ms")

With Custom Options

let result = try await RunAnywhere.generate(
    "Write a creative story about a robot",
    options: LLMGenerationOptions(
        maxTokens: 500,
        temperature: 1.2,  // More creative
        topP: 0.9,
        stopSequences: ["THE END"]
    )
)

With System Prompt

let result = try await RunAnywhere.generate(
    "What should I cook tonight?",
    options: LLMGenerationOptions(
        maxTokens: 200,
        systemPrompt: "You are a professional chef. Suggest creative recipes with detailed instructions."
    )
)

For Reasoning Models

Some models output their reasoning process. Extract it with thinkingContent:
let result = try await RunAnywhere.generate(
    "Solve: If a train travels 60 mph for 2 hours, how far does it go?",
    options: LLMGenerationOptions(maxTokens: 300)
)

// The main response
print("Answer: \(result.text)")

// The model's reasoning (if available)
if let thinking = result.thinkingContent {
    print("Reasoning: \(thinking)")
}

// Token breakdown
if let thinkingTokens = result.thinkingTokens {
    print("Thinking tokens: \(thinkingTokens)")
}
print("Response tokens: \(result.responseTokens)")

Performance Monitoring

let result = try await RunAnywhere.generate(prompt)

// Performance metrics
print("Model: \(result.modelUsed)")
print("Input tokens: \(result.inputTokens)")
print("Output tokens: \(result.tokensUsed)")
print("Speed: \(String(format: "%.1f", result.tokensPerSecond)) tok/s")
print("Latency: \(String(format: "%.0f", result.latencyMs))ms")

if let ttft = result.timeToFirstTokenMs {
    print("Time to first token: \(String(format: "%.0f", ttft))ms")
}

Structured Output

Generate type-safe structured output using the Generatable protocol:
struct Recipe: Generatable {
    let name: String
    let ingredients: [String]
    let steps: [String]
    let cookingTime: Int

    static var jsonSchema: String {
        """
        {
          "type": "object",
          "properties": {
            "name": { "type": "string" },
            "ingredients": { "type": "array", "items": { "type": "string" } },
            "steps": { "type": "array", "items": { "type": "string" } },
            "cookingTime": { "type": "integer" }
          },
          "required": ["name", "ingredients", "steps", "cookingTime"]
        }
        """
    }
}

let recipe: Recipe = try await RunAnywhere.generateStructured(
    Recipe.self,
    prompt: "Create a simple pasta recipe"
)

print("Recipe: \(recipe.name)")
print("Ingredients: \(recipe.ingredients.joined(separator: ", "))")
print("Cook time: \(recipe.cookingTime) minutes")

Error Handling

do {
    let result = try await RunAnywhere.generate(prompt, options: options)
    print(result.text)
} catch let error as SDKError {
    switch error.code {
    case .notInitialized:
        print("SDK not initialized")
    case .modelNotFound:
        print("Model not loaded")
    case .generationFailed:
        print("Generation failed: \(error.message)")
    case .contextTooLong:
        print("Prompt too long for model's context window")
    default:
        print("Error: \(error.localizedDescription)")
    }
}

Temperature Guide

TemperatureUse Case
0.0Deterministic, factual answers
0.3-0.5Focused, coherent responses
0.7-0.8Balanced creativity (default)
1.0-1.2Creative writing, brainstorming
1.5+Very random, experimental