Skip to main content

Memory Management

On-device AI models are memory-intensive. Proper memory management is critical for app stability.

Load Only What You Need

// ❌ Don't load multiple large models
RunAnywhere.loadLLMModel("model-3b")
RunAnywhere.loadSTTModel("whisper-large")

// ✅ Load one LLM at a time, use smaller models
RunAnywhere.loadLLMModel("model-0.5b")

Unload When Not Needed

// Unload models when switching tasks or backgrounding
override fun onStop() {
    super.onStop()
    lifecycleScope.launch {
        RunAnywhere.unloadLLMModel()
        RunAnywhere.unloadSTTModel()
    }
}

Monitor Memory Before Loading

val modelInfo = RunAnywhere.model(modelId)
val requiredMemory = modelInfo?.downloadSize ?: 0

val activityManager = getSystemService(Context.ACTIVITY_SERVICE) as ActivityManager
val memoryInfo = ActivityManager.MemoryInfo()
activityManager.getMemoryInfo(memoryInfo)

if (memoryInfo.availMem < requiredMemory * 1.5) {
    showWarning("Low memory - performance may be affected")
}

Performance Optimization

Use Quantized Models

Model TypeSizePerformanceQuality
Q8 (8-bit)LargerSlowerBest
Q4 (4-bit)SmallerFasterGood
Q2 (2-bit)SmallestFastestAcceptable
// ✅ Use Q4 models for mobile devices
val model = RunAnywhere.registerModel(
    name = "Qwen 0.5B Q4",
    url = "...qwen2.5-0.5b-instruct-q4_0.gguf",
    framework = InferenceFramework.LLAMA_CPP
)

Use Streaming for Better UX

// ✅ Stream tokens for perceived faster responses
RunAnywhere.generateStream(prompt)
    .collect { token ->
        textView.append(token)
    }

// ❌ Waiting for full response feels slow
val result = RunAnywhere.generate(prompt)
textView.text = result.text

Set Appropriate Token Limits

// ✅ Match maxTokens to your use case
val shortAnswer = LLMGenerationOptions(maxTokens = 50)   // Quick Q&A
val mediumAnswer = LLMGenerationOptions(maxTokens = 200) // General chat
val longForm = LLMGenerationOptions(maxTokens = 500)     // Stories/articles

App Lifecycle

Handle Background/Foreground

class MyApplication : Application(), LifecycleEventObserver {

    override fun onCreate() {
        super.onCreate()
        ProcessLifecycleOwner.get().lifecycle.addObserver(this)
        RunAnywhere.initialize(environment = SDKEnvironment.PRODUCTION)
    }

    override fun onStateChanged(source: LifecycleOwner, event: Lifecycle.Event) {
        when (event) {
            Lifecycle.Event.ON_STOP -> {
                // App backgrounded - release resources
                CoroutineScope(Dispatchers.Main).launch {
                    RunAnywhere.stopVoiceSession()
                    RunAnywhere.cleanup()
                }
            }
            Lifecycle.Event.ON_START -> {
                // App foregrounded - reinitialize if needed
            }
            else -> {}
        }
    }
}

Preload Models at Launch

class SplashActivity : AppCompatActivity() {

    override fun onCreate(savedInstanceState: Bundle?) {
        super.onCreate(savedInstanceState)

        lifecycleScope.launch {
            // Show loading UI
            showLoading()

            // Preload commonly used models
            val models = listOf("qwen-0.5b", "whisper-tiny")

            models.forEach { modelId ->
                if (!RunAnywhere.isModelDownloaded(modelId)) {
                    RunAnywhere.downloadModel(modelId).collect { progress ->
                        updateProgress(modelId, progress.progress)
                    }
                }
            }

            // Pre-load the primary model
            RunAnywhere.loadLLMModel("qwen-0.5b")

            // Navigate to main screen
            startActivity(Intent(this@SplashActivity, MainActivity::class.java))
            finish()
        }
    }
}

Error Handling

Always Handle Errors

// ✅ Comprehensive error handling
lifecycleScope.launch {
    try {
        val result = RunAnywhere.generate(prompt)
        showResponse(result.text)
    } catch (e: SDKError) {
        when (e.category) {
            ErrorCategory.MODEL -> promptModelDownload()
            ErrorCategory.STORAGE -> promptStorageCleanup()
            else -> showGenericError(e.message)
        }
    }
}

Provide User Feedback

// ✅ Show meaningful progress
RunAnywhere.downloadModel(modelId).collect { progress ->
    when (progress.state) {
        DownloadState.DOWNLOADING -> {
            progressBar.progress = (progress.progress * 100).toInt()
            statusText.text = "Downloading AI model..."
        }
        DownloadState.EXTRACTING -> {
            statusText.text = "Preparing model..."
        }
        DownloadState.COMPLETED -> {
            statusText.text = "Ready!"
        }
        DownloadState.ERROR -> {
            showError("Download failed: ${progress.error}")
        }
    }
}

Testing

Test on Real Devices

// Simulators don't accurately reflect:
// - Memory constraints
// - CPU performance
// - Thermal throttling
// Always test on physical devices before release

Measure Performance

val result = RunAnywhere.generate(prompt, options)

Log.d("Performance", """
    Model: ${result.modelUsed}
    Tokens: ${result.tokensUsed}
    Speed: ${result.tokensPerSecond} tok/s
    Latency: ${result.latencyMs}ms
    TTFT: ${result.timeToFirstTokenMs}ms
""".trimIndent())

Security

Protect API Keys

// ❌ Don't hardcode API keys
RunAnywhere.initialize(apiKey = "sk-12345...")

// ✅ Use BuildConfig or secure storage
RunAnywhere.initialize(apiKey = BuildConfig.RUNANYWHERE_API_KEY)

Clear Sensitive Data

// Clear conversation history when appropriate
RunAnywhere.clearVoiceConversation()

// Reset SDK to clear all state
RunAnywhere.reset()

Summary Checklist

  • Use quantized models (Q4) for mobile devices
  • Unload models when backgrounding the app
  • Use streaming for long text generation
  • Handle all error categories appropriately
  • Test on physical devices, not simulators
  • Preload commonly used models at app startup
  • Monitor memory before loading large models
  • Secure API keys using BuildConfig or secure storage