Memory Management
On-device AI models are memory-intensive. Proper memory management is critical for app stability.
Load Only What You Need
// ❌ Don't load multiple large models
RunAnywhere.loadLLMModel("model-3b")
RunAnywhere.loadSTTModel("whisper-large")
// ✅ Load one LLM at a time, use smaller models
RunAnywhere.loadLLMModel("model-0.5b")
Unload When Not Needed
// Unload models when switching tasks or backgrounding
override fun onStop() {
super.onStop()
lifecycleScope.launch {
RunAnywhere.unloadLLMModel()
RunAnywhere.unloadSTTModel()
}
}
Monitor Memory Before Loading
val modelInfo = RunAnywhere.model(modelId)
val requiredMemory = modelInfo?.downloadSize ?: 0
val activityManager = getSystemService(Context.ACTIVITY_SERVICE) as ActivityManager
val memoryInfo = ActivityManager.MemoryInfo()
activityManager.getMemoryInfo(memoryInfo)
if (memoryInfo.availMem < requiredMemory * 1.5) {
showWarning("Low memory - performance may be affected")
}
Use Quantized Models
| Model Type | Size | Performance | Quality |
|---|
| Q8 (8-bit) | Larger | Slower | Best |
| Q4 (4-bit) | Smaller | Faster | Good |
| Q2 (2-bit) | Smallest | Fastest | Acceptable |
// ✅ Use Q4 models for mobile devices
val model = RunAnywhere.registerModel(
name = "Qwen 0.5B Q4",
url = "...qwen2.5-0.5b-instruct-q4_0.gguf",
framework = InferenceFramework.LLAMA_CPP
)
Use Streaming for Better UX
// ✅ Stream tokens for perceived faster responses
RunAnywhere.generateStream(prompt)
.collect { token ->
textView.append(token)
}
// ❌ Waiting for full response feels slow
val result = RunAnywhere.generate(prompt)
textView.text = result.text
Set Appropriate Token Limits
// ✅ Match maxTokens to your use case
val shortAnswer = LLMGenerationOptions(maxTokens = 50) // Quick Q&A
val mediumAnswer = LLMGenerationOptions(maxTokens = 200) // General chat
val longForm = LLMGenerationOptions(maxTokens = 500) // Stories/articles
App Lifecycle
Handle Background/Foreground
class MyApplication : Application(), LifecycleEventObserver {
override fun onCreate() {
super.onCreate()
ProcessLifecycleOwner.get().lifecycle.addObserver(this)
RunAnywhere.initialize(environment = SDKEnvironment.PRODUCTION)
}
override fun onStateChanged(source: LifecycleOwner, event: Lifecycle.Event) {
when (event) {
Lifecycle.Event.ON_STOP -> {
// App backgrounded - release resources
CoroutineScope(Dispatchers.Main).launch {
RunAnywhere.stopVoiceSession()
RunAnywhere.cleanup()
}
}
Lifecycle.Event.ON_START -> {
// App foregrounded - reinitialize if needed
}
else -> {}
}
}
}
Preload Models at Launch
class SplashActivity : AppCompatActivity() {
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
lifecycleScope.launch {
// Show loading UI
showLoading()
// Preload commonly used models
val models = listOf("qwen-0.5b", "whisper-tiny")
models.forEach { modelId ->
if (!RunAnywhere.isModelDownloaded(modelId)) {
RunAnywhere.downloadModel(modelId).collect { progress ->
updateProgress(modelId, progress.progress)
}
}
}
// Pre-load the primary model
RunAnywhere.loadLLMModel("qwen-0.5b")
// Navigate to main screen
startActivity(Intent(this@SplashActivity, MainActivity::class.java))
finish()
}
}
}
Error Handling
Always Handle Errors
// ✅ Comprehensive error handling
lifecycleScope.launch {
try {
val result = RunAnywhere.generate(prompt)
showResponse(result.text)
} catch (e: SDKError) {
when (e.category) {
ErrorCategory.MODEL -> promptModelDownload()
ErrorCategory.STORAGE -> promptStorageCleanup()
else -> showGenericError(e.message)
}
}
}
Provide User Feedback
// ✅ Show meaningful progress
RunAnywhere.downloadModel(modelId).collect { progress ->
when (progress.state) {
DownloadState.DOWNLOADING -> {
progressBar.progress = (progress.progress * 100).toInt()
statusText.text = "Downloading AI model..."
}
DownloadState.EXTRACTING -> {
statusText.text = "Preparing model..."
}
DownloadState.COMPLETED -> {
statusText.text = "Ready!"
}
DownloadState.ERROR -> {
showError("Download failed: ${progress.error}")
}
}
}
Testing
Test on Real Devices
// Simulators don't accurately reflect:
// - Memory constraints
// - CPU performance
// - Thermal throttling
// Always test on physical devices before release
val result = RunAnywhere.generate(prompt, options)
Log.d("Performance", """
Model: ${result.modelUsed}
Tokens: ${result.tokensUsed}
Speed: ${result.tokensPerSecond} tok/s
Latency: ${result.latencyMs}ms
TTFT: ${result.timeToFirstTokenMs}ms
""".trimIndent())
Security
Protect API Keys
// ❌ Don't hardcode API keys
RunAnywhere.initialize(apiKey = "sk-12345...")
// ✅ Use BuildConfig or secure storage
RunAnywhere.initialize(apiKey = BuildConfig.RUNANYWHERE_API_KEY)
Clear Sensitive Data
// Clear conversation history when appropriate
RunAnywhere.clearVoiceConversation()
// Reset SDK to clear all state
RunAnywhere.reset()
Summary Checklist