Documentation Index Fetch the complete documentation index at: https://docs.runanywhere.ai/llms.txt
Use this file to discover all available pages before exploring further.
Voice Activity Detection (VAD) automatically detects speech in audio streams, enabling hands-free voice interfaces.
Overview
VAD determines when a user is speaking vs. silent, which is essential for:
Knowing when to start/stop recording
Triggering transcription at the right time
Building push-to-talk or hands-free interfaces
The SDK provides energy-based VAD with Silero VAD for accurate speech detection.
Automatic VAD with Voice Session
The easiest way to use VAD is through the Voice Agent pipeline:
final session = await RunAnywhere . startVoiceSession (
config : VoiceSessionConfig (
speechThreshold : 0.03 , // Audio level threshold
silenceDuration : 1.5 , // Seconds of silence before processing
),
);
session.events. listen ((event) {
switch (event) {
case VoiceSessionListening ( : final audioLevel) :
// Show audio level meter
updateAudioMeter (audioLevel);
case VoiceSessionSpeechStarted () :
print ( 'User started speaking' );
case VoiceSessionProcessing () :
print ( 'Processing speech...' );
default :
break ;
}
});
VoiceSessionConfig for VAD
Parameter Type Default Description speechThresholddouble0.03 Audio level to trigger speech detection silenceDurationdouble1.5 Seconds of silence before processing
// More sensitive (quieter environments)
final quietConfig = VoiceSessionConfig (
speechThreshold : 0.01 ,
silenceDuration : 1.0 ,
);
// Less sensitive (noisy environments)
final noisyConfig = VoiceSessionConfig (
speechThreshold : 0.1 ,
silenceDuration : 2.0 ,
);
Building a Voice Level Indicator
class VoiceLevelIndicator extends StatefulWidget {
final Stream < double > audioLevelStream;
const VoiceLevelIndicator ({ required this .audioLevelStream});
@override
_VoiceLevelIndicatorState createState () => _VoiceLevelIndicatorState ();
}
class _VoiceLevelIndicatorState extends State < VoiceLevelIndicator > {
double _level = 0.0 ;
StreamSubscription ? _subscription;
@override
void initState () {
super . initState ();
_subscription = widget.audioLevelStream. listen ((level) {
setState (() => _level = level);
});
}
@override
Widget build ( BuildContext context) {
return Container (
height : 10 ,
child : LinearProgressIndicator (
value : _level. clamp ( 0.0 , 1.0 ),
backgroundColor : Colors .grey[ 300 ],
valueColor : AlwaysStoppedAnimation (
_level > 0.03 ? Colors .green : Colors .grey,
),
),
);
}
@override
void dispose () {
_subscription ? . cancel ();
super . dispose ();
}
}
Voice Session with VAD Events
class VoiceAssistant extends StatefulWidget {
@override
_VoiceAssistantState createState () => _VoiceAssistantState ();
}
class _VoiceAssistantState extends State < VoiceAssistant > {
VoiceSessionHandle ? _session;
double _audioLevel = 0.0 ;
bool _isSpeaking = false ;
String _status = 'Ready' ;
Future < void > _startSession () async {
_session = await RunAnywhere . startVoiceSession (
config : VoiceSessionConfig (
speechThreshold : 0.03 ,
silenceDuration : 1.5 ,
autoPlayTTS : true ,
continuousMode : true ,
),
);
_session ! .events. listen ((event) {
setState (() {
switch (event) {
case VoiceSessionListening ( : final audioLevel) :
_audioLevel = audioLevel;
_status = 'Listening...' ;
case VoiceSessionSpeechStarted () :
_isSpeaking = true ;
_status = 'Speaking detected' ;
case VoiceSessionProcessing () :
_isSpeaking = false ;
_status = 'Processing...' ;
case VoiceSessionTranscribed ( : final text) :
_status = 'You: $ text ' ;
case VoiceSessionResponded ( : final text) :
_status = 'AI: $ text ' ;
case VoiceSessionSpeaking () :
_status = 'AI speaking...' ;
case VoiceSessionTurnCompleted () :
_status = 'Ready' ;
case VoiceSessionStopped () :
_status = 'Stopped' ;
default :
break ;
}
});
});
}
void _stopSession () {
_session ? . stop ();
}
@override
Widget build ( BuildContext context) {
return Column (
mainAxisAlignment : MainAxisAlignment .center,
children : [
// Audio level indicator
Container (
width : 100 ,
height : 100 ,
decoration : BoxDecoration (
shape : BoxShape .circle,
color : _isSpeaking ? Colors .green : Colors .blue,
boxShadow : [
BoxShadow (
color : (_isSpeaking ? Colors .green : Colors .blue)
. withValues (alpha : 0.5 ),
blurRadius : _audioLevel * 50 ,
spreadRadius : _audioLevel * 20 ,
),
],
),
child : Icon (
_isSpeaking ? Icons .mic : Icons .mic_none,
color : Colors .white,
size : 48 ,
),
),
SizedBox (height : 24 ),
Text (_status, textAlign : TextAlign .center),
SizedBox (height : 24 ),
ElevatedButton (
onPressed : _session == null ? _startSession : _stopSession,
child : Text (_session == null ? 'Start' : 'Stop' ),
),
],
);
}
}
VAD Tuning Tips
Lower thresholds (0.01) detect quieter speech but may trigger on background noise. Higher
thresholds (0.1) require louder speech but are more robust to noise.
Shorter duration (0.5s) feels more responsive but may cut off pauses mid-sentence. Longer duration
(2.0s) allows natural pauses but feels slower.
Consider measuring ambient noise level and adjusting threshold dynamically.
See Also
Voice Agent Complete voice pipeline
Speech-to-Text Audio transcription