Skip to content

Commit f98eadc

Browse files
authored
AudioHandler callback to modify AudioTrack and AudioRecord (#7496)
This PR adds a callback function `audioHandler` which would apply the necessary configurations or modifications to the `AudioTrack` and the `AudioRecord` objects used by the `startAudioCoversation` function. This PR also adds a new configuration class called conversation config which could be sent to the `startAudioConversation` function which allows users to specify different aspects of the conversation.
1 parent a0911a7 commit f98eadc

File tree

7 files changed

+243
-24
lines changed

7 files changed

+243
-24
lines changed

firebase-ai/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Unreleased
22

3+
- [changed] Added `LiveAudioConversationConfig` to control different aspects of the conversation
4+
while using the `startAudioConversation` function.
35
- [fixed] Fixed an issue causing streaming chat interactions to drop thought signatures. (#7562)
46

57
# 17.6.0

firebase-ai/api.txt

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ package com.google.firebase.ai.java {
165165
method public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> sendVideoRealtime(com.google.firebase.ai.type.InlineData video);
166166
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation();
167167
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(boolean enableInterruptions);
168+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(com.google.firebase.ai.type.LiveAudioConversationConfig liveAudioConversationConfig);
168169
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
169170
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, boolean enableInterruptions);
170171
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture<kotlin.Unit> startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler, boolean enableInterruptions);
@@ -874,6 +875,31 @@ package com.google.firebase.ai.type {
874875
public final class InvalidStateException extends com.google.firebase.ai.type.FirebaseAIException {
875876
}
876877

878+
@com.google.firebase.ai.type.PublicPreviewAPI public final class LiveAudioConversationConfig {
879+
field public static final com.google.firebase.ai.type.LiveAudioConversationConfig.Companion Companion;
880+
}
881+
882+
public static final class LiveAudioConversationConfig.Builder {
883+
ctor public LiveAudioConversationConfig.Builder();
884+
method public com.google.firebase.ai.type.LiveAudioConversationConfig build();
885+
method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder setEnableInterruptions(boolean enableInterruptions);
886+
method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder setFunctionCallHandler(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler);
887+
method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder setInitializationHandler(kotlin.jvm.functions.Function2<? super android.media.AudioRecord.Builder,? super android.media.AudioTrack.Builder,kotlin.Unit>? initializationHandler);
888+
method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder setTranscriptHandler(kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler);
889+
field public boolean enableInterruptions;
890+
field public kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler;
891+
field public kotlin.jvm.functions.Function2<? super android.media.AudioRecord.Builder,? super android.media.AudioTrack.Builder,kotlin.Unit>? initializationHandler;
892+
field public kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler;
893+
}
894+
895+
public static final class LiveAudioConversationConfig.Companion {
896+
method public com.google.firebase.ai.type.LiveAudioConversationConfig.Builder builder();
897+
}
898+
899+
public final class LiveAudioConversationConfigKt {
900+
method public static com.google.firebase.ai.type.LiveAudioConversationConfig liveAudioConversationConfig(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.LiveAudioConversationConfig.Builder,kotlin.Unit> init);
901+
}
902+
877903
@com.google.firebase.ai.type.PublicPreviewAPI public final class LiveGenerationConfig {
878904
field public static final com.google.firebase.ai.type.LiveGenerationConfig.Companion Companion;
879905
}
@@ -958,6 +984,7 @@ package com.google.firebase.ai.type {
958984
method @Deprecated public suspend Object? sendMediaStream(java.util.List<com.google.firebase.ai.type.MediaData> mediaChunks, kotlin.coroutines.Continuation<? super kotlin.Unit>);
959985
method public suspend Object? sendTextRealtime(String text, kotlin.coroutines.Continuation<? super kotlin.Unit>);
960986
method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation<? super kotlin.Unit>);
987+
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(com.google.firebase.ai.type.LiveAudioConversationConfig liveAudioConversationConfig, kotlin.coroutines.Continuation<? super kotlin.Unit>);
961988
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);
962989
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.coroutines.Continuation<? super kotlin.Unit>);
963990
method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(kotlin.jvm.functions.Function1<? super com.google.firebase.ai.type.FunctionCallPart,com.google.firebase.ai.type.FunctionResponsePart>? functionCallHandler = null, kotlin.jvm.functions.Function2<? super com.google.firebase.ai.type.Transcription?,? super com.google.firebase.ai.type.Transcription?,kotlin.Unit>? transcriptHandler = null, boolean enableInterruptions = false, kotlin.coroutines.Continuation<? super kotlin.Unit>);

firebase-ai/gradle.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
version=17.6.1
15+
version=17.7.0
1616
latestReleasedVersion=17.6.0

firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import com.google.firebase.ai.type.Content
2424
import com.google.firebase.ai.type.FunctionCallPart
2525
import com.google.firebase.ai.type.FunctionResponsePart
2626
import com.google.firebase.ai.type.InlineData
27+
import com.google.firebase.ai.type.LiveAudioConversationConfig
2728
import com.google.firebase.ai.type.LiveServerMessage
2829
import com.google.firebase.ai.type.LiveSession
2930
import com.google.firebase.ai.type.MediaData
@@ -49,6 +50,18 @@ public abstract class LiveSessionFutures internal constructor() {
4950
@RequiresPermission(RECORD_AUDIO)
5051
public abstract fun startAudioConversation(): ListenableFuture<Unit>
5152

53+
/**
54+
* Starts an audio conversation with the model, which can only be stopped using
55+
* [stopAudioConversation].
56+
*
57+
* @param liveAudioConversationConfig A [LiveAudioConversationConfig] provided by the user to
58+
* control the various aspects of the conversation.
59+
*/
60+
@RequiresPermission(RECORD_AUDIO)
61+
public abstract fun startAudioConversation(
62+
liveAudioConversationConfig: LiveAudioConversationConfig
63+
): ListenableFuture<Unit>
64+
5265
/**
5366
* Starts an audio conversation with the model, which can only be stopped using
5467
* [stopAudioConversation] or [close].
@@ -297,6 +310,12 @@ public abstract class LiveSessionFutures internal constructor() {
297310
session.startAudioConversation(transcriptHandler = transcriptHandler)
298311
}
299312

313+
@RequiresPermission(RECORD_AUDIO)
314+
override fun startAudioConversation(liveAudioConversationConfig: LiveAudioConversationConfig) =
315+
SuspendToFutureAdapter.launchFuture {
316+
session.startAudioConversation(liveAudioConversationConfig)
317+
}
318+
300319
@RequiresPermission(RECORD_AUDIO)
301320
override fun startAudioConversation() =
302321
SuspendToFutureAdapter.launchFuture { session.startAudioConversation() }

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/AudioHelper.kt

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package com.google.firebase.ai.type
1919
import android.Manifest
2020
import android.media.AudioAttributes
2121
import android.media.AudioFormat
22-
import android.media.AudioManager
2322
import android.media.AudioRecord
2423
import android.media.AudioTrack
2524
import android.media.MediaRecorder
@@ -157,28 +156,39 @@ internal class AudioHelper(
157156
*
158157
* It also makes it easier to read, since the long initialization is separate from the
159158
* constructor.
159+
*
160+
* @param initializationHandler A callback that is invoked immediately following the successful
161+
* initialization of the associated [AudioRecord.Builder] and [AudioTrack.Builder] objects. This
162+
* offers a final opportunity to configure these objects, which will remain valid and effective
163+
* for the duration of the current audio session.
160164
*/
161165
@RequiresPermission(Manifest.permission.RECORD_AUDIO)
162-
fun build(): AudioHelper {
163-
val playbackTrack =
164-
AudioTrack(
165-
AudioAttributes.Builder()
166-
.setUsage(AudioAttributes.USAGE_MEDIA)
167-
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
168-
.build(),
166+
fun build(
167+
initializationHandler: ((AudioRecord.Builder, AudioTrack.Builder) -> Unit)? = null
168+
): AudioHelper {
169+
val playTrackBuilder = AudioTrack.Builder()
170+
playTrackBuilder
171+
.setAudioFormat(
169172
AudioFormat.Builder()
170173
.setSampleRate(24000)
171174
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
172175
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
173-
.build(),
176+
.build()
177+
)
178+
.setAudioAttributes(
179+
AudioAttributes.Builder()
180+
.setUsage(AudioAttributes.USAGE_MEDIA)
181+
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
182+
.build()
183+
)
184+
.setBufferSizeInBytes(
174185
AudioTrack.getMinBufferSize(
175186
24000,
176187
AudioFormat.CHANNEL_OUT_MONO,
177188
AudioFormat.ENCODING_PCM_16BIT
178189
),
179-
AudioTrack.MODE_STREAM,
180-
AudioManager.AUDIO_SESSION_ID_GENERATE
181190
)
191+
.setTransferMode(AudioTrack.MODE_STREAM)
182192

183193
val bufferSize =
184194
AudioRecord.getMinBufferSize(
@@ -191,15 +201,22 @@ internal class AudioHelper(
191201
throw AudioRecordInitializationFailedException(
192202
"Audio Record buffer size is invalid ($bufferSize)"
193203
)
194-
195-
val recorder =
196-
AudioRecord(
197-
MediaRecorder.AudioSource.VOICE_COMMUNICATION,
198-
16000,
199-
AudioFormat.CHANNEL_IN_MONO,
200-
AudioFormat.ENCODING_PCM_16BIT,
201-
bufferSize
202-
)
204+
val recorderBuilder =
205+
AudioRecord.Builder()
206+
.setAudioSource(MediaRecorder.AudioSource.VOICE_COMMUNICATION)
207+
.setAudioFormat(
208+
AudioFormat.Builder()
209+
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
210+
.setSampleRate(16000)
211+
.setChannelMask(AudioFormat.CHANNEL_IN_MONO)
212+
.build()
213+
)
214+
.setBufferSizeInBytes(bufferSize)
215+
if (initializationHandler != null) {
216+
initializationHandler(recorderBuilder, playTrackBuilder)
217+
}
218+
val recorder = recorderBuilder.build()
219+
val playbackTrack = playTrackBuilder.build()
203220
if (recorder.state != AudioRecord.STATE_INITIALIZED)
204221
throw AudioRecordInitializationFailedException(
205222
"Audio Record initialization has failed. State: ${recorder.state}"
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.ai.type
18+
19+
import android.media.AudioRecord
20+
import android.media.AudioTrack
21+
22+
/**
23+
* Configuration parameters to use for conversation config.
24+
*
25+
* @property functionCallHandler A callback that is invoked whenever the model receives a function
26+
* call. The [FunctionResponsePart] that the callback function returns will be automatically sent to
27+
* the model.
28+
*
29+
* @property transcriptHandler A callback that is invoked whenever the model receives a transcript.
30+
* The first [Transcription] object is the input transcription, and the second is the output
31+
* transcription.
32+
*
33+
* @property initializationHandler A callback that is invoked immediately following the successful
34+
* initialization of the associated [AudioRecord.Builder] and [AudioTrack.Builder] objects. This
35+
* offers a final opportunity to configure these objects, which will remain valid and effective for
36+
* the duration of the current audio session.
37+
*
38+
* @property enableInterruptions If enabled, allows the user to speak over or interrupt the model's
39+
* ongoing reply.
40+
*
41+
* **WARNING**: The user interruption feature relies on device-specific support, and may not be
42+
* consistently available.
43+
*/
44+
@PublicPreviewAPI
45+
public class LiveAudioConversationConfig
46+
private constructor(
47+
internal val functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?,
48+
internal val initializationHandler: ((AudioRecord.Builder, AudioTrack.Builder) -> Unit)?,
49+
internal val transcriptHandler: ((Transcription?, Transcription?) -> Unit)?,
50+
internal val enableInterruptions: Boolean
51+
) {
52+
53+
/**
54+
* Builder for creating a [LiveAudioConversationConfig].
55+
*
56+
* Mainly intended for Java interop. Kotlin consumers should use [liveAudioConversationConfig] for
57+
* a more idiomatic experience.
58+
*
59+
* @property functionCallHandler See [LiveAudioConversationConfig.functionCallHandler].
60+
*
61+
* @property initializationHandler See [LiveAudioConversationConfig.initializationHandler].
62+
*
63+
* @property transcriptHandler See [LiveAudioConversationConfig.transcriptHandler].
64+
*
65+
* @property enableInterruptions See [LiveAudioConversationConfig.enableInterruptions].
66+
*/
67+
public class Builder {
68+
@JvmField public var functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)? = null
69+
@JvmField
70+
public var initializationHandler: ((AudioRecord.Builder, AudioTrack.Builder) -> Unit)? = null
71+
@JvmField public var transcriptHandler: ((Transcription?, Transcription?) -> Unit)? = null
72+
@JvmField public var enableInterruptions: Boolean = false
73+
74+
public fun setFunctionCallHandler(
75+
functionCallHandler: ((FunctionCallPart) -> FunctionResponsePart)?
76+
): Builder = apply { this.functionCallHandler = functionCallHandler }
77+
78+
public fun setInitializationHandler(
79+
initializationHandler: ((AudioRecord.Builder, AudioTrack.Builder) -> Unit)?
80+
): Builder = apply { this.initializationHandler = initializationHandler }
81+
82+
public fun setTranscriptHandler(
83+
transcriptHandler: ((Transcription?, Transcription?) -> Unit)?
84+
): Builder = apply { this.transcriptHandler = transcriptHandler }
85+
86+
public fun setEnableInterruptions(enableInterruptions: Boolean): Builder = apply {
87+
this.enableInterruptions = enableInterruptions
88+
}
89+
90+
/** Create a new [LiveAudioConversationConfig] with the attached arguments. */
91+
public fun build(): LiveAudioConversationConfig =
92+
LiveAudioConversationConfig(
93+
functionCallHandler = functionCallHandler,
94+
initializationHandler = initializationHandler,
95+
transcriptHandler = transcriptHandler,
96+
enableInterruptions = enableInterruptions
97+
)
98+
}
99+
100+
public companion object {
101+
102+
/**
103+
* Alternative casing for [LiveAudioConversationConfig.Builder]:
104+
* ```
105+
* val config = LiveAudioConversationConfig.builder()
106+
* ```
107+
*/
108+
public fun builder(): Builder = Builder()
109+
}
110+
}
111+
112+
/**
113+
* Helper method to construct a [LiveAudioConversationConfig] in a DSL-like manner.
114+
*
115+
* Example Usage:
116+
* ```
117+
* liveAudioConversationConfig {
118+
* functionCallHandler = ...
119+
* initializationHandler = ...
120+
* ...
121+
* }
122+
* ```
123+
*/
124+
@OptIn(PublicPreviewAPI::class)
125+
public fun liveAudioConversationConfig(
126+
init: LiveAudioConversationConfig.Builder.() -> Unit
127+
): LiveAudioConversationConfig {
128+
val builder = LiveAudioConversationConfig.builder()
129+
builder.init()
130+
return builder.build()
131+
}

firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,26 @@ internal constructor(
171171
transcriptHandler: ((Transcription?, Transcription?) -> Unit)? = null,
172172
enableInterruptions: Boolean = false,
173173
) {
174+
startAudioConversation(
175+
liveAudioConversationConfig {
176+
this.functionCallHandler = functionCallHandler
177+
this.transcriptHandler = transcriptHandler
178+
this.enableInterruptions = enableInterruptions
179+
}
180+
)
181+
}
182+
183+
/**
184+
* Starts an audio conversation with the model, which can only be stopped using
185+
* [stopAudioConversation] or [close].
186+
*
187+
* @param liveAudioConversationConfig A [LiveAudioConversationConfig] provided by the user to
188+
* control the various aspects of the conversation.
189+
*/
190+
@RequiresPermission(RECORD_AUDIO)
191+
public suspend fun startAudioConversation(
192+
liveAudioConversationConfig: LiveAudioConversationConfig
193+
) {
174194

175195
val context = firebaseApp.applicationContext
176196
if (
@@ -191,11 +211,14 @@ internal constructor(
191211
networkScope =
192212
CoroutineScope(blockingDispatcher + childJob() + CoroutineName("LiveSession Network"))
193213
audioScope = CoroutineScope(audioDispatcher + childJob() + CoroutineName("LiveSession Audio"))
194-
audioHelper = AudioHelper.build()
214+
audioHelper = AudioHelper.build(liveAudioConversationConfig.initializationHandler)
195215

196216
recordUserAudio()
197-
processModelResponses(functionCallHandler, transcriptHandler)
198-
listenForModelPlayback(enableInterruptions)
217+
processModelResponses(
218+
liveAudioConversationConfig.functionCallHandler,
219+
liveAudioConversationConfig.transcriptHandler
220+
)
221+
listenForModelPlayback(liveAudioConversationConfig.enableInterruptions)
199222
}
200223
}
201224

0 commit comments

Comments
 (0)