Skip to content

Commit 7b0ab86

Browse files
feat(stt): add new sad_module param to recognize functions
1 parent 1349c6e commit 7b0ab86

File tree

7 files changed

+117
-6
lines changed

7 files changed

+117
-6
lines changed

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/SpeechToText.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* (C) Copyright IBM Corp. 2016, 2024.
2+
* (C) Copyright IBM Corp. 2016, 2025.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
55
* the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@
1212
*/
1313

1414
/*
15-
* IBM OpenAPI SDK Code Generator Version: 3.97.0-0e90eab1-20241120-170029
15+
* IBM OpenAPI SDK Code Generator Version: 3.105.0-3c13b041-20250605-193116
1616
*/
1717

1818
package com.ibm.watson.speech_to_text.v1;
@@ -520,6 +520,9 @@ public ServiceCall<SpeechRecognitionResults> recognize(RecognizeOptions recogniz
520520
"speech_detector_sensitivity",
521521
String.valueOf(recognizeOptions.speechDetectorSensitivity()));
522522
}
523+
if (recognizeOptions.sadModule() != null) {
524+
builder.query("sad_module", String.valueOf(recognizeOptions.sadModule()));
525+
}
523526
if (recognizeOptions.backgroundAudioSuppression() != null) {
524527
builder.query(
525528
"background_audio_suppression",
@@ -854,6 +857,9 @@ public ServiceCall<RecognitionJob> createJob(CreateJobOptions createJobOptions)
854857
"speech_detector_sensitivity",
855858
String.valueOf(createJobOptions.speechDetectorSensitivity()));
856859
}
860+
if (createJobOptions.sadModule() != null) {
861+
builder.query("sad_module", String.valueOf(createJobOptions.sadModule()));
862+
}
857863
if (createJobOptions.backgroundAudioSuppression() != null) {
858864
builder.query(
859865
"background_audio_suppression",

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptions.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ public interface Events {
270270
protected Double endOfPhraseSilenceTime;
271271
protected Boolean splitTranscriptAtPhraseEnd;
272272
protected Float speechDetectorSensitivity;
273+
protected Long sadModule;
273274
protected Float backgroundAudioSuppression;
274275
protected Boolean lowLatency;
275276
protected Float characterInsertionBias;
@@ -306,6 +307,7 @@ public static class Builder {
306307
private Double endOfPhraseSilenceTime;
307308
private Boolean splitTranscriptAtPhraseEnd;
308309
private Float speechDetectorSensitivity;
310+
private Long sadModule;
309311
private Float backgroundAudioSuppression;
310312
private Boolean lowLatency;
311313
private Float characterInsertionBias;
@@ -346,6 +348,7 @@ private Builder(CreateJobOptions createJobOptions) {
346348
this.endOfPhraseSilenceTime = createJobOptions.endOfPhraseSilenceTime;
347349
this.splitTranscriptAtPhraseEnd = createJobOptions.splitTranscriptAtPhraseEnd;
348350
this.speechDetectorSensitivity = createJobOptions.speechDetectorSensitivity;
351+
this.sadModule = createJobOptions.sadModule;
349352
this.backgroundAudioSuppression = createJobOptions.backgroundAudioSuppression;
350353
this.lowLatency = createJobOptions.lowLatency;
351354
this.characterInsertionBias = createJobOptions.characterInsertionBias;
@@ -717,6 +720,17 @@ public Builder speechDetectorSensitivity(Float speechDetectorSensitivity) {
717720
return this;
718721
}
719722

723+
/**
724+
* Set the sadModule.
725+
*
726+
* @param sadModule the sadModule
727+
* @return the CreateJobOptions builder
728+
*/
729+
public Builder sadModule(long sadModule) {
730+
this.sadModule = sadModule;
731+
return this;
732+
}
733+
720734
/**
721735
* Set the backgroundAudioSuppression.
722736
*
@@ -797,6 +811,7 @@ protected CreateJobOptions(Builder builder) {
797811
endOfPhraseSilenceTime = builder.endOfPhraseSilenceTime;
798812
splitTranscriptAtPhraseEnd = builder.splitTranscriptAtPhraseEnd;
799813
speechDetectorSensitivity = builder.speechDetectorSensitivity;
814+
sadModule = builder.sadModule;
800815
backgroundAudioSuppression = builder.backgroundAudioSuppression;
801816
lowLatency = builder.lowLatency;
802817
characterInsertionBias = builder.characterInsertionBias;
@@ -1353,6 +1368,23 @@ public Float speechDetectorSensitivity() {
13531368
return speechDetectorSensitivity;
13541369
}
13551370

1371+
/**
1372+
* Gets the sadModule.
1373+
*
1374+
* <p>Detects speech boundaries within the audio stream with better performance, improved noise
1375+
* suppression, faster responsiveness, and increased accuracy.
1376+
*
1377+
* <p>Specify `sad_module: 2`
1378+
*
1379+
* <p>See [Speech Activity Detection
1380+
* (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
1381+
*
1382+
* @return the sadModule
1383+
*/
1384+
public Long sadModule() {
1385+
return sadModule;
1386+
}
1387+
13561388
/**
13571389
* Gets the backgroundAudioSuppression.
13581390
*

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptions.java

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,7 @@ public interface Model {
237237
protected Double endOfPhraseSilenceTime;
238238
protected Boolean splitTranscriptAtPhraseEnd;
239239
protected Float speechDetectorSensitivity;
240+
protected Long sadModule;
240241
protected Float backgroundAudioSuppression;
241242
protected Boolean lowLatency;
242243
protected Float characterInsertionBias;
@@ -268,6 +269,7 @@ public static class Builder {
268269
private Double endOfPhraseSilenceTime;
269270
private Boolean splitTranscriptAtPhraseEnd;
270271
private Float speechDetectorSensitivity;
272+
private Long sadModule;
271273
private Float backgroundAudioSuppression;
272274
private Boolean lowLatency;
273275
private Float characterInsertionBias;
@@ -303,6 +305,7 @@ private Builder(RecognizeOptions recognizeOptions) {
303305
this.endOfPhraseSilenceTime = recognizeOptions.endOfPhraseSilenceTime;
304306
this.splitTranscriptAtPhraseEnd = recognizeOptions.splitTranscriptAtPhraseEnd;
305307
this.speechDetectorSensitivity = recognizeOptions.speechDetectorSensitivity;
308+
this.sadModule = recognizeOptions.sadModule;
306309
this.backgroundAudioSuppression = recognizeOptions.backgroundAudioSuppression;
307310
this.lowLatency = recognizeOptions.lowLatency;
308311
this.characterInsertionBias = recognizeOptions.characterInsertionBias;
@@ -619,6 +622,17 @@ public Builder speechDetectorSensitivity(Float speechDetectorSensitivity) {
619622
return this;
620623
}
621624

625+
/**
626+
* Set the sadModule.
627+
*
628+
* @param sadModule the sadModule
629+
* @return the RecognizeOptions builder
630+
*/
631+
public Builder sadModule(long sadModule) {
632+
this.sadModule = sadModule;
633+
return this;
634+
}
635+
622636
/**
623637
* Set the backgroundAudioSuppression.
624638
*
@@ -694,6 +708,7 @@ protected RecognizeOptions(Builder builder) {
694708
endOfPhraseSilenceTime = builder.endOfPhraseSilenceTime;
695709
splitTranscriptAtPhraseEnd = builder.splitTranscriptAtPhraseEnd;
696710
speechDetectorSensitivity = builder.speechDetectorSensitivity;
711+
sadModule = builder.sadModule;
697712
backgroundAudioSuppression = builder.backgroundAudioSuppression;
698713
lowLatency = builder.lowLatency;
699714
characterInsertionBias = builder.characterInsertionBias;
@@ -759,7 +774,8 @@ public String model() {
759774
* when a speech activity is detected in the stream. This can be used both in standard and low
760775
* latency mode. This feature enables client applications to know that some words/speech has been
761776
* detected and the service is in the process of decoding. This can be used in lieu of interim
762-
* results in standard mode. See [Using speech recognition
777+
* results in standard mode. Use `sad_module: 2` to increase accuracy and performance in detecting
778+
* speech boundaries within the audio stream. See [Using speech recognition
763779
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
764780
*
765781
* @return the speechBeginEvent
@@ -1154,6 +1170,23 @@ public Float speechDetectorSensitivity() {
11541170
return speechDetectorSensitivity;
11551171
}
11561172

1173+
/**
1174+
* Gets the sadModule.
1175+
*
1176+
* <p>Detects speech boundaries within the audio stream with better performance, improved noise
1177+
* suppression, faster responsiveness, and increased accuracy.
1178+
*
1179+
* <p>Specify `sad_module: 2`
1180+
*
1181+
* <p>See [Speech Activity Detection
1182+
* (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
1183+
*
1184+
* @return the sadModule
1185+
*/
1186+
public Long sadModule() {
1187+
return sadModule;
1188+
}
1189+
11571190
/**
11581191
* Gets the backgroundAudioSuppression.
11591192
*

speech-to-text/src/main/java/com/ibm/watson/speech_to_text/v1/model/RecognizeWithWebsocketsOptions.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ public interface Model {
203203
protected Float backgroundAudioSuppression;
204204
protected Boolean lowLatency;
205205
protected Float characterInsertionBias;
206+
protected Long sadModule;
206207
private Boolean interimResults;
207208
private Boolean processingMetrics;
208209
private Float processingMetricsInterval;
@@ -236,6 +237,7 @@ public static class Builder {
236237
private Float backgroundAudioSuppression;
237238
private Boolean lowLatency;
238239
private Float characterInsertionBias;
240+
private Long sadModule;
239241
private Boolean interimResults;
240242
private Boolean processingMetrics;
241243
private Float processingMetricsInterval;
@@ -268,6 +270,7 @@ private Builder(RecognizeWithWebsocketsOptions recognizeWithWebsocketsOptions) {
268270
this.backgroundAudioSuppression = recognizeWithWebsocketsOptions.backgroundAudioSuppression;
269271
this.lowLatency = recognizeWithWebsocketsOptions.lowLatency;
270272
this.characterInsertionBias = recognizeWithWebsocketsOptions.characterInsertionBias;
273+
this.sadModule = recognizeWithWebsocketsOptions.sadModule;
271274
this.interimResults = recognizeWithWebsocketsOptions.interimResults;
272275
this.processingMetrics = recognizeWithWebsocketsOptions.processingMetrics;
273276
this.processingMetricsInterval = recognizeWithWebsocketsOptions.processingMetricsInterval;
@@ -606,6 +609,17 @@ public Builder characterInsertionBias(Float characterInsertionBias) {
606609
return this;
607610
}
608611

612+
/**
613+
* Set the sadModule.
614+
*
615+
* @param sadModule the sadModule
616+
* @return the RecognizeOptions builder
617+
*/
618+
public Builder sadModule(Long sadModule) {
619+
this.sadModule = sadModule;
620+
return this;
621+
}
622+
609623
/**
610624
* Set the interimResults.
611625
*
@@ -687,6 +701,7 @@ protected RecognizeWithWebsocketsOptions(Builder builder) {
687701
backgroundAudioSuppression = builder.backgroundAudioSuppression;
688702
lowLatency = builder.lowLatency;
689703
characterInsertionBias = builder.characterInsertionBias;
704+
sadModule = builder.sadModule;
690705
interimResults = builder.interimResults;
691706
processingMetrics = builder.processingMetrics;
692707
processingMetricsInterval = builder.processingMetricsInterval;
@@ -1176,6 +1191,23 @@ public Float characterInsertionBias() {
11761191
return characterInsertionBias;
11771192
}
11781193

1194+
/**
1195+
* Gets the sadModule.
1196+
*
1197+
* <p>Detects speech boundaries within the audio stream with better performance, improved noise
1198+
* suppression, faster responsiveness, and increased accuracy.
1199+
*
1200+
* <p>Specify `sad_module: 2`
1201+
*
1202+
* <p>See [Speech Activity Detection
1203+
* (SAD)](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#sad).
1204+
*
1205+
* @return the sadModule
1206+
*/
1207+
public Long sadModule() {
1208+
return sadModule;
1209+
}
1210+
11791211
/**
11801212
* Gets the interimResults.
11811213
*

speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/SpeechToTextTest.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* (C) Copyright IBM Corp. 2019, 2024.
2+
* (C) Copyright IBM Corp. 2019, 2025.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
55
* the License. You may obtain a copy of the License at
@@ -249,6 +249,7 @@ public void testRecognizeWOptions() throws Throwable {
249249
.endOfPhraseSilenceTime(Double.valueOf("0.8"))
250250
.splitTranscriptAtPhraseEnd(false)
251251
.speechDetectorSensitivity(Float.valueOf("0.5"))
252+
.sadModule(Long.valueOf("1"))
252253
.backgroundAudioSuppression(Float.valueOf("0.0"))
253254
.lowLatency(false)
254255
.characterInsertionBias(Float.valueOf("0.0"))
@@ -296,6 +297,7 @@ public void testRecognizeWOptions() throws Throwable {
296297
assertEquals(
297298
Boolean.valueOf(query.get("split_transcript_at_phrase_end")), Boolean.valueOf(false));
298299
assertEquals(Float.valueOf(query.get("speech_detector_sensitivity")), Float.valueOf("0.5"));
300+
assertEquals(Long.valueOf(query.get("sad_module")), Long.valueOf("1"));
299301
assertEquals(Float.valueOf(query.get("background_audio_suppression")), Float.valueOf("0.0"));
300302
assertEquals(Boolean.valueOf(query.get("low_latency")), Boolean.valueOf(false));
301303
assertEquals(Float.valueOf(query.get("character_insertion_bias")), Float.valueOf("0.0"));
@@ -470,6 +472,7 @@ public void testCreateJobWOptions() throws Throwable {
470472
.endOfPhraseSilenceTime(Double.valueOf("0.8"))
471473
.splitTranscriptAtPhraseEnd(false)
472474
.speechDetectorSensitivity(Float.valueOf("0.5"))
475+
.sadModule(Long.valueOf("1"))
473476
.backgroundAudioSuppression(Float.valueOf("0.0"))
474477
.lowLatency(false)
475478
.characterInsertionBias(Float.valueOf("0.0"))
@@ -522,6 +525,7 @@ public void testCreateJobWOptions() throws Throwable {
522525
assertEquals(
523526
Boolean.valueOf(query.get("split_transcript_at_phrase_end")), Boolean.valueOf(false));
524527
assertEquals(Float.valueOf(query.get("speech_detector_sensitivity")), Float.valueOf("0.5"));
528+
assertEquals(Long.valueOf(query.get("sad_module")), Long.valueOf("1"));
525529
assertEquals(Float.valueOf(query.get("background_audio_suppression")), Float.valueOf("0.0"));
526530
assertEquals(Boolean.valueOf(query.get("low_latency")), Boolean.valueOf(false));
527531
assertEquals(Float.valueOf(query.get("character_insertion_bias")), Float.valueOf("0.0"));

speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/CreateJobOptionsTest.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* (C) Copyright IBM Corp. 2020, 2024.
2+
* (C) Copyright IBM Corp. 2020, 2025.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
55
* the License. You may obtain a copy of the License at
@@ -63,6 +63,7 @@ public void testCreateJobOptions() throws Throwable {
6363
.endOfPhraseSilenceTime(Double.valueOf("0.8"))
6464
.splitTranscriptAtPhraseEnd(false)
6565
.speechDetectorSensitivity(Float.valueOf("0.5"))
66+
.sadModule(Long.valueOf("1"))
6667
.backgroundAudioSuppression(Float.valueOf("0.0"))
6768
.lowLatency(false)
6869
.characterInsertionBias(Float.valueOf("0.0"))
@@ -99,6 +100,7 @@ public void testCreateJobOptions() throws Throwable {
99100
assertEquals(createJobOptionsModel.endOfPhraseSilenceTime(), Double.valueOf("0.8"));
100101
assertEquals(createJobOptionsModel.splitTranscriptAtPhraseEnd(), Boolean.valueOf(false));
101102
assertEquals(createJobOptionsModel.speechDetectorSensitivity(), Float.valueOf("0.5"));
103+
assertEquals(createJobOptionsModel.sadModule(), Long.valueOf("1"));
102104
assertEquals(createJobOptionsModel.backgroundAudioSuppression(), Float.valueOf("0.0"));
103105
assertEquals(createJobOptionsModel.lowLatency(), Boolean.valueOf(false));
104106
assertEquals(createJobOptionsModel.characterInsertionBias(), Float.valueOf("0.0"));

speech-to-text/src/test/java/com/ibm/watson/speech_to_text/v1/model/RecognizeOptionsTest.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* (C) Copyright IBM Corp. 2020, 2024.
2+
* (C) Copyright IBM Corp. 2020, 2025.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
55
* the License. You may obtain a copy of the License at
@@ -58,6 +58,7 @@ public void testRecognizeOptions() throws Throwable {
5858
.endOfPhraseSilenceTime(Double.valueOf("0.8"))
5959
.splitTranscriptAtPhraseEnd(false)
6060
.speechDetectorSensitivity(Float.valueOf("0.5"))
61+
.sadModule(Long.valueOf("1"))
6162
.backgroundAudioSuppression(Float.valueOf("0.0"))
6263
.lowLatency(false)
6364
.characterInsertionBias(Float.valueOf("0.0"))
@@ -89,6 +90,7 @@ public void testRecognizeOptions() throws Throwable {
8990
assertEquals(recognizeOptionsModel.endOfPhraseSilenceTime(), Double.valueOf("0.8"));
9091
assertEquals(recognizeOptionsModel.splitTranscriptAtPhraseEnd(), Boolean.valueOf(false));
9192
assertEquals(recognizeOptionsModel.speechDetectorSensitivity(), Float.valueOf("0.5"));
93+
assertEquals(recognizeOptionsModel.sadModule(), Long.valueOf("1"));
9294
assertEquals(recognizeOptionsModel.backgroundAudioSuppression(), Float.valueOf("0.0"));
9395
assertEquals(recognizeOptionsModel.lowLatency(), Boolean.valueOf(false));
9496
assertEquals(recognizeOptionsModel.characterInsertionBias(), Float.valueOf("0.0"));

0 commit comments

Comments
 (0)