@@ -78,17 +78,63 @@ type ChatMessageImageURL struct {
7878 Detail ImageURLDetail `json:"detail,omitempty"`
7979}
8080
81+ type AudioVoice string
82+
83+ const (
84+ AudioVoiceAlloy AudioVoice = "alloy"
85+ AudioVoiceAsh AudioVoice = "ash"
86+ AudioVoiceBallad AudioVoice = "ballad"
87+ AudioVoiceCoral AudioVoice = "coral"
88+ AudioVoiceEcho AudioVoice = "echo"
89+ AudioVoiceSage AudioVoice = "sage"
90+ AudioVoiceShimmer AudioVoice = "shimmer"
91+ AudioVoiceVerse AudioVoice = "verse"
92+ )
93+
94+ type AudioFormat string
95+
96+ const (
97+ AudioFormatWAV AudioFormat = "wav"
98+ AudioFormatMP3 AudioFormat = "mp3"
99+ AudioFormatFLAC AudioFormat = "flac"
100+ AudioFormatOPUS AudioFormat = "opus"
101+ AudioFormatPCM16 AudioFormat = "pcm16"
102+ )
103+
104+ type ChatMessageAudio struct {
105+ // Base64 encoded audio data.
106+ Data string `json:"data,omitempty"`
107+ // The format of the encoded audio data. Currently supports "wav" and "mp3".
108+ Format AudioFormat `json:"format,omitempty"`
109+ }
110+
111+ type Modality string
112+
113+ const (
114+ ModalityAudio Modality = "audio"
115+ ModalityText Modality = "text"
116+ )
117+
118+ type AudioOutput struct {
119+ // The voice the model uses to respond. Supported voices are alloy, ash, ballad, coral, echo, sage, shimmer, and verse.
120+ Voice AudioVoice `json:"voice"`
121+ // Specifies the output audio format. Must be one of wav, mp3, flac, opus, or pcm16.
122+ Format AudioFormat `json:"format"`
123+ }
124+
81125type ChatMessagePartType string
82126
83127const (
84- ChatMessagePartTypeText ChatMessagePartType = "text"
85- ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
128+ ChatMessagePartTypeText ChatMessagePartType = "text"
129+ ChatMessagePartTypeImageURL ChatMessagePartType = "image_url"
130+ ChatMessagePartTypeInputAudio ChatMessagePartType = "input_audio"
86131)
87132
88133type ChatMessagePart struct {
89- Type ChatMessagePartType `json:"type,omitempty"`
90- Text string `json:"text,omitempty"`
91- ImageURL * ChatMessageImageURL `json:"image_url,omitempty"`
134+ Type ChatMessagePartType `json:"type,omitempty"`
135+ Text string `json:"text,omitempty"`
136+ ImageURL * ChatMessageImageURL `json:"image_url,omitempty"`
137+ InputAudio * ChatMessageAudio `json:"input_audio,omitempty"`
92138}
93139
94140type ChatCompletionMessage struct {
@@ -110,72 +156,74 @@ type ChatCompletionMessage struct {
110156
111157 // For Role=tool prompts this should be set to the ID given in the assistant's prior request to call a tool.
112158 ToolCallID string `json:"tool_call_id,omitempty"`
159+
160+ // If the audio output modality is requested, this object contains data about the audio response from the model.
161+ Audio * ChatCompletionAudio `json:"audio,omitempty"`
162+ }
163+
164+ type chatCompletionMessageMultiContent struct {
165+ Role string `json:"role"`
166+ Content string `json:"-"`
167+ Refusal string `json:"refusal,omitempty"`
168+ MultiContent []ChatMessagePart `json:"content,omitempty"`
169+ Name string `json:"name,omitempty"`
170+ FunctionCall * FunctionCall `json:"function_call,omitempty"`
171+ ToolCalls []ToolCall `json:"tool_calls,omitempty"`
172+ ToolCallID string `json:"tool_call_id,omitempty"`
173+ Audio * ChatCompletionAudio `json:"audio,omitempty"`
174+ }
175+
176+ type chatCompletionMessageSingleContent struct {
177+ Role string `json:"role"`
178+ Content string `json:"content"`
179+ Refusal string `json:"refusal,omitempty"`
180+ MultiContent []ChatMessagePart `json:"-"`
181+ Name string `json:"name,omitempty"`
182+ FunctionCall * FunctionCall `json:"function_call,omitempty"`
183+ ToolCalls []ToolCall `json:"tool_calls,omitempty"`
184+ ToolCallID string `json:"tool_call_id,omitempty"`
185+ Audio * ChatCompletionAudio `json:"audio,omitempty"`
113186}
114187
115188func (m ChatCompletionMessage ) MarshalJSON () ([]byte , error ) {
116189 if m .Content != "" && m .MultiContent != nil {
117190 return nil , ErrContentFieldsMisused
118191 }
119192 if len (m .MultiContent ) > 0 {
120- msg := struct {
121- Role string `json:"role"`
122- Content string `json:"-"`
123- Refusal string `json:"refusal,omitempty"`
124- MultiContent []ChatMessagePart `json:"content,omitempty"`
125- Name string `json:"name,omitempty"`
126- FunctionCall * FunctionCall `json:"function_call,omitempty"`
127- ToolCalls []ToolCall `json:"tool_calls,omitempty"`
128- ToolCallID string `json:"tool_call_id,omitempty"`
129- }(m )
193+ msg := chatCompletionMessageMultiContent (m )
130194 return json .Marshal (msg )
131195 }
132196
133- msg := struct {
134- Role string `json:"role"`
135- Content string `json:"content"`
136- Refusal string `json:"refusal,omitempty"`
137- MultiContent []ChatMessagePart `json:"-"`
138- Name string `json:"name,omitempty"`
139- FunctionCall * FunctionCall `json:"function_call,omitempty"`
140- ToolCalls []ToolCall `json:"tool_calls,omitempty"`
141- ToolCallID string `json:"tool_call_id,omitempty"`
142- }(m )
197+ msg := chatCompletionMessageSingleContent (m )
143198 return json .Marshal (msg )
144199}
145200
146201func (m * ChatCompletionMessage ) UnmarshalJSON (bs []byte ) error {
147- msg := struct {
148- Role string `json:"role"`
149- Content string `json:"content"`
150- Refusal string `json:"refusal,omitempty"`
151- MultiContent []ChatMessagePart
152- Name string `json:"name,omitempty"`
153- FunctionCall * FunctionCall `json:"function_call,omitempty"`
154- ToolCalls []ToolCall `json:"tool_calls,omitempty"`
155- ToolCallID string `json:"tool_call_id,omitempty"`
156- }{}
202+ msg := chatCompletionMessageSingleContent {}
157203
158204 if err := json .Unmarshal (bs , & msg ); err == nil {
159205 * m = ChatCompletionMessage (msg )
160206 return nil
161207 }
162- multiMsg := struct {
163- Role string `json:"role"`
164- Content string
165- Refusal string `json:"refusal,omitempty"`
166- MultiContent []ChatMessagePart `json:"content"`
167- Name string `json:"name,omitempty"`
168- FunctionCall * FunctionCall `json:"function_call,omitempty"`
169- ToolCalls []ToolCall `json:"tool_calls,omitempty"`
170- ToolCallID string `json:"tool_call_id,omitempty"`
171- }{}
208+ multiMsg := chatCompletionMessageMultiContent {}
172209 if err := json .Unmarshal (bs , & multiMsg ); err != nil {
173210 return err
174211 }
175212 * m = ChatCompletionMessage (multiMsg )
176213 return nil
177214}
178215
216+ type ChatCompletionAudio struct {
217+ // Unique identifier for this audio response.
218+ ID string `json:"id"`
219+ // The Unix timestamp (in seconds) for when this audio response will no longer be accessible on the server for use in multi-turn conversations.
220+ ExpiresAt int64 `json:"expires_at"`
221+ // Base64 encoded audio bytes generated by the model, in the format specified in the request.
222+ Data string `json:"data"`
223+ // Transcript of the audio generated by the model.
224+ Transcript string `json:"transcript"`
225+ }
226+
179227type ToolCall struct {
180228 // Index is not nil only in chat completion chunk object
181229 Index * int `json:"index,omitempty"`
@@ -260,6 +308,11 @@ type ChatCompletionRequest struct {
260308 Store bool `json:"store,omitempty"`
261309 // Metadata to store with the completion.
262310 Metadata map [string ]string `json:"metadata,omitempty"`
311+ // Output types that you would like the model to generate for this request. Most models are capable of generating text, which is the default: ["text"]
312+ // The gpt-4o-audio-preview model can also be used to generate audio. To request that this model generate both text and audio responses, you can use: ["text", "audio"]
313+ Modalities []Modality `json:"modalities,omitempty"`
314+ // Parameters for audio output. Required when audio output is requested with modalities: ["audio"]
315+ Audio * AudioOutput `json:"audio,omitempty"`
263316}
264317
265318type StreamOptions struct {
0 commit comments