@@ -23,6 +23,54 @@ def _normalize_audio_format(audio_format: str) -> str:
2323
2424
2525class Audio (Type ):
26+ """A type for representing audio data in DSPy.
27+
28+ The Audio class provides a standardized way to handle audio inputs for language models
29+ that support audio processing. Audio data is stored as base64-encoded strings along
30+ with format metadata.
31+
32+ Attributes:
33+ data: Base64-encoded audio data.
34+ audio_format: The audio format (e.g., "wav", "mp3", "flac").
35+
36+ Example:
37+ Basic usage with a local file:
38+
39+ ```python
40+ import dspy
41+
42+ dspy.configure(lm=dspy.LM("openai/gpt-4o-audio-preview"))
43+
44+ class TranscribeAudio(dspy.Signature):
45+ audio: dspy.Audio = dspy.InputField()
46+ transcription: str = dspy.OutputField()
47+
48+ # Create Audio from a local file
49+ audio = dspy.Audio.from_file("speech.wav")
50+
51+ predict = dspy.Predict(TranscribeAudio)
52+ result = predict(audio=audio)
53+ ```
54+
55+ Example:
56+ Creating Audio from different sources:
57+
58+ ```python
59+ import dspy
60+
61+ # From a URL
62+ audio = dspy.Audio.from_url("https://example.com/audio.mp3")
63+
64+ # From a local file path (auto-detected)
65+ audio = dspy.Audio("path/to/audio.wav")
66+
67+ # From a numpy array (requires soundfile)
68+ import numpy as np
69+ samples = np.random.randn(16000) # 1 second of audio at 16kHz
70+ audio = dspy.Audio.from_array(samples, sampling_rate=16000)
71+ ```
72+ """
73+
2674 data : str
2775 audio_format : str
2876
@@ -32,18 +80,20 @@ class Audio(Type):
3280 )
3381
3482 def format (self ) -> list [dict [str , Any ]]:
83+ """Format the audio data for consumption by language models.
84+
85+ Returns:
86+ A list containing the audio block in the format expected by
87+ audio-enabled language models.
88+
89+ Raises:
90+ ValueError: If the audio data cannot be formatted.
91+ """
3592 try :
3693 data = self .data
3794 except Exception as e :
3895 raise ValueError (f"Failed to format audio for DSPy: { e } " )
39- return [{
40- "type" : "input_audio" ,
41- "input_audio" : {
42- "data" : data ,
43- "format" : self .audio_format
44- }
45- }]
46-
96+ return [{"type" : "input_audio" , "input_audio" : {"data" : data , "format" : self .audio_format }}]
4797
4898 @pydantic .model_validator (mode = "before" )
4999 @classmethod
@@ -57,8 +107,20 @@ def validate_input(cls, values: Any) -> Any:
57107
58108 @classmethod
59109 def from_url (cls , url : str ) -> "Audio" :
60- """
61- Download an audio file from URL and encode it as base64.
110+ """Create an Audio instance by downloading from a URL.
111+
112+ Downloads the audio file from the specified URL, determines the format
113+ from the Content-Type header, and encodes the content as base64.
114+
115+ Args:
116+ url: The URL of the audio file to download.
117+
118+ Returns:
119+ An Audio instance containing the base64-encoded audio data.
120+
121+ Raises:
122+ ValueError: If the Content-Type is not an audio MIME type.
123+ requests.HTTPError: If the HTTP request fails.
62124 """
63125 response = requests .get (url )
64126 response .raise_for_status ()
@@ -74,8 +136,19 @@ def from_url(cls, url: str) -> "Audio":
74136
75137 @classmethod
76138 def from_file (cls , file_path : str ) -> "Audio" :
77- """
78- Read local audio file and encode it as base64.
139+ """Create an Audio instance from a local file.
140+
141+ Reads the audio file from disk, determines the format from the file
142+ extension, and encodes the content as base64.
143+
144+ Args:
145+ file_path: The path to the local audio file.
146+
147+ Returns:
148+ An Audio instance containing the base64-encoded audio data.
149+
150+ Raises:
151+ ValueError: If the file does not exist or has an unsupported MIME type.
79152 """
80153 if not os .path .isfile (file_path ):
81154 raise ValueError (f"File not found: { file_path } " )
@@ -95,11 +168,26 @@ def from_file(cls, file_path: str) -> "Audio":
95168 return cls (data = encoded_data , audio_format = audio_format )
96169
97170 @classmethod
98- def from_array (
99- cls , array : Any , sampling_rate : int , format : str = "wav"
100- ) -> "Audio" :
101- """
102- Process numpy-like array and encode it as base64. Uses sampling rate and audio format for encoding.
171+ def from_array (cls , array : Any , sampling_rate : int , format : str = "wav" ) -> "Audio" :
172+ """Create an Audio instance from a numpy array.
173+
174+ Converts a numpy-like array of audio samples into an Audio instance
175+ by encoding it with the specified format and sampling rate.
176+
177+ Args:
178+ array: A numpy-like array containing audio samples.
179+ sampling_rate: The sampling rate in Hz (e.g., 16000 for 16kHz).
180+ format: The output audio format. Defaults to "wav".
181+
182+ Returns:
183+ An Audio instance containing the base64-encoded audio data.
184+
185+ Raises:
186+ ImportError: If the soundfile library is not installed.
187+
188+ Note:
189+ This method requires the ``soundfile`` library to be installed.
190+ Install it with ``pip install soundfile``.
103191 """
104192 if not SF_AVAILABLE :
105193 raise ImportError ("soundfile is required to process audio arrays." )
@@ -122,11 +210,30 @@ def __repr__(self) -> str:
122210 length = len (self .data )
123211 return f"Audio(data=<AUDIO_BASE_64_ENCODED({ length } )>, audio_format='{ self .audio_format } ')"
124212
213+
125214def encode_audio (audio : Union [str , bytes , dict , "Audio" , Any ], sampling_rate : int = 16000 , format : str = "wav" ) -> dict :
126- """
127- Encode audio to a dict with 'data' and 'audio_format'.
128-
129- Accepts: local file path, URL, data URI, dict, Audio instance, numpy array, or bytes (with known format).
215+ """Encode audio from various sources into a standardized dictionary format.
216+
217+ This function accepts multiple input types and normalizes them into a dictionary
218+ containing base64-encoded audio data and format information.
219+
220+ Args:
221+ audio: The audio input. Supported types include:
222+
223+ - ``str``: Local file path, HTTP(S) URL, or data URI
224+ - ``bytes``: Raw audio bytes
225+ - ``dict``: Dictionary with "data" and "audio_format" keys
226+ - ``Audio``: An existing Audio instance
227+ - ``numpy.ndarray``: Audio samples as a numpy array (requires soundfile)
228+
229+ sampling_rate: The sampling rate in Hz for numpy array inputs. Defaults to 16000.
230+ format: The audio format for numpy array or bytes inputs. Defaults to "wav".
231+
232+ Returns:
233+ A dictionary with "data" (base64-encoded string) and "audio_format" keys.
234+
235+ Raises:
236+ ValueError: If the input type is unsupported or the data URI is malformed.
130237 """
131238 if isinstance (audio , dict ) and "data" in audio and "audio_format" in audio :
132239 return audio
0 commit comments