@@ -69,6 +69,9 @@ class AzureContentUnderstandingClient:
6969 ".heif" ,
7070 ] # Pro mode and Training for Standard mode only support document data
7171
72+ # Maximum number of pages to retrieve when following pagination links
73+ MAX_PAGINATION_PAGES : int = 1000
74+
7275 def __init__ (
7376 self ,
7477 endpoint : str ,
@@ -284,21 +287,58 @@ def get_all_analyzers(self) -> Dict[str, Any]:
284287 Retrieves a list of all available analyzers from the content understanding service.
285288
286289 This method sends a GET request to the service endpoint to fetch the list of analyzers.
290+ It automatically follows pagination links (nextLink) to retrieve all pages of results.
287291 It raises an HTTPError if the request fails.
288292
289293 Returns:
290294 dict: A dictionary containing the JSON response from the service, which includes
291- the list of available analyzers.
295+ the complete list of available analyzers across all pages in the "value" key .
292296
293297 Raises:
294298 requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code.
299+ RuntimeError: If too many pages are encountered (likely indicating a pagination loop).
300+ ValueError: If the API response contains an invalid 'value' field (not a list).
295301 """
296- response = requests .get (
297- url = self ._get_analyzer_list_url (self ._endpoint , self ._api_version ),
298- headers = self ._headers ,
299- )
300- self ._raise_for_status_with_detail (response )
301- return response .json ()
302+ all_analyzers = []
303+ url = self ._get_analyzer_list_url (self ._endpoint , self ._api_version )
304+ visited_urls = set ()
305+ page_count = 0
306+
307+ while url :
308+ # Prevent infinite loops from circular pagination links
309+ if url in visited_urls :
310+ raise RuntimeError (f"Circular pagination detected: { url } was already visited" )
311+
312+ visited_urls .add (url )
313+ page_count += 1
314+
315+ # Check page count after incrementing to properly enforce limit
316+ if page_count > self .MAX_PAGINATION_PAGES :
317+ raise RuntimeError (
318+ f"Maximum pagination limit ({ self .MAX_PAGINATION_PAGES } pages) exceeded. "
319+ f"This likely indicates a pagination loop or misconfiguration."
320+ )
321+
322+ response = requests .get (url = url , headers = self ._headers )
323+ self ._raise_for_status_with_detail (response )
324+ response_json = response .json ()
325+
326+ # Collect analyzers from current page
327+ analyzers = response_json .get ("value" , [])
328+ if not isinstance (analyzers , list ):
329+ # Include structure info without potentially sensitive response content
330+ structure_keys = list (response_json .keys ()) if isinstance (response_json , dict ) else []
331+ raise ValueError (
332+ f"Expected 'value' to be a list, got { type (analyzers ).__name__ } . "
333+ f"Response contains keys: { structure_keys } "
334+ )
335+ all_analyzers .extend (analyzers )
336+
337+ # Get the next page URL, if it exists
338+ url = response_json .get ("nextLink" )
339+
340+ # Return in the same format as the original response
341+ return {"value" : all_analyzers }
302342
303343 def get_defaults (self ) -> Dict [str , Any ]:
304344 """
0 commit comments