Skip to content

Commit 2cc8a44

Browse files
Implement nextLink pagination in get_all_analyzers (#127)
* Initial plan * Implement nextLink pagination in get_all_analyzers method Co-authored-by: yungshinlintw <[email protected]> * Add safeguards for pagination: circular link detection and value type validation Co-authored-by: yungshinlintw <[email protected]> * Refactor pagination safeguards: add class constant and improve error messages Co-authored-by: yungshinlintw <[email protected]> * Fix pagination limit enforcement and sanitize error messages Co-authored-by: yungshinlintw <[email protected]> --------- Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: yungshinlintw <[email protected]>
1 parent f8f406b commit 2cc8a44

File tree

1 file changed

+47
-7
lines changed

1 file changed

+47
-7
lines changed

python/content_understanding_client.py

Lines changed: 47 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ class AzureContentUnderstandingClient:
6969
".heif",
7070
] # Pro mode and Training for Standard mode only support document data
7171

72+
# Maximum number of pages to retrieve when following pagination links
73+
MAX_PAGINATION_PAGES: int = 1000
74+
7275
def __init__(
7376
self,
7477
endpoint: str,
@@ -284,21 +287,58 @@ def get_all_analyzers(self) -> Dict[str, Any]:
284287
Retrieves a list of all available analyzers from the content understanding service.
285288
286289
This method sends a GET request to the service endpoint to fetch the list of analyzers.
290+
It automatically follows pagination links (nextLink) to retrieve all pages of results.
287291
It raises an HTTPError if the request fails.
288292
289293
Returns:
290294
dict: A dictionary containing the JSON response from the service, which includes
291-
the list of available analyzers.
295+
the complete list of available analyzers across all pages in the "value" key.
292296
293297
Raises:
294298
requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code.
299+
RuntimeError: If too many pages are encountered (likely indicating a pagination loop).
300+
ValueError: If the API response contains an invalid 'value' field (not a list).
295301
"""
296-
response = requests.get(
297-
url=self._get_analyzer_list_url(self._endpoint, self._api_version),
298-
headers=self._headers,
299-
)
300-
self._raise_for_status_with_detail(response)
301-
return response.json()
302+
all_analyzers = []
303+
url = self._get_analyzer_list_url(self._endpoint, self._api_version)
304+
visited_urls = set()
305+
page_count = 0
306+
307+
while url:
308+
# Prevent infinite loops from circular pagination links
309+
if url in visited_urls:
310+
raise RuntimeError(f"Circular pagination detected: {url} was already visited")
311+
312+
visited_urls.add(url)
313+
page_count += 1
314+
315+
# Check page count after incrementing to properly enforce limit
316+
if page_count > self.MAX_PAGINATION_PAGES:
317+
raise RuntimeError(
318+
f"Maximum pagination limit ({self.MAX_PAGINATION_PAGES} pages) exceeded. "
319+
f"This likely indicates a pagination loop or misconfiguration."
320+
)
321+
322+
response = requests.get(url=url, headers=self._headers)
323+
self._raise_for_status_with_detail(response)
324+
response_json = response.json()
325+
326+
# Collect analyzers from current page
327+
analyzers = response_json.get("value", [])
328+
if not isinstance(analyzers, list):
329+
# Include structure info without potentially sensitive response content
330+
structure_keys = list(response_json.keys()) if isinstance(response_json, dict) else []
331+
raise ValueError(
332+
f"Expected 'value' to be a list, got {type(analyzers).__name__}. "
333+
f"Response contains keys: {structure_keys}"
334+
)
335+
all_analyzers.extend(analyzers)
336+
337+
# Get the next page URL, if it exists
338+
url = response_json.get("nextLink")
339+
340+
# Return in the same format as the original response
341+
return {"value": all_analyzers}
302342

303343
def get_defaults(self) -> Dict[str, Any]:
304344
"""

0 commit comments

Comments
 (0)