Implement nextLink pagination in get_all_analyzers (#127)

Copilot · yungshinlintw · web-flow · commit 2cc8a44c34a3 · 2025-11-20T16:55:42.000-08:00
* Initial plan

* Implement nextLink pagination in get_all_analyzers method

Co-authored-by: yungshinlintw &lt;14239352+yungshinlintw@users.noreply.github.com&gt;

* Add safeguards for pagination: circular link detection and value type validation

Co-authored-by: yungshinlintw &lt;14239352+yungshinlintw@users.noreply.github.com&gt;

* Refactor pagination safeguards: add class constant and improve error messages

Co-authored-by: yungshinlintw &lt;14239352+yungshinlintw@users.noreply.github.com&gt;

* Fix pagination limit enforcement and sanitize error messages

Co-authored-by: yungshinlintw &lt;14239352+yungshinlintw@users.noreply.github.com&gt;

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: yungshinlintw &lt;14239352+yungshinlintw@users.noreply.github.com&gt;
diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py
@@ -69,6 +69,9 @@ class AzureContentUnderstandingClient:
         ".heif",
     ]  # Pro mode and Training for Standard mode only support document data
 
+    # Maximum number of pages to retrieve when following pagination links
+    MAX_PAGINATION_PAGES: int = 1000
+
     def __init__(
         self,
         endpoint: str,
@@ -284,21 +287,58 @@ def get_all_analyzers(self) -> Dict[str, Any]:
         Retrieves a list of all available analyzers from the content understanding service.
 
         This method sends a GET request to the service endpoint to fetch the list of analyzers.
+        It automatically follows pagination links (nextLink) to retrieve all pages of results.
         It raises an HTTPError if the request fails.
 
         Returns:
             dict: A dictionary containing the JSON response from the service, which includes
-                  the list of available analyzers.
+                  the complete list of available analyzers across all pages in the "value" key.
 
         Raises:
             requests.exceptions.HTTPError: If the HTTP request returned an unsuccessful status code.
+            RuntimeError: If too many pages are encountered (likely indicating a pagination loop).
+            ValueError: If the API response contains an invalid 'value' field (not a list).
         """
-        response = requests.get(
-            url=self._get_analyzer_list_url(self._endpoint, self._api_version),
-            headers=self._headers,
-        )
-        self._raise_for_status_with_detail(response)
-        return response.json()
+        all_analyzers = []
+        url = self._get_analyzer_list_url(self._endpoint, self._api_version)
+        visited_urls = set()
+        page_count = 0
+        
+        while url:
+            # Prevent infinite loops from circular pagination links
+            if url in visited_urls:
+                raise RuntimeError(f"Circular pagination detected: {url} was already visited")
+            
+            visited_urls.add(url)
+            page_count += 1
+            
+            # Check page count after incrementing to properly enforce limit
+            if page_count > self.MAX_PAGINATION_PAGES:
+                raise RuntimeError(
+                    f"Maximum pagination limit ({self.MAX_PAGINATION_PAGES} pages) exceeded. "
+                    f"This likely indicates a pagination loop or misconfiguration."
+                )
+            
+            response = requests.get(url=url, headers=self._headers)
+            self._raise_for_status_with_detail(response)
+            response_json = response.json()
+            
+            # Collect analyzers from current page
+            analyzers = response_json.get("value", [])
+            if not isinstance(analyzers, list):
+                # Include structure info without potentially sensitive response content
+                structure_keys = list(response_json.keys()) if isinstance(response_json, dict) else []
+                raise ValueError(
+                    f"Expected 'value' to be a list, got {type(analyzers).__name__}. "
+                    f"Response contains keys: {structure_keys}"
+                )
+            all_analyzers.extend(analyzers)
+            
+            # Get the next page URL, if it exists
+            url = response_json.get("nextLink")
+        
+        # Return in the same format as the original response
+        return {"value": all_analyzers}
 
     def get_defaults(self) -> Dict[str, Any]:
         """