Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import sys
import re
import os
import traceback
from typing import BinaryIO, Any, List
from enum import Enum
from warnings import warn

from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
Expand Down Expand Up @@ -174,6 +176,8 @@ def __init__(

if credential is None:
if os.environ.get("AZURE_API_KEY") is None:
# Let user know if forgot to set key.
print(f"AZURE_API_KEY not found, creating default Azure credential")
credential = DefaultAzureCredential()
else:
credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
Expand Down Expand Up @@ -240,14 +244,22 @@ def convert(
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
# Extract the text using Azure Document Intelligence
poller = self.doc_intel_client.begin_analyze_document(
model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
features=self._analysis_features(stream_info),
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
)
result: AnalyzeResult = poller.result()

try:
# Extract the text using Azure Document Intelligence
poller = self.doc_intel_client.begin_analyze_document(
model_id="prebuilt-layout",
body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
features=self._analysis_features(stream_info),
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
)
result: AnalyzeResult = poller.result()
except Exception:
# Let user know what is wrong before falling back to other converters.
tb = traceback.format_exc()
warn(f"Error calling Azure Document Intelligence client:\n{tb}")
raise


# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
Expand Down