Skip to content

Commit 447c047

Browse files
authored
Test if mammoth resolves rlinks. (#1451)
1 parent 8a9d8f1 commit 447c047

File tree

3 files changed

+46
-4
lines changed

3 files changed

+46
-4
lines changed

packages/markitdown/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ all = [
5050
"azure-identity"
5151
]
5252
pptx = ["python-pptx"]
53-
docx = ["mammoth", "lxml"]
53+
docx = ["mammoth~=1.10.0", "lxml"]
5454
xlsx = ["pandas", "openpyxl"]
5555
xls = ["pandas", "xlrd"]
5656
pdf = ["pdfminer.six"]
13.4 KB
Binary file not shown.

packages/markitdown/tests/test_module_misc.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,47 @@ def test_input_as_strings() -> None:
288288
assert "# Test" in result.text_content
289289

290290

291+
def test_doc_rlink() -> None:
292+
# Test for: CVE-2025-11849
293+
markitdown = MarkItDown()
294+
295+
# Document with rlink
296+
docx_file = os.path.join(TEST_FILES_DIR, "rlink.docx")
297+
298+
# Directory containing the target rlink file
299+
rlink_tmp_dir = os.path.abspath(os.sep + "tmp")
300+
301+
# Ensure the tmp directory exists
302+
if not os.path.exists(rlink_tmp_dir):
303+
pytest.skip(f"Skipping rlink test; {rlink_tmp_dir} directory does not exist.")
304+
return
305+
306+
rlink_file_path = os.path.join(rlink_tmp_dir, "test_rlink.txt")
307+
rlink_content = "de658225-569e-4e3d-9ed2-cfb6abf927fc"
308+
b64_prefix = (
309+
"ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjk" # base64 prefix of rlink_content
310+
)
311+
312+
if os.path.exists(rlink_file_path):
313+
with open(rlink_file_path, "r", encoding="utf-8") as f:
314+
existing_content = f.read()
315+
if existing_content != rlink_content:
316+
raise ValueError(
317+
f"Existing {rlink_file_path} content does not match expected content."
318+
)
319+
else:
320+
with open(rlink_file_path, "w", encoding="utf-8") as f:
321+
f.write(rlink_content)
322+
323+
try:
324+
result = markitdown.convert(docx_file, keep_data_uris=True).text_content
325+
assert (
326+
b64_prefix not in result
327+
) # Make sure the target file was NOT embedded in the output
328+
finally:
329+
os.remove(rlink_file_path)
330+
331+
291332
@pytest.mark.skipif(
292333
skip_remote,
293334
reason="do not run tests that query external urls",
@@ -301,9 +342,9 @@ def test_markitdown_remote() -> None:
301342
assert test_string in result.text_content
302343

303344
# Youtube
304-
result = markitdown.convert(YOUTUBE_TEST_URL)
305-
for test_string in YOUTUBE_TEST_STRINGS:
306-
assert test_string in result.text_content
345+
# result = markitdown.convert(YOUTUBE_TEST_URL)
346+
# for test_string in YOUTUBE_TEST_STRINGS:
347+
# assert test_string in result.text_content
307348

308349

309350
@pytest.mark.skipif(
@@ -452,6 +493,7 @@ def test_markitdown_llm() -> None:
452493
test_markitdown_remote,
453494
test_speech_transcription,
454495
test_exceptions,
496+
test_doc_rlink,
455497
test_markitdown_exiftool,
456498
test_markitdown_llm_parameters,
457499
test_markitdown_llm,

0 commit comments

Comments
 (0)