Merge pull request #451 from TeamMsgExtractor/next-release

TheElementalOfDestruction · web-flow · commit 207bfb9b2a56 · 2025-02-05T13:17:43.000-08:00
Version 0.53.1
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,17 @@
+**v0.53.1**
+* Expanded allowable range for `red-black-tree-mod`.
+* Fix issue with `MessageBase.asEmailMessage()` that prevented embedded MSG files from being attached.
+* Expand allowable versions of `BeautifulSoup4`.
+
+**v0.53.0**
+* Added tests for many functions in `extract_msg.utils`.
+* Fix an issue in `extract_msg.utils.msgPathToString()` that prevented backslashes from being replaced with forward slashes.
+* Change the behavior of `extract_msg.utils.minutesToDurationStr()` to properly use plurals.
+* Fixed issue in `extract_msg.utils.unwrapMsg()` that would prevent it from working on signed messages due to an API change.
+* Added new exception `MimetypeFailureError`.
+* Modified the logic of `MessageBase.asEmailMessage()` to use `AttachmentBase/SignedAttachment.name` instead of `getFilename()` which only exists on AttachmentBase.
+* Modified the logic of `MessageBase.htmlBodyPrepared()` to properly put the mimetype in image tags to ensure rendering. Logic was also modified to use `encode` instead of `prettify` to reduce computation and output size.
+
 **v0.52.0**
 * [[TeamMsgExtractor #444](https://github.com/TeamMsgExtractor/msg-extractor/issues/444)] Fix typo in string that prevented HTML body from generating from the plain text body properly.
 * Adjusted the behavior of `MSGFile.areStringsUnicode` to prioritize the property specified by the parent MSG files for MSG files that are embedded. Additionally, added a fallback to rely on whether or not there is a stream using the `001F` type to determine the property value if it is entirely missing.
diff --git a/README.rst b/README.rst
@@ -260,8 +260,8 @@ your access to the newest major version of extract-msg.
 .. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
    :target: LICENSE.txt
 
-.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.52.0-blue.svg
-   :target: https://pypi.org/project/extract-msg/0.52.0/
+.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.1-blue.svg
+   :target: https://pypi.org/project/extract-msg/0.53.1/
 
 .. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg
    :target: https://www.python.org/downloads/release/python-3810/
diff --git a/extract_msg/__init__.py b/extract_msg/__init__.py
@@ -27,8 +27,8 @@
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 __author__ = 'Destiny Peterson & Matthew Walker'
-__date__ = '2024-10-22'
-__version__ = '0.52.0'
+__date__ = '2025-02-05'
+__version__ = '0.53.1'
 
 __all__ = [
     # Modules:
diff --git a/extract_msg/exceptions.py b/extract_msg/exceptions.py
@@ -92,6 +92,11 @@ class InvalidPropertyIdError(ExMsgBaseException):
     The provided property ID was invalid.
     """
 
+class MimetypeFailureError(ExMsgBaseException):
+    """
+    The mimetype was unable to be properly determined when it was mandatory.
+    """
+
 class NotWritableError(ExMsgBaseException):
     """
     Modification was attempted on an instance that is not writable.
diff --git a/extract_msg/msg_classes/message_base.py b/extract_msg/msg_classes/message_base.py
@@ -39,7 +39,8 @@
     )
 from ..exceptions import (
         ConversionError, DataNotFoundError, DeencapMalformedData,
-        DeencapNotEncapsulated, IncompatibleOptionsError, WKError
+        DeencapNotEncapsulated, IncompatibleOptionsError, MimetypeFailureError,
+        WKError
     )
 from .msg import MSGFile
 from ..structures.report_tag import ReportTag
@@ -178,13 +179,10 @@ def asEmailMessage(self) -> EmailMessage:
             if att.dataType:
                 if hasattr(att.dataType, 'asEmailMessage'):
                     # Replace the extension with '.eml'.
-                    filename = att.getFilename()
+                    filename = att.name or ''
                     if filename.lower().endswith('.msg'):
                         filename = filename[:-4] + '.eml'
-                    msgMain.add_attachment(
-                                        att.data.asEmailMessage(),
-                                        filename = filename,
-                                        cid = att.contentId)
+                    msgMain.attach(att.data.asEmailMessage())
                 else:
                     if issubclass(att.dataType, bytes):
                         data = att.data
@@ -1198,12 +1196,35 @@ def htmlBodyPrepared(self) -> Optional[bytes]:
         for tag in tags:
             # Iterate through the attachments until we get the right one.
             cid = tag['src'][4:]
-            data = next((attachment.data for attachment in self.attachments if attachment.cid == cid), None)
+            att = next((attachment for attachment in self.attachments if hasattr(attachment, 'cid') and attachment.cid == cid), None)
             # If we found anything, inject it.
-            if data:
-                tag['src'] = (b'data:image;base64,' + base64.b64encode(data)).decode('utf-8')
+            if att and isinstance(att.data, bytes):
+                # Try to get the mimetype. If we can't, see if the item has an
+                # extension and guess the mimtype for a few known ones.
+                mime = att.mimetype
+                if not mime:
+                    ext = (att.name or '').split('.')[-1].lower()
+                    if ext == 'png':
+                        mime = 'image/png'
+                    elif ext == 'jpg' or ext == 'jpeg':
+                        mime = 'image/jpeg'
+                    elif ext == 'gif':
+                        mime = 'image/gif'
+                    elif ext == 'tiff' or ext == 'tif':
+                        mime = 'image/tif'
+                    elif ext == 'bmp':
+                        mime = 'image/bmp'
+                    elif ext == 'svg':
+                        mime = 'image/svg+xml'
+                # Final check.
+                if mime:
+                    tag['src'] = (b'data:' + mime.encode() + b';base64,' + base64.b64encode(att.data)).decode('utf-8')
+                else:
+                    # We don't know what to actually put for this item, and we
+                    # really should never end up here, so throw an error.
+                    raise MimetypeFailureError('Could not get the mimetype to use for htmlBodyPrepared.')
 
-        return soup.prettify('utf-8')
+        return soup.encode('utf-8')
 
     @functools.cached_property
     def htmlInjectableHeader(self) -> str:
diff --git a/extract_msg/utils.py b/extract_msg/utils.py
@@ -696,8 +696,17 @@ def minutesToDurationStr(minutes: int) -> str:
         return '1 minute'
     elif minutes < 60:
         return f'{minutes} minutes'
+    elif minutes == 60:
+        return '1 hour'
     elif minutes % 60 == 0:
         return f'{minutes // 60} hours'
+    elif minutes < 120:
+        if minutes == 61:
+            return f'1 hour 1 minute'
+        else:
+            return f'1 hour {minutes - 60} minutes'
+    elif minutes % 60 == 1:
+        return f'{minutes // 60} hours 1 minute'
     else:
         return f'{minutes // 60} hours {minutes % 60} minutes'
 
@@ -709,8 +718,7 @@ def msgPathToString(inp: Union[str, Iterable[str]]) -> str:
     """
     if not isinstance(inp, str):
         inp = '/'.join(inp)
-    inp.replace('\\', '/')
-    return inp
+    return inp.replace('\\', '/')
 
 
 def parseType(_type: int, stream: Union[int, bytes], encoding: str, extras: Sequence[bytes]):
@@ -1094,7 +1102,7 @@ def unwrapMsg(msg: MSGFile) -> Dict[str, List]:
                 msgFiles.append(att.data)
                 toProcess.append(att.data)
         if isinstance(currentItem, MessageSignedBase):
-            raw += currentItem._rawAttachments
+            raw += currentItem.rawAttachments
 
     return {
         'attachments': attachments,
diff --git a/extract_msg_tests/__init__.py b/extract_msg_tests/__init__.py
@@ -4,11 +4,13 @@
     'OleWriterEditingTests',
     'OleWriterExportTests',
     'PropTests',
+    'UtilTests',
     'ValidationTests',
 ]
 
 from .attachment_tests import AttachmentTests
 from .cmd_line_tests import CommandLineTests
 from .ole_writer_tests import OleWriterEditingTests, OleWriterExportTests
 from .prop_tests import PropTests
+from .util_tests import UtilTests
 from .validation_tests import ValidationTests
diff --git a/extract_msg_tests/util_tests.py b/extract_msg_tests/util_tests.py
@@ -0,0 +1,61 @@
+__all__ = [
+    'UtilTests',
+]
+
+
+import unittest
+
+from extract_msg import utils
+
+
+class UtilTests(unittest.TestCase):
+    def test_dictGetCasedKey(self):
+        caseDict = {'hello': 1, 'HeUtQjWkW': 2}
+
+        self.assertEqual(utils.dictGetCasedKey(caseDict, 'Hello'), 'hello')
+        self.assertEqual(utils.dictGetCasedKey(caseDict, 'heutqjwkw'), 'HeUtQjWkW')
+        with self.assertRaises(KeyError):
+            utils.dictGetCasedKey(caseDict, 'jjjjj')
+
+    def test_divide(self):
+        inputString = '12345678901234567890'
+        expectedOutputs = {
+            1: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0'],
+            2: ['12', '34', '56', '78', '90', '12', '34', '56', '78', '90'],
+            3: ['123', '456', '789', '012', '345', '678', '90'],
+            4: ['1234', '5678', '9012', '3456', '7890'],
+            5: ['12345', '67890', '12345', '67890'],
+            6: ['123456', '789012', '345678', '90'],
+            7: ['1234567', '8901234', '567890'],
+            8: ['12345678', '90123456', '7890'],
+            9: ['123456789', '012345678', '90'],
+            10: ['1234567890', '1234567890'],
+            11: ['12345678901', '234567890'],
+        }
+
+        for divideBy, expectedResult in expectedOutputs.items():
+            self.assertListEqual(utils.divide(inputString, divideBy), expectedResult)
+
+    def test_makeWeakRef(self):
+        self.assertIsNone(utils.makeWeakRef(None))
+        class TestClass:
+            pass
+        self.assertIsNotNone(utils.makeWeakRef(TestClass()))
+
+    def test_minutesToDurationStr(self):
+        self.assertEqual(utils.minutesToDurationStr(0), '0 hours')
+        self.assertEqual(utils.minutesToDurationStr(1), '1 minute')
+        self.assertEqual(utils.minutesToDurationStr(2), '2 minutes')
+        self.assertEqual(utils.minutesToDurationStr(59), '59 minutes')
+        self.assertEqual(utils.minutesToDurationStr(60), '1 hour')
+        self.assertEqual(utils.minutesToDurationStr(61), '1 hour 1 minute')
+        self.assertEqual(utils.minutesToDurationStr(62), '1 hour 2 minutes')
+        self.assertEqual(utils.minutesToDurationStr(120), '2 hours')
+        self.assertEqual(utils.minutesToDurationStr(121), '2 hours 1 minute')
+        self.assertEqual(utils.minutesToDurationStr(122), '2 hours 2 minutes')
+
+    def test_msgPathToStr(self):
+        self.assertEqual(utils.msgPathToString('hello/world/one'), 'hello/world/one')
+        self.assertEqual(utils.msgPathToString('hello/world\\one'), 'hello/world/one')
+        self.assertEqual(utils.msgPathToString(['hello', 'world', 'one']), 'hello/world/one')
+        self.assertEqual(utils.msgPathToString(['hello\\world', 'one']), 'hello/world/one')
diff --git a/requirements.txt b/requirements.txt
@@ -6,6 +6,6 @@ olefile==0.47
 tzlocal>=4.2,<6
 compressed-rtf>=1.0.6,<2
 ebcdic>=1.1.1,<2
-beautifulsoup4>=4.11.1,<4.13
+beautifulsoup4>=4.11.1,<4.14
 RTFDE>=0.1.1,<0.2
-red-black-tree-mod==1.20
+red-black-tree-mod>=1.20, <=1.23