Skip to content

Commit 373f6c1

Browse files
Merge pull request #446 from TeamMsgExtractor/next-release
Version 0.52.0
2 parents 1302d6f + d2e321e commit 373f6c1

File tree

6 files changed

+57
-14
lines changed

6 files changed

+57
-14
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
**v0.52.0**
2+
* [[TeamMsgExtractor #444](https://github.com/TeamMsgExtractor/msg-extractor/issues/444)] Fix typo in string that prevented HTML body from generating from the plain text body properly.
3+
* Adjusted the behavior of `MSGFile.areStringsUnicode` to prioritize the property specified by the parent MSG files for MSG files that are embedded. Additionally, added a fallback to rely on whether or not there is a stream using the `001F` type to determine the property value if it is entirely missing.
4+
* Adjusted `OleWriter.fromMsg()` and `MSGFile.export()` to add the argument `allowBadEmbed` which helps to correct a few different issues that may appear in embedded MSG files. These corrections allow the embedded file to still be extracted and to open properly in Outlook.
5+
* In addition to the above, the errors that some of those corrections will suppress are now significantly more informative about what went wrong.
6+
17
**v0.51.1**
28
* Add class type added in last version to known class types.
39

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ your access to the newest major version of extract-msg.
260260
.. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
261261
:target: LICENSE.txt
262262

263-
.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.51.1-blue.svg
264-
:target: https://pypi.org/project/extract-msg/0.51.1/
263+
.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.52.0-blue.svg
264+
:target: https://pypi.org/project/extract-msg/0.52.0/
265265

266266
.. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg
267267
:target: https://www.python.org/downloads/release/python-3810/

extract_msg/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2828

2929
__author__ = 'Destiny Peterson & Matthew Walker'
30-
__date__ = '2024-10-11'
31-
__version__ = '0.51.1'
30+
__date__ = '2024-10-22'
31+
__version__ = '0.52.0'
3232

3333
__all__ = [
3434
# Modules:

extract_msg/msg_classes/message_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1170,7 +1170,7 @@ def htmlBody(self) -> Optional[bytes]:
11701170
# Convert the plain text body to html.
11711171
logger.info('HTML body was not found, attempting to generate from plain text body.')
11721172
correctedBody = html.escape(self.body).replace('\r', '').replace('\n', '<br />')
1173-
htmlBody = f'<html><body>{correctedBody}</body></head>'.encode('ascii', 'xmlreplace')
1173+
htmlBody = f'<html><body>{correctedBody}</body></head>'.encode('ascii', 'xmlcharrefreplace')
11741174

11751175
if not htmlBody:
11761176
logger.info('HTML body could not be found nor generated.')

extract_msg/msg_classes/msg.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ def existsTypedProperty(self, _id: str, location = None, _type = None, prefix: b
479479
foundNumber += 1
480480
return (foundNumber > 0), foundNumber
481481

482-
def export(self, path) -> None:
482+
def export(self, path, allowBadEmbed: bool = False) -> None:
483483
"""
484484
Exports the contents of this MSG file to a new MSG files specified by
485485
the path given.
@@ -492,21 +492,26 @@ def export(self, path) -> None:
492492
493493
:param path: A path-like object (including strings and ``pathlib.Path``
494494
objects) or an IO device with a write method which accepts bytes.
495+
:param allowBadEmbed: If True, attempts to skip steps that will fail if
496+
the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
495497
"""
496498
from ..ole_writer import OleWriter
497499

498500
# Create an instance of the class used for writing a new OLE file.
499501
writer = OleWriter()
500502
# Add all file and directory entries to it. If this
501-
writer.fromMsg(self)
503+
writer.fromMsg(self, allowBadEmbed = allowBadEmbed)
502504
writer.write(path)
503505

504-
def exportBytes(self) -> bytes:
506+
def exportBytes(self, allowBadEmbed: bool = False) -> bytes:
505507
"""
506508
Saves a new copy of the MSG file, returning the bytes.
509+
510+
:param allowBadEmbed: If True, attempts to skip steps that will fail if
511+
the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
507512
"""
508513
out = io.BytesIO()
509-
self.export(out)
514+
self.export(out, allowBadEmbed)
510515
return out.getvalue()
511516

512517
def fixPath(self, inp: MSG_PATH, prefix: bool = True) -> str:
@@ -843,7 +848,16 @@ def areStringsUnicode(self) -> bool:
843848
"""
844849
Whether the strings are Unicode encoded or not.
845850
"""
846-
return (self.getPropertyVal('340D0003', 0) & 0x40000) != 0
851+
val = self.getPropertyVal('340D0003')
852+
if val is None:
853+
# Try to get this value from the parent.
854+
if self.prefix:
855+
if self.__parentMsg and (msg := self.__parentMsg()) is not None:
856+
return msg.areStringsUnicode
857+
858+
# Final attempt: check the actual streams.
859+
return any(x[-1].upper().endswith('001F') for x in self.listDir())
860+
return (val & 0x40000) != 0
847861

848862
@functools.cached_property
849863
def attachments(self) -> Union[List[AttachmentBase], List[SignedAttachment]]:

extract_msg/ole_writer.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
from . import constants
1919
from .constants import MSG_PATH
2020
from .enums import Color, DirectoryEntryType
21-
from .exceptions import TooManySectorsError
21+
from .exceptions import StandardViolationError, TooManySectorsError
2222
from .utils import ceilDiv, dictGetCasedKey, inputToMsgPath
2323
from olefile.olefile import OleDirectoryEntry, OleFileIO
2424
from red_black_dict_mod import RedBlackTree
@@ -804,9 +804,15 @@ def editEntry(self, path: MSG_PATH, **kwargs) -> None:
804804
# Send it to be modified using the arguments given.
805805
self.__modifyEntry(entry, **kwargs)
806806

807-
def fromMsg(self, msg: MSGFile) -> None:
807+
def fromMsg(self, msg: MSGFile, allowBadEmbed: bool = False) -> None:
808808
"""
809809
Copies the streams and stream information necessary from the MSG file.
810+
811+
:param allowBadEmbed: If True, attempts to skip steps that will fail if
812+
the embedded MSG file violates standards. It will also attempt to repair the data to try to ensure it can open in Outlook.
813+
814+
:raises StandardViolationError: Something about the embedded data has a
815+
fundemental issue that violates the standard.
810816
"""
811817
# Get the root OLE entry's CLSID.
812818
self.__rootEntry.clsid = _unClsid(msg._getOleEntry('/').clsid)
@@ -825,7 +831,17 @@ def fromMsg(self, msg: MSGFile) -> None:
825831
# specific place. So let's check if we are doing the properties
826832
# stream and then if we are embedded.
827833
if x[0] == '__properties_version1.0' and msg.prefixLen > 0:
828-
data = data[:24] + b'\x00\x00\x00\x00\x00\x00\x00\x00' + data[24:]
834+
if len(data) % 16 != 0:
835+
data = data[:24] + b'\x00\x00\x00\x00\x00\x00\x00\x00' + data[24:]
836+
elif not allowBadEmbed:
837+
# If we are not allowing bad data, throw an error.
838+
raise StandardViolationError('Embedded msg file attempted to be extracted that contains a top level properties stream.')
839+
if allowBadEmbed:
840+
# See if we need to fix the properties stream at all.
841+
if msg.getPropertyVal('340D0003') is None:
842+
if msg.areStringsUnicode:
843+
# We need to add a property to allow this file to open:
844+
data += b'\x03\x00\x0D\x34\x02\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00'
829845
self.addOleEntry(x, entry, data)
830846

831847
# Now check if it is an embedded file. If so, we need to copy the named
@@ -834,7 +850,14 @@ def fromMsg(self, msg: MSGFile) -> None:
834850
# Get the entry for the named properties directory and add it
835851
# immediately if it exists. If it doesn't exist, this whole
836852
# section will be skipped.
837-
self.addOleEntry('__nameid_version1.0', msg._getOleEntry('__nameid_version1.0', False), None)
853+
try:
854+
self.addOleEntry('__nameid_version1.0', msg._getOleEntry('__nameid_version1.0', False), None)
855+
except OSError as e:
856+
if str(e).startswith('Cannot add an entry'):
857+
if allowBadEmbed:
858+
return
859+
raise StandardViolationError('Embedded msg file attempted to be extracted that contains it\'s own named streams.')
860+
raise
838861

839862
# Now that we know it exists, grab all the file inside and copy
840863
# them to our root.

0 commit comments

Comments
 (0)