Skip to content

Commit 6eee65a

Browse files
Fix #2750: Handle invalid LINK frames when saving MP3 files
MP3 files with LINK frames that do not follow the ID3 specification cause mutagen exceptions when saving with ID3 v2.3 format. This fix handles invalid LINK frames to prevent errors during save operations.
1 parent 34f6e7f commit 6eee65a

File tree

3 files changed

+277
-2
lines changed

3 files changed

+277
-2
lines changed

picard/formats/id3.py

Lines changed: 212 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ class ID3File(File):
170170
'TSOT': 'titlesort',
171171
'WCOP': 'license',
172172
'WOAR': 'website',
173+
'WXXX': 'user_website',
173174
'COMM': 'comment',
174175
'TOAL': 'originalalbum',
175176
'TOPE': 'originalartist',
@@ -300,6 +301,8 @@ def _load(self, filename):
300301
for frame in tags.values():
301302
self._process_frame(frame, metadata, config_params)
302303

304+
self._process_link_frames_on_load(tags, metadata)
305+
303306
if 'date' in metadata:
304307
self._sanitize_date(metadata)
305308

@@ -614,6 +617,12 @@ def _get_tags(self, filename):
614617

615618
def _save_tags(self, tags, filename):
616619
config = get_config()
620+
621+
try:
622+
self._sanitize_id3_frames(tags)
623+
except Exception as e:
624+
log.error("Error sanitizing ID3 frames: %s", e)
625+
617626
if config.setting['write_id3v1']:
618627
v1 = 2
619628
else:
@@ -622,11 +631,153 @@ def _save_tags(self, tags, filename):
622631
if config.setting['write_id3v23']:
623632
tags.update_to_v23()
624633
separator = config.setting['id3v23_join_with']
625-
tags.save(filename, v2_version=3, v1=v1, v23_sep=separator)
634+
try:
635+
tags.save(filename, v2_version=3, v1=v1, v23_sep=separator)
636+
except ValueError as e:
637+
if str(e) == "Invalid frame ID":
638+
log.warning("Invalid frame ID error when saving. Removing all LINK frames as fallback...")
639+
for frame_id in list(tags.keys()):
640+
if frame_id.startswith('LINK'):
641+
del tags[frame_id]
642+
tags.save(filename, v2_version=3, v1=v1, v23_sep=separator)
643+
else:
644+
raise
626645
else:
627646
tags.update_to_v24()
628647
tags.save(filename, v2_version=4, v1=v1)
629648

649+
def _sanitize_id3_frames(self, tags):
650+
"""This method attempts to fix various issues with ID3 frames:
651+
1. Handle invalid LINK frames (convert to WXXX if possible)
652+
2. Remove null bytes and other illegal characters from frame IDs
653+
Note: In the future, we might consider prompting users about invalid frames."""
654+
to_remove = []
655+
to_add = []
656+
confirmed_problematic_frames = []
657+
extracted_urls = []
658+
for frame_id, frame in list(tags.items()):
659+
try:
660+
invalid_id = False
661+
if '\x00' in frame_id:
662+
invalid_id = True
663+
if frame_id.startswith('LINK'):
664+
is_malformed = False
665+
if ':' in frame_id and any(term in frame_id for term in ('http', 'www.', '://')):
666+
is_malformed = True
667+
668+
if hasattr(frame, 'data') and frame.data:
669+
frame_data = frame.data.decode('latin1', errors='ignore')
670+
# A valid frame should have: URL text string, null byte, then optional data
671+
has_url_pattern = any(term in frame_data for term in ('http', 'www.', '://'))
672+
if frame_data.startswith('\x00') and any(term in frame_data[1:] for term in ('http', 'www.', '://')):
673+
is_malformed = True
674+
elif has_url_pattern and '\x00' not in frame_data:
675+
is_malformed = True
676+
elif '\x00' in frame_data:
677+
parts = frame_data.split('\x00', 1)
678+
if len(parts) > 1 and any(term in parts[1] for term in ('http', 'www.', '://')):
679+
is_malformed = True
680+
681+
if is_malformed:
682+
invalid_id = True
683+
if self._extract_and_convert_link_frame(frame, frame_id, to_add):
684+
log.debug("Successfully converted malformed LINK frame %r to WXXX", frame_id)
685+
for item in to_add:
686+
if isinstance(item, mutagen.id3.WXXX):
687+
extracted_urls.append(item.url)
688+
689+
if invalid_id:
690+
confirmed_problematic_frames.append(frame_id)
691+
to_remove.append(frame_id)
692+
693+
except Exception as e:
694+
log.error("Error processing frame %r: %s", frame_id, e)
695+
696+
for frame_id in to_remove:
697+
del tags[frame_id]
698+
for frame in to_add:
699+
tags.add(frame)
700+
701+
if extracted_urls and hasattr(self, 'metadata') and hasattr(self, 'orig_metadata'):
702+
if 'user_website' not in self.metadata:
703+
self.metadata['user_website'] = extracted_urls
704+
if 'user_website' not in self.orig_metadata:
705+
self.orig_metadata['user_website'] = extracted_urls
706+
707+
return confirmed_problematic_frames
708+
709+
def _extract_and_convert_link_frame(self, frame, frame_id, to_add):
710+
"""Extract URL from a malformed LINK frame and convert it to a WXXX frame. Handles cases where URLs
711+
are split between frame ID and data or separated by null bytes (common for archive.org)"""
712+
713+
try:
714+
url_parts = []
715+
if hasattr(frame, 'data') and frame.data:
716+
frame_data = frame.data.decode('latin1', errors='ignore')
717+
if '\x00' in frame_data:
718+
parts = frame_data.split('\x00', 1)
719+
# Likely valid LINK frame if the first part looks like a URL and second part doesn't contain URL patterns
720+
if (parts[0] and any(term in parts[0] for term in ('http', 'www.', '://'))
721+
and len(parts) > 1 and not any(term in parts[1] for term in ('http', 'www.', '://'))):
722+
log.debug("Skipping valid LINK frame with URL %r", parts[0])
723+
return False
724+
725+
if ':' in frame_id:
726+
frame_id_parts = frame_id.split(':', 1)
727+
if frame_id_parts[0] == 'LINK':
728+
# Standard case where everything after LINK: is part of the URL
729+
url_parts.append(frame_id_parts[1])
730+
elif any(proto in frame_id_parts[0] for proto in ('LINKhttp', 'LINKhtt', 'LINKwww')):
731+
# Case where part of the URL protocol is in the frame_id
732+
prefix = 'LINK'
733+
protocol_part = frame_id_parts[0][len(prefix):]
734+
url_parts.append(protocol_part + ':' + frame_id_parts[1])
735+
736+
if hasattr(frame, 'data') and frame.data:
737+
frame_data = frame.data.decode('latin1', errors='ignore').strip()
738+
if frame_data:
739+
if '\x00' in frame_data:
740+
parts = frame_data.split('\x00')
741+
for part in parts:
742+
if part.strip() and any(term in part for term in ('http', 'www.', '://')):
743+
url_parts.append(part.strip())
744+
elif any(term in frame_data for term in ('http', 'www.', '://')):
745+
url_parts.append(frame_data)
746+
747+
combined_url = ''.join(url_parts).strip(':;,. \t\n\r')
748+
749+
if combined_url:
750+
url_match = re.search(r'(?:https?:?(?:/+|\\+)|www\.)[a-zA-Z0-9][-a-zA-Z0-9\.]+\.[a-zA-Z]{2,}(?:/[^\s:;,]*)?', combined_url)
751+
if url_match:
752+
raw_url = url_match.group(0)
753+
754+
if raw_url.startswith('www.'):
755+
final_url = 'http://' + raw_url
756+
elif raw_url.startswith(('http://', 'https://')):
757+
final_url = raw_url
758+
else:
759+
final_url = re.sub(r'^htt:?p', 'http', raw_url)
760+
if not final_url.startswith(('http://', 'https://')):
761+
if final_url.startswith(':'):
762+
final_url = 'http' + final_url
763+
elif final_url.startswith('//'):
764+
final_url = 'http:' + final_url
765+
elif final_url.startswith('p://'):
766+
final_url = 'http://' + final_url[4:]
767+
else:
768+
final_url = 'http://' + final_url
769+
770+
wxxx_frame = mutagen.id3.WXXX(encoding=Id3Encoding.LATIN1, desc="URL from malformed LINK frame", url=final_url)
771+
to_add.append(wxxx_frame)
772+
return True
773+
774+
log.warning("Could not extract URL from malformed LINK frame %r", frame_id)
775+
return False
776+
777+
except Exception as e:
778+
log.error("Failed to process LINK frame %r: %s", frame_id, e)
779+
return False
780+
630781
def format_specific_metadata(self, metadata, tag, settings=None):
631782
if not settings:
632783
settings = get_config().setting
@@ -1035,6 +1186,49 @@ def _remove_other_supported_tag(self, tags, real_name):
10351186
"""Remove other supported tag from ID3 frames."""
10361187
del tags[real_name]
10371188

1189+
def _process_link_frames_on_load(self, tags, metadata):
1190+
"""Process malformed LINK frames during initial file load and add them as website URLs."""
1191+
1192+
link_frames = []
1193+
for frame_id, frame in list(tags.items()):
1194+
if frame_id.startswith('LINK'):
1195+
is_potentially_malformed = False
1196+
1197+
if ':' in frame_id and any(term in frame_id for term in ('http', 'www.', '://')):
1198+
is_potentially_malformed = True
1199+
1200+
if hasattr(frame, 'data') and frame.data:
1201+
try:
1202+
frame_data = frame.data.decode('latin1', errors='ignore')
1203+
1204+
if frame_data.startswith('\x00') and any(term in frame_data[1:] for term in ('http', 'www.', '://')):
1205+
is_potentially_malformed = True
1206+
1207+
elif '\x00' in frame_data:
1208+
parts = frame_data.split('\x00', 1)
1209+
if len(parts) > 1 and any(term in parts[1] for term in ('http', 'www.', '://')):
1210+
is_potentially_malformed = True
1211+
1212+
elif any(term in frame_data for term in ('http', 'www.', '://')) and '\x00' not in frame_data:
1213+
is_potentially_malformed = True
1214+
except Exception:
1215+
is_potentially_malformed = True
1216+
1217+
if is_potentially_malformed:
1218+
link_frames.append((frame_id, frame))
1219+
1220+
if not link_frames:
1221+
return
1222+
1223+
to_add = []
1224+
for frame_id, frame in link_frames:
1225+
if self._extract_and_convert_link_frame(frame, frame_id, to_add):
1226+
log.debug("Extracted URL from malformed LINK frame %r during load", frame_id)
1227+
1228+
for frame in to_add:
1229+
if isinstance(frame, mutagen.id3.WXXX):
1230+
url = frame.url
1231+
metadata.add('user_website', url)
10381232

10391233
class MP3File(ID3File):
10401234

@@ -1080,10 +1274,26 @@ def _get_tags(self, filename):
10801274

10811275
def _save_tags(self, tags, filename):
10821276
config = get_config()
1277+
1278+
try:
1279+
self._sanitize_id3_frames(tags)
1280+
except Exception as e:
1281+
log.error("Error sanitizing ID3 frames: %s", e)
1282+
10831283
if config.setting['write_id3v23']:
10841284
compatid3.update_to_v23(tags)
10851285
separator = config.setting['id3v23_join_with']
1086-
tags.save(filename, v2_version=3, v23_sep=separator)
1286+
try:
1287+
tags.save(filename, v2_version=3, v23_sep=separator)
1288+
except ValueError as e:
1289+
if str(e) == "Invalid frame ID":
1290+
log.debug("Invalid frame ID error when saving. Removing all LINK frames as fallback...")
1291+
for frame_id in list(tags.keys()):
1292+
if frame_id.startswith('LINK'):
1293+
del tags[frame_id]
1294+
tags.save(filename, v2_version=3, v23_sep=separator)
1295+
else:
1296+
raise
10871297
else:
10881298
tags.update_to_v24()
10891299
tags.save(filename, v2_version=4)

picard/util/tags.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@
136136
'totaltracks': N_('Total Tracks'),
137137
'tracknumber': N_('Track Number'),
138138
'website': N_('Artist Website'),
139+
'user_website': N_('User Defined Website'),
139140
'work': N_('Work'),
140141
'writer': N_('Writer'),
141142
}

test/formats/test_id3.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,70 @@ def test_releasedate_v24(self):
625625
raw_metadata = load_raw(self.filename)
626626
self.assertEqual(metadata['releasedate'], raw_metadata['TDRL'])
627627

628+
@skipUnlessTestfile
629+
def test_malformed_link_url_in_data(self):
630+
filename = self.copy_file_tmp(os.path.join('test', 'data', 'test.mp3'), '.mp3')
631+
tags = mutagen.id3.ID3()
632+
try:
633+
tags.load(filename)
634+
except mutagen.id3.ID3NoHeaderError:
635+
pass
636+
frame = mutagen.id3.Frames['LINK'](data=b'\x00http://example.org/null')
637+
tags.add(frame)
638+
tags.save(filename)
639+
metadata = load_metadata(filename)
640+
self.assertIn('user_website', metadata)
641+
website_url = metadata['user_website']
642+
self.assertTrue('http://example.org/null' in website_url)
643+
save_metadata(filename, metadata)
644+
raw_metadata = load_raw(filename)
645+
has_url_frame = False
646+
for key in raw_metadata:
647+
if key.startswith('WXXX:') or key == 'WXXX':
648+
has_url_frame = True
649+
self.assertTrue(website_url in key or website_url == raw_metadata[key])
650+
break
651+
self.assertTrue(has_url_frame, "No URL frame (WXXX) found in raw metadata")
652+
self.assertFalse(any(k.startswith('LINK') for k in raw_metadata))
653+
654+
@skipUnlessTestfile
655+
def test_archive_org_link_pattern(self):
656+
filename = self.copy_file_tmp(os.path.join('test', 'data', 'test.mp3'), '.mp3')
657+
658+
tags = mutagen.id3.ID3()
659+
try:
660+
tags.load(filename)
661+
except mutagen.id3.ID3NoHeaderError:
662+
pass
663+
664+
frame = mutagen.id3.Frames['LINK'](data=b'p://www.archive.org/details/test_item')
665+
tags.add(frame)
666+
tags.save(filename)
667+
668+
metadata = load_metadata(filename)
669+
self.assertIn('user_website', metadata, "URL was not extracted from archive.org pattern")
670+
website_url = metadata['user_website']
671+
self.assertIn('archive.org', website_url, "URL doesn't contain archive.org domain")
672+
self.assertIn('details/test_item', website_url, "URL path was not correctly extracted")
673+
674+
save_metadata(filename, metadata)
675+
raw_metadata = load_raw(filename)
676+
has_url_frame = False
677+
678+
for key in raw_metadata:
679+
if key.startswith('WXXX:') or key == 'WXXX':
680+
has_url_frame = True
681+
frame = raw_metadata[key]
682+
if hasattr(frame, 'url'):
683+
self.assertIn('archive.org', frame.url)
684+
self.assertIn('details/test_item', frame.url)
685+
else:
686+
self.assertIn('archive.org', frame)
687+
self.assertIn('details/test_item', frame)
688+
break
689+
690+
self.assertTrue(has_url_frame, "No URL frame (WXXX) found - LINK frame wasn't converted")
691+
self.assertFalse(any(k.startswith('LINK') for k in raw_metadata), "LINK frame was not removed after conversion")
628692

629693
class MP3Test(CommonId3Tests.Id3TestCase):
630694
testfile = 'test.mp3'

0 commit comments

Comments
 (0)