Fix horovod dataset partion for CombinedDataset with sampling_sizes

patrick-wilken · patrick-wilken · commit 0b73a90380b9 · 2021-10-13T06:41:31.000-04:00
diff --git a/returnn/datasets/basic.py b/returnn/datasets/basic.py
@@ -121,6 +121,7 @@ def __init__(self, name=None,
     self.random_seed_offset = random_seed_offset
     self.partition_epoch = partition_epoch or 1
     self.repeat_epoch = repeat_epoch or 1
+    self.disable_horovod_partition = False  # can be set by meta-dataset to handle multi-gpu partitioning on meta-level
     self.seq_tags_filter = set(self._load_seq_list_file(seq_list_filter_file)) if seq_list_filter_file else None
     self.unique_seq_tags = unique_seq_tags
     self._seq_order_seq_lens_file = seq_order_seq_lens_file
@@ -483,7 +484,8 @@ def get_seq_order_for_epoch(self, epoch, num_seqs, get_seq_len=None):
       seq_index = self._apply_partition_epoch(seq_index, partition_epoch, epoch)
     if repeat_epoch > 1:
       seq_index = seq_index * repeat_epoch
-    seq_index = self._apply_multi_gpu_partition(seq_index)
+    if not self.disable_horovod_partition:
+      seq_index = self._apply_multi_gpu_partition(seq_index)
     if self.seq_tags_filter is not None:
       # Note: This is as generic as possible, but requires that get_all_tags is implemented.
       assert seq_index
diff --git a/returnn/datasets/meta.py b/returnn/datasets/meta.py
@@ -913,6 +913,9 @@ def init_seq_order(self, epoch=None, seq_list=None, seq_order=None):
     # partition epoch of the individual sub-datasets is still supported. Later we will call init_seq_order again with a
     # sequence list to e.g. apply joint sorting or partition epoch of all sequences.
     for dataset in self.datasets.values():
+      if self.sampling_sizes:
+        # Partitioning does not make sense if we sample a fixed number of sequences anyway.
+        dataset.disable_horovod_partition = True
       dataset.init_seq_order(epoch=epoch)
 
     # noinspection PyBroadException
@@ -1076,6 +1079,8 @@ def _get_sampling_seq_order(self):
     # We want to additionally sort the sequences in the current sample. For this, create a sequence order on a
     # range of length of the number of sequences in the sample. Note that we have to map the indices to make use
     # of self._get_seq_length here.
+    # This get_seq_order_for_epoch call now also handles horovod_dataset_distribution = 'partition', which we
+    # disabled on sub-dataset level via 'disable_horovod_partition' above.
     seq_order_remapping = self.get_seq_order_for_epoch(
       epoch=epoch, num_seqs=len(seq_order), get_seq_len=lambda i: self._get_seq_length(seq_order[i]))