open-mmlab
diff --git a/‎configs/recognition/omnisource/slowonly_r50_16xb16-8x8x1-256e_imagenet-kinetics400-rgb.py‎
Lines changed: 171 additions & 0 deletions b/‎configs/recognition/omnisource/slowonly_r50_16xb16-8x8x1-256e_imagenet-kinetics400-rgb.py‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎mmaction/datasets/transforms/__init__.py‎
Lines changed: 19 additions & 19 deletions b/‎mmaction/datasets/transforms/__init__.py‎
Lines changed: 19 additions & 19 deletions
diff --git a/‎mmaction/datasets/transforms/loading.py‎
Lines changed: 86 additions & 0 deletions b/‎mmaction/datasets/transforms/loading.py‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎mmaction/engine/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎mmaction/engine/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mmaction/engine/runner/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎mmaction/engine/runner/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,171 @@
+_base_ = ['../../_base_/default_runtime.py']
+
+# model settings
+model = dict(
+    type='RecognizerOmni',
+    backbone=dict(type='OmniResNet'),
+    cls_head=dict(
+        type='OmniHead',
+        image_classes=1000,
+        video_classes=400,
+        in_channels=2048,
+        average_clips='prob'),
+    data_preprocessor=dict(
+        type='ActionDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        format_shape='MIX2d3d'))
+
+# dataset settings
+image_root = 'data/imagenet/'
+image_ann_train = 'meta/train.txt'
+
+video_root = 'data/kinetics400/videos_train'
+video_root_val = 'data/kinetics400/videos_val'
+video_ann_train = 'data/kinetics400/kinetics400_train_list_videos.txt'
+video_ann_val = 'data/kinetics400/kinetics400_val_list_videos.txt'
+
+num_images = 1281167  # number of training samples in the ImageNet dataset
+num_videos = 240435  # number of training samples in the Kinetics400 dataset
+batchsize_video = 16
+num_gpus = 8
+num_iter = num_videos // (batchsize_video * num_gpus)
+batchsize_image = num_images // (num_iter * num_gpus)
+
+train_pipeline = [
+    dict(type='DecordInit'),
+    dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='RandomResizedCrop'),
+    dict(type='Resize', scale=(224, 224), keep_ratio=False),
+    dict(type='Flip', flip_ratio=0.5),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+val_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=1,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='CenterCrop', crop_size=224),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+test_pipeline = [
+    dict(type='DecordInit'),
+    dict(
+        type='SampleFrames',
+        clip_len=8,
+        frame_interval=8,
+        num_clips=10,
+        test_mode=True),
+    dict(type='DecordDecode'),
+    dict(type='Resize', scale=(-1, 256)),
+    dict(type='ThreeCrop', crop_size=256),
+    dict(type='FormatShape', input_format='NCTHW'),
+    dict(type='PackActionInputs')
+]
+
+train_dataloader = dict(
+    batch_size=batchsize_video,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='VideoDataset',
+        ann_file=video_ann_train,
+        data_prefix=dict(video=video_root),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='VideoDataset',
+        ann_file=video_ann_val,
+        data_prefix=dict(video=video_root_val),
+        pipeline=val_pipeline,
+        test_mode=True))
+
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='VideoDataset',
+        ann_file=video_ann_val,
+        data_prefix=dict(video=video_root_val),
+        pipeline=test_pipeline,
+        test_mode=True))
+
+imagenet_pipeline = [
+    dict(type='LoadRGBFromFile'),
+    dict(type='mmcls.RandomResizedCrop', scale=224),
+    dict(type='mmcls.RandomFlip', prob=0.5, direction='horizontal'),
+    dict(type='mmcls.PackClsInputs'),
+]
+
+image_dataloader = dict(
+    batch_size=batchsize_image,
+    num_workers=8,
+    dataset=dict(
+        type='mmcls.ImageNet',
+        data_root=image_root,
+        ann_file=image_ann_train,
+        data_prefix='train',
+        pipeline=imagenet_pipeline),
+    sampler=dict(type='DefaultSampler', shuffle=True),
+)
+
+val_evaluator = dict(type='AccMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(
+    type='MultiLoaderEpochBasedTrainLoop',
+    other_loaders=[image_dataloader],
+    max_epochs=256,
+    val_interval=4)
+
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=0.1,
+        by_epoch=True,
+        begin=0,
+        end=34,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=222,
+        eta_min=0,
+        by_epoch=True,
+        begin=34,
+        end=256,
+        convert_to_iter_based=True)
+]
+"""
+The learning rate is for total_batch_size = 16 x 16 (num_gpus x batch_size)
+If you want to use other batch size or number of GPU settings, please update
+the learning rate with the linear scaling rule.
+"""
+optim_wrapper = dict(
+    optimizer=dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001),
+    clip_grad=dict(max_norm=40, norm_type=2))
+
+# runtime settings
+default_hooks = dict(checkpoint=dict(interval=4, max_keep_ckpts=3))
@@ -6,9 +6,9 @@
                       DecordInit, DenseSampleFrames,
                       GenerateLocalizationLabels, ImageDecode,
                       LoadAudioFeature, LoadHVULabel, LoadLocalizationFeature,
-                      LoadProposals, OpenCVDecode, OpenCVInit, PIMSDecode,
-                      PIMSInit, PyAVDecode, PyAVDecodeMotionVector, PyAVInit,
-                      RawFrameDecode, SampleAVAFrames, SampleFrames,
+                      LoadProposals, LoadRGBFromFile, OpenCVDecode, OpenCVInit,
+                      PIMSDecode, PIMSInit, PyAVDecode, PyAVDecodeMotionVector,
+                      PyAVInit, RawFrameDecode, SampleAVAFrames, SampleFrames,
                       UniformSample, UntrimmedSampleFrames)
 from .pose_transforms import (GeneratePoseTarget, GenSkeFeat, JointToBone,
                               LoadKineticsPose, MergeSkeFeat, PadTo,
@@ -21,20 +21,20 @@
 from .wrappers import ImgAug, PytorchVideoWrapper, TorchVisionWrapper
 
 __all__ = [
-    'SampleFrames', 'PyAVDecode', 'DecordDecode', 'DenseSampleFrames',
-    'OpenCVDecode', 'MultiScaleCrop', 'RandomResizedCrop', 'RandomCrop',
-    'Resize', 'Flip', 'Fuse', 'ThreeCrop', 'CenterCrop', 'TenCrop',
-    'Transpose', 'FormatShape', 'GenerateLocalizationLabels',
-    'LoadLocalizationFeature', 'LoadProposals', 'DecordInit', 'OpenCVInit',
-    'PyAVInit', 'UntrimmedSampleFrames', 'RawFrameDecode', 'DecordInit',
-    'OpenCVInit', 'PyAVInit', 'ColorJitter', 'LoadHVULabel', 'SampleAVAFrames',
-    'AudioAmplify', 'MelSpectrogram', 'AudioDecode', 'FormatAudioShape',
-    'LoadAudioFeature', 'AudioFeatureSelector', 'AudioDecodeInit',
-    'ImageDecode', 'BuildPseudoClip', 'RandomRescale', 'PIMSDecode',
-    'PyAVDecodeMotionVector', 'UniformSampleFrames', 'PoseDecode',
-    'LoadKineticsPose', 'GeneratePoseTarget', 'PIMSInit', 'FormatGCNInput',
-    'PadTo', 'ArrayDecode', 'JointToBone', 'PackActionInputs',
-    'PackLocalizationInputs', 'ImgAug', 'TorchVisionWrapper',
-    'PytorchVideoWrapper', 'PoseCompact', 'PreNormalize3D', 'ToMotion',
-    'MergeSkeFeat', 'GenSkeFeat', 'PreNormalize2D', 'UniformSample'
+    'ArrayDecode', 'AudioAmplify', 'AudioDecode', 'AudioDecodeInit',
+    'AudioFeatureSelector', 'BuildPseudoClip', 'CenterCrop', 'ColorJitter',
+    'DecordDecode', 'DecordInit', 'DecordInit', 'DenseSampleFrames', 'Flip',
+    'FormatAudioShape', 'FormatGCNInput', 'FormatShape', 'Fuse', 'GenSkeFeat',
+    'GenerateLocalizationLabels', 'GeneratePoseTarget', 'ImageDecode',
+    'ImgAug', 'JointToBone', 'LoadAudioFeature', 'LoadHVULabel',
+    'LoadKineticsPose', 'LoadLocalizationFeature', 'LoadProposals',
+    'LoadRGBFromFile', 'MelSpectrogram', 'MergeSkeFeat', 'MultiScaleCrop',
+    'OpenCVDecode', 'OpenCVInit', 'OpenCVInit', 'PIMSDecode', 'PIMSInit',
+    'PackActionInputs', 'PackLocalizationInputs', 'PadTo', 'PoseCompact',
+    'PoseDecode', 'PreNormalize2D', 'PreNormalize3D', 'PyAVDecode',
+    'PyAVDecodeMotionVector', 'PyAVInit', 'PyAVInit', 'PytorchVideoWrapper',
+    'RandomCrop', 'RandomRescale', 'RandomResizedCrop', 'RawFrameDecode',
+    'Resize', 'SampleAVAFrames', 'SampleFrames', 'TenCrop', 'ThreeCrop',
+    'ToMotion', 'TorchVisionWrapper', 'Transpose', 'UniformSample',
+    'UniformSampleFrames', 'UntrimmedSampleFrames'
 ]
@@ -16,6 +16,92 @@
 from mmaction.utils import get_random_string, get_shm_dir, get_thread_id
 
 
+@TRANSFORMS.register_module()
+class LoadRGBFromFile(BaseTransform):
+    """Load a RGB image from file.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:``mmcv.imfrombytes``.
+            Defaults to 'color'.
+        imdecode_backend (str): The image decoding backend type. The backend
+            argument for :func:``mmcv.imfrombytes``.
+            See :func:``mmcv.imfrombytes`` for details.
+            Defaults to 'cv2'.
+        io_backend (str): io backend where frames are store.
+            Default: 'disk'.
+        ignore_empty (bool): Whether to allow loading empty image or file path
+            not existent. Defaults to False.
+        kwargs (dict): Args for file client.
+    """
+
+    def __init__(self,
+                 to_float32: bool = False,
+                 color_type: str = 'color',
+                 imdecode_backend: str = 'cv2',
+                 io_backend: str = 'disk',
+                 ignore_empty: bool = False,
+                 **kwargs) -> None:
+        self.ignore_empty = ignore_empty
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.imdecode_backend = imdecode_backend
+        self.file_client = FileClient(io_backend, **kwargs)
+        self.io_backend = io_backend
+
+    def transform(self, results: dict) -> dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        try:
+            img_bytes = self.file_client.get(filename)
+            img = mmcv.imfrombytes(
+                img_bytes,
+                flag=self.color_type,
+                channel_order='rgb',
+                backend=self.imdecode_backend)
+        except Exception as e:
+            if self.ignore_empty:
+                return None
+            else:
+                raise e
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'ignore_empty={self.ignore_empty}, '
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f"imdecode_backend='{self.imdecode_backend}', "
+                    f"io_backend='{self.io_backend}')")
+        return repr_str
+
+
 @TRANSFORMS.register_module()
 class LoadHVULabel(BaseTransform):
     """Convert the HVU label from dictionaries to torch tensors.
 
@@ -2,3 +2,4 @@
 from .hooks import *  # noqa: F401, F403
 from .model import *  # noqa: F401, F403
 from .optimizers import *  # noqa: F401, F403
+from .runner import *  # noqa: F401, F403
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .multi_loop import MultiLoaderEpochBasedTrainLoop
+
+__all__ = ['MultiLoaderEpochBasedTrainLoop']