datajuicer · cyruszhang · May 21, 2025 · May 22, 2025 · May 28, 2025 · Jun 10, 2025
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,9 @@
 outputs/
 assets/
 
+# logs
+**/logs/*
+
 # setup
 data_juicer.egg-info/
 py_data_juicer.egg-info/
@@ -16,6 +19,7 @@ wandb/
 __pycache__
 .vscode/
 .ipynb_checkpoints/
+performance_test_results*.json
 
 # label studio related
 label_studio_data/
@@ -31,3 +35,6 @@ tests/ops/data/*dup*
 tests/tools/tmp_*/
 tests/ops/deduplicator/chinese_dedup/
 tests/ops/deduplicator/english_dedup/
+
+# perf bench data
+perf_bench_data
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -68,6 +68,11 @@ eoc_special_token: '<|__dj__eoc|>'                          # the special token
 executor_type: default                                      # type of executor, support "default" or "ray" for now.
 ray_address: auto                                           # the address of the Ray cluster.
 
+# Core optimizer configuration
+enable_optimizer: false                                     # enable/disable core optimizer
+optimizer_strategies: ['op_reorder']                        # list of optimization strategies to apply
+                                                           # available strategies: op_reorder, filter_fusion, mapper_fusion
+
 # only for data analysis
 percentiles: [0.25, 0.5, 0.75]                              # percentiles to analyze the dataset distribution
 export_original_dataset: false                              # whether to export the original dataset with stats. If you only need the stats of the dataset, setting it to false could speed up the exporting.

diff --git a/configs/config_min.yaml b/configs/config_min.yaml
@@ -11,3 +11,8 @@ executor_type: default                                      # type of executor,
 ray_address: auto                                           # the address of the Ray cluster.
 suffixes: null
 add_suffix: false
+
+# Core optimizer configuration
+enable_optimizer: false                                     # enable/disable core optimizer
+optimizer_strategies: ['op_reorder']                        # list of optimization strategies to apply
+                                                           # available strategies: op_reorder, filter_fusion, mapper_fusion
diff --git a/configs/demo/fused_operations_demo.yaml b/configs/demo/fused_operations_demo.yaml
@@ -0,0 +1,92 @@
+# Fused Operations Demo Configuration
+# This config demonstrates how to use fused operations for optimal performance
+
+project_name: 'fused_operations_demo'
+dataset_path: 'path/to/your/dataset.jsonl'  # Replace with your dataset path
+export_path: 'output/fused_processed_dataset.jsonl'
+export_shard_size: 0
+export_in_parallel: false
+np: 4
+text_keys: 'text'
+suffixes: []
+turbo: false
+skip_op_error: true
+use_cache: true
+ds_cache_dir: null
+open_monitor: true
+use_checkpoint: false
+temp_dir: null
+open_tracer: false
+op_list_to_trace: []
+trace_num: 10
+
+# Enable fused operations for optimal performance
+op_fusion: true
+fusion_strategy: 'probe'  # Use probe strategy for optimal ordering
+cache_compress: null
+keep_stats_in_res_ds: false
+keep_hashes_in_res_ds: false
+adaptive_batch_size: false
+
+# For multimodal data processing
+image_key: 'images'
+image_special_token: '<__dj__image>'
+audio_key: 'audios'
+audio_special_token: '<__dj__audio>'
+video_key: 'videos'
+video_special_token: '<__dj__video>'
+eoc_special_token: '<|__dj__eoc|>'
+
+# Executor configuration
+executor_type: default
+ray_address: auto
+
+# Process pipeline with operations that will be automatically fused
+process:
+  # Phase 1: Text cleaning mappers (these run first)
+  - clean_html_mapper: {}                    # Remove HTML tags
+  - clean_links_mapper: {}                   # Remove URLs
+  - clean_email_mapper: {}                   # Remove email addresses
+  - clean_copyright_mapper: {}               # Remove copyright notices
+
+  # Phase 2: Text quality filters (these will be fused automatically)
+  # Basic text characteristics
+  - text_length_filter:                      # Filter by text length
+      min_len: 50
+      max_len: 2000
+  - words_num_filter:                        # Filter by word count
+      min_num: 10
+      max_num: 500
+  - character_repetition_filter:             # Filter repetitive characters
+      repetition_ratio: 0.8
+  - word_repetition_filter:                  # Filter repetitive words
+      min_ratio: 0.0
+      max_ratio: 0.5
+  - special_characters_filter:               # Filter special character ratio
+      min_ratio: 0.0
+      max_ratio: 0.3
+  - alphanumeric_filter:                     # Filter alphanumeric ratio
+      min_ratio: 0.3
+  - average_line_length_filter:              # Filter by average line length
+      min_len: 10
+      max_len: 100
+  - maximum_line_length_filter:              # Filter by maximum line length
+      min_len: 10
+      max_len: 200
+
+  # Phase 3: Content quality filters (these will also be fused)
+  - perplexity_filter:                       # Filter by language model perplexity
+      max_ppl: 1500
+  - stopwords_filter:                        # Filter by stopword ratio
+      min_ratio: 0.1
+  - flagged_words_filter:                    # Filter by flagged word ratio
+      max_ratio: 0.05
+  - language_id_score_filter:                # Filter by language confidence
+      lang: 'en'
+      min_score: 0.5
+      max_score: 1.0
+
+  # Phase 4: Text transformation mappers (these run after filtering)
+  - expand_macro_mapper: {}                  # Expand LaTeX macros
+  - chinese_convert_mapper:                  # Convert Chinese text
+      mode: 's2t'  # Simplified to Traditional
diff --git a/configs/optimization/op_reorder_showcase.yaml b/configs/optimization/op_reorder_showcase.yaml
@@ -0,0 +1,53 @@
+# Configuration to showcase operation reordering optimization
+# This config has a suboptimal order that should be reordered by the optimizer
+# GOAL: Show dramatic performance difference by putting expensive operations first
+
+project_name: 'op-reorder-showcase'
+dataset_path: 'perf_bench_data/text/wiki-10k.jsonl'
+export_path: 'outputs/op_reorder_showcase/res.jsonl'
+np: 4
+use_cache: false
+
+process:
+  # VERY EXPENSIVE OPERATIONS (should be moved after filtering)
+  # These are resource-intensive operations that waste computation on filtered data
+  - text_chunk_mapper:
+      chunk_size: 500  # Smaller chunks = more processing
+      text_key: 'text'
+      mem_required: '2GB'
+
+  - text_entity_dependency_filter:
+      min_score: 0.9  # Very strict filtering
+      text_key: 'text'
+      mem_required: '3GB'
+
+  - text_pair_similarity_filter:
+      min_score: 0.8
+      text_key: 'text'
+      mem_required: '2GB'
+
+  # LIGHT FILTERS (should be moved to front)
+  # These are fast filters that should run early to reduce data volume
+  - text_length_filter:
+      min_len: 50   # Less restrictive to keep more data
+      max_len: 5000
+      text_key: 'text'
+
+  - text_action_filter:
+      action_types: ['question', 'command', 'statement']  # Keep all types
+      text_key: 'text'
+
+  # DEPENDENCY CHAIN (must stay in order)
+  # language_id must come before perplexity
+  - language_id_score_filter:
+      lang: 'en'
+      min_score: 0.5  # Much less strict to keep more data
+      text_key: 'text'
+
+  - perplexity_filter:
+      lang: 'en'
+      min_score: 0.1  # Much less strict to keep more data
+      text_key: 'text'
+
+  # ADDITIONAL EXPENSIVE OPERATIONS
+  # text_pair_similarity_filter moved up to replace text_embd_similarity_filter
diff --git a/data_juicer/benchmark/__init__.py b/data_juicer/benchmark/__init__.py
@@ -0,0 +1,32 @@
+"""
+Data-Juicer Performance Benchmark Framework
+
+A comprehensive framework for A/B testing optimization strategies
+across different workloads, modalities, and operation complexities.
+"""
+
+from .core.benchmark_runner import BenchmarkConfig, BenchmarkRunner
+from .core.metrics_collector import MetricsCollector
+from .core.report_generator import ReportGenerator
+from .core.result_analyzer import ResultAnalyzer
+from .strategies.ab_test import ABTestConfig, StrategyABTest
+from .strategies.strategy_library import STRATEGY_LIBRARY, OptimizationStrategy
+from .utils.config_manager import ConfigManager
+from .workloads.workload_suite import WORKLOAD_SUITE, WorkloadDefinition, WorkloadSuite
+
+__version__ = "1.0.0"
+__all__ = [
+    "BenchmarkRunner",
+    "BenchmarkConfig",
+    "MetricsCollector",
+    "ResultAnalyzer",
+    "ReportGenerator",
+    "OptimizationStrategy",
+    "STRATEGY_LIBRARY",
+    "StrategyABTest",
+    "ABTestConfig",
+    "WorkloadSuite",
+    "WorkloadDefinition",
+    "WORKLOAD_SUITE",
+    "ConfigManager",
+]
diff --git a/data_juicer/benchmark/core/__init__.py b/data_juicer/benchmark/core/__init__.py
@@ -0,0 +1,15 @@
+"""Core benchmark framework components."""
+
+from .benchmark_runner import BenchmarkRunner
+from .metrics_collector import BenchmarkMetrics, MetricsCollector
+from .report_generator import ReportGenerator
+from .result_analyzer import ComparisonResult, ResultAnalyzer
+
+__all__ = [
+    "BenchmarkRunner",
+    "MetricsCollector",
+    "BenchmarkMetrics",
+    "ResultAnalyzer",
+    "ComparisonResult",
+    "ReportGenerator",
+]