microsoft · jingyuanlm · Jul 25, 2025 · Jul 25, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -41,6 +41,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     """The recommend time limit for running on full data"""
     full_timeout: int = 3600
     """The timeout limit for running on full data"""
+    ensemble_timeout: int = 3600*5
 
     ### specific feature
 
@@ -103,6 +104,10 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     sota_count_threshold: int = 1
     """The threshold for SOTA count"""
 
+    ensemble_with_merge: bool = False
+
+    ratio_merge_or_ensemble: int = 99 # 70% for merge/ensemble
+
     #### multi-trace: SOTA experiment selector
     sota_exp_selector_name: str = "rdagent.scenarios.data_science.proposal.exp_gen.select.submit.GlobalSOTASelector"
     """The name of the SOTA experiment selector to use"""
@@ -136,12 +141,43 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     show_hard_limit: bool = True
 
     #### hypothesis critique and rewrite
-    enable_hypo_critique_rewrite: bool = True
+    enable_hypo_critique_rewrite: bool = False
     """Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
     enable_scale_check: bool = False
 
+    #### hypothesis selection method
+    llm_select_hypothesis: bool = True
+    """Whether to use LLM to select hypothesis. If True, use LLM selection; if False, use the existing ranking method."""
     #### enable runner code change summary
-    runner_enable_code_change_summary: bool = True
+    runner_enable_code_change_summary: bool = False
+
+    # runner MCTS settings
+    switch_mcts_ratio : int = 50
+
+
+    enable_runner_mcts: bool = True
+    """Enable MCTS in runner for better code searching"""
+    mcts_max_iterations: int = 2
+    """The maximum number of MCTS iterations to perform.
+
+    - If running in single-process mode, this is set to the total number of nodes in the tree.
+    - If running in multi-process mode, this is set to the maximum depth of the tree.
+    """
+
+    mcts_exploration_constant: float = 1.4
+    """The exploration constant (C) used in the UCT formula."""
+
+    mcts_hypothesis_sample_size: int = 2
+    """The number of hypotheses to sample during MCTS."""
 
+    runner_max_loop: int = mcts_max_iterations*mcts_hypothesis_sample_size +1
+    """The maximum number of MCTS iterations to perform."""
 
+    multiprocessing_mcts_simulation: bool = True
+    """Enable multiprocessing for MCTS simulations."""
+    mcts_multiprocessing_batch_size: int  = 2
+    """ # Recommended to keep it equal to mcts_hypothesis_sample_size """
+    """The batch size for multiprocessing in MCTS simulations."""
+    mcts_n_processes: int = 2
+    """The number of processes to use for multiprocessing in MCTS simulations."""
 DS_RD_SETTING = DataScienceBasePropSetting()
diff --git a/rdagent/components/coder/CoSTEER/evolvable_subjects.py b/rdagent/components/coder/CoSTEER/evolvable_subjects.py
@@ -1,6 +1,7 @@
 from rdagent.core.evolving_framework import EvolvableSubjects
 from rdagent.core.experiment import Experiment, FBWorkspace, Task
 from rdagent.log import rdagent_logger as logger
+from typing import Literal,Any
 
 
 class EvolvingItem(Experiment, EvolvableSubjects):
@@ -24,6 +25,9 @@ def __init__(
         else:
             self.sub_gt_implementations = sub_gt_implementations
 
+        self.MCTS_NODE_LIST: list[Any] = []  # To store the MCTS nodes for this experiment, should be generated inside MCTS runner
+        self.FEEDBACK = None  # To store the bset MCTS node feedback
+
     @classmethod
     def from_experiment(cls, exp: Experiment) -> "EvolvingItem":
         ei = cls(sub_tasks=exp.sub_tasks)

diff --git a/rdagent/components/coder/CoSTEER/knowledge_management.py b/rdagent/components/coder/CoSTEER/knowledge_management.py
@@ -285,7 +285,8 @@ def generate_knowledge(
                     target_task = implementations.sub_tasks[task_index]
                     target_task_information = target_task.get_task_information()
                     implementation = implementations.sub_workspace_list[task_index]
-                    single_feedback: CoSTEERSingleFeedback = feedback[task_index]
+                    single_feedback: CoSTEERSingleFeedback = feedback[task_index]#.feedback_list[task_index]#feedback#feedback[task_index]
+
                     if implementation is None or single_feedback is None:
                         continue
                     single_knowledge = CoSTEERKnowledge(

diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
@@ -11,7 +11,8 @@
 from rdagent.core.evaluation import EvaluableObj, Evaluator, Feedback
 from rdagent.core.evolving_framework import EvolvableSubjects, EvolvingStrategy, EvoStep
 from rdagent.log import rdagent_logger as logger
-
+from rdagent.app.data_science.conf import DS_RD_SETTING
+from rdagent.components.coder.CoSTEER.evaluators import CoSTEERMultiEvaluator
 ASpecificEvaluator = TypeVar("ASpecificEvaluator", bound=Evaluator)
 ASpecificEvolvableSubjects = TypeVar("ASpecificEvolvableSubjects", bound=EvolvableSubjects)
 
@@ -92,10 +93,21 @@ def multistep_evolve(
 
                 # 4. Evaluation
                 if self.with_feedback:
-                    es.feedback = (
-                        eva if isinstance(eva, Feedback) else eva.evaluate(evo, queried_knowledge=queried_knowledge)
-                    )
-                    logger.log_object(es.feedback, tag="evolving feedback")
+                    if DS_RD_SETTING.enable_runner_mcts:
+                        if "Runner" in str(self.evolving_strategy.__class__):
+                            es.feedback = (
+                            eva if isinstance(eva, Feedback) else evo.FEEDBACK
+                        )
+                        else:
+                            es.feedback = (
+                            eva if isinstance(eva, Feedback) else eva.evaluate(evo, queried_knowledge=queried_knowledge)
+                        )
+                        logger.log_object(es.feedback, tag="evolving feedback")
+                    else:
+                        es.feedback = (
+                            eva if isinstance(eva, Feedback) else eva.evaluate(evo, queried_knowledge=queried_knowledge)
+                        )
+                        logger.log_object(es.feedback, tag="evolving feedback")
 
                 # 5. update trace
                 self.evolving_trace.append(es)

diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py
@@ -20,7 +20,7 @@
 
 from .base import Storage
 from .storage import FileStorage
-from .utils import get_caller_info
+from .utils import LogColors, get_caller_info
 
 
 class RDAgentLog(SingletonBaseClass):
@@ -127,10 +127,15 @@ def _log(self, level: str, msg: str, *, tag: str = "", raw: bool = False) -> Non
             logger.add(sys.stderr)
 
     def info(self, msg: str, *, tag: str = "", raw: bool = False) -> None:
+        # Use default color for info messages
         self._log("info", msg, tag=tag, raw=raw)
 
     def warning(self, msg: str, *, tag: str = "", raw: bool = False) -> None:
-        self._log("warning", msg, tag=tag, raw=raw)
+        # Add yellow color for warning messages
+        colored_msg = f"{LogColors.YELLOW}{msg}{LogColors.END}"
+        self._log("warning", colored_msg, tag=tag, raw=True)
 
     def error(self, msg: str, *, tag: str = "", raw: bool = False) -> None:
-        self._log("error", msg, tag=tag, raw=raw)
+        # Add red color for error messages
+        colored_msg = f"{LogColors.RED}{msg}{LogColors.END}"
+        self._log("error", colored_msg, tag=tag, raw=True)
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -31,9 +31,13 @@ exp_feedback:
       - If overfitting is detected, provide a detailed analysis explaining how and why it occurs, referencing scenario description, code implementation, and validation scores to support your findings.
     - If such discrepancies or risks are found:
       - Clearly document these issues in `Reasoning`, referencing both scenario description and code implementation—not just validation scores.
-      - Set `"Evaluation Aligned With Task": "no"` and `"Replace Best Result": "no"`.
-      - Begin your `reasoning` with `[Evaluation error]`, explicitly stating the evaluation alignment issues causing experiment failure.
-    - If evaluation alignment passes, set `"Evaluation Aligned With Task": "yes"`, and then proceed to Step 3.
+        - Severity-based handling:
+         - Severe risk — likely to invert or invalidate the performance trend between validation and test (e.g., strong overfitting, label leakage, test distribution shift):
+           - Set "Evaluation Aligned With Task": "no" and "Replace Best Result": "no".
+           - Begin your reasoning with [Evaluation error], explicitly stating the evaluation alignment issues causing experiment failure.
+         - Mild/moderate risk — may cause slightly optimistic or biased validation scores but is unlikely to change the relative performance trend (e.g., scaling or PCA fit on full training data that’s also applied consistently to test):
+          - Set "Evaluation Aligned With Task": "yes" but note the potential bias in Reasoning.
+           - Proceed to Step 3 for result comparison.
 
     Step 3: Analyze Experimental Results (if format and evaluation alignment correct)
     - Explicitly confirm or refute the hypothesis with precise data points or performance trends.

diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -161,7 +161,7 @@ def __init__(
             es=es,
             evolving_version=2,
             scen=scen,
-            max_loop=DS_RD_SETTING.runner_max_loop,
+            max_loop=1,
             **kwargs,
         )