microsoft · Hoder-zyf · Aug 13, 2025
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -21,7 +21,7 @@ def __init__(
         concise_knowledge: str | None = None,
         problem_name: str | None = None,
         problem_desc: str | None = None,
-        problem_label: Literal["SCENARIO_PROBLEM", "FEEDBACK_PROBLEM"] = "FEEDBACK_PROBLEM",
+        problem_label: Literal["SCENARIO_PROBLEM", "FEEDBACK_PROBLEM", "PERSISTENT_PROBLEM"] = "FEEDBACK_PROBLEM",
         appendix: str | None = None,
     ) -> None:
         super().__init__(

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/package_info.py b/rdagent/scenarios/data_science/proposal/exp_gen/package_info.py
@@ -108,6 +108,55 @@ def get_python_packages():
             print(pkg)
 
 
+def get_persistent_problem_guidelines():
+    """Generate guidelines for PERSISTENT_PROBLEM scenarios - focusing on model architecture"""
+    guidelines = [
+        "## Model Architecture Selection Guidelines",
+        "",
+        "Focus on **model architecture** - choosing the right model type and structure for your specific problem.",
+        "",
+        "### **History-Aware Architecture Selection**",
+        "",
+        "1. **Learn from Experiment History**",
+        "   - **Check what's been tried**: Review previous experiments to understand current baseline status",
+        "   - **Identify gaps**: What architectures haven't been properly tested yet?",
+        "   - **Build on success**: If baseline exists and works, focus on targeted improvements",
+        "",
+        "2. **Context-Driven Strategy**",
+        "   - **No baseline yet**: Start with reliable methods (XGBoost, RandomForest) to establish foundation",
+        "   - **Baseline established**: Explore modern alternatives (LightGBM, CatBoost) for potential gains",
+        "   - **Modern methods tested**: Consider advanced techniques (ensembles, custom) if justified by results",
+        "",
+        "3. ** Single Focus Per Hypothesis**",
+        "   - **One goal at a time**: Each hypothesis should focus on either establishing baseline OR testing innovation, not both",
+        "   - **Avoid feature creep**: Don't try to implement multiple improvements in one hypothesis",
+        "   - **Clear hypothesis scope**: Define exactly what this hypothesis is testing before proposing",
+        "   - **Iterative approach**: Build incrementally - baseline first, then one innovation at a time",
+        "",
+        "4. **Timely Fallback Principle**",
+        "   - **Monitor performance closely**: If advanced methods show no clear improvement, retreat quickly",
+        "   - **Don't chase complexity**: Advanced doesn't always mean better - simple often wins",
+        "   - **Fallback triggers**: Performance drop, training instability, or unclear benefits = immediate retreat",
+        "   - **Preserve what works**: Always maintain access to your best-performing solution",
+        "",
+        "5. **Computational Constraints**",
+        "   - Training time limitations: Choose models that converge quickly",
+        "   - Inference requirements: Balance accuracy with prediction speed",
+        "   - Memory constraints: Consider model size and batch processing needs",
+        "",
+        "6. **Model Selection Trade-offs (When Stuck)**",
+        "   - **⚠️ Use with caution**: Only pivot when multiple attempts with same paradigm clearly fail",
+        "   - **Complexity vs Simplicity**: If complex models keep failing, pivot to simple tree methods (XGBoost/RandomForest)",
+        "   - **Domain paradigms**: NLP stuck with transformers? Try rule-based. CV stuck with CNN? Try traditional features",
+        "   - **Speed vs Accuracy**: Trade some accuracy for faster, more reliable models that actually complete training",
+        "   - **Proven vs Novel**: When innovation fails repeatedly, choose proven methods over cutting-edge approaches",
+        "",
+        "### 💡 **Key Reminder**",
+        "**One hypothesis, one goal**: Each hypothesis should test exactly one architectural change - either establish baseline OR test one specific innovation. Keep scope focused for clear results.",
+    ]
+    return "\n".join(guidelines)
+
+
 if __name__ == "__main__":
     # Check if we should print available packages prompt
     if len(sys.argv) > 1 and sys.argv[1] == "--packages-prompt":

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -193,6 +193,10 @@ hypothesis_gen:
       - **Risk-Reward Balance (Score: 1-10):** Considering the potential for significant improvement (reward) versus the probability of failure, negative side-effects, or excessive resource consumption (risk), how optimal is this balance? A high score indicates a favorable balance.
       - **Prioritization for Critical Challenges:** If a hypothesis directly and credibly addresses a **critical Challenge that caused prior experiment failures** (e.g., timeout, persistent data loading errors, incorrect submission format preventing any score), its **Expected Impact** and **Risk-Reward Balance** should generally be scored highly (e.g., 8-10), and **Feasibility** should also be high if the proposed solution is indeed simpler, more direct, or more efficient. This ensures such critical hypotheses are prioritized.
 
+    {% if additional_guidelines %}
+    {{ additional_guidelines }}
+    {% endif %}
+
     {% if inject_diverse %}
     # Focus on Diversity!!
     Diversity is very critical in the analysis of scenario problems. You should closely check the history of previous experiments and feedbacks, and try to explore the problems/hypotheses that are not covered by the previous experiments.
@@ -303,13 +307,16 @@ hypothesis_rewrite:
     ## Task
     Transform each **original hypothesis and its critique** into a **single, specific, testable technical hypothesis** that can be implemented immediately.
 
+    **You have the authority to delete hypotheses that you judge to be completely infeasible or unsuitable, but ensure at least one hypothesis remains in your output.**
+
     ## Core Principles
     1. **Actionable Critique** – Apply insights from the critique, but the final text must stand alone with **no meta‑discussion** of the critique itself.
     2. **Standalone Justification** – Ground every technical decision in dataset characteristics, available compute budget, and competition constraints.
     3. **Decisive Specificity** – Remove all ambiguity; propose one clear action.
     4. **Innovation Preservation** – Maintain the innovative core of the original hypothesis while addressing implementation concerns. Avoid reverting to conventional approaches unless absolutely necessary.
     5. **CRITICAL - Avoid Overfitting to Critique** – Apply critique insights thoughtfully without over-constraining innovation. Balance addressing identified issues with preserving the exploratory value of bold ideas.
-    {% if enable_scale_check %}6. The user is currently working on a continuous exploration on the task. It's typical that we first try in small scale and in some certain point we will scale up the solution. 
+    6. **Hypothesis Deletion Authority** – You have the authority to delete hypotheses that you judge to be completely infeasible or unsuitable. Use your judgment, but ensure at least one hypothesis remains.
+    {% if enable_scale_check %}7. The user is currently working on a continuous exploration on the task. It's typical that we first try in small scale and in some certain point we will scale up the solution. 
     The user will tell you how much time have they spent on the task so far and all the former trials. You should consider whether to scale up the solution based on the current situation. You should put this conclusion in each hypothesis's appendix section.
     Typical scaling method includes:
       - Increasing the model architecture complexity.
@@ -443,7 +450,15 @@ task_gen:
     8. **Submission File (`submission.csv`)**: Generate `submission.csv` in the **exact format** required (column names, order, data types), as detailed in the '====== Submission Format ======' section of the Competition Scenario Description (DO NOT read the sample_submission.csv file directly in the code). This is a critical step.
     9. **Preferred Packages Notes**:
       {% include "scenarios.data_science.share:guidelines.package_selection" %}
-
+    10. **Adaptive Architecture Selection Strategy**:
+    - **History assessment**: Review what architectures/approaches have been tried in previous experiments and their outcomes.
+    - **Context-driven decisions**: Base architecture choice on experiment maturity rather than fixed preferences.
+    - **IF no working solution exists**: Focus on establishing reliable baseline first.
+    - **IF baseline established**: Consider exploring alternative architectures for potential improvements.
+    - **IF alternatives tested**: Evaluate advanced techniques only if previous approaches show measurable benefits.
+    - **Avoid repetition**: Don't retry failed approaches unless addressing specific implementation issues identified in feedback.
+    - **Performance-guided progression**: Let actual results from previous experiments guide complexity level choice.
+
     ## Package Declaration
     At the end of your design, **you MUST** provide a key `packages` in the final JSON output.  
     It should be an **array of PyPI package names** (strings) that you expect to `import` in the forthcoming implementation.  

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -23,6 +23,9 @@
     DSDraftExpGen,  # TODO: DSDraftExpGen should be moved to router in the further
 )
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSIdea
+from rdagent.scenarios.data_science.proposal.exp_gen.package_info import (
+    get_persistent_problem_guidelines,
+)
 from rdagent.scenarios.data_science.proposal.exp_gen.planner import (
     DSExperimentPlan,
     RD_Agent_TIMER_wrapper,
@@ -577,6 +580,13 @@ def identify_problem(
             for problem_name in fb_problems:
                 fb_problems[problem_name]["label"] = "FEEDBACK_PROBLEM"
                 all_problems[problem_name] = fb_problems[problem_name]
+
+        # Add persistent model problem  -- Persistent problem is a problem that is not related to the current scenario or feedback, but is a problem that is likely to be encountered in the future.
+        all_problems["Potential Model Architecture Optimization"] = {
+            "problem": "Current model architecture may not be optimal for this specific dataset and task characteristics, the chosen model type might not be the best fit for the data structure, size, and problem complexity.",
+            "reason": "Model architecture selection is fundamental to performance, as different model types have varying strengths for different data types and problem characteristics, choosing the right architecture is often more impactful than hyperparameter tuning.",
+            "label": "PERSISTENT_PROBLEM",
+        }
         return all_problems
 
     @wait_retry(retry_n=5)
@@ -608,6 +618,7 @@ def hypothesis_gen(
             problem_formatted_str += f"\n{packages_prompt}\n"
 
         sys_prompt = T(".prompts_v2:hypothesis_gen.system").r(
+            additional_guidelines=get_persistent_problem_guidelines(),
             hypothesis_output_format=(
                 T(".prompts_v2:output_format.hypothesis").r(pipeline=pipeline, enable_idea_pool=enable_idea_pool)
                 if not self.supports_response_schema
@@ -791,19 +802,25 @@ def hypothesis_rewrite(
 
         improved_hypotheses_dict = json.loads(response)
 
-        # Validate that we have rewritten hypotheses for all original hypotheses
+        # Validate rewritten hypotheses (now allows deletion of hypotheses)
         expected_problems = set(hypothesis_dict.keys())
-        available_problems = set(  # The code snippet provided is a comment in Python. It appears to be
-            # a placeholder for a function or variable named
-            # `improved_hypotheses_dict`. The actual implementation of this
-            # function or variable is not provided in the code snippet.
-            improved_hypotheses_dict.keys()
-        )
+        available_problems = set(improved_hypotheses_dict.keys())
 
-        if not expected_problems.issubset(available_problems):
-            missing_problems = expected_problems - available_problems
+        # Check if all available problems are valid (subset of expected)
+        if not available_problems.issubset(expected_problems):
+            unexpected_problems = available_problems - expected_problems
             # Raise exception to trigger retry mechanism
-            raise ValueError(f"Rewrite response missing expected problems. Missing: {missing_problems}")
+            raise ValueError(f"Rewrite response contains unexpected problems. Unexpected: {unexpected_problems}")
+
+        # Check if at least one hypothesis remains
+        if len(available_problems) == 0:
+            # Raise exception to trigger retry mechanism
+            raise ValueError("Rewrite response deleted all hypotheses. At least one hypothesis must remain.")
+
+        # Log deleted hypotheses if any
+        deleted_problems = expected_problems - available_problems
+        if deleted_problems:
+            logger.info(f"Deleted {len(deleted_problems)} hypotheses during rewrite: {deleted_problems}")
 
         # Note: We don't preserve 'inspired' field from original hypotheses
         # because after critique and rewrite, the hypothesis may have changed significantly
@@ -869,6 +886,8 @@ def select_hypothesis(
                 index_to_pick_pool_list.extend([j] * self.scen_prob_multiplier)
             elif problem_dict[problem_name]["label"] == "FEEDBACK_PROBLEM":
                 index_to_pick_pool_list.extend([j] * (3 - self.scen_prob_multiplier))
+            elif problem_dict[problem_name]["label"] == "PERSISTENT_PROBLEM":
+                index_to_pick_pool_list.extend([j] * 2)
             else:
                 index_to_pick_pool_list.extend([j] * 1)
         logger.info(f"index_to_pick_pool_list: {index_to_pick_pool_list}")