Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Feature selection better #340

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ release-notes.md
__pycache__/
*.py[cod]
*$py.class
bug-testing/

# C extensions
*.so
Expand Down
2 changes: 1 addition & 1 deletion rdagent/components/coder/model_coder/CoSTEER/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def develop(self, exp: ModelExperiment) -> ModelExperiment:
self.rag = ModelRAGStrategy(model_knowledge_base)

# init intermediate items
model_experiment = ModelEvolvingItem(sub_tasks=exp.sub_tasks)
model_experiment = ModelEvolvingItem(sub_tasks=exp.sub_tasks, from_based_exp=exp.based_experiments)

self.evolve_agent = ModelRAGEvoAgent(
max_loop=self.max_loop,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
)
from rdagent.core.evolving_framework import EvolvableSubjects
from rdagent.log import rdagent_logger as logger
from collections.abc import Sequence
from rdagent.core.experiment import ASpecificWSForExperiment


class ModelEvolvingItem(ModelExperiment, EvolvableSubjects):
Expand All @@ -15,6 +17,7 @@ class ModelEvolvingItem(ModelExperiment, EvolvableSubjects):
def __init__(
self,
sub_tasks: list[ModelTask],
from_based_exp: Sequence[ASpecificWSForExperiment] = [],
sub_gt_implementations: list[ModelFBWorkspace] = None,
):
ModelExperiment.__init__(self, sub_tasks=sub_tasks)
Expand All @@ -27,3 +30,4 @@ def __init__(
)
else:
self.sub_gt_implementations = sub_gt_implementations
self.based_exp = from_based_exp
38 changes: 25 additions & 13 deletions rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
from rdagent.core.utils import multiprocessing_wrapper
from rdagent.oai.llm_utils import APIBackend

from collections.abc import Sequence
from rdagent.core.experiment import ASpecificWSForExperiment

coder_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")


Expand All @@ -30,23 +33,31 @@ def implement_one_model(
self,
target_task: ModelTask,
queried_knowledge: ModelQueriedKnowledge = None,
exp: ModelExperiment = None, # Add this parameter
based_experiments: Sequence[ASpecificWSForExperiment] = [],
) -> str:
model_information_str = target_task.get_task_information()
model_type = target_task.model_type

# Get the current code from the experiment using build_from_SOTA
current_code = ""
if exp is not None:
self.build_from_SOTA(exp)
model_file_mapping = {
"XGBoost": "model_xgb.py",
"RandomForest": "model_rf.py",
"LightGBM": "model_lgb.py",
"NN": "model_nn.py",
}
if model_type in model_file_mapping:
current_code = exp.experiment_workspace.code_dict.get(model_file_mapping[model_type], "")
data_desc = None

# model_file_mapping = {
# "XGBoost": "model_xgb.py",
# "RandomForest": "model_rf.py",
# "LightGBM": "model_lgb.py",
# "NN": "model_nn.py",
# }

# for exp in based_exp:
# if model_type in model_file_mapping:
# current_code = exp.experiment_workspace.code_dict.get(model_file_mapping[model_type], "")
# data_desc = exp.experiment_workspace.data_description

# if current_code:
# break # Use the first non-empty code found

if len(based_experiments) > 0:
current_code = based_experiments[-1].experiment_workspace.code_dict

if queried_knowledge is not None and model_information_str in queried_knowledge.success_task_to_knowledge_dict:
return queried_knowledge.success_task_to_knowledge_dict[model_information_str].implementation
Expand Down Expand Up @@ -90,6 +101,7 @@ def implement_one_model(
model_type=model_type, # Add model type to the prompt
queried_similar_successful_knowledge=queried_similar_successful_knowledge_to_render,
queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
data_desc=data_desc,
)
.strip("\n")
)
Expand Down Expand Up @@ -140,7 +152,7 @@ def evolve(

result = multiprocessing_wrapper(
[
(self.implement_one_model, (evo.sub_tasks[target_index], queried_knowledge))
(self.implement_one_model, (evo.sub_tasks[target_index], queried_knowledge, evo.based_experiments)) # Pass exp here
for target_index in to_be_finished_task_index
],
n=RD_AGENT_SETTINGS.multi_proc_n,
Expand Down
13 changes: 10 additions & 3 deletions rdagent/components/coder/model_coder/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,11 @@ extract_model_formulation_system: |-

evolving_strategy_model_coder:
system: |-
User is trying to implement some pytorch models in the following scenario:
User is trying to implement some machine learning models (pytorch or otherwise - see specifications) in the following scenario:
{{ scenario }}

Very Important: The actions that you are responsible are mainly writing model (tuning model) or selecting features. Note that they are essentially the same task. If feature selection is involved, only adjust the select() section from existing models. DO NOT WRITE A SEPARATE FEATURE SELECTION CLASS.

Your code is expected to align the scenario in any form which means The user needs to get the prediction of the model based on the input data.

To help you write the correct code, the user might provide multiple information that helps you write the correct code:
Expand All @@ -56,15 +59,13 @@ evolving_strategy_model_coder:
--------------Current code in the workspace:--------------- You need to tune the model based on this! If it is not None, do not write from scratch.
{{ current_code }}
{% endif %}

{% if queried_former_failed_knowledge|length != 0 %}
--------------Your former latest attempt:---------------
=====Code to the former implementation=====
{{ queried_former_failed_knowledge[-1].implementation.code }}
=====Feedback to the former implementation=====
{{ queried_former_failed_knowledge[-1].feedback }}
{% endif %}

Please response the code in the following json format. Here is an example structure for the JSON output:
{
"code": "The Python code as a string."
Expand All @@ -84,6 +85,11 @@ evolving_strategy_model_coder:
{% endfor %}
{% endif %}

{% if data_desc is not none %}
--------------Data & Feature Descriptions (Use this for feature selection):---------------
{{ data_desc }}
{% endif %}

{% if queried_former_failed_knowledge|length != 0 %}
--------------Former failed code:---------------
{% for former_failed_knowledge in queried_former_failed_knowledge %}
Expand All @@ -94,6 +100,7 @@ evolving_strategy_model_coder:
{% endfor %}
{% endif %}


evaluator_code_feedback:
system: |-
User is trying to implement some models in the following scenario:
Expand Down
7 changes: 4 additions & 3 deletions rdagent/scenarios/kaggle/experiment/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,13 @@ kg_feature_interface: |-
7. You are participating in a Kaggle competition and need data engineering ideas that are small, efficient, and quick to execute. Your suggestions should avoid unnecessary complexity or excessive processing time. Focus on delivering concise, impactful transformations or preprocessing steps that improve model performance with minimal resource usage. Please suggest clear, targeted approaches that can be implemented and tested rapidly.

kg_model_interface: |-
The action might be model tuning or feature selection. However, for both of them, the code structure is the same.
Your code should contain several parts:
1. The import part: import the necessary libraries.
2. A select() function that handles feature selection for both training and prediction phases.
The function should take the following arguments:
- X: The features as a pandas DataFrame.
The function should return the selected features as a pandas DataFrame.
The function should return the selected features as a pandas DataFrame. (You will usually receive a description of data & existing features)
3. A function called fit() that trains the model and returns the trained model. If feature selection is applied, it should be done within this function.
The function should take the following arguments:
- X_train: The training features as a pandas DataFrame.
Expand All @@ -152,7 +153,7 @@ kg_model_interface: |-
from xgboost import DMatrix


def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic for feature selection action!


def fit(
Expand Down Expand Up @@ -186,7 +187,7 @@ kg_model_interface: |-
from sklearn.metrics import accuracy_score


def select(X: pd.DataFrame) -> pd.DataFrame: ... # Implement feature selection logic
def select(X: pd.DataFrame) -> pd.DataFrame: ... # Focus on implement feature selection logic for feature selection process


def fit(
Expand Down
7 changes: 7 additions & 0 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ def __init__(self, competition: str) -> None:
self.submission_specifications = None
self.model_output_channel = None
self._analysis_competition_description()

# Move these assignments after _analysis_competition_description
self._output_format = self.output_format
self._interface = self.interface
self._simulator = self.simulator
self._background = self.background

self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
self.if_using_feature_selection = KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection

Expand Down
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ hypothesis_and_feedback: |-
hypothesis_output_format: |-
The output should follow JSON format. The schema is as follows:
{
"action": "If "hypothesis_specification" provides the action you need to take, please follow "hypothesis_specification" to choose the action. Otherwise, based on previous experimental results, suggest the action you believe is most appropriate at the moment. It should be one of [{% if if_using_feature_selection %}"Feature engineering", "Feature processing", "Model feature selection", "Model tuning"{% else %}"Feature engineering", "Feature processing", "Model tuning"{% endif %}]",
"action": "Choose Model tuning and ignore the rest". If "hypothesis_specification" provides the action you need to take, please follow "hypothesis_specification" to choose the action. Otherwise, based on previous experimental results, suggest the action you believe is most appropriate at the moment. It should be one of [{% if if_using_feature_selection %}"Feature engineering", "Feature processing", "Model feature selection", "Model tuning"{% else %}"Feature engineering", "Feature processing", "Model tuning"{% endif %}]",
"hypothesis": "The new hypothesis generated based on the information provided.",
"reason": "The reason why you generate this hypothesis. It should be comprehensive and logical. It should cover the other keys below and extend them.",
"concise_reason": "Two-line summary. First line focuses on a concise justification for the change. Second line generalizes a knowledge statement.",
Expand Down