Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions sagemaker-train/src/sagemaker/train/common_utils/finetune_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,19 @@ def _create_mlflow_app(sagemaker_session) -> Optional[MlflowApp]:
if new_app.status in ["Created", "Updated"]:
return new_app
elif new_app.status in ["Failed", "Stopped"]:
raise RuntimeError(f"MLflow app creation failed with status: {new_app.status}")
# Get detailed error from MLflow app
error_msg = f"MLflow app creation failed with status: {new_app.status}"
if hasattr(new_app, 'failure_reason') and new_app.failure_reason:
error_msg += f". Reason: {new_app.failure_reason}"
raise RuntimeError(error_msg)
time.sleep(poll_interval)

raise RuntimeError(f"MLflow app creation timed out after {max_wait_time} seconds")
# Timeout case - get current status and any error details
new_app.refresh()
error_msg = f"MLflow app creation failed. Current status: {new_app.status}"
if hasattr(new_app, 'failure_reason') and new_app.failure_reason:
error_msg += f". Reason: {new_app.failure_reason}"
raise RuntimeError(error_msg)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to raise error in case of timeout exception or should just display error message ? If MLFlow app is required parameter for CTJ API then it's fine to throw error.


except Exception as e:
logger.error("Failed to create MLflow app: %s", e)
Expand Down Expand Up @@ -693,7 +702,7 @@ def _validate_eula_for_gated_model(model, accept_eula, is_gated_model):


def _validate_s3_path_exists(s3_path: str, sagemaker_session):
"""Validate if S3 path exists and is accessible."""
"""Validate S3 path and create bucket/prefix if they don't exist."""
if not s3_path.startswith("s3://"):
raise ValueError(f"Invalid S3 path format: {s3_path}")

Expand All @@ -705,19 +714,34 @@ def _validate_s3_path_exists(s3_path: str, sagemaker_session):
s3_client = sagemaker_session.boto_session.client('s3')

try:
# Check if bucket exists and is accessible
s3_client.head_bucket(Bucket=bucket_name)
# Check if bucket exists, create if it doesn't
try:
s3_client.head_bucket(Bucket=bucket_name)
except Exception as e:
if "NoSuchBucket" in str(e) or "Not Found" in str(e):
# Create bucket
region = sagemaker_session.boto_region_name
if region == 'us-east-1':
Copy link
Collaborator

@jam-jee jam-jee Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this region specific check here ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is expected for s3

s3_client.create_bucket(Bucket=bucket_name)
else:
s3_client.create_bucket(
Bucket=bucket_name,
CreateBucketConfiguration={'LocationConstraint': region}
)
else:
raise

# If prefix is provided, check if it exists
# If prefix is provided, check if it exists, create if it doesn't
if prefix:
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix, MaxKeys=1)
if 'Contents' not in response:
raise ValueError(f"S3 prefix '{prefix}' does not exist in bucket '{bucket_name}'")
# Create the prefix by putting an empty object
if not prefix.endswith('/'):
prefix += '/'
s3_client.put_object(Bucket=bucket_name, Key=prefix, Body=b'')

except Exception as e:
if "NoSuchBucket" in str(e):
raise ValueError(f"S3 bucket '{bucket_name}' does not exist or is not accessible")
raise ValueError(f"Failed to validate S3 path '{s3_path}': {str(e)}")
raise ValueError(f"Failed to validate/create S3 path '{s3_path}': {str(e)}")


def _validate_hyperparameter_values(hyperparameters: dict):
Expand Down
16 changes: 9 additions & 7 deletions sagemaker-train/src/sagemaker/train/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,15 @@

HUB_NAME = "SageMakerPublicHub"

# Allowed reward model IDs for RLAIF trainer
_ALLOWED_REWARD_MODEL_IDS = [
"openai.gpt-oss-120b-1:0",
"openai.gpt-oss-20b-1:0",
"qwen.qwen3-32b-v1:0",
"qwen.qwen3-coder-30b-a3b-v1:0"
]
# Allowed reward model IDs for RLAIF trainer with region restrictions
_ALLOWED_REWARD_MODEL_IDS = {
"openai.gpt-oss-120b-1:0": ["us-west-2", "us-east-1", "ap-northeast-1", "eu-west-1"],
"openai.gpt-oss-20b-1:0": ["us-west-2", "us-east-1", "ap-northeast-1", "eu-west-1"],
"qwen.qwen3-32b-v1:0": ["us-west-2", "us-east-1", "ap-northeast-1", "eu-west-1"],
"qwen.qwen3-coder-30b-a3b-v1:0": ["us-west-2", "us-east-1", "ap-northeast-1", "eu-west-1"],
"qwen.qwen3-coder-480b-a35b-v1:0": ["us-west-2", "ap-northeast-1"],
"qwen.qwen3-235b-a22b-2507-v1:0": ["us-west-2", "ap-northeast-1"]
}

# Allowed evaluator models for LLM as Judge evaluator with region restrictions
_ALLOWED_EVALUATOR_MODELS = {
Expand Down
18 changes: 9 additions & 9 deletions sagemaker-train/src/sagemaker/train/dpo_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class DPOTrainer(BaseTrainer):
trainer = DPOTrainer(
model="meta-llama/Llama-2-7b-hf",
training_type=TrainingType.LORA,
model_package_group_name="my-model-group",
model_package_group="my-model-group",
training_dataset="s3://bucket/preference_data.jsonl"
)

Expand All @@ -50,7 +50,7 @@ class DPOTrainer(BaseTrainer):
# Complete workflow: create -> wait -> get model package ARN
trainer = DPOTrainer(
model="meta-llama/Llama-2-7b-hf",
model_package_group_name="my-dpo-models"
model_package_group="my-dpo-models"
)

# Create training job (non-blocking)
Expand All @@ -75,7 +75,7 @@ class DPOTrainer(BaseTrainer):
training_type (Union[TrainingType, str]):
The fine-tuning approach. Valid values are TrainingType.LORA (default),
TrainingType.FULL.
model_package_group_name (Optional[Union[str, ModelPackageGroup]]):
model_package_group (Optional[Union[str, ModelPackageGroup]]):
The model package group for storing the fine-tuned model. Can be a group name,
ARN, or ModelPackageGroup object. Required when model is not a ModelPackage.
mlflow_resource_arn (Optional[str]):
Expand All @@ -86,9 +86,9 @@ class DPOTrainer(BaseTrainer):
mlflow_run_name (Optional[str]):
The MLflow run name for this training job.
training_dataset (Optional[Union[str, DataSet]]):
The training dataset with preference pairs. Can be an S3 URI, dataset ARN, or DataSet object.
The training dataset with preference pairs. Can be a dataset ARN, or DataSet object.
validation_dataset (Optional[Union[str, DataSet]]):
The validation dataset. Can be an S3 URI, dataset ARN, or DataSet object.
The validation dataset. Can be a dataset ARN, or DataSet object.
s3_output_path (Optional[str]):
The S3 path for training job outputs.
If not specified, defaults to s3://sagemaker-<region>-<account>/output.
Expand All @@ -101,7 +101,7 @@ def __init__(
self,
model: Union[str, ModelPackage],
training_type: Union[TrainingType, str] = TrainingType.LORA,
model_package_group_name: Optional[Union[str, ModelPackageGroup]] = None,
model_package_group: Optional[Union[str, ModelPackageGroup]] = None,
mlflow_resource_arn: Optional[str] = None,
mlflow_experiment_name: Optional[str] = None,
mlflow_run_name: Optional[str] = None,
Expand All @@ -118,8 +118,8 @@ def __init__(
self.model, self._model_name = _resolve_model_and_name(model, self.sagemaker_session)
self.training_type = training_type

self.model_package_group_name = _validate_and_resolve_model_package_group(model,
model_package_group_name)
self.model_package_group = _validate_and_resolve_model_package_group(model,
model_package_group)
self.mlflow_resource_arn = mlflow_resource_arn
self.mlflow_experiment_name = mlflow_experiment_name
self.mlflow_run_name = mlflow_run_name
Expand Down Expand Up @@ -232,7 +232,7 @@ def train(self,
_validate_hyperparameter_values(final_hyperparameters)

model_package_config = _create_model_package_config(
model_package_group_name=self.model_package_group_name,
model_package_group_name=self.model_package_group,
model=self.model,
sagemaker_session=sagemaker_session
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,14 @@ class LLMAsJudgeEvaluator(BaseEvaluator):

This evaluator uses foundation models to evaluate LLM responses
based on various quality and responsible AI metrics.


This feature is powered by Amazon Bedrock Evaluations. Your use of this feature is subject to pricing of
Amazon Bedrock Evaluations, the Service Terms applicable to Amazon Bedrock, and the terms that apply to your
usage of third-party models. Amazon Bedrock Evaluations may securely transmit data across AWS Regions within your
geography for processing. For more information, access Amazon Bedrock Evaluations documentation.

Documentation: https://docs.aws.amazon.com/bedrock/latest/userguide/evaluation-judge.html

Attributes:
evaluator_model (str): AWS Bedrock foundation model identifier to use as the judge.
Required. For supported models, see:
Expand Down
32 changes: 22 additions & 10 deletions sagemaker-train/src/sagemaker/train/rlaif_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class RLAIFTrainer(BaseTrainer):
trainer = RLAIFTrainer(
model="meta-llama/Llama-2-7b-hf",
training_type=TrainingType.LORA,
model_package_group_name="my-model-group",
model_package_group="my-model-group",
reward_model_id="reward-model-id",
reward_prompt="Rate the helpfulness of this response on a scale of 1-10",
training_dataset="s3://bucket/rlaif_data.jsonl"
Expand All @@ -55,7 +55,7 @@ class RLAIFTrainer(BaseTrainer):
# Complete workflow: create -> wait -> get model package ARN
trainer = RLAIFTrainer(
model="meta-llama/Llama-2-7b-hf",
model_package_group_name="my-rlaif-models",
model_package_group="my-rlaif-models",
reward_model_id="reward-model-id",
reward_prompt="Rate the helpfulness of this response on a scale of 1-10"
)
Expand All @@ -82,7 +82,7 @@ class RLAIFTrainer(BaseTrainer):
training_type (Union[TrainingType, str]):
The fine-tuning approach. Valid values are TrainingType.LORA (default),
TrainingType.FULL.
model_package_group_name (Optional[Union[str, ModelPackageGroup]]):
model_package_group (Optional[Union[str, ModelPackageGroup]]):
The model package group for storing the fine-tuned model. Can be a group name,
ARN, or ModelPackageGroup object. Required when model is not a ModelPackage.
reward_model_id (str):
Expand All @@ -100,9 +100,9 @@ class RLAIFTrainer(BaseTrainer):
mlflow_run_name (Optional[str]):
The MLflow run name for this training job.
training_dataset (Optional[Union[str, DataSet]]):
The training dataset. Can be an S3 URI, dataset ARN, or DataSet object.
The training dataset. Can be a dataset ARN, or DataSet object.
validation_dataset (Optional[Union[str, DataSet]]):
The validation dataset. Can be an S3 URI, dataset ARN, or DataSet object.
The validation dataset. Can be a dataset ARN, or DataSet object.
s3_output_path (Optional[str]):
The S3 path for training job outputs.
If not specified, defaults to s3://sagemaker-<region>-<account>/output.
Expand All @@ -116,7 +116,7 @@ def __init__(
self,
model: Union[str, ModelPackage],
training_type: Union[TrainingType, str] = TrainingType.LORA,
model_package_group_name: Optional[Union[str, ModelPackageGroup]] = None,
model_package_group: Optional[Union[str, ModelPackageGroup]] = None,
reward_model_id: str = None,
reward_prompt: Union[str, Evaluator] = None,
mlflow_resource_arn: Optional[Union[str, MlflowTrackingServer]] = None,
Expand All @@ -138,8 +138,8 @@ def __init__(
self.model, self._model_name = _resolve_model_and_name(model, self.sagemaker_session)

self.training_type = training_type
self.model_package_group_name = _validate_and_resolve_model_package_group(model,
model_package_group_name)
self.model_package_group = _validate_and_resolve_model_package_group(model,
model_package_group)
self.reward_model_id = self._validate_reward_model_id(reward_model_id)
self.reward_prompt = reward_prompt
self.mlflow_resource_arn = mlflow_resource_arn
Expand Down Expand Up @@ -173,8 +173,20 @@ def _validate_reward_model_id(self, reward_model_id):
if reward_model_id not in _ALLOWED_REWARD_MODEL_IDS:
raise ValueError(
f"Invalid reward_model_id '{reward_model_id}'. "
f"Available models are: {_ALLOWED_REWARD_MODEL_IDS}"
f"Available models are: {list(_ALLOWED_REWARD_MODEL_IDS.keys())}"
)

# Check region compatibility
session = self.sagemaker_session if hasattr(self, 'sagemaker_session') and self.sagemaker_session else TrainDefaults.get_sagemaker_session()
current_region = session.boto_region_name
allowed_regions = _ALLOWED_REWARD_MODEL_IDS[reward_model_id]

if current_region not in allowed_regions:
raise ValueError(
f"Reward model '{reward_model_id}' is not available in region '{current_region}'. "
f"Available regions for this model: {allowed_regions}"
)

return reward_model_id


Expand Down Expand Up @@ -239,7 +251,7 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None, validati
_validate_hyperparameter_values(final_hyperparameters)

model_package_config = _create_model_package_config(
model_package_group_name=self.model_package_group_name,
model_package_group_name=self.model_package_group,
model=self.model,
sagemaker_session=sagemaker_session
)
Expand Down
18 changes: 9 additions & 9 deletions sagemaker-train/src/sagemaker/train/rlvr_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class RLVRTrainer(BaseTrainer):
trainer = RLVRTrainer(
model="meta-llama/Llama-2-7b-hf",
training_type=TrainingType.LORA,
model_package_group_name="my-model-group",
model_package_group="my-model-group",
custom_reward_function="arn:aws:sagemaker:us-east-1:123456789012:hub-content/SageMakerPublicHub/JsonDoc/my-evaluator/1.0",
training_dataset="s3://bucket/rlvr_data.jsonl"
)
Expand All @@ -52,7 +52,7 @@ class RLVRTrainer(BaseTrainer):
# Complete workflow: create -> wait -> get model package ARN
trainer = RLVRTrainer(
model="meta-llama/Llama-2-7b-hf",
model_package_group_name="my-rlvr-models",
model_package_group="my-rlvr-models",
custom_reward_function="arn:aws:sagemaker:us-east-1:123456789012:hub-content/SageMakerPublicHub/JsonDoc/my-evaluator/1.0"
)

Expand All @@ -78,7 +78,7 @@ class RLVRTrainer(BaseTrainer):
training_type (Union[TrainingType, str]):
The fine-tuning approach. Valid values are TrainingType.LORA (default),
TrainingType.FULL.
model_package_group_name (Optional[Union[str, ModelPackageGroup]]):
model_package_group (Optional[Union[str, ModelPackageGroup]]):
The model package group for storing the fine-tuned model. Can be a group name,
ARN, or ModelPackageGroup object. Required when model is not a ModelPackage.
custom_reward_function (Optional[Union[str, Evaluator]]):
Expand All @@ -92,9 +92,9 @@ class RLVRTrainer(BaseTrainer):
mlflow_run_name (Optional[str]):
The MLflow run name for this training job.
training_dataset (Optional[Union[str, DataSet]]):
The training dataset. Can be an S3 URI, dataset ARN, or DataSet object.
The training dataset. Can be a dataset ARN, or DataSet object.
validation_dataset (Optional[Union[str, DataSet]]):
The validation dataset. Can be an S3 URI, dataset ARN, or DataSet object.
The validation dataset. Can be a dataset ARN, or DataSet object.
s3_output_path (Optional[str]):
The S3 path for training job outputs.
If not specified, defaults to s3://sagemaker-<region>-<account>/output.
Expand All @@ -108,7 +108,7 @@ def __init__(
self,
model: Union[str, ModelPackage],
training_type: Union[TrainingType, str] = TrainingType.LORA,
model_package_group_name: Optional[Union[str, ModelPackageGroup]] = None,
model_package_group: Optional[Union[str, ModelPackageGroup]] = None,
custom_reward_function: Optional[Union[str, Evaluator]] = None,
mlflow_resource_arn: Optional[Union[str, MlflowTrackingServer]] = None,
mlflow_experiment_name: Optional[str] = None,
Expand All @@ -129,8 +129,8 @@ def __init__(
self.model, self._model_name = _resolve_model_and_name(model, self.sagemaker_session)

self.training_type = training_type
self.model_package_group_name = _validate_and_resolve_model_package_group(model,
model_package_group_name)
self.model_package_group = _validate_and_resolve_model_package_group(model,
model_package_group)
self.custom_reward_function = custom_reward_function
self.mlflow_resource_arn = mlflow_resource_arn
self.mlflow_experiment_name = mlflow_experiment_name
Expand Down Expand Up @@ -239,7 +239,7 @@ def train(self, training_dataset: Optional[Union[str, DataSet]] = None,
_validate_hyperparameter_values(final_hyperparameters)

model_package_config = _create_model_package_config(
model_package_group_name=self.model_package_group_name,
model_package_group_name=self.model_package_group,
model=self.model,
sagemaker_session=sagemaker_session
)
Expand Down
Loading
Loading