diff --git a/draft/code_v0.py b/draft/code_v0.py new file mode 100644 index 000000000..d17ef9911 --- /dev/null +++ b/draft/code_v0.py @@ -0,0 +1,319 @@ +import pandas as pd +sub = pd.read_csv("/home/shared/RD-Agent/online/cafa-6-protein-function-prediction/sample_submission.tsv", sep= "\t",on_bad_lines='skip', header = None) +sub.columns = ["The Protein ID", "The Gene Ontology term (GO) ID", "Predicted link probability that GO appear in Protein"] +sub.head(5) + +MAIN_DIR = "/home/shared/RD-Agent/online/cafa-6-protein-function-prediction" + +# UTILITARIES +import numpy as np +from tqdm import tqdm +import time + +# TORCH MODULES FOR METRICS COMPUTATION : +import torch +from torch.utils.data import Dataset +from torch import nn +from torch.utils.data import random_split +from torch.optim.lr_scheduler import ReduceLROnPlateau +from torchmetrics.classification import MultilabelF1Score +from torchmetrics.classification import MultilabelAccuracy + +import pytorch_lightning as pl +from pytorch_lightning import Trainer +from pytorch_lightning.loggers import WandbLogger + +# get embeddings code +#!pip install -q fair-esm + +# import pathlib +# import torch + +# from esm import FastaBatchedDataset, pretrained +# def extract_embeddings(model_name, fasta_file, output_dir, tokens_per_batch=4096, seq_length=1022,repr_layers=[33]): + +# model, alphabet = pretrained.load_model_and_alphabet(model_name) +# model.eval() + +# if torch.cuda.is_available(): +# model = model.cuda() + +# dataset = FastaBatchedDataset.from_file(fasta_file) +# batches = dataset.get_batch_indices(tokens_per_batch, extra_toks_per_seq=1) + +# data_loader = torch.utils.data.DataLoader( +# dataset, +# collate_fn=alphabet.get_batch_converter(seq_length), +# batch_sampler=batches +# ) + +# output_dir.mkdir(parents=True, exist_ok=True) + +# with torch.no_grad(): +# for batch_idx, (labels, strs, toks) in enumerate(data_loader): + +# print(f'Processing batch {batch_idx + 1} of {len(batches)}') + +# if torch.cuda.is_available(): +# toks = toks.to(device="cuda", non_blocking=True) + +# out = model(toks, repr_layers=repr_layers, return_contacts=False) + +# logits = out["logits"].to(device="cpu") +# representations = {layer: t.to(device="cpu") for layer, t in out["representations"].items()} + +# for i, label in enumerate(labels): +# entry_id = label.split()[0] + +# filename = output_dir / f"{entry_id}.pt" +# truncate_len = min(seq_length, len(strs[i])) + +# result = {"entry_id": entry_id} +# result["mean_representations"] = { +# layer: t[i, 1 : truncate_len + 1].mean(0).clone() +# for layer, t in representations.items() +# } + +# torch.save(result, filename) +# model_name = 'esm2_t33_650M_UR50D' +# fasta_file = pathlib.Path('/kaggle/input/cafa-5-fasta-files/train_sequences.fasta') +# output_dir = pathlib.Path('train_embeddings') + +# extract_embeddings(model_name, fasta_file, output_dir) + + +class config: + train_sequences_path = MAIN_DIR + "/Train/train_sequences.fasta" + train_labels_path = MAIN_DIR + "/Train/train_terms.tsv" + test_sequences_path = MAIN_DIR + "/Test/testsuperset.fasta" + + num_labels = 500 + n_epochs = 8 + batch_size = 128 + lr = 0.01 + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + +# Directories for the different embedding vectors : +embeds_map = { + "T5" : "t5embeds", + "ProtBERT" : "protbert-embeddings-for-cafa5", + "EMS2" : "cafa-5-ems-2-embeddings-numpy" +} + +# Length of the different embedding vectors : +embeds_dim = { + "T5" : 1024, + "ProtBERT" : 1024, + "EMS2" : 1280 +} + + +class ProteinSequenceDataset(Dataset): + + def __init__(self, datatype, embeddings_source): + super(ProteinSequenceDataset).__init__() + self.datatype = datatype + + if embeddings_source in ["ProtBERT", "EMS2"]: + embeds = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_embeddings.npy") + ids = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy") + + if embeddings_source == "T5": + embeds = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_embeds.npy") + ids = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy") + + embeds_list = [] + for l in range(embeds.shape[0]): + embeds_list.append(embeds[l,:]) + self.df = pd.DataFrame(data={"EntryID": ids, "embed" : embeds_list}) + + if datatype=="train": + np_labels = np.load( + "/kaggle/input/train-targets-top"+str(config.num_labels)+ \ + "/train_targets_top"+str(config.num_labels)+".npy") + df_labels = pd.DataFrame(self.df['EntryID']) + df_labels['labels_vect']=[row for row in np_labels] + self.df = self.df.merge(df_labels, on="EntryID") + + def __len__(self): + return len(self.df) + + def __getitem__(self, index): + embed = torch.tensor(self.df.iloc[index]["embed"] , dtype = torch.float32) + if self.datatype=="train": + targets = torch.tensor(self.df.iloc[index]["labels_vect"], dtype = torch.float32) + return embed, targets + if self.datatype=="test": + id = self.df.iloc[index]["EntryID"] + return embed, id + + +class MultiLayerPerceptron(torch.nn.Module): + + def __init__(self, input_dim, num_classes): + super(MultiLayerPerceptron, self).__init__() + + self.linear1 = torch.nn.Linear(input_dim, 864) + self.activation1 = torch.nn.ReLU() + self.linear2 = torch.nn.Linear(864, 712) + self.activation2 = torch.nn.ReLU() + self.linear3 = torch.nn.Linear(712, num_classes) + + + def forward(self, x): + x = self.linear1(x) + x = self.activation1(x) + x = self.linear2(x) + x = self.activation2(x) + x = self.linear3(x) + return x + +class CNN1D(nn.Module): + def __init__(self, input_dim, num_classes): + super(CNN1D, self).__init__() + # (batch_size, channels, embed_size) + self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1) + # (batch_size, 3, embed_size) + self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2) + # (batch_size, 3, embed_size/2 = 512) + self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1) + # (batch_size, 8, embed_size/2 = 512) + self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2) + # (batch_size, 8, embed_size/4 = 256) + self.fc1 = nn.Linear(in_features=int(8 * input_dim/4), out_features=864) + self.fc2 = nn.Linear(in_features=864, out_features=num_classes) + + def forward(self, x): + x = x.reshape(x.shape[0], 1, x.shape[1]) + x = self.pool1(nn.functional.tanh(self.conv1(x))) + x = self.pool2(nn.functional.tanh(self.conv2(x))) + x = torch.flatten(x, 1) + x = nn.functional.tanh(self.fc1(x)) + x = self.fc2(x) + return x + + +def train_model(embeddings_source, model_type="linear", train_size=0.9): + + train_dataset = ProteinSequenceDataset(datatype="train", embeddings_source = embeddings_source) + + train_set, val_set = random_split(train_dataset, lengths = [int(len(train_dataset)*train_size), len(train_dataset)-int(len(train_dataset)*train_size)]) + train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=config.batch_size, shuffle=True) + val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=config.batch_size, shuffle=True) + + if model_type == "linear": + model = MultiLayerPerceptron(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device) + if model_type == "convolutional": + model = CNN1D(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device) + + optimizer = torch.optim.Adam(model.parameters(), lr = config.lr) + scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=1) + CrossEntropy = torch.nn.CrossEntropyLoss() + f1_score = MultilabelF1Score(num_labels=config.num_labels).to(config.device) + n_epochs = config.n_epochs + + print("BEGIN TRAINING...") + train_loss_history=[] + val_loss_history=[] + + train_f1score_history=[] + val_f1score_history=[] + for epoch in range(n_epochs): + print("EPOCH ", epoch+1) + ## TRAIN PHASE : + losses = [] + scores = [] + for embed, targets in tqdm(train_dataloader): + embed, targets = embed.to(config.device), targets.to(config.device) + optimizer.zero_grad() + preds = model(embed) + loss= CrossEntropy(preds, targets) + score=f1_score(preds, targets) + losses.append(loss.item()) + scores.append(score.item()) + loss.backward() + optimizer.step() + avg_loss = np.mean(losses) + avg_score = np.mean(scores) + print("Running Average TRAIN Loss : ", avg_loss) + print("Running Average TRAIN F1-Score : ", avg_score) + train_loss_history.append(avg_loss) + train_f1score_history.append(avg_score) + + ## VALIDATION PHASE : + losses = [] + scores = [] + for embed, targets in val_dataloader: + embed, targets = embed.to(config.device), targets.to(config.device) + preds = model(embed) + loss= CrossEntropy(preds, targets) + score=f1_score(preds, targets) + losses.append(loss.item()) + scores.append(score.item()) + avg_loss = np.mean(losses) + avg_score = np.mean(scores) + print("Running Average VAL Loss : ", avg_loss) + print("Running Average VAL F1-Score : ", avg_score) + val_loss_history.append(avg_loss) + val_f1score_history.append(avg_score) + + scheduler.step(avg_loss) + print("\n") + + print("TRAINING FINISHED") + print("FINAL TRAINING SCORE : ", train_f1score_history[-1]) + print("FINAL VALIDATION SCORE : ", val_f1score_history[-1]) + + losses_history = {"train" : train_loss_history, "val" : val_loss_history} + scores_history = {"train" : train_f1score_history, "val" : val_f1score_history} + + return model, losses_history, scores_history + +ems2_model, ems2_losses, ems2_scores = train_model(embeddings_source="EMS2",model_type="convolutional") + +def predict(embeddings_source): + + test_dataset = ProteinSequenceDataset(datatype="test", embeddings_source = embeddings_source) + test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False) + + if embeddings_source == "T5": + model = t5_model + if embeddings_source == "ProtBERT": + model = protbert_model + if embeddings_source == "EMS2": + model = ems2_model + + model.eval() + + labels = pd.read_csv(config.train_labels_path, sep = "\t") + top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False) + labels_names = top_terms[:config.num_labels].index.values + print("GENERATE PREDICTION FOR TEST SET...") + + ids_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object) + go_terms_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object) + confs_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=np.float32) + + for i, (embed, id) in tqdm(enumerate(test_dataloader)): + embed = embed.to(config.device) + confs_[i*config.num_labels:(i+1)*config.num_labels] = torch.nn.functional.sigmoid(model(embed)).squeeze().detach().cpu().numpy() + ids_[i*config.num_labels:(i+1)*config.num_labels] = id[0] + go_terms_[i*config.num_labels:(i+1)*config.num_labels] = labels_names + + submission_df = pd.DataFrame(data={"Id" : ids_, "GO term" : go_terms_, "Confidence" : confs_}) + print("PREDICTIONS DONE") + return submission_df +submission_df = predict("EMS2") + +#this submission was obtained by training models on BlastP, Sprof, QuickGo and DeeepGoZero offline +submission2 = pd.read_csv('/kaggle/input/blast-quick-sprof-zero-pred/submission.tsv', + sep='\t', header=None, names=['Id2', 'GO term2', 'Confidence2']) + +subs = submission2.merge(submission_df, left_on=['Id2', 'GO term2'], + right_on=['Id', 'GO term'], how='outer') + +subs.drop(['Id', 'GO term'], axis=1, inplace=True) +subs['confidence_combined'] = subs.apply(lambda row: row['Confidence2'] if not np.isnan(row['Confidence2']) else row['Confidence'], axis=1) + +subs[['Id2', 'GO term2', 'confidence_combined']].to_csv('submission.tsv', sep='\t', header=False, index=False) \ No newline at end of file diff --git a/draft/code_v1.py b/draft/code_v1.py new file mode 100644 index 000000000..ecd795be0 --- /dev/null +++ b/draft/code_v1.py @@ -0,0 +1,246 @@ +import os +import time +import numpy as np +import pandas as pd +from tqdm import tqdm + +import torch +from torch import nn +from torch.utils.data import Dataset, DataLoader, random_split +from torch.optim.lr_scheduler import ReduceLROnPlateau +from torchmetrics.classification import MultilabelF1Score + +# ----------------------------------------------------------- +# CONFIGURATION +# ----------------------------------------------------------- +class Config: + MAIN_DIR = "/home/shared/RD-Agent/online/cafa-6-protein-function-prediction" + train_sequences_path = os.path.join(MAIN_DIR, "Train/train_sequences.fasta") + train_labels_path = os.path.join(MAIN_DIR, "Train/train_terms.tsv") + test_sequences_path = os.path.join(MAIN_DIR, "Test/testsuperset.fasta") + + embeddings_dir = os.path.expanduser("~/cafa6_embeddings") + os.makedirs(embeddings_dir, exist_ok=True) + + model_name = "esm2_t33_650M_UR50D" + num_labels = 500 + n_epochs = 8 + batch_size = 128 + lr = 0.01 + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +config = Config() + + +# ----------------------------------------------------------- +# STEP 1: EXTRACT EMBEDDINGS USING FAIR-ESM +# ----------------------------------------------------------- +from esm import FastaBatchedDataset, pretrained + +def extract_embeddings(model_name, fasta_file, output_dir, repr_layers=[33], tokens_per_batch=4096, seq_length=1022): + model, alphabet = pretrained.load_model_and_alphabet(model_name) + model.eval() + if torch.cuda.is_available(): + model = model.cuda() + + dataset = FastaBatchedDataset.from_file(fasta_file) + batches = dataset.get_batch_indices(tokens_per_batch, extra_toks_per_seq=1) + data_loader = torch.utils.data.DataLoader( + dataset, + collate_fn=alphabet.get_batch_converter(seq_length), + batch_sampler=batches, + ) + + os.makedirs(output_dir, exist_ok=True) + print(f"Extracting embeddings to {output_dir}") + + all_embeds, all_ids = [], [] + + with torch.no_grad(): + for batch_idx, (labels, strs, toks) in enumerate(data_loader): + print(f"Processing batch {batch_idx + 1}/{len(batches)}") + if torch.cuda.is_available(): + toks = toks.to(device="cuda", non_blocking=True) + out = model(toks, repr_layers=repr_layers, return_contacts=False) + representations = {layer: t.to("cpu") for layer, t in out["representations"].items()} + + for i, label in enumerate(labels): + entry_id = label.split()[0] + truncate_len = min(seq_length, len(strs[i])) + mean_embed = representations[repr_layers[0]][i, 1 : truncate_len + 1].mean(0).clone() + all_embeds.append(mean_embed.numpy()) + all_ids.append(entry_id) + + np.save(os.path.join(output_dir, "embeddings.npy"), np.stack(all_embeds)) + np.save(os.path.join(output_dir, "ids.npy"), np.array(all_ids)) + print(f"Saved {len(all_ids)} embeddings to {output_dir}") + + +# ----------------------------------------------------------- +# STEP 2: DATASET DEFINITION +# ----------------------------------------------------------- +class ProteinSequenceDataset(Dataset): + def __init__(self, datatype, embedding_dir): + super().__init__() + self.datatype = datatype + embeds = np.load(os.path.join(embedding_dir, "embeddings.npy")) + ids = np.load(os.path.join(embedding_dir, "ids.npy")) + + self.df = pd.DataFrame({"EntryID": ids, "embed": list(embeds)}) + + if datatype == "train": + np_labels = np.load( + os.path.join(embedding_dir, f"train_targets_top{config.num_labels}.npy") + ) + df_labels = pd.DataFrame({"EntryID": ids, "labels_vect": list(np_labels)}) + self.df = self.df.merge(df_labels, on="EntryID") + + def __len__(self): + return len(self.df) + + def __getitem__(self, index): + embed = torch.tensor(self.df.iloc[index]["embed"], dtype=torch.float32) + if self.datatype == "train": + targets = torch.tensor(self.df.iloc[index]["labels_vect"], dtype=torch.float32) + return embed, targets + else: + entry_id = self.df.iloc[index]["EntryID"] + return embed, entry_id + + +# ----------------------------------------------------------- +# STEP 3: MODELS +# ----------------------------------------------------------- +class MultiLayerPerceptron(nn.Module): + def __init__(self, input_dim, num_classes): + super().__init__() + self.linear1 = nn.Linear(input_dim, 864) + self.linear2 = nn.Linear(864, 712) + self.linear3 = nn.Linear(712, num_classes) + self.activation = nn.ReLU() + + def forward(self, x): + x = self.activation(self.linear1(x)) + x = self.activation(self.linear2(x)) + x = self.linear3(x) + return x + + +class CNN1D(nn.Module): + def __init__(self, input_dim, num_classes): + super().__init__() + self.conv1 = nn.Conv1d(1, 3, kernel_size=3, padding=1) + self.pool1 = nn.MaxPool1d(2) + self.conv2 = nn.Conv1d(3, 8, kernel_size=3, padding=1) + self.pool2 = nn.MaxPool1d(2) + self.fc1 = nn.Linear(int(8 * input_dim / 4), 864) + self.fc2 = nn.Linear(864, num_classes) + + def forward(self, x): + x = x.unsqueeze(1) # (batch, 1, embed_dim) + x = self.pool1(torch.tanh(self.conv1(x))) + x = self.pool2(torch.tanh(self.conv2(x))) + x = torch.flatten(x, 1) + x = torch.tanh(self.fc1(x)) + x = self.fc2(x) + return x + + +# ----------------------------------------------------------- +# STEP 4: TRAINING FUNCTION +# ----------------------------------------------------------- +def train_model(embedding_dir, model_type="mlp", train_size=0.9): + train_dataset = ProteinSequenceDataset("train", embedding_dir) + train_len = int(len(train_dataset) * train_size) + train_set, val_set = random_split(train_dataset, [train_len, len(train_dataset) - train_len]) + train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True) + val_loader = DataLoader(val_set, batch_size=config.batch_size, shuffle=False) + + input_dim = train_dataset.df.iloc[0]["embed"].shape[0] + model = ( + MultiLayerPerceptron(input_dim, config.num_labels) + if model_type == "mlp" + else CNN1D(input_dim, config.num_labels) + ).to(config.device) + + optimizer = torch.optim.Adam(model.parameters(), lr=config.lr) + scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=1) + criterion = nn.BCEWithLogitsLoss() + f1 = MultilabelF1Score(num_labels=config.num_labels).to(config.device) + + print("BEGIN TRAINING...") + for epoch in range(config.n_epochs): + model.train() + train_losses, train_scores = [], [] + for embed, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.n_epochs}"): + embed, targets = embed.to(config.device), targets.to(config.device) + optimizer.zero_grad() + preds = model(embed) + loss = criterion(preds, targets) + score = f1(torch.sigmoid(preds), targets) + loss.backward() + optimizer.step() + train_losses.append(loss.item()) + train_scores.append(score.item()) + + model.eval() + val_losses, val_scores = [], [] + with torch.no_grad(): + for embed, targets in val_loader: + embed, targets = embed.to(config.device), targets.to(config.device) + preds = model(embed) + loss = criterion(preds, targets) + score = f1(torch.sigmoid(preds), targets) + val_losses.append(loss.item()) + val_scores.append(score.item()) + + scheduler.step(np.mean(val_losses)) + print(f"Epoch {epoch+1}: Train F1={np.mean(train_scores):.4f}, Val F1={np.mean(val_scores):.4f}") + + print("TRAINING FINISHED ✅") + return model + + +# ----------------------------------------------------------- +# STEP 5: PREDICTION +# ----------------------------------------------------------- +def predict(model, embedding_dir): + test_dataset = ProteinSequenceDataset("test", embedding_dir) + test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False) + + labels = pd.read_csv(config.train_labels_path, sep="\t") + top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False) + labels_names = top_terms[:config.num_labels].index.values + + ids_, go_terms_, confs_ = [], [], [] + model.eval() + with torch.no_grad(): + for embed, entry_id in tqdm(test_loader, desc="Predicting"): + embed = embed.to(config.device) + preds = torch.sigmoid(model(embed)).squeeze().cpu().numpy() + ids_.extend([entry_id[0]] * config.num_labels) + go_terms_.extend(labels_names) + confs_.extend(preds.tolist()) + + submission_df = pd.DataFrame({"Id": ids_, "GO term": go_terms_, "Confidence": confs_}) + submission_df.to_csv("submission.tsv", sep="\t", index=False, header=False) + print("✅ Submission saved to submission.tsv") + return submission_df + + +# ----------------------------------------------------------- +# EXECUTION PIPELINE +# ----------------------------------------------------------- +if __name__ == "__main__": + # 1️⃣ Extract embeddings (can be skipped if already generated) + extract_embeddings( + model_name=config.model_name, + fasta_file=config.train_sequences_path, + output_dir=config.embeddings_dir, + ) + + # 2️⃣ Train model + model = train_model(embedding_dir=config.embeddings_dir, model_type="mlp") + + # 3️⃣ Predict on test set + submission_df = predict(model, config.embeddings_dir) diff --git a/draft/playground-series-s5e7.py b/draft/playground-series-s5e7.py new file mode 100644 index 000000000..9b7a04109 --- /dev/null +++ b/draft/playground-series-s5e7.py @@ -0,0 +1,71 @@ +# 1. Imports +import numpy as np +import pandas as pd +import xgboost as xgb +from sklearn.metrics import accuracy_score +from sklearn.model_selection import StratifiedKFold +from sklearn.preprocessing import LabelEncoder, OrdinalEncoder + +# 2. Load data +train = pd.read_csv("/tmp/kaggle/playground-series-s5e7/train.csv") +test = pd.read_csv("/tmp/kaggle/playground-series-s5e7/test.csv") +submission = pd.read_csv("/tmp/kaggle/playground-series-s5e7/sample_submission.csv") + +# 3. Encode target +le = LabelEncoder() +train["Personality_encoded"] = le.fit_transform(train["Personality"]) + +# 4. Prepare features +X = train.drop(columns=["id", "Personality", "Personality_encoded"]) +y = train["Personality_encoded"] +X_test = test.drop(columns=["id"]) + +# 5. Encode categorical columns +combined = pd.concat([X, X_test], axis=0) +cat_cols = combined.select_dtypes(include="object").columns.tolist() +encoder = OrdinalEncoder() +combined[cat_cols] = encoder.fit_transform(combined[cat_cols]) + +X = combined.iloc[: len(X)].reset_index(drop=True) +X_test = combined.iloc[len(X) :].reset_index(drop=True) + +# 6. Setup XGBoost +params = { + "objective": "binary:logistic", + "eval_metric": "logloss", + "max_depth": 4, + "eta": 0.1, + "subsample": 0.8, + "colsample_bytree": 0.8, + "random_state": 42, +} + +# 7. Stratified K-Fold Cross-Validation +skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) +oof_preds = np.zeros(len(X)) +test_preds = np.zeros(len(X_test)) + +for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)): + X_train, X_val = X.iloc[train_idx], X.iloc[val_idx] + y_train, y_val = y.iloc[train_idx], y.iloc[val_idx] + + dtrain = xgb.DMatrix(X_train, label=y_train) + dval = xgb.DMatrix(X_val, label=y_val) + dtest = xgb.DMatrix(X_test) + + model = xgb.train( + params, dtrain, num_boost_round=100, evals=[(dval, "valid")], early_stopping_rounds=10, verbose_eval=False + ) + + oof_preds[val_idx] = model.predict(dval) > 0.5 + test_preds += model.predict(dtest) / skf.n_splits + +# 8. Evaluate +cv_acc = accuracy_score(y, oof_preds) +print(f"Cross-Validation Accuracy: {cv_acc:.4f}") + +# 9. Create submission +final_preds = (test_preds > 0.5).astype(int) +submission["Personality"] = le.inverse_transform(final_preds) +submission.to_csv("submission.csv", index=False) +submission.head() diff --git a/job_log.txt b/job_log.txt new file mode 100644 index 000000000..57aeb7387 --- /dev/null +++ b/job_log.txt @@ -0,0 +1 @@ +dotenv run -- python rdagent/app/data_science/loop.py --competition cafa-6-protein-function-prediction --timeout 12h diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py index 14f107d5d..8e7bcf236 100644 --- a/rdagent/app/data_science/conf.py +++ b/rdagent/app/data_science/conf.py @@ -35,13 +35,13 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): ## Coding Related coding_fail_reanalyze_threshold: int = 3 - debug_recommend_timeout: int = 600 + debug_recommend_timeout: int = 600*3 """The recommend time limit for running on debugging data""" - debug_timeout: int = 600 + debug_timeout: int = 600*3 """The timeout limit for running on debugging data""" - full_recommend_timeout: int = 3600 + full_recommend_timeout: int = 3600*3 """The recommend time limit for running on full data""" - full_timeout: int = 3600 + full_timeout: int = 3600*3 """The timeout limit for running on full data""" #### model dump @@ -151,6 +151,10 @@ class DataScienceBasePropSetting(KaggleBasePropSetting): enable_draft_before_first_sota: bool = False enable_planner: bool = False + #### enable draft code replacement for first loop + enable_draft_code_replacement: bool = True + draft_code_path: str = "/data/userdata/v-lijingyuan/RD-Agent-draft-RAG/RD-Agent/draft/code_v1.py" + model_architecture_suggestion_time_percent: float = 0.75 allow_longer_timeout: bool = False coder_enable_llm_decide_longer_timeout: bool = False diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py index 560e71e38..8cb2d91c2 100644 --- a/rdagent/app/data_science/loop.py +++ b/rdagent/app/data_science/loop.py @@ -1,10 +1,10 @@ import asyncio +from collections.abc import Coroutine from pathlib import Path -from typing import Optional +from typing import Annotated, Optional import fire import typer -from typing_extensions import Annotated from rdagent.app.data_science.conf import DS_RD_SETTING from rdagent.core.utils import import_class @@ -12,6 +12,18 @@ from rdagent.scenarios.data_science.loop import DataScienceRDLoop +async def run_and_submit_sota(loop_task: Coroutine, competition: str) -> None: + """Run the loop coroutine task, and submit the SOTA experiment submission file to kaggle at the end.""" + from rdagent.scenarios.kaggle.submission import submit_current_sota + + try: + # wait for the loop end + await loop_task + finally: + # we do not care about exception, just make sure we can submit + submit_current_sota(competition=competition) + + def main( path: Optional[str] = None, checkout: Annotated[bool, typer.Option("--checkout/--no-checkout", "-c/-C")] = True, @@ -73,7 +85,12 @@ def main( if exp_gen_cls is not None: kaggle_loop.exp_gen = import_class(exp_gen_cls)(kaggle_loop.exp_gen.scen) - asyncio.run(kaggle_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout)) + if DS_RD_SETTING.auto_submit: + asyncio.run( + run_and_submit_sota(kaggle_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout), competition) + ) + else: + asyncio.run(kaggle_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout)) if __name__ == "__main__": diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py index 0cac34e6a..a0bf9259d 100644 --- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py +++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py @@ -2,6 +2,7 @@ import math from datetime import timedelta from enum import Enum +from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import numpy as np @@ -1250,10 +1251,20 @@ def task_gen( # Persist for later stages task.package_info = get_packages(pkgs) - exp = DSExperiment( - pending_tasks_list=[[task]], hypothesis=hypotheses[0], hypothesis_candidates=hypotheses_candidates - ) - if sota_exp is not None: + exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=hypotheses[0]) + + # Draft code replacement for first loop + if ( + DS_RD_SETTING.enable_draft_code_replacement and DS_RD_SETTING.draft_code_path and sota_exp is None + ): # First loop (no SOTA experiment exists) + + draft_path = Path(DS_RD_SETTING.draft_code_path) + if draft_path.exists(): + logger.info(f"Loading draft code from: {draft_path}") + exp.experiment_workspace.inject_files(**{"main.py": draft_path.read_text()}) + else: + logger.warning(f"Draft code file not found: {draft_path}") + elif sota_exp is not None: exp.experiment_workspace.inject_code_from_file_dict(sota_exp.experiment_workspace) # 3) create the workflow update task diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py index e1962f9c0..5ccf1a5c9 100644 --- a/rdagent/scenarios/kaggle/kaggle_crawler.py +++ b/rdagent/scenarios/kaggle/kaggle_crawler.py @@ -109,6 +109,9 @@ def kaggle_description_css_selectors() -> tuple[str, str]: def download_data(competition: str, settings: ExtendedBaseSettings, enable_create_debug_data: bool = True) -> None: local_path = settings.local_data_path + if (Path(local_path) / f"{competition}").exists(): + logger.info(f"{competition} already exists, skipping download.") + return if settings.if_using_mle_data: zipfile_path = f"{local_path}/zip_files" zip_competition_path = Path(zipfile_path) / competition diff --git a/rdagent/scenarios/kaggle/submission.py b/rdagent/scenarios/kaggle/submission.py new file mode 100644 index 000000000..cc090998b --- /dev/null +++ b/rdagent/scenarios/kaggle/submission.py @@ -0,0 +1,672 @@ +import json +import os +import tempfile +import time +from collections.abc import Generator +from contextlib import contextmanager +from datetime import datetime, timezone +from pathlib import Path + +import nbformat +import requests +from kaggle.api.kaggle_api_extended import ( + ApiCreateCodeSubmissionResponse, + ApiSaveKernelResponse, + KaggleApi, + SubmissionSortBy, + SubmissionStatus, +) +from kagglesdk.competitions.types.competition_api_service import ( + ApiCompetition, + ApiSubmission, +) +from kagglesdk.kernels.types.kernels_enums import KernelWorkerStatus +from pydantic_settings import SettingsConfigDict + +from rdagent.core.conf import ExtendedBaseSettings +from rdagent.log import rdagent_logger as logger + +# error 403 means we have not join the competition when retrieving information, like submissions +ERROR_CODE_NOT_JOIN_COMPETITION = 403 +# error code when we try to get submission when there is not any +ERROR_CODE_NO_SUBMISSION = 400 + + +class KaggleSubmissionSetting(ExtendedBaseSettings): + model_config = SettingsConfigDict(env_prefix="SUBMIT_", protected_namespaces=()) + + # if force to submit code even it is not a code competition + force_submit_code: bool = False + + # submission file name to submit + submission_file: str = "submission.csv" + + # if enable notebook gpu + enable_gpu: bool = False + + # if enable notebook internet access + enable_internet: bool = False + + # if enable notebook tpu + enable_tpu: bool = False + + # if make kernel private + is_private: bool = True + + # prefix of the notebook name + kernel_prefix: str = "rd-submission" + + # timeout when tracking submission status + status_timeout: int = 120 + + +KG_SUBMISSION_SETTING = KaggleSubmissionSetting() + + +def get_completed_submissions(api: KaggleApi, competition: str) -> list[ApiSubmission]: + """Get the completed submissions of today. + + Args: + api (KaggleApi): kaggle api object + competition (str): competition name to get submissions + + Return: + list[ApiSubmission]: completed submissions of today + """ + # only completed submission will consume today's limitation + submissions: list[ApiSubmission] = [] + + today = datetime.now(timezone.utc).date() + + try: + submissions_resp = api.competition_submissions( + competition=competition, + # the api use date to sort the result by default, so latest submission is in last page + page_token=-1, + ) + + if submissions_resp is not None: + submissions = [ + submission + for submission in submissions_resp + if submission is not None + and submission.date.date() == today + and submission.status == SubmissionStatus.COMPLETE + ] + except requests.exceptions.HTTPError as e: + # 403 if we have not joined the competition + if e.response.status_code == ERROR_CODE_NOT_JOIN_COMPETITION: + logger.error(f"You have not joined the competition '{competition}'.") + elif e.response.status_code == ERROR_CODE_NO_SUBMISSION: + # if a competition has no any submission, this call will cause 400. + # we consider it is a correct state, means 0 completed submissions + logger.info("no submission now.") + except Exception as e: # noqa: BLE001 + logger.error(f"Fail to get submissions with error: {e}") + + return submissions + + +def get_competition_detail(api: KaggleApi, competition: str) -> ApiCompetition | None: + """Get the competition detail information. + + Args: + api (KaggleApi): kaggle api object + competition (str): competition name to get detail + + Return: + ApiCompetition: competition detail information + """ + # kaggle sdk do not have function to get detail for specified competition, here we use the list function + # list function only return in-progress competitions, + # and we will use search parameter to filter competition name, + # and default page_size is 20, so more time we can get the target competition at first page + max_pages = 10 + try: + page = 1 + while page <= max_pages and (search_result := api.competitions_list(search=competition, page=page)) is not None: + for comp in search_result: + if comp is not None and comp.ref.rsplit("/", 1)[-1] == competition: + return comp + + page += 1 + + time.sleep(1) + except Exception as e: # noqa: BLE001 + logger.error(f"Fail to get competition list, with exception: {e}") + + return None + + +def get_submission_remaining(api: KaggleApi, competition: str) -> int: + """Get the submission remaining number. + + NOTE: this function will try to avoid any exception raised, so if you have not joined the competition, + or any other exceptions, will return 0 + + Args: + api (KaggleApi): kaggle api object + competition (str): competition name to get submission remaining number + + Return: + int: submission remaining number + """ + # kaggle use utc time to count + completed_submissions = get_completed_submissions(api, competition) + + competition_detail = get_competition_detail(api, competition) + + if competition_detail is None: + logger.error("Fail to get the competition, make sure it is in progress now.") + + return 0 + + return competition_detail.max_daily_submissions - len(completed_submissions) + + +def wait_for_submission_complete(api: KaggleApi, competition: str) -> None | tuple[str, str]: + """Wait for the latest submission complete (failed or completed). + + Args: + api (KaggleApi): kaggle api object + competition (str): competition name to wait for submission complete + + Return: + tuple[str, str]: public score, private score + """ + timeout = KG_SUBMISSION_SETTING.status_timeout + + # the submission request is done without error, here we keep check the latest submission state, until completed or timeout + start = datetime.now() # noqa: DTZ005 + while (datetime.now() - start).seconds <= timeout: # noqa: DTZ005 + # NOTE: we sleep first before check the latest submission state, + # as previous steps may just finish calling kaggle api, if we do not sleep here, + # kaggle will raise 400 - bad request error. + # wait for 5 seconds to get latest result + time.sleep(5) + + # the api can sort the submissions by date (new -> old), so we use the first item at first page + resp = api.competition_submissions( + competition=competition, + sort=SubmissionSortBy.SUBMISSION_SORT_BY_DATE, + page_token=0, + ) + + logger.info(f"Current submissions: {resp}") + + if resp is None or resp[0] is None or resp[0].status == SubmissionStatus.PENDING: + continue + + submission = resp[0] + + if submission.status == SubmissionStatus.ERROR: + logger.error(f"The submission validation failed, with message: {submission.error_description}") + else: + logger.info( + f"Current submission state [@{submission.date}] is: {submission.status}, public score: {submission.public_score}", + ) + + return submission.public_score, submission.private_score + + break + + return None + + +def submit_local_file( + api: KaggleApi, competition: str, file: str | Path, *, msg: str = "Message" +) -> None | tuple[str, str]: + """Submit local file to competition. + + Args: + api (KaggleApi): kaggle api object + competition (str): competition name to submit + file (str): local file path to submit + msg (str): message of current submission + """ + try: + resp = api.competition_submit(file_name=str(file), message=msg, competition=competition) + + # if upload failed, will return a string message, or it will raise exception if there is http response code >= 400 + if type(resp) is str: + logger.error(f"Fail to get submissions with error: {resp}") + else: + return wait_for_submission_complete(api=api, competition=competition) + except Exception as e: # noqa: BLE001 + if type(e) is requests.exceptions.HTTPError and e.response.status_code == ERROR_CODE_NOT_JOIN_COMPETITION: + logger.error(f"You have not joined the competition '{competition}'.") + else: + logger.error(f"Fail to submit with error: {e}") + + return None + + +def submit_code( + api: KaggleApi, + competition: str, + kernel_id: str, + kernel_version: int, + *, + msg: str = "Message", +) -> None | tuple[str, str]: + """Submit specified version of kernel to competition (without downloading the outputs). + + Args: + api (KaggleApi): kaggle api object + competition (str): competition name to submit + kernel_id (str): kernel id to submit + kernel_version (str): kernel version to submit, + each time we call kaggle sdk to upload a notebook, it will return a new version + msg (str): message of current submission + """ + submission_file = KG_SUBMISSION_SETTING.submission_file + + try: + resp: ApiCreateCodeSubmissionResponse = api.competition_submit_code( + competition=competition, + file_name=submission_file, + message=msg, + kernel=kernel_id, + kernel_version=kernel_version, + ) + + logger.info(f"Submission response: {resp}") + + return wait_for_submission_complete(api=api, competition=competition) + except Exception as e: # noqa: BLE001 + if type(e) is requests.exceptions.HTTPError and e.response.status_code == ERROR_CODE_NOT_JOIN_COMPETITION: + logger.error(f"You have not joined the competition '{competition}'.") + else: + logger.error(f"Fail to get submissions with error: {e}") + + return None + + +def generate_kernel_metadata(user: str, competition: str) -> dict: + """Generate kaggle kernel metadata to upload notebook to kaggle. + + Args: + user (str): kaggle username + competition (str): which competition this notebook will reference to + + Returns: + dict: kaggle kernel metadata + """ + kernel_name = f"{KG_SUBMISSION_SETTING.kernel_prefix}-{competition}" + + return { + "id": f"{user}/{kernel_name}", + "title": kernel_name, + "code_file": "submission.ipynb", # we hard coded the name, same with prepare_notebook function + "language": "python", + "kernel_type": "notebook", + "is_private": KG_SUBMISSION_SETTING.is_private, + "enable_gpu": KG_SUBMISSION_SETTING.enable_gpu, + "enable_tpu": KG_SUBMISSION_SETTING.enable_tpu, + "enable_internet": KG_SUBMISSION_SETTING.enable_internet, + "dataset_sources": [], + "competition_sources": [f"{competition}"], + "kernel_sources": [], + "model_sources": [], # TODO: we can use this field if we want to upload our model + } + + +# this is the minimal notebook template from kaggle notebook page, +# NOTE: sometimes using nbformat to generate notebook from scratch may cause kernel error, +# while running in Kaggle. So we use this template from Kaggle notebook page, to avoid the error. +kaggle_notebook_template = """ +{ + "metadata": { + "kernelspec": { + "language": "python", + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "version": "3.6.4", + "file_extension": ".py", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "name": "python", + "mimetype": "text/x-python" + } + }, + "nbformat_minor": 4, + "nbformat": 4, + "cells": [ + ] +} +""" + + +def prepare_notebook(user: str, competition: str, workspace: Path, output_path: Path) -> None: + """Prepare notebook from workspace for uploading to kaggle. + + NOTE: kaggle sdk need to prepare a folder with kernel-metadata.json and xxx.ipynb to upload + + Args: + user (str): kaggle username used to construct the notebook url + competition (str): which competition this notebook will reference to + workspace (Path): where is code from + output_path (Path): notebook output path + """ + # NOTE: this function only support adding main.py to notebook. + # TODO: support other project structure + + # generate metadata + metadata = generate_kernel_metadata(user=user, competition=competition) + + logger.info(f"Generated metadata: {metadata}") + + metadata_file = output_path / "kernel-metadata.json" + + # write the metadata + with metadata_file.open("wt", encoding="utf-8") as fp: + json.dump(metadata, fp) + + # create kaggle notebook from template without converting + notebook: nbformat.NotebookNode = nbformat.reads(kaggle_notebook_template, as_version=nbformat.NO_CONVERT) + + # read the main.py + main_file = workspace / "main.py" + + if not main_file.exists(): + raise FileNotFoundError + + with main_file.open("rt", encoding="utf-8") as fp: + main_code = fp.read() + + # patch the main code to make it possible to run in kaggle + # 1. change the input dir to kaggle input + main_code = main_code.replace("./workspace_input", f"/kaggle/input/{competition}") + + # 2. update the possible argparser code to support kaggle notebook starting parameter + # we cannot just use replace here, as we need insert correct indent + for code_line in main_code.split("\n"): + if code_line.strip() == "parser = argparse.ArgumentParser()": + # get the indent length + indent_n = code_line.index("parser") + # kaggle notebook will add additional arguments, we add them here to avoid error + main_code = main_code.replace( + "parser = argparse.ArgumentParser()", + ( + f"parser = argparse.ArgumentParser()\n" + f"{' ' * indent_n}parser.add_argument('-f', required=False)\n" + f"{' ' * indent_n}parser.add_argument('--HistoryManager.hist_file', required=False)\n" + ), + ) + + break + + notebook.cells.append(nbformat.v4.new_code_cell(main_code)) + + # an additional cell used to print traceback if there is any error + notebook.cells.append(nbformat.v4.new_code_cell("%tb")) + + # NOTE: in kaggle, the default workspace is /kaggle/working, + # so if our result files are writing into current folder, then we can use it to submit. + # and we do not need to change the code + + nbformat.write(notebook, os.path.join(output_path, "submission.ipynb")) # noqa: PTH118 + + +def upload_notebook(api: KaggleApi, kernel_id: str, folder: Path) -> int | None: + """Upload notebook to kaggle. + + Args: + api (kaggle.api.KaggleApi): kaggle api + kernel_id (str): id of the kernel, used to check status + folder (Path): the folder to upload that contains kernel-metadata.json and submission.ipynb + + Return: + int: the version number of uploaded kernel + """ + # try to push the notebook to kaggle + try: + upload_resp: ApiSaveKernelResponse = api.kernels_push(folder=folder) + + # check upload status + if upload_resp.error: + logger.error(f"Upload notebook failed: {upload_resp.error}. respose object: {upload_resp}") + + return None + except Exception as e: # noqa: BLE001 + logger.error(f"Upload notebook failed: {e}") + + return None + + # if upload success, the notebook will be running immediately. + # wait until running is done + try: + while (kernel_status_resp := api.kernels_status(kernel_id)) is not None: + logger.info(f"Kernel status: {kernel_status_resp.status}") + + if kernel_status_resp.status == KernelWorkerStatus.COMPLETE: + break + + if kernel_status_resp.status in [ + KernelWorkerStatus.ERROR, + KernelWorkerStatus.CANCEL_ACKNOWLEDGED, + KernelWorkerStatus.CANCEL_REQUESTED, + ]: + logger.error(f"Kernel error: {kernel_status_resp.status}") + + return None + + # sleep more seconds to avoid rate limit, as notebook will cost more time to complete + time.sleep(30) + except Exception as e: # noqa: BLE001 + logger.error(f"Check notebook status failed: {e}") + + return None + finally: + logger.info(f"Refer to this link for details: {upload_resp.url}") + + return upload_resp.version_number + + +def download_notebook_output(api: KaggleApi, kernel_id: str, output_folder: Path) -> bool: + """Download output files of a notebook. + + Args: + api (kaggle.api.KaggleApi): kaggle api object + kernel_id (str): kernel id to download + output_folder (Path): output folder to save the output files + + Return: + bool: whether download successfully + """ + try: + api.kernels_output(kernel=kernel_id, path=output_folder, force=True) + except Exception as e: # noqa: BLE001 + logger.error(f"Download notebook output failed: {e}") + + return False + + return True + + +@contextmanager +def create_kaggle_notebook( + api: KaggleApi, + competition: str, + workspace: Path, +) -> Generator[tuple[Path, str, int], None, None]: + """Create a kaggle notebook from workspace, and run it. + + Args: + api (kaggle.api.KaggleApi): kaggle api object + competition (str): competition name + workspace (Path): workspace path to get code + + Return: + tuple[Path, str, int]: local foler of notebook, the kernel id, and the kernel version + """ + with tempfile.TemporaryDirectory() as tmp_dir: + kernel_path = Path(tmp_dir) + + prepare_notebook( + user=api.config_values["username"], + competition=competition, + workspace=workspace, + output_path=kernel_path, + ) + + # read the metadata to get the kernel id + metadata_json_file = kernel_path / "kernel-metadata.json" + + with metadata_json_file.open("rt", encoding="utf-8") as fp: + kernel_metadata = json.load(fp) + + kernel_id = kernel_metadata["id"] + + kernel_version = upload_notebook(api=api, kernel_id=kernel_id, folder=kernel_path) + + if kernel_version is None: + logger.error("Upload notebook failed") + + return + + yield (kernel_path, kernel_id, kernel_version) + + +def submit_notebook_output(api: KaggleApi, competition: str, workspace: Path, msg: str) -> None | tuple[str, str]: + """Submit code from workspace, download the output and then submit the output to competition. + + Args: + api (kaggle.api.KaggleApi): kaggle api object + competition (str): competition name + workspace (Path): workspace path contains code to run + msg (str): message to submit + """ + with create_kaggle_notebook(api=api, competition=competition, workspace=workspace) as (kernel_path, kernel_id, _): + if download_notebook_output(api=api, kernel_id=kernel_id, output_folder=kernel_path): + # check if the submission file exist + submission_file_path = kernel_path / KG_SUBMISSION_SETTING.submission_file + + if not submission_file_path.exists(): + logger.error(f"Submission file {submission_file_path} not found in kernel output") + else: + return submit_local_file(api=api, competition=competition, file=submission_file_path, msg=msg) + + return None + + +def submit_notebook_online(api: KaggleApi, competition: str, workspace: Path, msg: str) -> None | tuple[str, str]: + """Submit the output of notebook in kaggle to competition. + + Args: + api (kaggle.api.KaggleApi): kaggle api object + competition (str): competition name + workspace (Path): workspace path contains code to run + msg (str): message to submit + """ + with create_kaggle_notebook(api=api, competition=competition, workspace=workspace) as ( + _, + kernel_id, + kernel_version, + ): + return submit_code( + api=api, + competition=competition, + kernel_id=kernel_id, + kernel_version=kernel_version, + msg=msg, + ) + + +def submit_from_workspace(competition: str, workspace: Path | str, *, msg: str = "Message") -> None | tuple[str, str]: + """Submit the result of competition to kaggle from specified workspace. + + Args: + competition (str): the competition name + code_only (bool, optional): if True, submit a notebook, and download the output, then submit for non-code competitions. + """ + api = KaggleApi() + + api.authenticate() + + competition_info = get_competition_detail(api=api, competition=competition) + + if competition_info is None: + logger.warning(f"Cannot find competition: {competition}") + + return None + + submission_file = KG_SUBMISSION_SETTING.submission_file + + if type(workspace) is str: + workspace = Path(workspace) + + result: None | tuple[str, str] = None + + # if the competition is not code competition, and not force to use notebook, then submit local file + if not KG_SUBMISSION_SETTING.force_submit_code and not competition_info.is_kernels_submissions_only: + logger.info(f"Submitting {submission_file} to {competition}") + + file = workspace / submission_file # type: ignore + + result = submit_local_file(api=api, competition=competition, file=file, msg=msg) + elif not competition_info.is_kernels_submissions_only: + logger.info(f"Submitting via notebook for {competition}") + + result = submit_notebook_output(api=api, competition=competition, workspace=workspace, msg=msg) # type: ignore + else: + logger.info(f"Submitting notebook for {competition}") + + result = submit_notebook_online(api=api, competition=competition, workspace=workspace, msg=msg) # type: ignore + + # show remaining submit number + # NOTE: if the competition is finished, the number will be 0 + remaining_num = get_submission_remaining(api=api, competition=competition) + + logger.info(f"Remaining submit number: {remaining_num}") + + return result + + +def submit_current_sota(competition: str) -> None | tuple[str, str]: + """Submit the sota result of competition from current trace path. + + Args: + competition (str): which competition to submit + """ + # we have to import this function here to avoid circular import issue + from rdagent.log.conf import LOG_SETTINGS # noqa: PLC0415 + from rdagent.log.ui.utils import get_sota_exp_stat # noqa: PLC0415 + + # check the trace_path + sota, sota_loop_id, _, _ = get_sota_exp_stat(log_path=Path(LOG_SETTINGS.trace_path)) + + logger.info(f"sota loop id: {sota_loop_id}") + + if sota is None: + logger.warning("Cannot find sota experiment, skip submitting.") + + return None + + if sota.experiment_workspace is None: + logger.warning("Fail to get sota output, workspace is None.") + + return None + + worspace = sota.experiment_workspace.workspace_path + + logger.info(f"Current sota workspace: {worspace}") + + # do submit + return submit_from_workspace( + competition=competition, + workspace=worspace, + msg=f"SOTA at {datetime.now()}, loop: {sota_loop_id}, file: {KG_SUBMISSION_SETTING.submission_file}", # noqa: DTZ005 + ) + + +if __name__ == "__main__": + import fire + + fire.Fire({"sota": submit_current_sota, "workspace": submit_from_workspace})