diff --git a/draft/code_v0.py b/draft/code_v0.py
new file mode 100644
index 000000000..d17ef9911
--- /dev/null
+++ b/draft/code_v0.py
@@ -0,0 +1,319 @@
+import pandas as pd
+sub = pd.read_csv("/home/shared/RD-Agent/online/cafa-6-protein-function-prediction/sample_submission.tsv", sep= "\t",on_bad_lines='skip', header = None)
+sub.columns = ["The Protein ID", "The Gene Ontology term (GO) ID", "Predicted link probability that GO appear in Protein"]
+sub.head(5)
+
+MAIN_DIR = "/home/shared/RD-Agent/online/cafa-6-protein-function-prediction"
+
+# UTILITARIES
+import numpy as np
+from tqdm import tqdm
+import time
+
+# TORCH MODULES FOR METRICS COMPUTATION :
+import torch
+from torch.utils.data import Dataset
+from torch import nn
+from torch.utils.data import random_split
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from torchmetrics.classification import MultilabelF1Score
+from torchmetrics.classification import MultilabelAccuracy
+
+import pytorch_lightning as pl
+from pytorch_lightning import Trainer
+from pytorch_lightning.loggers import WandbLogger
+
+# get embeddings code
+#!pip install -q fair-esm
+
+# import pathlib
+# import torch
+
+# from esm import FastaBatchedDataset, pretrained
+# def extract_embeddings(model_name, fasta_file, output_dir, tokens_per_batch=4096, seq_length=1022,repr_layers=[33]):
+    
+#     model, alphabet = pretrained.load_model_and_alphabet(model_name)
+#     model.eval()
+
+#     if torch.cuda.is_available():
+#         model = model.cuda()
+        
+#     dataset = FastaBatchedDataset.from_file(fasta_file)
+#     batches = dataset.get_batch_indices(tokens_per_batch, extra_toks_per_seq=1)
+
+#     data_loader = torch.utils.data.DataLoader(
+#         dataset, 
+#         collate_fn=alphabet.get_batch_converter(seq_length), 
+#         batch_sampler=batches
+#     )
+
+#     output_dir.mkdir(parents=True, exist_ok=True)
+    
+#     with torch.no_grad():
+#         for batch_idx, (labels, strs, toks) in enumerate(data_loader):
+
+#             print(f'Processing batch {batch_idx + 1} of {len(batches)}')
+
+#             if torch.cuda.is_available():
+#                 toks = toks.to(device="cuda", non_blocking=True)
+
+#             out = model(toks, repr_layers=repr_layers, return_contacts=False)
+
+#             logits = out["logits"].to(device="cpu")
+#             representations = {layer: t.to(device="cpu") for layer, t in out["representations"].items()}
+            
+#             for i, label in enumerate(labels):
+#                 entry_id = label.split()[0]
+                
+#                 filename = output_dir / f"{entry_id}.pt"
+#                 truncate_len = min(seq_length, len(strs[i]))
+
+#                 result = {"entry_id": entry_id}
+#                 result["mean_representations"] = {
+#                         layer: t[i, 1 : truncate_len + 1].mean(0).clone()
+#                         for layer, t in representations.items()
+#                     }
+
+#                 torch.save(result, filename)
+# model_name = 'esm2_t33_650M_UR50D'
+# fasta_file = pathlib.Path('/kaggle/input/cafa-5-fasta-files/train_sequences.fasta')
+# output_dir = pathlib.Path('train_embeddings')
+
+# extract_embeddings(model_name, fasta_file, output_dir)
+
+
+class config:
+    train_sequences_path = MAIN_DIR  + "/Train/train_sequences.fasta"
+    train_labels_path = MAIN_DIR + "/Train/train_terms.tsv"
+    test_sequences_path = MAIN_DIR + "/Test/testsuperset.fasta"
+    
+    num_labels = 500
+    n_epochs = 8
+    batch_size = 128
+    lr = 0.01
+    
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+# Directories for the different embedding vectors : 
+embeds_map = {
+    "T5" : "t5embeds",
+    "ProtBERT" : "protbert-embeddings-for-cafa5",
+    "EMS2" : "cafa-5-ems-2-embeddings-numpy"
+}
+
+# Length of the different embedding vectors :
+embeds_dim = {
+    "T5" : 1024,
+    "ProtBERT" : 1024,
+    "EMS2" : 1280
+}
+
+
+class ProteinSequenceDataset(Dataset):
+    
+    def __init__(self, datatype, embeddings_source):
+        super(ProteinSequenceDataset).__init__()
+        self.datatype = datatype
+        
+        if embeddings_source in ["ProtBERT", "EMS2"]:
+            embeds = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_embeddings.npy")
+            ids = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy")
+        
+        if embeddings_source == "T5":
+            embeds = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_embeds.npy")
+            ids = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy")
+            
+        embeds_list = []
+        for l in range(embeds.shape[0]):
+            embeds_list.append(embeds[l,:])
+        self.df = pd.DataFrame(data={"EntryID": ids, "embed" : embeds_list})
+        
+        if datatype=="train":
+            np_labels = np.load(
+                "/kaggle/input/train-targets-top"+str(config.num_labels)+ \
+                "/train_targets_top"+str(config.num_labels)+".npy")
+            df_labels = pd.DataFrame(self.df['EntryID'])
+            df_labels['labels_vect']=[row for row in np_labels]
+            self.df = self.df.merge(df_labels, on="EntryID")
+            
+    def __len__(self):
+        return len(self.df)
+    
+    def __getitem__(self, index):
+        embed = torch.tensor(self.df.iloc[index]["embed"] , dtype = torch.float32)
+        if self.datatype=="train":
+            targets = torch.tensor(self.df.iloc[index]["labels_vect"], dtype = torch.float32)
+            return embed, targets
+        if self.datatype=="test":
+            id = self.df.iloc[index]["EntryID"]
+            return embed, id
+        
+
+class MultiLayerPerceptron(torch.nn.Module):
+
+    def __init__(self, input_dim, num_classes):
+        super(MultiLayerPerceptron, self).__init__()
+
+        self.linear1 = torch.nn.Linear(input_dim, 864)
+        self.activation1 = torch.nn.ReLU() 
+        self.linear2 = torch.nn.Linear(864, 712)
+        self.activation2 = torch.nn.ReLU()
+        self.linear3 = torch.nn.Linear(712, num_classes)
+      
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.activation1(x)
+        x = self.linear2(x)
+        x = self.activation2(x)
+        x = self.linear3(x)
+        return x
+    
+class CNN1D(nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super(CNN1D, self).__init__()
+        # (batch_size, channels, embed_size)
+        self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
+        # (batch_size, 3, embed_size)
+        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
+        # (batch_size, 3, embed_size/2 = 512)
+        self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
+        # (batch_size, 8, embed_size/2 = 512)
+        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
+        # (batch_size, 8, embed_size/4 = 256)
+        self.fc1 = nn.Linear(in_features=int(8 * input_dim/4), out_features=864)
+        self.fc2 = nn.Linear(in_features=864, out_features=num_classes)
+
+    def forward(self, x):
+        x = x.reshape(x.shape[0], 1, x.shape[1])
+        x = self.pool1(nn.functional.tanh(self.conv1(x)))
+        x = self.pool2(nn.functional.tanh(self.conv2(x)))
+        x = torch.flatten(x, 1)
+        x = nn.functional.tanh(self.fc1(x))
+        x = self.fc2(x)
+        return x
+
+
+def train_model(embeddings_source, model_type="linear", train_size=0.9):
+    
+    train_dataset = ProteinSequenceDataset(datatype="train", embeddings_source = embeddings_source)
+    
+    train_set, val_set = random_split(train_dataset, lengths = [int(len(train_dataset)*train_size), len(train_dataset)-int(len(train_dataset)*train_size)])
+    train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
+    val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=config.batch_size, shuffle=True)
+
+    if model_type == "linear":
+        model = MultiLayerPerceptron(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)
+    if model_type == "convolutional":
+        model = CNN1D(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)
+
+    optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
+    scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=1)
+    CrossEntropy = torch.nn.CrossEntropyLoss()
+    f1_score = MultilabelF1Score(num_labels=config.num_labels).to(config.device)
+    n_epochs = config.n_epochs
+
+    print("BEGIN TRAINING...")
+    train_loss_history=[]
+    val_loss_history=[]
+    
+    train_f1score_history=[]
+    val_f1score_history=[]
+    for epoch in range(n_epochs):
+        print("EPOCH ", epoch+1)
+        ## TRAIN PHASE :
+        losses = []
+        scores = []
+        for embed, targets in tqdm(train_dataloader):
+            embed, targets = embed.to(config.device), targets.to(config.device)
+            optimizer.zero_grad()
+            preds = model(embed)
+            loss= CrossEntropy(preds, targets)
+            score=f1_score(preds, targets)
+            losses.append(loss.item()) 
+            scores.append(score.item())
+            loss.backward()
+            optimizer.step()
+        avg_loss = np.mean(losses)
+        avg_score = np.mean(scores)
+        print("Running Average TRAIN Loss : ", avg_loss)
+        print("Running Average TRAIN F1-Score : ", avg_score)
+        train_loss_history.append(avg_loss)
+        train_f1score_history.append(avg_score)
+        
+        ## VALIDATION PHASE : 
+        losses = []
+        scores = []
+        for embed, targets in val_dataloader:
+            embed, targets = embed.to(config.device), targets.to(config.device)
+            preds = model(embed)
+            loss= CrossEntropy(preds, targets)
+            score=f1_score(preds, targets)
+            losses.append(loss.item())
+            scores.append(score.item())
+        avg_loss = np.mean(losses)
+        avg_score = np.mean(scores)
+        print("Running Average VAL Loss : ", avg_loss)
+        print("Running Average VAL F1-Score : ", avg_score)
+        val_loss_history.append(avg_loss)
+        val_f1score_history.append(avg_score)
+        
+        scheduler.step(avg_loss)
+        print("\n")
+        
+    print("TRAINING FINISHED")
+    print("FINAL TRAINING SCORE : ", train_f1score_history[-1])
+    print("FINAL VALIDATION SCORE : ", val_f1score_history[-1])
+    
+    losses_history = {"train" : train_loss_history, "val" : val_loss_history}
+    scores_history = {"train" : train_f1score_history, "val" : val_f1score_history}
+    
+    return model, losses_history, scores_history
+
+ems2_model, ems2_losses, ems2_scores = train_model(embeddings_source="EMS2",model_type="convolutional")
+
+def predict(embeddings_source):
+    
+    test_dataset = ProteinSequenceDataset(datatype="test", embeddings_source = embeddings_source)
+    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)
+    
+    if embeddings_source == "T5":
+        model = t5_model
+    if embeddings_source == "ProtBERT":
+        model = protbert_model
+    if embeddings_source == "EMS2":
+        model = ems2_model
+        
+    model.eval()
+    
+    labels = pd.read_csv(config.train_labels_path, sep = "\t")
+    top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
+    labels_names = top_terms[:config.num_labels].index.values
+    print("GENERATE PREDICTION FOR TEST SET...")
+
+    ids_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
+    go_terms_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
+    confs_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=np.float32)
+
+    for i, (embed, id) in tqdm(enumerate(test_dataloader)):
+        embed = embed.to(config.device)
+        confs_[i*config.num_labels:(i+1)*config.num_labels] = torch.nn.functional.sigmoid(model(embed)).squeeze().detach().cpu().numpy()
+        ids_[i*config.num_labels:(i+1)*config.num_labels] = id[0]
+        go_terms_[i*config.num_labels:(i+1)*config.num_labels] = labels_names
+
+    submission_df = pd.DataFrame(data={"Id" : ids_, "GO term" : go_terms_, "Confidence" : confs_})
+    print("PREDICTIONS DONE")
+    return submission_df
+submission_df = predict("EMS2")
+
+#this submission was obtained by training models on BlastP, Sprof, QuickGo and DeeepGoZero offline
+submission2 = pd.read_csv('/kaggle/input/blast-quick-sprof-zero-pred/submission.tsv',
+    sep='\t', header=None, names=['Id2', 'GO term2', 'Confidence2']) 
+
+subs = submission2.merge(submission_df, left_on=['Id2', 'GO term2'], 
+                                                  right_on=['Id', 'GO term'], how='outer')
+
+subs.drop(['Id', 'GO term'], axis=1, inplace=True)
+subs['confidence_combined'] = subs.apply(lambda row: row['Confidence2'] if not np.isnan(row['Confidence2']) else row['Confidence'], axis=1)
+
+subs[['Id2', 'GO term2', 'confidence_combined']].to_csv('submission.tsv', sep='\t', header=False, index=False)
\ No newline at end of file
diff --git a/draft/code_v1.py b/draft/code_v1.py
new file mode 100644
index 000000000..ecd795be0
--- /dev/null
+++ b/draft/code_v1.py
@@ -0,0 +1,246 @@
+import os
+import time
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+import torch
+from torch import nn
+from torch.utils.data import Dataset, DataLoader, random_split
+from torch.optim.lr_scheduler import ReduceLROnPlateau
+from torchmetrics.classification import MultilabelF1Score
+
+# -----------------------------------------------------------
+# CONFIGURATION
+# -----------------------------------------------------------
+class Config:
+    MAIN_DIR = "/home/shared/RD-Agent/online/cafa-6-protein-function-prediction"
+    train_sequences_path = os.path.join(MAIN_DIR, "Train/train_sequences.fasta")
+    train_labels_path = os.path.join(MAIN_DIR, "Train/train_terms.tsv")
+    test_sequences_path = os.path.join(MAIN_DIR, "Test/testsuperset.fasta")
+
+    embeddings_dir = os.path.expanduser("~/cafa6_embeddings")
+    os.makedirs(embeddings_dir, exist_ok=True)
+
+    model_name = "esm2_t33_650M_UR50D"
+    num_labels = 500
+    n_epochs = 8
+    batch_size = 128
+    lr = 0.01
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+config = Config()
+
+
+# -----------------------------------------------------------
+# STEP 1: EXTRACT EMBEDDINGS USING FAIR-ESM
+# -----------------------------------------------------------
+from esm import FastaBatchedDataset, pretrained
+
+def extract_embeddings(model_name, fasta_file, output_dir, repr_layers=[33], tokens_per_batch=4096, seq_length=1022):
+    model, alphabet = pretrained.load_model_and_alphabet(model_name)
+    model.eval()
+    if torch.cuda.is_available():
+        model = model.cuda()
+
+    dataset = FastaBatchedDataset.from_file(fasta_file)
+    batches = dataset.get_batch_indices(tokens_per_batch, extra_toks_per_seq=1)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        collate_fn=alphabet.get_batch_converter(seq_length),
+        batch_sampler=batches,
+    )
+
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"Extracting embeddings to {output_dir}")
+
+    all_embeds, all_ids = [], []
+
+    with torch.no_grad():
+        for batch_idx, (labels, strs, toks) in enumerate(data_loader):
+            print(f"Processing batch {batch_idx + 1}/{len(batches)}")
+            if torch.cuda.is_available():
+                toks = toks.to(device="cuda", non_blocking=True)
+            out = model(toks, repr_layers=repr_layers, return_contacts=False)
+            representations = {layer: t.to("cpu") for layer, t in out["representations"].items()}
+
+            for i, label in enumerate(labels):
+                entry_id = label.split()[0]
+                truncate_len = min(seq_length, len(strs[i]))
+                mean_embed = representations[repr_layers[0]][i, 1 : truncate_len + 1].mean(0).clone()
+                all_embeds.append(mean_embed.numpy())
+                all_ids.append(entry_id)
+
+    np.save(os.path.join(output_dir, "embeddings.npy"), np.stack(all_embeds))
+    np.save(os.path.join(output_dir, "ids.npy"), np.array(all_ids))
+    print(f"Saved {len(all_ids)} embeddings to {output_dir}")
+
+
+# -----------------------------------------------------------
+# STEP 2: DATASET DEFINITION
+# -----------------------------------------------------------
+class ProteinSequenceDataset(Dataset):
+    def __init__(self, datatype, embedding_dir):
+        super().__init__()
+        self.datatype = datatype
+        embeds = np.load(os.path.join(embedding_dir, "embeddings.npy"))
+        ids = np.load(os.path.join(embedding_dir, "ids.npy"))
+
+        self.df = pd.DataFrame({"EntryID": ids, "embed": list(embeds)})
+
+        if datatype == "train":
+            np_labels = np.load(
+                os.path.join(embedding_dir, f"train_targets_top{config.num_labels}.npy")
+            )
+            df_labels = pd.DataFrame({"EntryID": ids, "labels_vect": list(np_labels)})
+            self.df = self.df.merge(df_labels, on="EntryID")
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, index):
+        embed = torch.tensor(self.df.iloc[index]["embed"], dtype=torch.float32)
+        if self.datatype == "train":
+            targets = torch.tensor(self.df.iloc[index]["labels_vect"], dtype=torch.float32)
+            return embed, targets
+        else:
+            entry_id = self.df.iloc[index]["EntryID"]
+            return embed, entry_id
+
+
+# -----------------------------------------------------------
+# STEP 3: MODELS
+# -----------------------------------------------------------
+class MultiLayerPerceptron(nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super().__init__()
+        self.linear1 = nn.Linear(input_dim, 864)
+        self.linear2 = nn.Linear(864, 712)
+        self.linear3 = nn.Linear(712, num_classes)
+        self.activation = nn.ReLU()
+
+    def forward(self, x):
+        x = self.activation(self.linear1(x))
+        x = self.activation(self.linear2(x))
+        x = self.linear3(x)
+        return x
+
+
+class CNN1D(nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super().__init__()
+        self.conv1 = nn.Conv1d(1, 3, kernel_size=3, padding=1)
+        self.pool1 = nn.MaxPool1d(2)
+        self.conv2 = nn.Conv1d(3, 8, kernel_size=3, padding=1)
+        self.pool2 = nn.MaxPool1d(2)
+        self.fc1 = nn.Linear(int(8 * input_dim / 4), 864)
+        self.fc2 = nn.Linear(864, num_classes)
+
+    def forward(self, x):
+        x = x.unsqueeze(1)  # (batch, 1, embed_dim)
+        x = self.pool1(torch.tanh(self.conv1(x)))
+        x = self.pool2(torch.tanh(self.conv2(x)))
+        x = torch.flatten(x, 1)
+        x = torch.tanh(self.fc1(x))
+        x = self.fc2(x)
+        return x
+
+
+# -----------------------------------------------------------
+# STEP 4: TRAINING FUNCTION
+# -----------------------------------------------------------
+def train_model(embedding_dir, model_type="mlp", train_size=0.9):
+    train_dataset = ProteinSequenceDataset("train", embedding_dir)
+    train_len = int(len(train_dataset) * train_size)
+    train_set, val_set = random_split(train_dataset, [train_len, len(train_dataset) - train_len])
+    train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
+    val_loader = DataLoader(val_set, batch_size=config.batch_size, shuffle=False)
+
+    input_dim = train_dataset.df.iloc[0]["embed"].shape[0]
+    model = (
+        MultiLayerPerceptron(input_dim, config.num_labels)
+        if model_type == "mlp"
+        else CNN1D(input_dim, config.num_labels)
+    ).to(config.device)
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)
+    scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=1)
+    criterion = nn.BCEWithLogitsLoss()
+    f1 = MultilabelF1Score(num_labels=config.num_labels).to(config.device)
+
+    print("BEGIN TRAINING...")
+    for epoch in range(config.n_epochs):
+        model.train()
+        train_losses, train_scores = [], []
+        for embed, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.n_epochs}"):
+            embed, targets = embed.to(config.device), targets.to(config.device)
+            optimizer.zero_grad()
+            preds = model(embed)
+            loss = criterion(preds, targets)
+            score = f1(torch.sigmoid(preds), targets)
+            loss.backward()
+            optimizer.step()
+            train_losses.append(loss.item())
+            train_scores.append(score.item())
+
+        model.eval()
+        val_losses, val_scores = [], []
+        with torch.no_grad():
+            for embed, targets in val_loader:
+                embed, targets = embed.to(config.device), targets.to(config.device)
+                preds = model(embed)
+                loss = criterion(preds, targets)
+                score = f1(torch.sigmoid(preds), targets)
+                val_losses.append(loss.item())
+                val_scores.append(score.item())
+
+        scheduler.step(np.mean(val_losses))
+        print(f"Epoch {epoch+1}: Train F1={np.mean(train_scores):.4f}, Val F1={np.mean(val_scores):.4f}")
+
+    print("TRAINING FINISHED ✅")
+    return model
+
+
+# -----------------------------------------------------------
+# STEP 5: PREDICTION
+# -----------------------------------------------------------
+def predict(model, embedding_dir):
+    test_dataset = ProteinSequenceDataset("test", embedding_dir)
+    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)
+
+    labels = pd.read_csv(config.train_labels_path, sep="\t")
+    top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
+    labels_names = top_terms[:config.num_labels].index.values
+
+    ids_, go_terms_, confs_ = [], [], []
+    model.eval()
+    with torch.no_grad():
+        for embed, entry_id in tqdm(test_loader, desc="Predicting"):
+            embed = embed.to(config.device)
+            preds = torch.sigmoid(model(embed)).squeeze().cpu().numpy()
+            ids_.extend([entry_id[0]] * config.num_labels)
+            go_terms_.extend(labels_names)
+            confs_.extend(preds.tolist())
+
+    submission_df = pd.DataFrame({"Id": ids_, "GO term": go_terms_, "Confidence": confs_})
+    submission_df.to_csv("submission.tsv", sep="\t", index=False, header=False)
+    print("✅ Submission saved to submission.tsv")
+    return submission_df
+
+
+# -----------------------------------------------------------
+# EXECUTION PIPELINE
+# -----------------------------------------------------------
+if __name__ == "__main__":
+    # 1️⃣ Extract embeddings (can be skipped if already generated)
+    extract_embeddings(
+        model_name=config.model_name,
+        fasta_file=config.train_sequences_path,
+        output_dir=config.embeddings_dir,
+    )
+
+    # 2️⃣ Train model
+    model = train_model(embedding_dir=config.embeddings_dir, model_type="mlp")
+
+    # 3️⃣ Predict on test set
+    submission_df = predict(model, config.embeddings_dir)
diff --git a/draft/playground-series-s5e7.py b/draft/playground-series-s5e7.py
new file mode 100644
index 000000000..9b7a04109
--- /dev/null
+++ b/draft/playground-series-s5e7.py
@@ -0,0 +1,71 @@
+# 1. Imports
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import StratifiedKFold
+from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
+
+# 2. Load data
+train = pd.read_csv("/tmp/kaggle/playground-series-s5e7/train.csv")
+test = pd.read_csv("/tmp/kaggle/playground-series-s5e7/test.csv")
+submission = pd.read_csv("/tmp/kaggle/playground-series-s5e7/sample_submission.csv")
+
+# 3. Encode target
+le = LabelEncoder()
+train["Personality_encoded"] = le.fit_transform(train["Personality"])
+
+# 4. Prepare features
+X = train.drop(columns=["id", "Personality", "Personality_encoded"])
+y = train["Personality_encoded"]
+X_test = test.drop(columns=["id"])
+
+# 5. Encode categorical columns
+combined = pd.concat([X, X_test], axis=0)
+cat_cols = combined.select_dtypes(include="object").columns.tolist()
+encoder = OrdinalEncoder()
+combined[cat_cols] = encoder.fit_transform(combined[cat_cols])
+
+X = combined.iloc[: len(X)].reset_index(drop=True)
+X_test = combined.iloc[len(X) :].reset_index(drop=True)
+
+# 6. Setup XGBoost
+params = {
+    "objective": "binary:logistic",
+    "eval_metric": "logloss",
+    "max_depth": 4,
+    "eta": 0.1,
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    "random_state": 42,
+}
+
+# 7. Stratified K-Fold Cross-Validation
+skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+oof_preds = np.zeros(len(X))
+test_preds = np.zeros(len(X_test))
+
+for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
+    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
+    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
+
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dval = xgb.DMatrix(X_val, label=y_val)
+    dtest = xgb.DMatrix(X_test)
+
+    model = xgb.train(
+        params, dtrain, num_boost_round=100, evals=[(dval, "valid")], early_stopping_rounds=10, verbose_eval=False
+    )
+
+    oof_preds[val_idx] = model.predict(dval) > 0.5
+    test_preds += model.predict(dtest) / skf.n_splits
+
+# 8. Evaluate
+cv_acc = accuracy_score(y, oof_preds)
+print(f"Cross-Validation Accuracy: {cv_acc:.4f}")
+
+# 9. Create submission
+final_preds = (test_preds > 0.5).astype(int)
+submission["Personality"] = le.inverse_transform(final_preds)
+submission.to_csv("submission.csv", index=False)
+submission.head()
diff --git a/job_log.txt b/job_log.txt
new file mode 100644
index 000000000..57aeb7387
--- /dev/null
+++ b/job_log.txt
@@ -0,0 +1 @@
+dotenv run -- python rdagent/app/data_science/loop.py --competition cafa-6-protein-function-prediction --timeout 12h
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
index 14f107d5d..8e7bcf236 100644
--- a/rdagent/app/data_science/conf.py
+++ b/rdagent/app/data_science/conf.py
@@ -35,13 +35,13 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     ## Coding Related
     coding_fail_reanalyze_threshold: int = 3
 
-    debug_recommend_timeout: int = 600
+    debug_recommend_timeout: int = 600*3
     """The recommend time limit for running on debugging data"""
-    debug_timeout: int = 600
+    debug_timeout: int = 600*3
     """The timeout limit for running on debugging data"""
-    full_recommend_timeout: int = 3600
+    full_recommend_timeout: int = 3600*3
     """The recommend time limit for running on full data"""
-    full_timeout: int = 3600
+    full_timeout: int = 3600*3
     """The timeout limit for running on full data"""
 
     #### model dump
@@ -151,6 +151,10 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     enable_draft_before_first_sota: bool = False
     enable_planner: bool = False
 
+    #### enable draft code replacement for first loop
+    enable_draft_code_replacement: bool = True
+    draft_code_path: str = "/data/userdata/v-lijingyuan/RD-Agent-draft-RAG/RD-Agent/draft/code_v1.py"
+
     model_architecture_suggestion_time_percent: float = 0.75
     allow_longer_timeout: bool = False
     coder_enable_llm_decide_longer_timeout: bool = False
diff --git a/rdagent/app/data_science/loop.py b/rdagent/app/data_science/loop.py
index 560e71e38..8cb2d91c2 100644
--- a/rdagent/app/data_science/loop.py
+++ b/rdagent/app/data_science/loop.py
@@ -1,10 +1,10 @@
 import asyncio
+from collections.abc import Coroutine
 from pathlib import Path
-from typing import Optional
+from typing import Annotated, Optional
 
 import fire
 import typer
-from typing_extensions import Annotated
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.core.utils import import_class
@@ -12,6 +12,18 @@
 from rdagent.scenarios.data_science.loop import DataScienceRDLoop
 
 
+async def run_and_submit_sota(loop_task: Coroutine, competition: str) -> None:
+    """Run the loop coroutine task, and submit the SOTA experiment submission file to kaggle at the end."""
+    from rdagent.scenarios.kaggle.submission import submit_current_sota
+
+    try:
+        # wait for the loop end
+        await loop_task
+    finally:
+        # we do not care about exception, just make sure we can submit
+        submit_current_sota(competition=competition)
+
+
 def main(
     path: Optional[str] = None,
     checkout: Annotated[bool, typer.Option("--checkout/--no-checkout", "-c/-C")] = True,
@@ -73,7 +85,12 @@ def main(
     if exp_gen_cls is not None:
         kaggle_loop.exp_gen = import_class(exp_gen_cls)(kaggle_loop.exp_gen.scen)
 
-    asyncio.run(kaggle_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout))
+    if DS_RD_SETTING.auto_submit:
+        asyncio.run(
+            run_and_submit_sota(kaggle_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout), competition)
+        )
+    else:
+        asyncio.run(kaggle_loop.run(step_n=step_n, loop_n=loop_n, all_duration=timeout))
 
 
 if __name__ == "__main__":
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
index 0cac34e6a..a0bf9259d 100644
--- a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
+++ b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -2,6 +2,7 @@
 import math
 from datetime import timedelta
 from enum import Enum
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
@@ -1250,10 +1251,20 @@ def task_gen(
             # Persist for later stages
             task.package_info = get_packages(pkgs)
 
-        exp = DSExperiment(
-            pending_tasks_list=[[task]], hypothesis=hypotheses[0], hypothesis_candidates=hypotheses_candidates
-        )
-        if sota_exp is not None:
+        exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=hypotheses[0])
+
+        # Draft code replacement for first loop
+        if (
+            DS_RD_SETTING.enable_draft_code_replacement and DS_RD_SETTING.draft_code_path and sota_exp is None
+        ):  # First loop (no SOTA experiment exists)
+
+            draft_path = Path(DS_RD_SETTING.draft_code_path)
+            if draft_path.exists():
+                logger.info(f"Loading draft code from: {draft_path}")
+                exp.experiment_workspace.inject_files(**{"main.py": draft_path.read_text()})
+            else:
+                logger.warning(f"Draft code file not found: {draft_path}")
+        elif sota_exp is not None:
             exp.experiment_workspace.inject_code_from_file_dict(sota_exp.experiment_workspace)
 
         # 3) create the workflow update task
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
index e1962f9c0..5ccf1a5c9 100644
--- a/rdagent/scenarios/kaggle/kaggle_crawler.py
+++ b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -109,6 +109,9 @@ def kaggle_description_css_selectors() -> tuple[str, str]:
 
 def download_data(competition: str, settings: ExtendedBaseSettings, enable_create_debug_data: bool = True) -> None:
     local_path = settings.local_data_path
+    if (Path(local_path) / f"{competition}").exists():
+        logger.info(f"{competition} already exists, skipping download.")
+        return
     if settings.if_using_mle_data:
         zipfile_path = f"{local_path}/zip_files"
         zip_competition_path = Path(zipfile_path) / competition
diff --git a/rdagent/scenarios/kaggle/submission.py b/rdagent/scenarios/kaggle/submission.py
new file mode 100644
index 000000000..cc090998b
--- /dev/null
+++ b/rdagent/scenarios/kaggle/submission.py
@@ -0,0 +1,672 @@
+import json
+import os
+import tempfile
+import time
+from collections.abc import Generator
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from pathlib import Path
+
+import nbformat
+import requests
+from kaggle.api.kaggle_api_extended import (
+    ApiCreateCodeSubmissionResponse,
+    ApiSaveKernelResponse,
+    KaggleApi,
+    SubmissionSortBy,
+    SubmissionStatus,
+)
+from kagglesdk.competitions.types.competition_api_service import (
+    ApiCompetition,
+    ApiSubmission,
+)
+from kagglesdk.kernels.types.kernels_enums import KernelWorkerStatus
+from pydantic_settings import SettingsConfigDict
+
+from rdagent.core.conf import ExtendedBaseSettings
+from rdagent.log import rdagent_logger as logger
+
+# error 403 means we have not join the competition when retrieving information, like submissions
+ERROR_CODE_NOT_JOIN_COMPETITION = 403
+# error code when we try to get submission when there is not any
+ERROR_CODE_NO_SUBMISSION = 400
+
+
+class KaggleSubmissionSetting(ExtendedBaseSettings):
+    model_config = SettingsConfigDict(env_prefix="SUBMIT_", protected_namespaces=())
+
+    # if force to submit code even it is not a code competition
+    force_submit_code: bool = False
+
+    # submission file name to submit
+    submission_file: str = "submission.csv"
+
+    # if enable notebook gpu
+    enable_gpu: bool = False
+
+    # if enable notebook internet access
+    enable_internet: bool = False
+
+    # if enable notebook tpu
+    enable_tpu: bool = False
+
+    # if make kernel private
+    is_private: bool = True
+
+    # prefix of the notebook name
+    kernel_prefix: str = "rd-submission"
+
+    # timeout when tracking submission status
+    status_timeout: int = 120
+
+
+KG_SUBMISSION_SETTING = KaggleSubmissionSetting()
+
+
+def get_completed_submissions(api: KaggleApi, competition: str) -> list[ApiSubmission]:
+    """Get the completed submissions of today.
+
+    Args:
+        api (KaggleApi): kaggle api object
+        competition (str): competition name to get submissions
+
+    Return:
+        list[ApiSubmission]: completed submissions of today
+    """
+    # only completed submission will consume today's limitation
+    submissions: list[ApiSubmission] = []
+
+    today = datetime.now(timezone.utc).date()
+
+    try:
+        submissions_resp = api.competition_submissions(
+            competition=competition,
+            # the api use date to sort the result by default, so latest submission is in last page
+            page_token=-1,
+        )
+
+        if submissions_resp is not None:
+            submissions = [
+                submission
+                for submission in submissions_resp
+                if submission is not None
+                and submission.date.date() == today
+                and submission.status == SubmissionStatus.COMPLETE
+            ]
+    except requests.exceptions.HTTPError as e:
+        # 403 if we have not joined the competition
+        if e.response.status_code == ERROR_CODE_NOT_JOIN_COMPETITION:
+            logger.error(f"You have not joined the competition '{competition}'.")
+        elif e.response.status_code == ERROR_CODE_NO_SUBMISSION:
+            # if a competition has no any submission, this call will cause 400.
+            # we consider it is a correct state, means 0 completed submissions
+            logger.info("no submission now.")
+    except Exception as e:  # noqa: BLE001
+        logger.error(f"Fail to get submissions with error: {e}")
+
+    return submissions
+
+
+def get_competition_detail(api: KaggleApi, competition: str) -> ApiCompetition | None:
+    """Get the competition detail information.
+
+    Args:
+        api (KaggleApi): kaggle api object
+        competition (str): competition name to get detail
+
+    Return:
+        ApiCompetition: competition detail information
+    """
+    # kaggle sdk do not have function to get detail for specified competition, here we use the list function
+    # list function only return in-progress competitions,
+    # and we will use search parameter to filter competition name,
+    # and default page_size is 20, so more time we can get the target competition at first page
+    max_pages = 10
+    try:
+        page = 1
+        while page <= max_pages and (search_result := api.competitions_list(search=competition, page=page)) is not None:
+            for comp in search_result:
+                if comp is not None and comp.ref.rsplit("/", 1)[-1] == competition:
+                    return comp
+
+            page += 1
+
+            time.sleep(1)
+    except Exception as e:  # noqa: BLE001
+        logger.error(f"Fail to get competition list, with exception: {e}")
+
+    return None
+
+
+def get_submission_remaining(api: KaggleApi, competition: str) -> int:
+    """Get the submission remaining  number.
+
+    NOTE: this function will try to avoid any exception raised, so if you have not joined the competition,
+        or any other exceptions, will return 0
+
+    Args:
+        api (KaggleApi): kaggle api object
+        competition (str): competition name to get submission remaining number
+
+    Return:
+        int: submission remaining number
+    """
+    # kaggle use utc time to count
+    completed_submissions = get_completed_submissions(api, competition)
+
+    competition_detail = get_competition_detail(api, competition)
+
+    if competition_detail is None:
+        logger.error("Fail to get the competition, make sure it is in progress now.")
+
+        return 0
+
+    return competition_detail.max_daily_submissions - len(completed_submissions)
+
+
+def wait_for_submission_complete(api: KaggleApi, competition: str) -> None | tuple[str, str]:
+    """Wait for the latest submission complete (failed or completed).
+
+    Args:
+        api (KaggleApi): kaggle api object
+        competition (str): competition name to wait for submission complete
+
+    Return:
+        tuple[str, str]: public score, private score
+    """
+    timeout = KG_SUBMISSION_SETTING.status_timeout
+
+    # the submission request is done without error, here we keep check the latest submission state, until completed or timeout
+    start = datetime.now()  # noqa: DTZ005
+    while (datetime.now() - start).seconds <= timeout:  # noqa: DTZ005
+        # NOTE: we sleep first before check the latest submission state,
+        # as previous steps may just finish calling kaggle api, if we do not sleep here,
+        # kaggle will raise 400 - bad request error.
+        # wait for 5 seconds to get latest result
+        time.sleep(5)
+
+        # the api can sort the submissions by date (new -> old), so we use the first item at first page
+        resp = api.competition_submissions(
+            competition=competition,
+            sort=SubmissionSortBy.SUBMISSION_SORT_BY_DATE,
+            page_token=0,
+        )
+
+        logger.info(f"Current submissions: {resp}")
+
+        if resp is None or resp[0] is None or resp[0].status == SubmissionStatus.PENDING:
+            continue
+
+        submission = resp[0]
+
+        if submission.status == SubmissionStatus.ERROR:
+            logger.error(f"The submission validation failed, with message: {submission.error_description}")
+        else:
+            logger.info(
+                f"Current submission state [@{submission.date}] is: {submission.status}, public score: {submission.public_score}",
+            )
+
+            return submission.public_score, submission.private_score
+
+        break
+
+    return None
+
+
+def submit_local_file(
+    api: KaggleApi, competition: str, file: str | Path, *, msg: str = "Message"
+) -> None | tuple[str, str]:
+    """Submit local file to competition.
+
+    Args:
+        api (KaggleApi): kaggle api object
+        competition (str): competition name to submit
+        file (str): local file path to submit
+        msg (str): message of current submission
+    """
+    try:
+        resp = api.competition_submit(file_name=str(file), message=msg, competition=competition)
+
+        # if upload failed, will return a string message, or it will raise exception if there is http response code >= 400
+        if type(resp) is str:
+            logger.error(f"Fail to get submissions with error: {resp}")
+        else:
+            return wait_for_submission_complete(api=api, competition=competition)
+    except Exception as e:  # noqa: BLE001
+        if type(e) is requests.exceptions.HTTPError and e.response.status_code == ERROR_CODE_NOT_JOIN_COMPETITION:
+            logger.error(f"You have not joined the competition '{competition}'.")
+        else:
+            logger.error(f"Fail to submit with error: {e}")
+
+    return None
+
+
+def submit_code(
+    api: KaggleApi,
+    competition: str,
+    kernel_id: str,
+    kernel_version: int,
+    *,
+    msg: str = "Message",
+) -> None | tuple[str, str]:
+    """Submit specified version of kernel to competition (without downloading the outputs).
+
+    Args:
+        api (KaggleApi): kaggle api object
+        competition (str): competition name to submit
+        kernel_id (str): kernel id to submit
+        kernel_version (str): kernel version to submit,
+            each time we call kaggle sdk to upload a notebook, it will return a new version
+        msg (str): message of current submission
+    """
+    submission_file = KG_SUBMISSION_SETTING.submission_file
+
+    try:
+        resp: ApiCreateCodeSubmissionResponse = api.competition_submit_code(
+            competition=competition,
+            file_name=submission_file,
+            message=msg,
+            kernel=kernel_id,
+            kernel_version=kernel_version,
+        )
+
+        logger.info(f"Submission response: {resp}")
+
+        return wait_for_submission_complete(api=api, competition=competition)
+    except Exception as e:  # noqa: BLE001
+        if type(e) is requests.exceptions.HTTPError and e.response.status_code == ERROR_CODE_NOT_JOIN_COMPETITION:
+            logger.error(f"You have not joined the competition '{competition}'.")
+        else:
+            logger.error(f"Fail to get submissions with error: {e}")
+
+    return None
+
+
+def generate_kernel_metadata(user: str, competition: str) -> dict:
+    """Generate kaggle kernel metadata to upload notebook to kaggle.
+
+    Args:
+        user (str): kaggle username
+        competition (str): which competition this notebook will reference to
+
+    Returns:
+        dict: kaggle kernel metadata
+    """
+    kernel_name = f"{KG_SUBMISSION_SETTING.kernel_prefix}-{competition}"
+
+    return {
+        "id": f"{user}/{kernel_name}",
+        "title": kernel_name,
+        "code_file": "submission.ipynb",  # we hard coded the name, same with prepare_notebook function
+        "language": "python",
+        "kernel_type": "notebook",
+        "is_private": KG_SUBMISSION_SETTING.is_private,
+        "enable_gpu": KG_SUBMISSION_SETTING.enable_gpu,
+        "enable_tpu": KG_SUBMISSION_SETTING.enable_tpu,
+        "enable_internet": KG_SUBMISSION_SETTING.enable_internet,
+        "dataset_sources": [],
+        "competition_sources": [f"{competition}"],
+        "kernel_sources": [],
+        "model_sources": [],  # TODO: we can use this field if we want to upload our model
+    }
+
+
+# this is the minimal notebook template from kaggle notebook page,
+# NOTE: sometimes using nbformat to generate notebook from scratch may cause kernel error,
+# while running in Kaggle. So we use this template from Kaggle notebook page, to avoid the error.
+kaggle_notebook_template = """
+{
+    "metadata": {
+        "kernelspec": {
+            "language": "python",
+            "display_name": "Python 3",
+            "name": "python3"
+        },
+        "language_info": {
+            "pygments_lexer": "ipython3",
+            "nbconvert_exporter": "python",
+            "version": "3.6.4",
+            "file_extension": ".py",
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "name": "python",
+            "mimetype": "text/x-python"
+        }
+    },
+    "nbformat_minor": 4,
+    "nbformat": 4,
+    "cells": [
+    ]
+}
+"""
+
+
+def prepare_notebook(user: str, competition: str, workspace: Path, output_path: Path) -> None:
+    """Prepare notebook from workspace for uploading to kaggle.
+
+    NOTE: kaggle sdk need to prepare a folder with kernel-metadata.json and xxx.ipynb to upload
+
+    Args:
+        user (str): kaggle username used to construct the notebook url
+        competition (str): which competition this notebook will reference to
+        workspace (Path): where is code from
+        output_path (Path): notebook output path
+    """
+    # NOTE: this function only support adding main.py to notebook.
+    # TODO: support other project structure
+
+    # generate metadata
+    metadata = generate_kernel_metadata(user=user, competition=competition)
+
+    logger.info(f"Generated metadata: {metadata}")
+
+    metadata_file = output_path / "kernel-metadata.json"
+
+    # write the metadata
+    with metadata_file.open("wt", encoding="utf-8") as fp:
+        json.dump(metadata, fp)
+
+    # create kaggle notebook from template without converting
+    notebook: nbformat.NotebookNode = nbformat.reads(kaggle_notebook_template, as_version=nbformat.NO_CONVERT)
+
+    # read the main.py
+    main_file = workspace / "main.py"
+
+    if not main_file.exists():
+        raise FileNotFoundError
+
+    with main_file.open("rt", encoding="utf-8") as fp:
+        main_code = fp.read()
+
+    # patch the main code to make it possible to run in kaggle
+    # 1. change the input dir to kaggle input
+    main_code = main_code.replace("./workspace_input", f"/kaggle/input/{competition}")
+
+    # 2. update the possible argparser code to support kaggle notebook starting parameter
+    # we cannot just use replace here, as we need insert correct indent
+    for code_line in main_code.split("\n"):
+        if code_line.strip() == "parser = argparse.ArgumentParser()":
+            # get the indent length
+            indent_n = code_line.index("parser")
+            # kaggle notebook will add additional arguments, we add them here to avoid error
+            main_code = main_code.replace(
+                "parser = argparse.ArgumentParser()",
+                (
+                    f"parser = argparse.ArgumentParser()\n"
+                    f"{' ' * indent_n}parser.add_argument('-f', required=False)\n"
+                    f"{' ' * indent_n}parser.add_argument('--HistoryManager.hist_file', required=False)\n"
+                ),
+            )
+
+            break
+
+    notebook.cells.append(nbformat.v4.new_code_cell(main_code))
+
+    # an additional cell used to print traceback if there is any error
+    notebook.cells.append(nbformat.v4.new_code_cell("%tb"))
+
+    # NOTE: in kaggle, the default workspace is /kaggle/working,
+    # so if our result files are writing into current folder, then we can use it to submit.
+    # and we do not need to change the code
+
+    nbformat.write(notebook, os.path.join(output_path, "submission.ipynb"))  # noqa: PTH118
+
+
+def upload_notebook(api: KaggleApi, kernel_id: str, folder: Path) -> int | None:
+    """Upload notebook to kaggle.
+
+    Args:
+        api (kaggle.api.KaggleApi): kaggle api
+        kernel_id (str): id of the kernel, used to check status
+        folder (Path): the folder to upload that contains kernel-metadata.json and submission.ipynb
+
+    Return:
+        int: the version number of uploaded kernel
+    """
+    # try to push the notebook to kaggle
+    try:
+        upload_resp: ApiSaveKernelResponse = api.kernels_push(folder=folder)
+
+        # check upload status
+        if upload_resp.error:
+            logger.error(f"Upload notebook failed: {upload_resp.error}. respose object: {upload_resp}")
+
+            return None
+    except Exception as e:  # noqa: BLE001
+        logger.error(f"Upload notebook failed: {e}")
+
+        return None
+
+    # if upload success, the notebook will be running immediately.
+    # wait until running is done
+    try:
+        while (kernel_status_resp := api.kernels_status(kernel_id)) is not None:
+            logger.info(f"Kernel status: {kernel_status_resp.status}")
+
+            if kernel_status_resp.status == KernelWorkerStatus.COMPLETE:
+                break
+
+            if kernel_status_resp.status in [
+                KernelWorkerStatus.ERROR,
+                KernelWorkerStatus.CANCEL_ACKNOWLEDGED,
+                KernelWorkerStatus.CANCEL_REQUESTED,
+            ]:
+                logger.error(f"Kernel error: {kernel_status_resp.status}")
+
+                return None
+
+            # sleep more seconds to avoid rate limit, as notebook will cost more time to complete
+            time.sleep(30)
+    except Exception as e:  # noqa: BLE001
+        logger.error(f"Check notebook status failed: {e}")
+
+        return None
+    finally:
+        logger.info(f"Refer to this link for details: {upload_resp.url}")
+
+    return upload_resp.version_number
+
+
+def download_notebook_output(api: KaggleApi, kernel_id: str, output_folder: Path) -> bool:
+    """Download output files of a notebook.
+
+    Args:
+        api (kaggle.api.KaggleApi): kaggle api object
+        kernel_id (str): kernel id to download
+        output_folder (Path): output folder to save the output files
+
+    Return:
+        bool: whether download successfully
+    """
+    try:
+        api.kernels_output(kernel=kernel_id, path=output_folder, force=True)
+    except Exception as e:  # noqa: BLE001
+        logger.error(f"Download notebook output failed: {e}")
+
+        return False
+
+    return True
+
+
+@contextmanager
+def create_kaggle_notebook(
+    api: KaggleApi,
+    competition: str,
+    workspace: Path,
+) -> Generator[tuple[Path, str, int], None, None]:
+    """Create a kaggle notebook from workspace, and run it.
+
+    Args:
+        api (kaggle.api.KaggleApi): kaggle api object
+        competition (str): competition name
+        workspace (Path): workspace path to get code
+
+    Return:
+        tuple[Path, str, int]: local foler of notebook, the kernel id, and the kernel version
+    """
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        kernel_path = Path(tmp_dir)
+
+        prepare_notebook(
+            user=api.config_values["username"],
+            competition=competition,
+            workspace=workspace,
+            output_path=kernel_path,
+        )
+
+        # read the metadata to get the kernel id
+        metadata_json_file = kernel_path / "kernel-metadata.json"
+
+        with metadata_json_file.open("rt", encoding="utf-8") as fp:
+            kernel_metadata = json.load(fp)
+
+        kernel_id = kernel_metadata["id"]
+
+        kernel_version = upload_notebook(api=api, kernel_id=kernel_id, folder=kernel_path)
+
+        if kernel_version is None:
+            logger.error("Upload notebook failed")
+
+            return
+
+        yield (kernel_path, kernel_id, kernel_version)
+
+
+def submit_notebook_output(api: KaggleApi, competition: str, workspace: Path, msg: str) -> None | tuple[str, str]:
+    """Submit code from workspace, download the output and then submit the output to competition.
+
+    Args:
+        api (kaggle.api.KaggleApi): kaggle api object
+        competition (str): competition name
+        workspace (Path): workspace path contains code to run
+        msg (str): message to submit
+    """
+    with create_kaggle_notebook(api=api, competition=competition, workspace=workspace) as (kernel_path, kernel_id, _):
+        if download_notebook_output(api=api, kernel_id=kernel_id, output_folder=kernel_path):
+            # check if the submission file exist
+            submission_file_path = kernel_path / KG_SUBMISSION_SETTING.submission_file
+
+            if not submission_file_path.exists():
+                logger.error(f"Submission file {submission_file_path} not found in kernel output")
+            else:
+                return submit_local_file(api=api, competition=competition, file=submission_file_path, msg=msg)
+
+    return None
+
+
+def submit_notebook_online(api: KaggleApi, competition: str, workspace: Path, msg: str) -> None | tuple[str, str]:
+    """Submit the output of notebook in kaggle to competition.
+
+    Args:
+        api (kaggle.api.KaggleApi): kaggle api object
+        competition (str): competition name
+        workspace (Path): workspace path contains code to run
+        msg (str): message to submit
+    """
+    with create_kaggle_notebook(api=api, competition=competition, workspace=workspace) as (
+        _,
+        kernel_id,
+        kernel_version,
+    ):
+        return submit_code(
+            api=api,
+            competition=competition,
+            kernel_id=kernel_id,
+            kernel_version=kernel_version,
+            msg=msg,
+        )
+
+
+def submit_from_workspace(competition: str, workspace: Path | str, *, msg: str = "Message") -> None | tuple[str, str]:
+    """Submit the result of competition to kaggle from specified workspace.
+
+    Args:
+        competition (str): the competition name
+        code_only (bool, optional): if True, submit a notebook, and download the output, then submit for non-code competitions.
+    """
+    api = KaggleApi()
+
+    api.authenticate()
+
+    competition_info = get_competition_detail(api=api, competition=competition)
+
+    if competition_info is None:
+        logger.warning(f"Cannot find competition: {competition}")
+
+        return None
+
+    submission_file = KG_SUBMISSION_SETTING.submission_file
+
+    if type(workspace) is str:
+        workspace = Path(workspace)
+
+    result: None | tuple[str, str] = None
+
+    # if the competition is not code competition, and not force to use notebook, then submit local file
+    if not KG_SUBMISSION_SETTING.force_submit_code and not competition_info.is_kernels_submissions_only:
+        logger.info(f"Submitting {submission_file} to {competition}")
+
+        file = workspace / submission_file  # type: ignore
+
+        result = submit_local_file(api=api, competition=competition, file=file, msg=msg)
+    elif not competition_info.is_kernels_submissions_only:
+        logger.info(f"Submitting via notebook for {competition}")
+
+        result = submit_notebook_output(api=api, competition=competition, workspace=workspace, msg=msg)  # type: ignore
+    else:
+        logger.info(f"Submitting notebook for {competition}")
+
+        result = submit_notebook_online(api=api, competition=competition, workspace=workspace, msg=msg)  # type: ignore
+
+    # show remaining submit number
+    # NOTE: if the competition is finished, the number will be 0
+    remaining_num = get_submission_remaining(api=api, competition=competition)
+
+    logger.info(f"Remaining submit number: {remaining_num}")
+
+    return result
+
+
+def submit_current_sota(competition: str) -> None | tuple[str, str]:
+    """Submit the sota result of competition from current trace path.
+
+    Args:
+        competition (str): which competition to submit
+    """
+    # we have to import this function here to avoid circular import issue
+    from rdagent.log.conf import LOG_SETTINGS  # noqa: PLC0415
+    from rdagent.log.ui.utils import get_sota_exp_stat  # noqa: PLC0415
+
+    # check the trace_path
+    sota, sota_loop_id, _, _ = get_sota_exp_stat(log_path=Path(LOG_SETTINGS.trace_path))
+
+    logger.info(f"sota loop id: {sota_loop_id}")
+
+    if sota is None:
+        logger.warning("Cannot find sota experiment, skip submitting.")
+
+        return None
+
+    if sota.experiment_workspace is None:
+        logger.warning("Fail to get sota output, workspace is None.")
+
+        return None
+
+    worspace = sota.experiment_workspace.workspace_path
+
+    logger.info(f"Current sota workspace: {worspace}")
+
+    # do submit
+    return submit_from_workspace(
+        competition=competition,
+        workspace=worspace,
+        msg=f"SOTA at {datetime.now()}, loop: {sota_loop_id}, file: {KG_SUBMISSION_SETTING.submission_file}",  # noqa: DTZ005
+    )
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire({"sota": submit_current_sota, "workspace": submit_from_workspace})