Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ef9238b
add script to submit and get remaining submit number
chaosddp Sep 1, 2025
69eba92
add function to auto submit csv file after experiment completed
chaosddp Sep 2, 2025
2ec6a14
add an argument to disable remaining number check
chaosddp Sep 3, 2025
04b404e
simplify the impl, handle 400 error
chaosddp Sep 4, 2025
7602d60
add public score for completed submission
chaosddp Sep 7, 2025
5544909
comment update
chaosddp Sep 7, 2025
6c67d61
add file path in message to make it easy to debug
chaosddp Sep 9, 2025
3dbbc70
try to submit if there is any exception
chaosddp Sep 10, 2025
35ec727
support both code and non-code competition submission now
chaosddp Sep 12, 2025
70a90da
add settings for submission
chaosddp Sep 12, 2025
2950fe1
update submit function, with sota loop id
chaosddp Sep 12, 2025
5586dad
refactoring the auto submit function, move to submit related function…
chaosddp Sep 12, 2025
98c84b4
refine code to reduce code lines
chaosddp Sep 12, 2025
7da697d
fix parameter issue, make the get_completed_submission only return to…
chaosddp Sep 12, 2025
1b7c1ad
remove eda from notebooke, as our final code will print it
chaosddp Sep 15, 2025
c97d06d
update comments and function names
chaosddp Sep 15, 2025
7237d88
sleep before get submission status, to avoid rate limit
chaosddp Sep 15, 2025
62053ff
return public and private score after submit
chaosddp Sep 16, 2025
f613e20
Merge branch 'main' of github.com:microsoft/RD-Agent into qzli/draft_…
Jensen246 Sep 16, 2025
e3077b9
feat: add draft_replace
Jensen246 Sep 16, 2025
2ea87e4
Merge remote-tracking branch 'origin/main' into qzli/draft_real
jingyuanlm Oct 17, 2025
13f5781
draft_v1
jingyuanlm Oct 22, 2025
f875d6d
skip download
qew21 Oct 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
319 changes: 319 additions & 0 deletions draft/code_v0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
import pandas as pd
sub = pd.read_csv("/home/shared/RD-Agent/online/cafa-6-protein-function-prediction/sample_submission.tsv", sep= "\t",on_bad_lines='skip', header = None)
sub.columns = ["The Protein ID", "The Gene Ontology term (GO) ID", "Predicted link probability that GO appear in Protein"]
sub.head(5)

MAIN_DIR = "/home/shared/RD-Agent/online/cafa-6-protein-function-prediction"

# UTILITARIES
import numpy as np
from tqdm import tqdm
import time

# TORCH MODULES FOR METRICS COMPUTATION :
import torch
from torch.utils.data import Dataset
from torch import nn
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import MultilabelF1Score
from torchmetrics.classification import MultilabelAccuracy

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger

# get embeddings code
#!pip install -q fair-esm

# import pathlib
# import torch

# from esm import FastaBatchedDataset, pretrained
# def extract_embeddings(model_name, fasta_file, output_dir, tokens_per_batch=4096, seq_length=1022,repr_layers=[33]):

# model, alphabet = pretrained.load_model_and_alphabet(model_name)
# model.eval()

# if torch.cuda.is_available():
# model = model.cuda()

# dataset = FastaBatchedDataset.from_file(fasta_file)
# batches = dataset.get_batch_indices(tokens_per_batch, extra_toks_per_seq=1)

# data_loader = torch.utils.data.DataLoader(
# dataset,
# collate_fn=alphabet.get_batch_converter(seq_length),
# batch_sampler=batches
# )

# output_dir.mkdir(parents=True, exist_ok=True)

# with torch.no_grad():
# for batch_idx, (labels, strs, toks) in enumerate(data_loader):

# print(f'Processing batch {batch_idx + 1} of {len(batches)}')

# if torch.cuda.is_available():
# toks = toks.to(device="cuda", non_blocking=True)

# out = model(toks, repr_layers=repr_layers, return_contacts=False)

# logits = out["logits"].to(device="cpu")
# representations = {layer: t.to(device="cpu") for layer, t in out["representations"].items()}

# for i, label in enumerate(labels):
# entry_id = label.split()[0]

# filename = output_dir / f"{entry_id}.pt"
# truncate_len = min(seq_length, len(strs[i]))

# result = {"entry_id": entry_id}
# result["mean_representations"] = {
# layer: t[i, 1 : truncate_len + 1].mean(0).clone()
# for layer, t in representations.items()
# }

# torch.save(result, filename)
# model_name = 'esm2_t33_650M_UR50D'
# fasta_file = pathlib.Path('/kaggle/input/cafa-5-fasta-files/train_sequences.fasta')
# output_dir = pathlib.Path('train_embeddings')

# extract_embeddings(model_name, fasta_file, output_dir)


class config:
train_sequences_path = MAIN_DIR + "/Train/train_sequences.fasta"
train_labels_path = MAIN_DIR + "/Train/train_terms.tsv"
test_sequences_path = MAIN_DIR + "/Test/testsuperset.fasta"

num_labels = 500
n_epochs = 8
batch_size = 128
lr = 0.01

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Directories for the different embedding vectors :
embeds_map = {
"T5" : "t5embeds",
"ProtBERT" : "protbert-embeddings-for-cafa5",
"EMS2" : "cafa-5-ems-2-embeddings-numpy"
}

# Length of the different embedding vectors :
embeds_dim = {
"T5" : 1024,
"ProtBERT" : 1024,
"EMS2" : 1280
}


class ProteinSequenceDataset(Dataset):

def __init__(self, datatype, embeddings_source):
super(ProteinSequenceDataset).__init__()
self.datatype = datatype

if embeddings_source in ["ProtBERT", "EMS2"]:
embeds = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_embeddings.npy")
ids = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy")

if embeddings_source == "T5":
embeds = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_embeds.npy")
ids = np.load("/kaggle/input/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy")

embeds_list = []
for l in range(embeds.shape[0]):
embeds_list.append(embeds[l,:])
self.df = pd.DataFrame(data={"EntryID": ids, "embed" : embeds_list})

if datatype=="train":
np_labels = np.load(
"/kaggle/input/train-targets-top"+str(config.num_labels)+ \
"/train_targets_top"+str(config.num_labels)+".npy")
df_labels = pd.DataFrame(self.df['EntryID'])
df_labels['labels_vect']=[row for row in np_labels]
self.df = self.df.merge(df_labels, on="EntryID")

def __len__(self):
return len(self.df)

def __getitem__(self, index):
embed = torch.tensor(self.df.iloc[index]["embed"] , dtype = torch.float32)
if self.datatype=="train":
targets = torch.tensor(self.df.iloc[index]["labels_vect"], dtype = torch.float32)
return embed, targets
if self.datatype=="test":
id = self.df.iloc[index]["EntryID"]
return embed, id


class MultiLayerPerceptron(torch.nn.Module):

def __init__(self, input_dim, num_classes):
super(MultiLayerPerceptron, self).__init__()

self.linear1 = torch.nn.Linear(input_dim, 864)
self.activation1 = torch.nn.ReLU()
self.linear2 = torch.nn.Linear(864, 712)
self.activation2 = torch.nn.ReLU()
self.linear3 = torch.nn.Linear(712, num_classes)


def forward(self, x):
x = self.linear1(x)
x = self.activation1(x)
x = self.linear2(x)
x = self.activation2(x)
x = self.linear3(x)
return x

class CNN1D(nn.Module):
def __init__(self, input_dim, num_classes):
super(CNN1D, self).__init__()
# (batch_size, channels, embed_size)
self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
# (batch_size, 3, embed_size)
self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
# (batch_size, 3, embed_size/2 = 512)
self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
# (batch_size, 8, embed_size/2 = 512)
self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
# (batch_size, 8, embed_size/4 = 256)
self.fc1 = nn.Linear(in_features=int(8 * input_dim/4), out_features=864)
self.fc2 = nn.Linear(in_features=864, out_features=num_classes)

def forward(self, x):
x = x.reshape(x.shape[0], 1, x.shape[1])
x = self.pool1(nn.functional.tanh(self.conv1(x)))
x = self.pool2(nn.functional.tanh(self.conv2(x)))
x = torch.flatten(x, 1)
x = nn.functional.tanh(self.fc1(x))
x = self.fc2(x)
return x


def train_model(embeddings_source, model_type="linear", train_size=0.9):

train_dataset = ProteinSequenceDataset(datatype="train", embeddings_source = embeddings_source)

train_set, val_set = random_split(train_dataset, lengths = [int(len(train_dataset)*train_size), len(train_dataset)-int(len(train_dataset)*train_size)])
train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=config.batch_size, shuffle=True)

if model_type == "linear":
model = MultiLayerPerceptron(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)
if model_type == "convolutional":
model = CNN1D(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)

optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=1)
CrossEntropy = torch.nn.CrossEntropyLoss()
f1_score = MultilabelF1Score(num_labels=config.num_labels).to(config.device)
n_epochs = config.n_epochs

print("BEGIN TRAINING...")
train_loss_history=[]
val_loss_history=[]

train_f1score_history=[]
val_f1score_history=[]
for epoch in range(n_epochs):
print("EPOCH ", epoch+1)
## TRAIN PHASE :
losses = []
scores = []
for embed, targets in tqdm(train_dataloader):
embed, targets = embed.to(config.device), targets.to(config.device)
optimizer.zero_grad()
preds = model(embed)
loss= CrossEntropy(preds, targets)
score=f1_score(preds, targets)
losses.append(loss.item())
scores.append(score.item())
loss.backward()
optimizer.step()
avg_loss = np.mean(losses)
avg_score = np.mean(scores)
print("Running Average TRAIN Loss : ", avg_loss)
print("Running Average TRAIN F1-Score : ", avg_score)
train_loss_history.append(avg_loss)
train_f1score_history.append(avg_score)

## VALIDATION PHASE :
losses = []
scores = []
for embed, targets in val_dataloader:
embed, targets = embed.to(config.device), targets.to(config.device)
preds = model(embed)
loss= CrossEntropy(preds, targets)
score=f1_score(preds, targets)
losses.append(loss.item())
scores.append(score.item())
avg_loss = np.mean(losses)
avg_score = np.mean(scores)
print("Running Average VAL Loss : ", avg_loss)
print("Running Average VAL F1-Score : ", avg_score)
val_loss_history.append(avg_loss)
val_f1score_history.append(avg_score)

scheduler.step(avg_loss)
print("\n")

print("TRAINING FINISHED")
print("FINAL TRAINING SCORE : ", train_f1score_history[-1])
print("FINAL VALIDATION SCORE : ", val_f1score_history[-1])

losses_history = {"train" : train_loss_history, "val" : val_loss_history}
scores_history = {"train" : train_f1score_history, "val" : val_f1score_history}

return model, losses_history, scores_history

ems2_model, ems2_losses, ems2_scores = train_model(embeddings_source="EMS2",model_type="convolutional")

def predict(embeddings_source):

test_dataset = ProteinSequenceDataset(datatype="test", embeddings_source = embeddings_source)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

if embeddings_source == "T5":
model = t5_model
if embeddings_source == "ProtBERT":
model = protbert_model
if embeddings_source == "EMS2":
model = ems2_model

model.eval()

labels = pd.read_csv(config.train_labels_path, sep = "\t")
top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
labels_names = top_terms[:config.num_labels].index.values
print("GENERATE PREDICTION FOR TEST SET...")

ids_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
go_terms_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
confs_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=np.float32)

for i, (embed, id) in tqdm(enumerate(test_dataloader)):
embed = embed.to(config.device)
confs_[i*config.num_labels:(i+1)*config.num_labels] = torch.nn.functional.sigmoid(model(embed)).squeeze().detach().cpu().numpy()
ids_[i*config.num_labels:(i+1)*config.num_labels] = id[0]
go_terms_[i*config.num_labels:(i+1)*config.num_labels] = labels_names

submission_df = pd.DataFrame(data={"Id" : ids_, "GO term" : go_terms_, "Confidence" : confs_})
print("PREDICTIONS DONE")
return submission_df
submission_df = predict("EMS2")

#this submission was obtained by training models on BlastP, Sprof, QuickGo and DeeepGoZero offline
submission2 = pd.read_csv('/kaggle/input/blast-quick-sprof-zero-pred/submission.tsv',
sep='\t', header=None, names=['Id2', 'GO term2', 'Confidence2'])

subs = submission2.merge(submission_df, left_on=['Id2', 'GO term2'],
right_on=['Id', 'GO term'], how='outer')

subs.drop(['Id', 'GO term'], axis=1, inplace=True)
subs['confidence_combined'] = subs.apply(lambda row: row['Confidence2'] if not np.isnan(row['Confidence2']) else row['Confidence'], axis=1)

subs[['Id2', 'GO term2', 'confidence_combined']].to_csv('submission.tsv', sep='\t', header=False, index=False)
Loading
Loading