Skip to content

Commit 82ab072

Browse files
committed
update
1 parent 74d2a10 commit 82ab072

File tree

11 files changed

+391
-165
lines changed

11 files changed

+391
-165
lines changed

src/Build_dict.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,11 +310,25 @@ def hpo_word_map(hpo_obo, outpath):
310310
json.dump(hpoid_word, fout,indent=2)
311311
fout.close()
312312

313+
def alt_hpo(hpo_obo,outpath):
314+
fout=open(outpath+'alt_hpoid.json','w',encoding='utf-8')
315+
alt_hpoid={}
316+
for hpoid in hpo_obo.keys():
317+
318+
if hpo_obo[hpoid]['is_obsolete']=='':
319+
alt_hpoid[hpoid]=hpoid
320+
for ele in hpo_obo[hpoid]['alt_id']:
321+
if ele not in alt_hpoid.keys():
322+
alt_hpoid[ele]=hpoid
323+
else:
324+
pass
325+
# print('alt_id:',ele,'old:',alt_hpoid[ele])
326+
json.dump(alt_hpoid, fout ,indent=2)
313327

314328
if __name__=="__main__":
315329

316330
parser = argparse.ArgumentParser(description='build ontogoly dictionary, python Build_dict.py -i infile -o outpath -r rootnode')
317-
parser.add_argument('--input', '-i', help="input the ontology .obo file",default='../ontology/hp.obo') # hp.obo CTD_diseases.obo chebi.obo
331+
parser.add_argument('--input', '-i', help="input the ontology .obo file",default='../ontology/hp_202204.obo') # hp.obo CTD_diseases.obo chebi.obo
318332
parser.add_argument('--output', '-o', help="the output path of dictionary",default='../dict/')
319333
parser.add_argument('--rootnode','-r',help="input the root node of the ontogyly",nargs='+', default=['HP:0000118'])#HP:0000118 MESH:C CHEBI:24431
320334
args = parser.parse_args()
@@ -329,5 +343,7 @@ def hpo_word_map(hpo_obo, outpath):
329343

330344
hpo_word_map(hpo_obo, args.output)
331345

346+
alt_hpo(hpo_obo,args.output)
347+
332348
print('building dictionary done........')
333349

src/Build_distant_corpus.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ def combine_pos_neg(outpath):
160160

161161
parser = argparse.ArgumentParser(description='build distant training corpus, python Build_distant_corpus.py -d dictpath -f fileneg -n number_of_neg -o outpath')
162162
parser.add_argument('--dict', '-d', help="the input path of the ontology dictionary",default='../dict/')
163-
parser.add_argument('--fileneg', '-f', help="the text file used to generate the negatives",default='../mutation_disease.txt')
164-
parser.add_argument('--negnum', '-n', help="the number of negatives ",type=int, default=10000)
163+
parser.add_argument('--fileneg', '-f', help="the text file used to generate the negatives",default='../data/mutation_disease.txt')
164+
parser.add_argument('--negnum', '-n', help="the number of negatives ",type=int, default=50000)
165165
parser.add_argument('--output', '-o', help="the output folder of the distantly-supervised training dataset",default='../data/distant_train_data/')
166166
args = parser.parse_args()
167167

src/HPO_evaluation.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
Created on Fri Jun 12 11:33:22 2020
4+
5+
@author: luol2
6+
"""
7+
import argparse
8+
from nn_model import bioTag_CNN,bioTag_BERT
9+
from dic_ner import dic_ont
10+
from evaluate import GSCplus_corpus,JAX_corpus
11+
from tagging_text import bioTag
12+
import os
13+
import time
14+
import json
15+
import tensorflow as tf
16+
17+
'''
18+
config = tf.ConfigProto()
19+
config.gpu_options.allow_growth = True
20+
session = tf.Session(config=config)
21+
'''
22+
def run_gsc_test(files,biotag_dic,nn_model):
23+
24+
25+
fin_test=open(files['testfile'],'r',encoding='utf-8')
26+
all_test=fin_test.read().strip().split('\n\n')
27+
fin_test.close()
28+
test_out=open(files['outfile'],'w',encoding='utf-8')
29+
#i=0
30+
for doc_test in all_test:
31+
#i+=1
32+
#print(i)
33+
lines=doc_test.split('\n')
34+
pmid = lines[0]
35+
test_result=bioTag(lines[1],biotag_dic,nn_model,onlyLongest=False,abbrRecog=False,Threshold=0.95)
36+
test_out.write(pmid+'\n'+lines[1]+'\n')
37+
for ele in test_result:
38+
test_out.write(ele[0]+'\t'+ele[1]+'\t'+lines[1][int(ele[0]):int(ele[1])]+'\t'+ele[2]+'\t'+ele[3]+'\n')
39+
test_out.write('\n')
40+
test_out.close()
41+
GSCplus_corpus(files['outfile'],files['testfile'],subtree=True)
42+
43+
def run_jax_test(files,biotag_dic,nn_model):
44+
inpath=files['testfile']
45+
test_out=open(files['outfile'],'w',encoding='utf-8')
46+
i=0
47+
preds_result={}
48+
for file in os.listdir(inpath):
49+
i+=1
50+
print(i)
51+
pmid=file[:-4]
52+
temp_result=[]
53+
fin=open(inpath+file,'r',encoding='utf-8')
54+
intext=fin.read().rstrip()
55+
fin.close()
56+
test_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=False,abbrRecog=True,Threshold=0.95)
57+
for ele in test_result:
58+
if ele not in temp_result:
59+
temp_result.append(ele)
60+
preds_result[pmid]=temp_result
61+
json.dump(preds_result, test_out ,indent=2)
62+
test_out.close()
63+
JAX_corpus(files['outfile'], files['goldfile'])
64+
65+
66+
if __name__=="__main__":
67+
68+
parser = argparse.ArgumentParser(description='build weak training corpus, python build_dict.py -i infile -o outpath')
69+
parser.add_argument('--modeltype', '-m', help="the model type (cnn or biobert or bioformer?)",default='biobert')
70+
parser.add_argument('--corpus', '-c', help="HPO corpus (gsc or jax?)",default='jax')
71+
parser.add_argument('--output', '-o', help="the output prediction file ",default='../results/gsc_bioformer_new1.tsv')
72+
73+
args = parser.parse_args()
74+
model_type=args.modeltype
75+
test_set=args.corpus
76+
77+
78+
79+
ontfiles={'dic_file':'../dict/noabb_lemma.dic',
80+
'word_hpo_file':'../dict/word_id_map.json',
81+
'hpo_word_file':'../dict/id_word_map.json'}
82+
biotag_dic=dic_ont(ontfiles)
83+
84+
if model_type=='cnn':
85+
vocabfiles={'w2vfile':'../models_v1.1/bio_embedding_intrinsic.d200',
86+
'charfile':'../dict/char.vocab',
87+
'labelfile':'../dict/lable.vocab',
88+
'posfile':'../dict/pos.vocab'}
89+
90+
modelfile='../models_v1.1/cnn_hpo_v1.1.h5'
91+
nn_model=bioTag_CNN(vocabfiles)
92+
nn_model.load_model(modelfile)
93+
94+
elif model_type=='biobert':
95+
vocabfiles={'labelfile':'../dict/lable.vocab',
96+
'config_path':'../models_v1.1/biobert_v11_pubmed/bert_config.json',
97+
'checkpoint_path':'../models_v1.1/biobert_v11_pubmed/model.ckpt-1000000',
98+
'vocab_path':'../models_v1.1/biobert_v11_pubmed/vocab.txt'}
99+
modelfile='../models_v1.1/biobert_hpo_v1.1.h5'
100+
nn_model=bioTag_BERT(vocabfiles)
101+
nn_model.load_model(modelfile)
102+
else:
103+
vocabfiles={'labelfile':'../dict/lable.vocab',
104+
'config_path':'../models_v1.1/bioformer-cased-v1.0/bert_config.json',
105+
'checkpoint_path':'../models_v1.1/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
106+
'vocab_path':'../models_v1.1/bioformer-cased-v1.0/vocab.txt'}
107+
modelfile='../models_v1.1/bioformer_hpo_v1.1.h5'
108+
nn_model=bioTag_BERT(vocabfiles)
109+
nn_model.load_model(modelfile)
110+
111+
if test_set=='gsc':
112+
files={'testfile':'../data/corpus/GSC/GSCplus_test_gold.tsv',
113+
'outfile':'../results/gsc_test_bioformer_p5n5.tsv'}
114+
files['outfile']=args.output
115+
start_time=time.time()
116+
run_gsc_test(files,biotag_dic,nn_model)
117+
print('gsc done:',time.time()-start_time)
118+
else:
119+
files={'testfile':'../data/corpus/JAX/txt/',
120+
'goldfile':'../data/corpus/JAX/JAX_gold.json',
121+
'outfile':'../results/jax_test_bert_p5n5.json'}
122+
start_time=time.time()
123+
files['outfile']=args.output
124+
run_jax_test(files,biotag_dic,nn_model)
125+
print('jax done:',time.time()-start_time)

src/PhenoTagger_tagging.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def PubTator_Converter(infile,outfile,biotag_dic,nn_model,para_set):
4040
fout.write(pmid+"|a|"+abstract+"\n")
4141
else: # annotation
4242
intext=title+' '+abstract
43+
#print('..........',pmid)
44+
#print(intext)
4345
tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
4446
for ele in tag_result:
4547
start = ele[0]
@@ -59,9 +61,10 @@ def BioC_Converter(infile,outfile,biotag_dic,nn_model,para_set):
5961
with open(outfile,'w', encoding='utf8') as fout:
6062
collection = bioc.load(fin)
6163
for document in collection.documents:
64+
mention_num=0
6265
for passage in document.passages:
63-
tag_result=bioTag(passage.text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
64-
mention_num=0
66+
passage_offset=passage.offset
67+
tag_result=bioTag(passage.text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
6568
for ele in tag_result:
6669
bioc_note = bioc.BioCAnnotation()
6770
bioc_note.id = str(mention_num)
@@ -71,7 +74,7 @@ def BioC_Converter(infile,outfile,biotag_dic,nn_model,para_set):
7174
bioc_note.infons['score'] = ele[3]
7275
start = int(ele[0])
7376
last = int(ele[1])
74-
loc = bioc.BioCLocation(offset=str(start), length= str(last-start))
77+
loc = bioc.BioCLocation(offset=str(passage_offset+start), length= str(last-start))
7578
bioc_note.locations.append(loc)
7679
bioc_note.text = passage.text[start:last]
7780
passage.annotations.append(bioc_note)
@@ -84,17 +87,23 @@ def phenotagger_tag(infolder,para_set,outfolder):
8487
'hpo_word_file':'../dict/id_word_map.json'}
8588

8689
if para_set['model_type']=='cnn':
87-
vocabfiles={'w2vfile':'../models/bio_embedding_intrinsic.d200',
90+
vocabfiles={'w2vfile':'../models_v1.1/bio_embedding_intrinsic.d200',
8891
'charfile':'../dict/char.vocab',
8992
'labelfile':'../dict/lable.vocab',
9093
'posfile':'../dict/pos.vocab'}
91-
modelfile='../models/cnn_hpo.h5'
94+
modelfile='../models_v1.1/cnn_hpo_v1.1.h5'
95+
elif para_set['model_type']=='bioformer':
96+
vocabfiles={'labelfile':'../dict/lable.vocab',
97+
'config_path':'../models_v1.1/bioformer-cased-v1.0/bert_config.json',
98+
'checkpoint_path':'../models_v1.1/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
99+
'vocab_path':'../models_v1.1/bioformer-cased-v1.0/vocab.txt'}
100+
modelfile='../models_v1.1/bioformer_hpo_v1.1.h5'
92101
else:
93102
vocabfiles={'labelfile':'../dict/lable.vocab',
94-
'config_path':'../models/biobert_v11_pubmed/bert_config.json',
95-
'checkpoint_path':'../models/biobert_v11_pubmed/model.ckpt-1000000',
96-
'vocab_path':'../models/biobert_v11_pubmed/vocab.txt'}
97-
modelfile='../models/biobert_hpo.h5'
103+
'config_path':'../models_v1.1/biobert_v11_pubmed/bert_config.json',
104+
'checkpoint_path':'../models_v1.1/biobert_v11_pubmed/model.ckpt-1000000',
105+
'vocab_path':'../models_v1.1/biobert_v11_pubmed/vocab.txt'}
106+
modelfile='../models_v1.1/biobert_hpo_v1.1.h5'
98107

99108
# loading dict and model
100109

@@ -153,7 +162,7 @@ def phenotagger_tag(infolder,para_set,outfolder):
153162
os.makedirs(args.outfolder)
154163

155164
para_set={
156-
'model_type':'biobert', # cnn or biobert
165+
'model_type':'bioformer', # cnn, bioformer, or biobert
157166
'onlyLongest':False, # False: return overlap concepts, True only longgest
158167
'abbrRecog':True,# False: don't identify abbr, True: identify abbr
159168
'ML_Threshold':0.95,# the Threshold of deep learning model

src/PhenoTagger_training.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def CNN_training(trainfiles,vocabfiles,modelfile,EPOCH=50):
4343
trainfile=trainfiles['trainfile']
4444
train_set,train_label = ml_intext(trainfile)
4545

46-
train_x, train_y = cnn_model.rep.represent_instances_all_feas(train_set,train_label,word_max_len=cnn_model.hyper['sen_max'],char_max_len=cnn_model.hyper['word_max'])
46+
train_x, train_y = cnn_model.rep.represent_instances_all_feas(train_set,train_label,word_max_len=cnn_model.hyper['sen_max'],char_max_len=cnn_model.hyper['word_max'],training=True)
4747
input_train = []
4848

4949
if cnn_model.fea_dict['word'] == 1:
@@ -103,7 +103,7 @@ def BERT_training(trainfiles,vocabfiles,modelfile,EPOCH=50):
103103

104104
train_set,train_label = ml_intext(trainfile)
105105

106-
train_x,train_y=bert_model.rep.load_data(train_set,train_label,word_max_len=bert_model.maxlen)
106+
train_x,train_y=bert_model.rep.load_data(train_set,train_label,word_max_len=bert_model.maxlen,training=True)
107107

108108
bert_model.model.compile(optimizer=Adam(1e-5),loss='categorical_crossentropy',metrics=['categorical_accuracy'])
109109

@@ -145,15 +145,15 @@ def BERT_training(trainfiles,vocabfiles,modelfile,EPOCH=50):
145145
parser = argparse.ArgumentParser(description='train PhenoTagger, python PhenoTagger_training.py -t trainfile -d devfile -m modeltype -o outpath')
146146
parser.add_argument('--trainfile', '-t', help="the training file",default='../data/distant_train_data/distant_train.conll')
147147
parser.add_argument('--devfile', '-d', help="the development set file",default='none')
148-
parser.add_argument('--modeltype', '-m', help="deep learning model (cnn or biobert?)",default='biobert')
148+
parser.add_argument('--modeltype', '-m', help="deep learning model (cnn, bioformer or biobert?)",default='bioformer')
149149
parser.add_argument('--output', '-o', help="the model output folder",default='../newmodels/')
150150
args = parser.parse_args()
151151

152152
if not os.path.exists(args.output):
153153
os.makedirs(args.output)
154154

155155
if args.modeltype=='cnn':
156-
vocabfiles={'w2vfile':'../models/bio_embedding_intrinsic.d200',
156+
vocabfiles={'w2vfile':'../models_v1.1/bio_embedding_intrinsic.d200',
157157
'charfile':'../dict/char.vocab',
158158
'labelfile':'../dict/lable.vocab',
159159
'posfile':'../dict/pos.vocab'}
@@ -167,12 +167,28 @@ def BERT_training(trainfiles,vocabfiles,modelfile,EPOCH=50):
167167
modelfile=args.output+'cnn.h5'
168168
CNN_training(trainfiles,vocabfiles,modelfile)
169169

170+
elif args.modeltype=='bioformer':
171+
172+
vocabfiles={'labelfile':'../dict/lable.vocab',
173+
'config_path':'../models_v1.1/bioformer-cased-v1.0/bert_config.json',
174+
'checkpoint_path':'../models_v1.1/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
175+
'vocab_path':'../models_v1.1/bioformer-cased-v1.0/vocab.txt'}
176+
177+
178+
trainfiles={'trainfile':' ',
179+
'devfile':' ',
180+
'devout':' '}
181+
trainfiles['trainfile']=args.trainfile
182+
trainfiles['devfile']=args.devfile
183+
trainfiles['devout']=args.output+'biobert_dev_temp.tsv'
184+
modelfile=args.output+'bioformer.h5'
185+
BERT_training(trainfiles,vocabfiles,modelfile)
170186
else:
171187

172188
vocabfiles={'labelfile':'../dict/lable.vocab',
173-
'config_path':'../models/biobert_v11_pubmed/bert_config.json',
174-
'checkpoint_path':'../models/biobert_v11_pubmed/model.ckpt-1000000',
175-
'vocab_path':'../models/biobert_v11_pubmed/vocab.txt'}
189+
'config_path':'../models_v1.1/biobert_v11_pubmed/bert_config.json',
190+
'checkpoint_path':'../models_v1.1/biobert_v11_pubmed/model.ckpt-1000000',
191+
'vocab_path':'../models_v1.1/biobert_v11_pubmed/vocab.txt'}
176192

177193

178194
trainfiles={'trainfile':' ',

0 commit comments

Comments
 (0)