ncbi-nlp
diff --git a/‎src/Build_dict.py‎
Lines changed: 17 additions & 1 deletion b/‎src/Build_dict.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎src/Build_distant_corpus.py‎
Lines changed: 2 additions & 2 deletions b/‎src/Build_distant_corpus.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/HPO_evaluation.py‎
Lines changed: 125 additions & 0 deletions b/‎src/HPO_evaluation.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎src/PhenoTagger_tagging.py‎
Lines changed: 19 additions & 10 deletions b/‎src/PhenoTagger_tagging.py‎
Lines changed: 19 additions & 10 deletions
diff --git a/‎src/PhenoTagger_training.py‎
Lines changed: 23 additions & 7 deletions b/‎src/PhenoTagger_training.py‎
Lines changed: 23 additions & 7 deletions
@@ -310,11 +310,25 @@ def hpo_word_map(hpo_obo, outpath):
     json.dump(hpoid_word, fout,indent=2)
     fout.close()    
 
+def alt_hpo(hpo_obo,outpath):
+    fout=open(outpath+'alt_hpoid.json','w',encoding='utf-8')
+    alt_hpoid={}
+    for hpoid in hpo_obo.keys():
+        
+        if hpo_obo[hpoid]['is_obsolete']=='':
+            alt_hpoid[hpoid]=hpoid
+            for ele in hpo_obo[hpoid]['alt_id']:
+                if ele not in alt_hpoid.keys():
+                    alt_hpoid[ele]=hpoid
+                else:
+                    pass
+                    # print('alt_id:',ele,'old:',alt_hpoid[ele])
+    json.dump(alt_hpoid, fout ,indent=2)
 
 if __name__=="__main__":
 
     parser = argparse.ArgumentParser(description='build ontogoly dictionary, python Build_dict.py -i infile -o outpath -r rootnode')
-    parser.add_argument('--input', '-i', help="input the ontology .obo file",default='../ontology/hp.obo') # hp.obo CTD_diseases.obo chebi.obo
+    parser.add_argument('--input', '-i', help="input the ontology .obo file",default='../ontology/hp_202204.obo') # hp.obo CTD_diseases.obo chebi.obo
     parser.add_argument('--output', '-o', help="the output path of dictionary",default='../dict/')
     parser.add_argument('--rootnode','-r',help="input the root node of the ontogyly",nargs='+', default=['HP:0000118'])#HP:0000118 MESH:C CHEBI:24431
     args = parser.parse_args()
@@ -329,5 +343,7 @@ def hpo_word_map(hpo_obo, outpath):
 
     hpo_word_map(hpo_obo, args.output)
 
+    alt_hpo(hpo_obo,args.output)
+    
     print('building dictionary done........')
 
@@ -160,8 +160,8 @@ def combine_pos_neg(outpath):
 
     parser = argparse.ArgumentParser(description='build distant training corpus, python Build_distant_corpus.py -d dictpath -f fileneg  -n number_of_neg -o outpath')
     parser.add_argument('--dict', '-d', help="the input path of the ontology dictionary",default='../dict/')
-    parser.add_argument('--fileneg', '-f', help="the text file used to generate the negatives",default='../mutation_disease.txt')
-    parser.add_argument('--negnum', '-n', help="the number of negatives ",type=int, default=10000)
+    parser.add_argument('--fileneg', '-f', help="the text file used to generate the negatives",default='../data/mutation_disease.txt')
+    parser.add_argument('--negnum', '-n', help="the number of negatives ",type=int, default=50000)
     parser.add_argument('--output', '-o', help="the output folder of the distantly-supervised training dataset",default='../data/distant_train_data/')
     args = parser.parse_args()
 
 
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 12 11:33:22 2020
+
+@author: luol2
+"""
+import argparse
+from nn_model import bioTag_CNN,bioTag_BERT
+from dic_ner import dic_ont
+from evaluate import GSCplus_corpus,JAX_corpus
+from tagging_text import bioTag
+import os
+import time
+import json
+import tensorflow as tf
+
+'''
+config = tf.ConfigProto()  
+config.gpu_options.allow_growth = True  
+session = tf.Session(config=config) 
+'''
+def run_gsc_test(files,biotag_dic,nn_model):
+    
+
+    fin_test=open(files['testfile'],'r',encoding='utf-8')
+    all_test=fin_test.read().strip().split('\n\n')
+    fin_test.close()
+    test_out=open(files['outfile'],'w',encoding='utf-8')
+    #i=0
+    for doc_test in all_test:
+        #i+=1
+        #print(i)
+        lines=doc_test.split('\n')
+        pmid = lines[0]
+        test_result=bioTag(lines[1],biotag_dic,nn_model,onlyLongest=False,abbrRecog=False,Threshold=0.95)
+        test_out.write(pmid+'\n'+lines[1]+'\n')
+        for ele in test_result:
+            test_out.write(ele[0]+'\t'+ele[1]+'\t'+lines[1][int(ele[0]):int(ele[1])]+'\t'+ele[2]+'\t'+ele[3]+'\n')
+        test_out.write('\n')
+    test_out.close()
+    GSCplus_corpus(files['outfile'],files['testfile'],subtree=True)
+
+def run_jax_test(files,biotag_dic,nn_model):
+    inpath=files['testfile']
+    test_out=open(files['outfile'],'w',encoding='utf-8')
+    i=0
+    preds_result={}
+    for file in os.listdir(inpath):
+        i+=1
+        print(i)
+        pmid=file[:-4]
+        temp_result=[]
+        fin=open(inpath+file,'r',encoding='utf-8')
+        intext=fin.read().rstrip()
+        fin.close()     
+        test_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=False,abbrRecog=True,Threshold=0.95)
+        for ele in test_result:
+            if ele not in temp_result:
+                temp_result.append(ele)
+        preds_result[pmid]=temp_result
+    json.dump(preds_result, test_out ,indent=2)
+    test_out.close()
+    JAX_corpus(files['outfile'], files['goldfile'])
+
+
+if __name__=="__main__":
+    
+    parser = argparse.ArgumentParser(description='build weak training corpus, python build_dict.py -i infile -o outpath')
+    parser.add_argument('--modeltype', '-m', help="the model type (cnn or biobert or bioformer?)",default='biobert')
+    parser.add_argument('--corpus', '-c', help="HPO corpus (gsc or jax?)",default='jax')
+    parser.add_argument('--output', '-o', help="the output prediction file ",default='../results/gsc_bioformer_new1.tsv')
+    
+    args = parser.parse_args()
+    model_type=args.modeltype
+    test_set=args.corpus
+
+    
+    
+    ontfiles={'dic_file':'../dict/noabb_lemma.dic',
+              'word_hpo_file':'../dict/word_id_map.json',
+              'hpo_word_file':'../dict/id_word_map.json'}
+    biotag_dic=dic_ont(ontfiles)
+    
+    if model_type=='cnn':
+        vocabfiles={'w2vfile':'../models_v1.1/bio_embedding_intrinsic.d200',   
+                    'charfile':'../dict/char.vocab',
+                    'labelfile':'../dict/lable.vocab',
+                    'posfile':'../dict/pos.vocab'}
+
+        modelfile='../models_v1.1/cnn_hpo_v1.1.h5'
+        nn_model=bioTag_CNN(vocabfiles)
+        nn_model.load_model(modelfile)
+        
+    elif model_type=='biobert':
+        vocabfiles={'labelfile':'../dict/lable.vocab',
+                    'config_path':'../models_v1.1/biobert_v11_pubmed/bert_config.json',
+                    'checkpoint_path':'../models_v1.1/biobert_v11_pubmed/model.ckpt-1000000',
+                    'vocab_path':'../models_v1.1/biobert_v11_pubmed/vocab.txt'}
+        modelfile='../models_v1.1/biobert_hpo_v1.1.h5'
+        nn_model=bioTag_BERT(vocabfiles)
+        nn_model.load_model(modelfile)
+    else:
+        vocabfiles={'labelfile':'../dict/lable.vocab',
+                    'config_path':'../models_v1.1/bioformer-cased-v1.0/bert_config.json',
+                    'checkpoint_path':'../models_v1.1/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
+                    'vocab_path':'../models_v1.1/bioformer-cased-v1.0/vocab.txt'}
+        modelfile='../models_v1.1/bioformer_hpo_v1.1.h5'
+        nn_model=bioTag_BERT(vocabfiles)
+        nn_model.load_model(modelfile)
+    
+    if test_set=='gsc':
+        files={'testfile':'../data/corpus/GSC/GSCplus_test_gold.tsv',
+               'outfile':'../results/gsc_test_bioformer_p5n5.tsv'}
+        files['outfile']=args.output
+        start_time=time.time()
+        run_gsc_test(files,biotag_dic,nn_model)
+        print('gsc done:',time.time()-start_time)
+    else:
+        files={'testfile':'../data/corpus/JAX/txt/',
+               'goldfile':'../data/corpus/JAX/JAX_gold.json',
+               'outfile':'../results/jax_test_bert_p5n5.json'}
+        start_time=time.time()
+        files['outfile']=args.output
+        run_jax_test(files,biotag_dic,nn_model)
+        print('jax done:',time.time()-start_time)
@@ -40,6 +40,8 @@ def PubTator_Converter(infile,outfile,biotag_dic,nn_model,para_set):
                     fout.write(pmid+"|a|"+abstract+"\n")
                 else:  # annotation
                     intext=title+' '+abstract
+                    #print('..........',pmid)
+                    #print(intext)
                     tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
                     for ele in tag_result:
                         start = ele[0]
@@ -59,9 +61,10 @@ def BioC_Converter(infile,outfile,biotag_dic,nn_model,para_set):
         with open(outfile,'w', encoding='utf8') as fout:
             collection = bioc.load(fin)
             for document in collection.documents:
+                mention_num=0
                 for passage in document.passages:
-                    tag_result=bioTag(passage.text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
-                    mention_num=0
+                    passage_offset=passage.offset
+                    tag_result=bioTag(passage.text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold']) 
                     for ele in tag_result:
                         bioc_note = bioc.BioCAnnotation()
                         bioc_note.id = str(mention_num)
@@ -71,7 +74,7 @@ def BioC_Converter(infile,outfile,biotag_dic,nn_model,para_set):
                         bioc_note.infons['score'] = ele[3]
                         start = int(ele[0])
                         last = int(ele[1])
-                        loc = bioc.BioCLocation(offset=str(start), length= str(last-start))
+                        loc = bioc.BioCLocation(offset=str(passage_offset+start), length= str(last-start))
                         bioc_note.locations.append(loc)
                         bioc_note.text = passage.text[start:last]
                         passage.annotations.append(bioc_note)
@@ -84,17 +87,23 @@ def phenotagger_tag(infolder,para_set,outfolder):
               'hpo_word_file':'../dict/id_word_map.json'}
 
     if para_set['model_type']=='cnn':
-        vocabfiles={'w2vfile':'../models/bio_embedding_intrinsic.d200',   
+        vocabfiles={'w2vfile':'../models_v1.1/bio_embedding_intrinsic.d200',   
                     'charfile':'../dict/char.vocab',
                     'labelfile':'../dict/lable.vocab',
                     'posfile':'../dict/pos.vocab'}
-        modelfile='../models/cnn_hpo.h5'
+        modelfile='../models_v1.1/cnn_hpo_v1.1.h5'
+    elif para_set['model_type']=='bioformer':
+        vocabfiles={'labelfile':'../dict/lable.vocab',
+                    'config_path':'../models_v1.1/bioformer-cased-v1.0/bert_config.json',
+                    'checkpoint_path':'../models_v1.1/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
+                    'vocab_path':'../models_v1.1/bioformer-cased-v1.0/vocab.txt'}
+        modelfile='../models_v1.1/bioformer_hpo_v1.1.h5'
     else:
         vocabfiles={'labelfile':'../dict/lable.vocab',
-                    'config_path':'../models/biobert_v11_pubmed/bert_config.json',
-                    'checkpoint_path':'../models/biobert_v11_pubmed/model.ckpt-1000000',
-                    'vocab_path':'../models/biobert_v11_pubmed/vocab.txt'}
-        modelfile='../models/biobert_hpo.h5'
+                    'config_path':'../models_v1.1/biobert_v11_pubmed/bert_config.json',
+                    'checkpoint_path':'../models_v1.1/biobert_v11_pubmed/model.ckpt-1000000',
+                    'vocab_path':'../models_v1.1/biobert_v11_pubmed/vocab.txt'}
+        modelfile='../models_v1.1/biobert_hpo_v1.1.h5'
 
     # loading dict and model
 
@@ -153,7 +162,7 @@ def phenotagger_tag(infolder,para_set,outfolder):
         os.makedirs(args.outfolder)
 
     para_set={
-              'model_type':'biobert', # cnn or biobert
+              'model_type':'bioformer', # cnn, bioformer, or biobert
               'onlyLongest':False, # False: return overlap concepts, True only longgest
               'abbrRecog':True,# False: don't identify abbr, True: identify abbr
               'ML_Threshold':0.95,# the Threshold of deep learning model
 
@@ -43,7 +43,7 @@ def CNN_training(trainfiles,vocabfiles,modelfile,EPOCH=50):
     trainfile=trainfiles['trainfile']
     train_set,train_label = ml_intext(trainfile)
 
-    train_x, train_y = cnn_model.rep.represent_instances_all_feas(train_set,train_label,word_max_len=cnn_model.hyper['sen_max'],char_max_len=cnn_model.hyper['word_max'])
+    train_x, train_y = cnn_model.rep.represent_instances_all_feas(train_set,train_label,word_max_len=cnn_model.hyper['sen_max'],char_max_len=cnn_model.hyper['word_max'],training=True)
     input_train = []
 
     if cnn_model.fea_dict['word'] == 1:
@@ -103,7 +103,7 @@ def BERT_training(trainfiles,vocabfiles,modelfile,EPOCH=50):
 
     train_set,train_label = ml_intext(trainfile)
 
-    train_x,train_y=bert_model.rep.load_data(train_set,train_label,word_max_len=bert_model.maxlen)
+    train_x,train_y=bert_model.rep.load_data(train_set,train_label,word_max_len=bert_model.maxlen,training=True)
 
     bert_model.model.compile(optimizer=Adam(1e-5),loss='categorical_crossentropy',metrics=['categorical_accuracy'])
 
@@ -145,15 +145,15 @@ def BERT_training(trainfiles,vocabfiles,modelfile,EPOCH=50):
     parser = argparse.ArgumentParser(description='train PhenoTagger, python PhenoTagger_training.py -t trainfile -d devfile -m modeltype -o outpath')
     parser.add_argument('--trainfile', '-t', help="the training file",default='../data/distant_train_data/distant_train.conll')
     parser.add_argument('--devfile', '-d', help="the development set file",default='none')
-    parser.add_argument('--modeltype', '-m', help="deep learning model (cnn or biobert?)",default='biobert')
+    parser.add_argument('--modeltype', '-m', help="deep learning model (cnn, bioformer or biobert?)",default='bioformer')
     parser.add_argument('--output', '-o', help="the model output folder",default='../newmodels/')
     args = parser.parse_args()
 
     if not os.path.exists(args.output):
         os.makedirs(args.output)
 
     if args.modeltype=='cnn':
-        vocabfiles={'w2vfile':'../models/bio_embedding_intrinsic.d200',   
+        vocabfiles={'w2vfile':'../models_v1.1/bio_embedding_intrinsic.d200',   
                     'charfile':'../dict/char.vocab',
                     'labelfile':'../dict/lable.vocab',
                     'posfile':'../dict/pos.vocab'}
@@ -167,12 +167,28 @@ def BERT_training(trainfiles,vocabfiles,modelfile,EPOCH=50):
         modelfile=args.output+'cnn.h5'
         CNN_training(trainfiles,vocabfiles,modelfile)
 
+    elif args.modeltype=='bioformer':
+        
+        vocabfiles={'labelfile':'../dict/lable.vocab',
+                    'config_path':'../models_v1.1/bioformer-cased-v1.0/bert_config.json',
+                    'checkpoint_path':'../models_v1.1/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
+                    'vocab_path':'../models_v1.1/bioformer-cased-v1.0/vocab.txt'}
+
+        
+        trainfiles={'trainfile':' ',
+                    'devfile':' ',
+                    'devout':' '}
+        trainfiles['trainfile']=args.trainfile
+        trainfiles['devfile']=args.devfile
+        trainfiles['devout']=args.output+'biobert_dev_temp.tsv'
+        modelfile=args.output+'bioformer.h5'
+        BERT_training(trainfiles,vocabfiles,modelfile)
     else:
 
         vocabfiles={'labelfile':'../dict/lable.vocab',
-                    'config_path':'../models/biobert_v11_pubmed/bert_config.json',
-                    'checkpoint_path':'../models/biobert_v11_pubmed/model.ckpt-1000000',
-                    'vocab_path':'../models/biobert_v11_pubmed/vocab.txt'}
+                    'config_path':'../models_v1.1/biobert_v11_pubmed/bert_config.json',
+                    'checkpoint_path':'../models_v1.1/biobert_v11_pubmed/model.ckpt-1000000',
+                    'vocab_path':'../models_v1.1/biobert_v11_pubmed/vocab.txt'}
 
 
         trainfiles={'trainfile':' ',