美文网首页NLP
NER实体识别

NER实体识别

作者: 潇萧之炎 | 来源:发表于2019-05-16 08:50 被阅读73次

    main

    #-*-encoding=utf8-*-
    
    from flask import jsonify  # 轻量级flask部署用的
    from flask import Flask
    from flask import request
    import json
    import platform
    import codecs
    import logging
    import itertools
    from collections import OrderedDict  # 字数、词频的统计
    import os  # 目录文件增删改查
    import sys  # 打断点,获取路径
    from gevent import monkey  # 优化flask,部署线程优化
    monkey.patch_all()
    from gevent import wsgi
    import tensorflow as tf
    import numpy as np
    from model import Model
    from loader import load_sentences, update_tag_scheme
    from loader import char_mapping, tag_mapping   #char_mapping, tag_mapping字符映射,tag映射,都是预处理
    from loader import augment_with_pretrained, prepare_dataset #预训练,做准备
    from utils import get_logger,load_config,create_model #通过create_model加载 Model里面的模型
    from utils import make_path
    
    from data_utils import load_word2vec, create_input, input_from_line, BatchManager
    
    currentPath=os.getcwd() # 当前目录
    sys.path.append(currentPath)
    
    root_path=os.getcwd()#获取的是根目录
    #判断py的版本,获取不同的包
    global pyversion
    if sys.version>'3':
        pyversion='three'
    else:
        pyversion='two'
    if pyversion=='three':
        import pickle
    else :
        import cPickle,pickle
    root_path=os.getcwd()+os.sep #os.getcwd()获取当前文件夹的根路径,os.sep是分隔符,转义
    flags = tf.app.flags  # 用tf.app.flags来定义参数,可以在flags里保存参数
    # flags后面之间填写参数或者是文件、文件夹名称
    flags.DEFINE_boolean("clean",       True,      "clean train folder")#清理之前的训练结果
    flags.DEFINE_boolean("train",       True,      "Whether train the model") #是否训练模型
    # configurations for the model
    flags.DEFINE_integer("seg_dim",     20,         "Embedding size for segmentation, 0 if not used")# embeding的增维
    # 因为Y是双标签,所以x也要用双标签来标注。BIOS是标注y的,不是x
    # 文字有两重信息:1.文字本身的100字向量 2.位置信息:20维
    # ,急性呼吸道感染
    # 0 1 2 2 2 2 2 3 逗号是0,开头是1,结尾是3,中间全是2
    # 比如x急是0100四维,全连接20维,再加上原来的100维,100+20=120维。20就是做位置词的Embedding,用120维来代替一个x的输入
    flags.DEFINE_integer("char_dim",    100,        "Embedding size for characters")#字的维度
    flags.DEFINE_integer("lstm_dim",    100,        "Num of hidden units in LSTM, or num of filters in IDCNN")#隐层
    flags.DEFINE_string("tag_schema",   "iobes",    "tagging schema iobes or iob") #y有实体信息和位置信息,这里是标签的位置类型
    
    # configurations for training
    flags.DEFINE_float("clip",          5,          "Gradient clip")#梯度截断值
    flags.DEFINE_float("dropout",       0.5,        "Dropout rate")
    flags.DEFINE_float("batch_size",    20,         "batch size")
    flags.DEFINE_float("lr",            0.001,      "Initial learning rate")
    flags.DEFINE_string("optimizer",    "adam",     "Optimizer for training")#优化器,tf有9类优化器
    flags.DEFINE_boolean("pre_emb",     True,       "Wither use pre-trained embedding")#数据预处理embeding,char_dim是100,这里就是true
    flags.DEFINE_boolean("zeros",       True,      "Wither replace digits with zero")#碰到生僻字用0取代,预测值为0?
    flags.DEFINE_boolean("lower",       False,       "Wither lower case") # 是否需要将字母小写,这个案例中,字符串不需要小写
    
    flags.DEFINE_integer("max_epoch",   100,        "maximum training epochs")# 最大epoch,建议5000-10000
    flags.DEFINE_integer("steps_check", 100,        "steps per checkpoint")# 每100个batch输出损失
    flags.DEFINE_string("ckpt_path",    "ckpt",      "Path to save model") #保存模型的路径
    flags.DEFINE_string("summary_path", "summary",      "Path to store summaries")# 保存可视化摘要,保存流程图
    flags.DEFINE_string("log_file",     "train.log",    "File for log") #maps.pkl一般用来保存模型,这里保存字典
    flags.DEFINE_string("map_file",     "maps.pkl",     "file for maps")#保存字典的向量,训练集的正反向字典,将训练集隐射成word2vec的字典
    flags.DEFINE_string("vocab_file",   "vocab.json",   "File for vocab")#原始ccorpus
    flags.DEFINE_string("config_file",  "config_file",  "File for config")#配置文件
    flags.DEFINE_string("script",       "conlleval",    "evaluation script")
    flags.DEFINE_string("result_path",  "result",       "Path for results")
    flags.DEFINE_string("emb_file",     os.path.join(root_path+"data", "vec.txt"),  "Path for pre_trained embedding")
    flags.DEFINE_string("train_file",   os.path.join(root_path+"data", "example.train"),  "Path for train data")#训练集
    flags.DEFINE_string("dev_file",     os.path.join(root_path+"data", "example.dev"),    "Path for dev data")# 开发集或验证集,验证当前模型的损失是否在减小
    flags.DEFINE_string("test_file",    os.path.join(root_path+"data", "example.test"),   "Path for test data")#测试集
    #深度学习中,样本一般分为3份,一边训练,一边用验证集中的数据,验证当前模型的损失是否在减小,精确度是否很高,因为量非常大
    #该项目中放在D:\Python\NERuselocal\NERuselocal\data文件夹下
    
    flags.DEFINE_string("model_type", "idcnn", "Model type, can be idcnn or bilstm")
    #flags.DEFINE_string("model_type", "bilstm", "Model type, can be idcnn or bilstm")
    
    FLAGS = tf.app.flags.FLAGS #上面的参数保存在这里
    # 断言,相当于if else的判断 return
    assert FLAGS.clip < 5.1, "gradient clip should't be too much"
    assert 0 <= FLAGS.dropout < 1, "dropout rate between 0 and 1"
    assert FLAGS.lr > 0, "learning rate must larger than zero"
    assert FLAGS.optimizer in ["adam", "sgd", "adagrad"]
    
    
    # config for the model 无调用 
    def config_model(char_to_id, tag_to_id):
        config = OrderedDict()
        config["model_type"] = FLAGS.model_type
        config["num_chars"] = len(char_to_id)
        config["char_dim"] = FLAGS.char_dim
        config["num_tags"] = len(tag_to_id)
        config["seg_dim"] = FLAGS.seg_dim
        config["lstm_dim"] = FLAGS.lstm_dim
        config["batch_size"] = FLAGS.batch_size
    
        config["emb_file"] = FLAGS.emb_file
        config["clip"] = FLAGS.clip
        config["dropout_keep"] = 1.0 - FLAGS.dropout
        config["optimizer"] = FLAGS.optimizer
        config["lr"] = FLAGS.lr
        config["tag_schema"] = FLAGS.tag_schema
        config["pre_emb"] = FLAGS.pre_emb
        config["zeros"] = FLAGS.zeros
        config["lower"] = FLAGS.lower
        return config
    
    # 打断点的时候,上面的def config_model()不执行
    # 把x和y的正反向字典读进来了
    with open(FLAGS.map_file, "rb") as f:
        if pyversion=='three': 
            #词到ID,标记到ID。pickle用来打开D:\Python\NERuselocal\NERuselocal\maps.pkl文件
            # 这四个值就是正反向字典,长度分别是
            #  2678        2678         51         51
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        else:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f,protocol=2)
            # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        # 如果D:\Python\NERuselocal\NERuselocal\config_file是个文件,就加载进来
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    # 全部加载到flag中去
    make_path(FLAGS)
    app = Flask(__name__)
    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    tf_config = tf.ConfigProto()
    sess=tf.Session(config=tf_config)
    #sess.run(tf.global_variables_initializer())
    # 会话创建好之后加载模型
    # id_to_char就是反向字典
    model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
    
    '''
    def get_text_input():
        #http://127.0.0.1:5002/?inputStr="最开心"
        text=request.args.get('inputStr')
        if len(text.strip())>0:     
            aa=model.evaluate_line(sess, input_from_line(text, char_to_id), id_to_tag)
            return jsonify(aa)   
    '''
    @app.route('/', methods=['POST','GET'])
    def get_text_input():
        #http://127.0.0.1:5002/?inputStr="神经病"
        #如果遇到显示问题:下载QQ浏览器,将编码设置为utf-8
        text=request.args.get('inputStr')
        #if len(text.strip())>0: 
    
        if text:    
            aa=model.evaluate_line(sess, input_from_line(text, char_to_id), id_to_tag)
            print(aa)
            
            return jsonify(aa) 
    @app.route('/text',methods=['POST','GET'])
    def text():
        #http://101.224.189.118:5002/text?inputStr="神经病"
        # 101.224.189.118
        text=request.args.get('inputStr')
        #if len(text.strip())>0: 
        #Input_from_line get_seg_features  self.trans(crf_log_likelihood)  run_step  self.loss  
        #project_layer_idcnn
        if text:    
            aa=model.evaluate_line(sess, input_from_line(text, char_to_id), id_to_tag)
            print(aa)
            
            return jsonify(aa) 
    
       
    if __name__ == "__main__":   
        app.config['JSON_AS_ASCII'] = False
        app.run(host='127.0.0.1',port=5002)
     
    
    # id_to_tag.txt和id_to_tag.txt是正反向字典
    # vec.txt是词向量,每个都是100维的,即时是一个逗号。
    # 与聊天机器人不同的地方,之前每个字是转化为数值,这里是转化为向量
    # main里面放的是测试的结果,main2是训练的过程
    # 只要不是函数的,就会从上往下去运行,运行到define就会跳过
    

    mian2

    # encoding=utf8
    
    import codecs
    import pickle
    import itertools
    from collections import OrderedDict
    import os
    import tensorflow as tf
    import numpy as np
    from model import Model
    from loader import load_sentences, update_tag_scheme
    from loader import char_mapping, tag_mapping
    from loader import augment_with_pretrained, prepare_dataset
    from utils import get_logger, make_path, clean, create_model, save_model
    from utils import print_config, save_config, load_config, test_ner
    from data_utils import load_word2vec, create_input, input_from_line, BatchManager
    root_path=os.getcwd()+os.sep
    flags = tf.app.flags
    flags.DEFINE_boolean("clean",       True,      "clean train folder")  #清理之前的训练结果
    flags.DEFINE_boolean("train",       False,      "Whether train the model") #是否训练模型
    # configurations for the model
    flags.DEFINE_integer("seg_dim",     20,         "Embedding size for segmentation, 0 if not used") # embeding的增维
    flags.DEFINE_integer("char_dim",    100,        "Embedding size for characters")#字的维度
    flags.DEFINE_integer("lstm_dim",    100,        "Num of hidden units in LSTM, or num of filters in IDCNN")
    flags.DEFINE_string("tag_schema",   "iobes",    "tagging schema iobes or iob") #标签的位置类型
    
    # configurations for training
    flags.DEFINE_float("clip",          5,          "Gradient clip") #梯度截断值
    flags.DEFINE_float("dropout",       0.5,        "Dropout rate")
    flags.DEFINE_float("batch_size",    20,         "batch size")
    flags.DEFINE_float("lr",            0.001,      "Initial learning rate")
    flags.DEFINE_string("optimizer",    "adam",     "Optimizer for training")
    flags.DEFINE_boolean("pre_emb",     True,       "Wither use pre-trained embedding")
    flags.DEFINE_boolean("zeros",       True,      "Wither replace digits with zero")
    flags.DEFINE_boolean("lower",       False,       "Wither lower case")
    
    flags.DEFINE_integer("max_epoch",   100,        "maximum training epochs") # 建议5000-10000
    flags.DEFINE_integer("steps_check", 100,        "steps per checkpoint")
    flags.DEFINE_string("ckpt_path",    "ckpt",      "Path to save model") #保存模型的路径
    flags.DEFINE_string("summary_path", "summary",      "Path to store summaries") # 保存可视化摘要
    flags.DEFINE_string("log_file",     "train.log",    "File for log") #日志
    flags.DEFINE_string("map_file",     "maps.pkl",     "file for maps") #保存字典的向量
    flags.DEFINE_string("vocab_file",   "vocab.json",   "File for vocab")
    flags.DEFINE_string("config_file",  "config_file",  "File for config")
    flags.DEFINE_string("script",       "conlleval",    "evaluation script")
    flags.DEFINE_string("result_path",  "result",       "Path for results")
    flags.DEFINE_string("emb_file",     os.path.join(root_path+"data", "vec.txt"),  "Path for pre_trained embedding")
    flags.DEFINE_string("train_file",   os.path.join(root_path+"data", "example.train"),  "Path for train data")
    flags.DEFINE_string("dev_file",     os.path.join(root_path+"data", "example.dev"),    "Path for dev data") # 验证集,验证是否损失在下降
    flags.DEFINE_string("test_file",    os.path.join(root_path+"data", "example.test"),   "Path for test data")
    
    flags.DEFINE_string("model_type", "idcnn", "Model type, can be idcnn or bilstm")
    #flags.DEFINE_string("model_type", "bilstm", "Model type, can be idcnn or bilstm")
    
    FLAGS = tf.app.flags.FLAGS
    assert FLAGS.clip < 5.1, "gradient clip should't be too much"
    assert 0 <= FLAGS.dropout < 1, "dropout rate between 0 and 1"
    assert FLAGS.lr > 0, "learning rate must larger than zero"
    assert FLAGS.optimizer in ["adam", "sgd", "adagrad"]
    
    
    # config for the model
    def config_model(char_to_id, tag_to_id):
        config = OrderedDict()
        config["model_type"] = FLAGS.model_type
        config["num_chars"] = len(char_to_id)
        config["char_dim"] = FLAGS.char_dim
        config["num_tags"] = len(tag_to_id)
        config["seg_dim"] = FLAGS.seg_dim
        config["lstm_dim"] = FLAGS.lstm_dim
        config["batch_size"] = FLAGS.batch_size
    
        config["emb_file"] = FLAGS.emb_file
        config["clip"] = FLAGS.clip
        config["dropout_keep"] = 1.0 - FLAGS.dropout
        config["optimizer"] = FLAGS.optimizer
        config["lr"] = FLAGS.lr
        config["tag_schema"] = FLAGS.tag_schema
        config["pre_emb"] = FLAGS.pre_emb
        config["zeros"] = FLAGS.zeros
        config["lower"] = FLAGS.lower
        return config
    
    
    def evaluate(sess, model, name, data, id_to_tag, logger):
        logger.info("evaluate:{}".format(name))
        ner_results = model.evaluate(sess, data, id_to_tag)
        eval_lines = test_ner(ner_results, FLAGS.result_path)
        for line in eval_lines:
            logger.info(line)
        f1 = float(eval_lines[1].strip().split()[-1])
    
        if name == "dev":
            best_test_f1 = model.best_dev_f1.eval()
            if f1 > best_test_f1:
                tf.assign(model.best_dev_f1, f1).eval()
                logger.info("new best dev f1 score:{:>.3f}".format(f1))
            return f1 > best_test_f1
        elif name == "test":
            best_test_f1 = model.best_test_f1.eval()
            if f1 > best_test_f1:
                tf.assign(model.best_test_f1, f1).eval()
                logger.info("new best test f1 score:{:>.3f}".format(f1))
            return f1 > best_test_f1
    
    
    def train():
        # load data sets [class_list['[', 'O'], ['双', 'O'], ['击', 'O']]…] 详细讲   data/example.*
        train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
        dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
        test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)
    
        # Use selected tagging scheme (IOB / IOBES) 详细讲 IOB:实体开头为I,其他为I,非实体O
        update_tag_scheme(train_sentences, FLAGS.tag_schema)
        update_tag_scheme(test_sentences, FLAGS.tag_schema)
        update_tag_scheme(dev_sentences, FLAGS.tag_schema)
        # create maps if not exist
        if not os.path.isfile(FLAGS.map_file):
            # create dictionary for word
            if FLAGS.pre_emb:
                dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
                dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                    dico_chars_train.copy(),
                    FLAGS.emb_file,
                    list(itertools.chain.from_iterable(
                        [[w[0] for w in s] for s in test_sentences])
                    )
                )
            else:
                _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)
    
            # Create a dictionary and a mapping for tags
            _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
            #with open('maps.txt','w',encoding='utf8') as f1:
                #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n')
            with open(FLAGS.map_file, "wb") as f:
                pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
        else:
            with open(FLAGS.map_file, "rb") as f:
                char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
    
        # prepare data, get a collection of list containing index
        train_data = prepare_dataset(
            train_sentences, char_to_id, tag_to_id, FLAGS.lower
        )
        dev_data = prepare_dataset(
            dev_sentences, char_to_id, tag_to_id, FLAGS.lower
        )
        test_data = prepare_dataset(
            test_sentences, char_to_id, tag_to_id, FLAGS.lower
        )
        print("%i / %i / %i sentences in train / dev / test." % (
            len(train_data), 0, len(test_data)))
    
        train_manager = BatchManager(train_data, FLAGS.batch_size)
        dev_manager = BatchManager(dev_data, 100)
        test_manager = BatchManager(test_data, 100)
        # make path for store log and model if not exist
        make_path(FLAGS)
        if os.path.isfile(FLAGS.config_file):
            config = load_config(FLAGS.config_file)
        else:
            config = config_model(char_to_id, tag_to_id)
            save_config(config, FLAGS.config_file)
        make_path(FLAGS)
    
        log_path = os.path.join("log", FLAGS.log_file)
        logger = get_logger(log_path)
        print_config(config, logger)
    
        # limit GPU memory
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        steps_per_epoch = train_manager.len_data
        with tf.Session(config=tf_config) as sess:
            model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
            logger.info("start training")
            loss = []
            with tf.device("/gpu:0"):
                for i in range(100):
                    for batch in train_manager.iter_batch(shuffle=True):
                        step, batch_loss = model.run_step(sess, True, batch)
                        loss.append(batch_loss)
                        if step % FLAGS.steps_check == 0:
                            iteration = step // steps_per_epoch + 1
                            logger.info("iteration:{} step:{}/{}, "
                                        "NER loss:{:>9.6f}".format(
                                iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss)))
                            loss = []
        
                   # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
                    if i%7==0:
                        save_model(sess, model, FLAGS.ckpt_path, logger)
                #evaluate(sess, model, "test", test_manager, id_to_tag, logger)
    
    
    def evaluate_line():
        config = load_config(FLAGS.config_file)
        logger = get_logger(FLAGS.log_file)
        # limit GPU memory
        tf_config = tf.ConfigProto()
        tf_config.gpu_options.allow_growth = True
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)
        with tf.Session(config=tf_config) as sess:
            model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
            while True:
                # try:
                #     line = input("请输入测试句子:")
                #     result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag)
                #     print(result)
                # except Exception as e:
                #     logger.info(e)
    
                    line = input("请输入测试句子:")
                    result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag)
                    print(result)
    
    
    def main(_):
    
        if 1:
            if FLAGS.clean:
                clean(FLAGS)
            train()
        else:
            evaluate_line()
    
    
    if __name__ == "__main__":
        tf.app.run(main)
    

    model

    # encoding = utf8
    import numpy as np
    import tensorflow as tf
    from tensorflow.contrib.crf import crf_log_likelihood  #条件随机场
    from tensorflow.contrib.crf import viterbi_decode # 隐马尔科夫链
    from tensorflow.contrib.layers.python.layers import initializers #初始化神经网络
    
    from utils import result_to_json #保存参数
    from data_utils import create_input, iobes_iob,iob_iobes
    
    # 双向lstm或IdCNN模型,找到x,y. y是双标签,x是文字word2vec映射成的词向量。
    # 如何拟合x.y:拟合之前第一步提取x的特征,用BiLstm或idCNN对x做特征提取,+分类器(crf条件随机场)
    # BiLstm or idCNN + crf
    # idCNN与cnn的区别是,idCNN的卷积核是扁的:找一句话之间的关系可以用扁的,
    # 好处:可以有效地抗噪音:完形填空时,扁的卷积核它只会扫当前这句话,不会把上下文卷进来,抗的是上下文的躁
    # CNN和RNN本质上没有太大差别,都是把局部的相关性体现出来,CNN体现在空间上,RNN体现在时间时序上
    
    # crf:条件随机场。跟rnn很类似,提供了一个分类结果,当然它也可以做特征提取。它的分类需要算一个联合概率
    # 第一步,找到x,y
    # 第二步,对x做特征提取、特征工程(之前所有的resnet等都是为特征工程服务的),对y做one_hot向量(或二分类)
    # 第三步,去拟合,分类
    
    # crf_log_likelihood(#likelihood似然,一般加似然的就是损失函数
    class Model(object):
        def __init__(self, config):
    
            #__init__方法下面的参数都会被执行,相当于构造方法
            self.config = config
            
            self.lr = config["lr"]
            self.char_dim = config["char_dim"]  # embeding_size 100
            self.lstm_dim = config["lstm_dim"]  # lstm隐层神经元个数
            self.seg_dim = config["seg_dim"] #增加的维度
    
            self.num_tags = config["num_tags"] #tag的标签个数
            self.num_chars = config["num_chars"] # 字典维度
            self.num_segs = 4 #0,1,2,3,0是不需要的字,1是第一个,2是中间的,3是最后一个
    
            self.global_step = tf.Variable(0, trainable=False)
            self.best_dev_f1 = tf.Variable(0.0, trainable=False)
            self.best_test_f1 = tf.Variable(0.0, trainable=False)
            #xavier_initializer迭代器,效率高,和global_initializer类似
            self.initializer = initializers.xavier_initializer()
            
            
    
            # add placeholders for the model
            # batch_size是20
            self.char_inputs = tf.placeholder(dtype=tf.int32,  # 这个是20*100
                                              shape=[None, None],
                                              name="ChatInputs")
            self.seg_inputs = tf.placeholder(dtype=tf.int32,  # 这个是20*20,0-3映射成20
                                             shape=[None, None], # 后面加起来120*20
                                             name="SegInputs")
    
            self.targets = tf.placeholder(dtype=tf.int32, # 这个是20*1,y值
                                          shape=[None, None],
                                          name="Targets")
            # dropout keep prob
            self.dropout = tf.placeholder(dtype=tf.float32,
                                          name="Dropout")
    
            used = tf.sign(tf.abs(self.char_inputs))
            length = tf.reduce_sum(used, reduction_indices=1)
            #二维的东西,降掉一维,算整个长度是多少
            self.lengths = tf.cast(length, tf.int32)# 120
            self.batch_size = tf.shape(self.char_inputs)[0]  # 20*120,第0个就是20
            self.num_steps = tf.shape(self.char_inputs)[-1]  # 120,最后一个就是120
            
            
            #Add model type by crownpku bilstm or idcnn
            self.model_type = config['model_type']#idcnn
            #parameters for idcnn
            # idcnn后面连的是膨胀卷积,好处:有些图像比较小的时候,不希望挤到一起。防止欠拟合
            # 一种方法是,把图像做膨胀,另一种方法是将卷积核做膨胀。一般是feature_map做膨胀,卷积核不膨胀
            # 由3*3变成5*5,中间补0
            self.layers = [
                {
                    'dilation': 1 #膨胀卷积 膨胀卷积核尺寸 = 膨胀系数*(原始卷积核尺寸-1)+1
                },
                {
                    'dilation': 1
                },
                {
                    'dilation': 2
                },
            ]
            self.filter_width = 3  #卷积核宽3,卷积核的高没有写,所以高是1,1*3,卷积核是扁的
            self.num_filter = self.lstm_dim  #卷积核个数即为lstm连接隐层的个数,就是卷积的通道数输出的
            #字向量的维度+词长度特征维度
            self.embedding_dim = self.char_dim + self.seg_dim # embedding_size 120=100+20
            self.repeat_times = 4 #重复的次数是4,4层卷积网络 深度3*4=12层,重复的是self.layers
            self.cnn_output_width = 0 #输出的宽度实际上是2000多维,这里初始化为0
            
            # embeddings for chinese character and segmentation representation
            embedding = self.embedding_layer(self.char_inputs, self.seg_inputs, config)
    
            if self.model_type == 'bilstm':
                # apply dropout before feed to lstm layer
                model_inputs = tf.nn.dropout(embedding, self.dropout)
    
                # bi-directional lstm layer
                model_outputs = self.biLSTM_layer(model_inputs, self.lstm_dim, self.lengths)
    
                # logits for tags
                self.logits = self.project_layer_bilstm(model_outputs)
            
            elif self.model_type == 'idcnn':
                # apply dropout before feed to idcnn layer
                # 120个里面随机删掉一部分,内存不删,删里面的值
                # dropout在输入层、输出层、隐层都可以做
                model_inputs = tf.nn.dropout(embedding, self.dropout) #输入120个
    
                # ldcnn layer
                # ldcnn layer 特征提取 膨胀卷积
                # model_inputs是120维的,如果做了dropout,就剩60维了
                model_outputs = self.IDCNN_layer(model_inputs) #输出200个
                # 输入(100+20)个——卷积--》(100个通道---》3次膨胀(100)以上循环4次)
                # logits for tags
                # logits for tags 模型文件的输出
                self.logits = self.project_layer_idcnn(model_outputs)
            
            else:
                raise KeyError
    
            # loss of the model
            self.loss = self.loss_layer(self.logits, self.lengths)
    
            with tf.variable_scope("optimizer"):
                optimizer = self.config["optimizer"]
                if optimizer == "sgd":
                    self.opt = tf.train.GradientDescentOptimizer(self.lr)
                elif optimizer == "adam":
                    self.opt = tf.train.AdamOptimizer(self.lr)
                elif optimizer == "adgrad":
                    self.opt = tf.train.AdagradOptimizer(self.lr)
                else:
                    raise KeyError
    
                # apply grad clip to avoid gradient explosion
                grads_vars = self.opt.compute_gradients(self.loss)
                capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v]
                                     for g, v in grads_vars]
                self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step)
    
            # saver of the model
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
    
        # 用来做embedding。seg_inputs是20维的。
        def embedding_layer(self, char_inputs, seg_inputs, config, name=None):
            """
            :param char_inputs: one-hot encoding of sentence
            :param seg_inputs: segmentation feature
            :param config: wither use segmentation feature
            :return: [1, num_steps, embedding size], 
            """
            #高:3 血:22 糖:23 和:24 高:3 血:22 压:25 char_inputs=[3,22,23,24,3,22,25]
            #高血糖 和 高血压 seg_inputs 高血糖=[1,2,3] 和=[0] 高血压=[1,2,3] seg_inputs=[1,2,3,0,1,2,3]        
            embedding = []
            with tf.variable_scope("char_embedding" if not name else name), tf.device('/cpu:0'):
                # self.num_chars=3538, self.char_dim=100维 ,char_lookup字符查找
                self.char_lookup = tf.get_variable(
                        name="char_embedding",
                        shape=[self.num_chars, self.char_dim],
                        initializer=self.initializer)
                #输入char_inputs='常'对应的字典的索引/编号/value为:8
                #self.char_lookup=[2677*100]的向量,char_inputs字对应在字典的索引/编号/key=[1]
                # char_lookup:被查的字典。char_inputs:每个字的索引
                # 查的过程是向量相乘,看平板截图
                embedding.append(tf.nn.embedding_lookup(self.char_lookup, char_inputs))#把input映射成embedding
                #上一步完成后,变成7*100
                if config["seg_dim"]:#上面是创建100维的,这里再20维的
                    #self.num_segs=4, self.seg_dim=20 ,4*20的
                    with tf.variable_scope("seg_embedding"), tf.device('/cpu:0'):
                        self.seg_lookup = tf.get_variable(
                            name="seg_embedding",
                            shape=[self.num_segs, self.seg_dim],
                            initializer=self.initializer)
                        embedding.append(tf.nn.embedding_lookup(self.seg_lookup, seg_inputs))#分割部位的embedding,生成20维
                        # seg_input只有四个值,0、1、2、3
                embed = tf.concat(embedding, axis=-1)#组成120维向量
            return embed
    
        def biLSTM_layer(self, model_inputs, lstm_dim, lengths, name=None):
            """
            :param lstm_inputs: [batch_size, num_steps, emb_size] 
            :return: [batch_size, num_steps, 2*lstm_dim] 
            """
            with tf.variable_scope("char_BiLSTM" if not name else name):
                lstm_cell = {}
                for direction in ["forward", "backward"]:
                    with tf.variable_scope(direction):
                        lstm_cell[direction] = rnn.CoupledInputForgetGateLSTMCell(
                            lstm_dim,
                            use_peepholes=True,
                            initializer=self.initializer,
                            state_is_tuple=True)
                outputs, final_states = tf.nn.bidirectional_dynamic_rnn(
                    lstm_cell["forward"],
                    lstm_cell["backward"],
                    model_inputs,
                    dtype=tf.float32,
                    sequence_length=lengths)
            return tf.concat(outputs, axis=2)
        
        #IDCNN layer 
        def IDCNN_layer(self, model_inputs, 
                        name=None):
            """
            :param idcnn_inputs: [batch_size, num_steps, emb_size] 
            :return: [batch_size, num_steps, cnn_output_width]
            """
            #ft.expand_dims会向tensor中插入一个维度,插入位置就是参数代表的位置(维度从0开始)
            model_inputs = tf.expand_dims(model_inputs, 1) #增加了一个维度
            # shape由[?,?,120]变成[?,1,?,120],最后一维是embedding
            reuse = False
            if self.dropout == 1.0:
                reuse = True
            with tf.variable_scope("idcnn" if not name else name):
                shape=[1, self.filter_width, self.embedding_dim,
                           self.num_filter]
                print(shape)
                filter_weights = tf.get_variable(
                    "idcnn_filter",
                    shape=[1, self.filter_width, self.embedding_dim,
                           self.num_filter],
                    initializer=self.initializer)
                
                """
                shape of input = [batch, in_height, in_width, in_channels]
                shape of filter = [filter_height, filter_width, in_channels, out_channels]
                height是默认1,width是句子长度,通道是120维
                shape of input = [batch, in_height, in_width, in_channels]
                shape of filter = [filter_height, filter_width, in_channels, out_channels]
                """
                layerInput = tf.nn.conv2d(model_inputs,
                                          filter_weights,
                                          strides=[1, 1, 1, 1],
                                          padding="SAME",
                                          name="init_layer",use_cudnn_on_gpu=True)
                finalOutFromLayers = []
                totalWidthForLastDim = 0
                #多次卷积,就会将膨胀的时候单次没有卷到的数据在下次卷到
                for j in range(self.repeat_times):
                    for i in range(len(self.layers)):
                        dilation = self.layers[i]['dilation']
                        isLast = True if i == (len(self.layers) - 1) else False
                        with tf.variable_scope("atrous-conv-layer-%d" % i,
                                               reuse=True
                                               if (reuse or j > 0) else False):
                            w = tf.get_variable(
                                "filterW",
                                shape=[1, self.filter_width, self.num_filter,
                                       self.num_filter],
                                initializer=tf.contrib.layers.xavier_initializer())
                            b = tf.get_variable("filterB", shape=[self.num_filter])
                            #膨胀卷积:插入rate-1个0 这里三层{1,1,2}相当于前两个没有膨胀
                            conv = tf.nn.atrous_conv2d(layerInput,
                                                       w,
                                                       rate=dilation,
                                                       padding="SAME")
                            conv = tf.nn.bias_add(conv, b)
                            conv = tf.nn.relu(conv)
                            if isLast:
                                finalOutFromLayers.append(conv)
                                totalWidthForLastDim += self.num_filter
                            layerInput = conv
                         
                finalOut = tf.concat(axis=3, values=finalOutFromLayers)
                keepProb = 1.0 if reuse else 0.5
                finalOut = tf.nn.dropout(finalOut, keepProb)
                
                #踢掉指定的维度,值不变  
                finalOut = tf.squeeze(finalOut, [1])
                finalOut = tf.reshape(finalOut, [-1, totalWidthForLastDim])
                self.cnn_output_width = totalWidthForLastDim
                return finalOut
    
        def project_layer_bilstm(self, lstm_outputs, name=None):
            """
            hidden layer between lstm layer and logits
            :param lstm_outputs: [batch_size, num_steps, emb_size] 
            :return: [batch_size, num_steps, num_tags]
            """
            with tf.variable_scope("project"  if not name else name):
                with tf.variable_scope("hidden"):
                    W = tf.get_variable("W", shape=[self.lstm_dim*2, self.lstm_dim],
                                        dtype=tf.float32, initializer=self.initializer)
    
                    b = tf.get_variable("b", shape=[self.lstm_dim], dtype=tf.float32,
                                        initializer=tf.zeros_initializer())
                    output = tf.reshape(lstm_outputs, shape=[-1, self.lstm_dim*2])
                    hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b))
    
                # project to score of tags
                with tf.variable_scope("logits"):
                    W = tf.get_variable("W", shape=[self.lstm_dim, self.num_tags],
                                        dtype=tf.float32, initializer=self.initializer)
    
                    b = tf.get_variable("b", shape=[self.num_tags], dtype=tf.float32,
                                        initializer=tf.zeros_initializer())
    
                    pred = tf.nn.xw_plus_b(hidden, W, b)
    
                return tf.reshape(pred, [-1, self.num_steps, self.num_tags])
        
        #Project layer for idcnn by crownpku
        #Delete the hidden layer, and change bias initializer
        def project_layer_idcnn(self, idcnn_outputs, name=None):
            """
            :param lstm_outputs: [batch_size, num_steps, emb_size] 
            :return: [batch_size, num_steps, num_tags]
            """
            with tf.variable_scope("project"  if not name else name):
                
                # project to score of tags
                with tf.variable_scope("logits"):
                    W = tf.get_variable("W", shape=[self.cnn_output_width, self.num_tags],
                                        dtype=tf.float32, initializer=self.initializer)
    
                    b = tf.get_variable("b",  initializer=tf.constant(0.001, shape=[self.num_tags]))
    
                    pred = tf.nn.xw_plus_b(idcnn_outputs, W, b)
    
                return tf.reshape(pred, [-1, self.num_steps, self.num_tags])
    
        def loss_layer(self, project_logits, lengths, name=None):
            """
            calculate crf loss
            :param project_logits: [1, num_steps, num_tags]
            :return: scalar loss
            """
            #num_steps是句子长度;project_logits是特征提取并全连接后的输出
            with tf.variable_scope("crf_loss"  if not name else name):
                small = -1000.0
                # pad logits for crf loss  #start_logits=[batch_size,1,num_tags+1]
                start_logits = tf.concat(
                    [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1)
                #pad_logits=[batch_size,num_steps,1]
                pad_logits = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
                #logits=[batch_size,num_steps,num_tags+1]
                logits = tf.concat([project_logits, pad_logits], axis=-1)
                #logits=[batch_size,num_steps+1,num_tags+1]
                logits = tf.concat([start_logits, logits], axis=1)
                targets = tf.concat(
                    [tf.cast(self.num_tags*tf.ones([self.batch_size, 1]), tf.int32), self.targets], axis=-1)
                #targets=[batch_size,1+实际标签数]
                self.trans = tf.get_variable(
                    "transitions",
                    shape=[self.num_tags + 1, self.num_tags + 1],
                    initializer=self.initializer)
                #logits是模型的特征输出;targets是label;trans是条件随机场的输出
                #crf_log_likelihood在一个条件随机场里计算标签序列的log-likelihood
                #inputs:一个形状为[batch_size,max_seq_len,num_tags]的tensor
                #一般使用BILSTM处理之后输出转换为他要求的形状作为CRF层的输入
                #tag_indices:一个形状为[batch_size]的向量,表示每个序列的长度
                #sequence_lengths:一个形状为[batch_size]的向量,表示每个序列的长度
                #transition_params:形状为[num_tags,num_tags]的转移矩阵
                #log_likelihood:标量,log-likelihood
                #注意:由于条件随机场有标记,故真实维度+1
                #inputs=[char_inputs,seg_inputs]
                #高:3 血:22 糖:23 和:24 高:3 血:22 压:25 char_inputs=[3,22,23,24,3,22,25]
                #高血糖 和 高血压 seg_inputs 高血糖=[1,2,3] 和=[0] 高血压=[1,2,3] seg_inputs=[1,2,3,0,1,2,3]             
                log_likelihood, self.trans = crf_log_likelihood(#likelihood似然,一般加似然的就是损失函数
                    inputs=logits,
                    tag_indices=targets,
                    transition_params=self.trans,
                    sequence_lengths=lengths+1)
                return tf.reduce_mean(-log_likelihood)
    
        def create_feed_dict(self, is_train, batch):
            """
            :param is_train: Flag, True for train batch
            :param batch: list train/evaluate data 
            :return: structured data to feed
            """
            _, chars, segs, tags = batch
            feed_dict = {
                self.char_inputs: np.asarray(chars),
                self.seg_inputs: np.asarray(segs),
                self.dropout: 1.0,
            }
            if is_train:
                feed_dict[self.targets] = np.asarray(tags)
                feed_dict[self.dropout] = self.config["dropout_keep"]
            return feed_dict
    
        def run_step(self, sess, is_train, batch):
            """
            :param sess: session to run the batch
            :param is_train: a flag indicate if it is a train batch
            :param batch: a dict containing batch data
            :return: batch result, loss of the batch or logits
            """
            feed_dict = self.create_feed_dict(is_train, batch)
            if is_train:
                global_step, loss, _ = sess.run(
                    [self.global_step, self.loss, self.train_op],
                    feed_dict)
                return global_step, loss
            else:
                #lengths是字的个数,logits是模型特征
                lengths, logits = sess.run([self.lengths, self.logits], feed_dict)
                return lengths, logits
    
        def decode(self, logits, lengths, matrix):
            """
            :param logits: [batch_size, num_steps, num_tags]float32, logits
            :param lengths: [batch_size]int32, real length of each sequence
            :param matrix: transaction matrix for inference
            :return:
            """
            # inference final labels usa viterbi Algorithm
            paths = []
            small = -1000.0
            start = np.asarray([[small]*self.num_tags +[0]])
            for score, length in zip(logits, lengths):
                score = score[:length]
                pad = small * np.ones([length, 1])
                logits = np.concatenate([score, pad], axis=1)
                logits = np.concatenate([start, logits], axis=0)
                #由显式序列logits和状态转移阵matrix,求隐藏序列的最大概率路径,也即最短路径
                path, _ = viterbi_decode(logits, matrix)
     
                paths.append(path[1:])
            return paths
    
        def evaluate(self, sess, data_manager, id_to_tag):
            """
            :param sess: session  to run the model 
            :param data: list of data
            :param id_to_tag: index to tag name
            :return: evaluate result
            """
            results = []
            trans = self.trans.eval()
            for batch in data_manager.iter_batch():
                strings = batch[0]
                tags = batch[-1]
                lengths, scores = self.run_step(sess, False, batch)
                batch_paths = self.decode(scores, lengths, trans)
                for i in range(len(strings)):
                    result = []
                    string = strings[i][:lengths[i]]
                    gold = iobes_iob([id_to_tag[int(x)] for x in tags[i][:lengths[i]]])
                    pred = iobes_iob([id_to_tag[int(x)] for x in batch_paths[i][:lengths[i]]])
                    #gold = iob_iobes([id_to_tag[int(x)] for x in tags[i][:lengths[i]]])
                    #pred = iob_iobes([id_to_tag[int(x)] for x in batch_paths[i][:lengths[i]]])                
                    for char, gold, pred in zip(string, gold, pred):
                        result.append(" ".join([char, gold, pred]))
                    results.append(result)
            return results
    
        def evaluate_line(self, sess, inputs, id_to_tag):
            #trans条件随机场分类得出的矩阵
            trans = self.trans.eval(session=sess)
            #score是[句数,字数,label数]
            lengths, scores = self.run_step(sess, False, inputs)
            #viterbi_decode 由显式序列scores和状态转移阵trans,求隐藏序列的最大概率路径,也即最短路径
            batch_paths = self.decode(scores, lengths, trans)
            tags = [id_to_tag[idx] for idx in batch_paths[0]]
            return result_to_json(inputs[0][0], tags)
    
    

    相关文章

      网友评论

        本文标题:NER实体识别

        本文链接:https://www.haomeiwen.com/subject/wtcbaqtx.html