美文网首页
对xml文件中标题和内容节点做命名实体识别

对xml文件中标题和内容节点做命名实体识别

作者: 夜空中最亮的星_6c64 | 来源:发表于2018-10-13 21:52 被阅读0次

    1.具体步骤:分词,测试标注,实体识别

    #  -*- coding: utf-8 -*-
    import xml.etree.ElementTree as ET
    import sys
    
    reload(sys)
    sys.setdefaultencoding('utf8')
    # 要保证正常运行,请参照最后的完整代码附录
    
    from pyltp import SentenceSplitter
    from pyltp import Segmentor
    from pyltp import Postagger
    from pyltp import SementicRoleLabeller
    from pyltp import NamedEntityRecognizer
    from pyltp import Parser
    
    class extractEntity:
    
        def __init__(self,persons,locations,institutions):
            #使用set可以避免重复元素
            persons = set()
            locations = set()
            institutions = set()
    
        # 分句,也就是将一片文本分割为独立的句子
        def sentence_splitter(sentence='你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档,然后改了下这里得到的。我的微博是MebiuW,转载请注明来自MebiuW!'):
            sents = SentenceSplitter.split(sentence)  # 分句
            # print '\n'.join(sents)
    
        # 分词测试
        def segmentor(self,sentence='你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档,然后改了下这里得到的。我的微博是MebiuW,转载请注明来自MebiuW!'):
            segmentor = Segmentor()  # 初始化实例
            segmentor.load('/Users/Zd/Downloads/ltp_data_v3.4.0/cws.model')  # 加载模型
            words = segmentor.segment(sentence)  # 分词
        # #默认可以这样输出
        # print '\t'.join(words)
        # 可以转换成List 输出
            words_list = list(words)
            segmentor.release()  # 释放模型
            return words_list
    
    
        # 测试标注
        def posttagger(self,words):
            postagger = Postagger()  # 初始化实例
            postagger.load('/Users/Zd/Downloads/ltp_data_v3.4.0/pos.model')  # 加载模型
            postags = postagger.postag(words)  # 词性标注
            # for word,tag in zip(words,postags):
                # print word+'/'+tag
            postagger.release()  # 释放模型
            return postags
    
    
        # 命名实体识别
        def ner(self,words, postags):
            recognizer = NamedEntityRecognizer()  # 初始化实例
            recognizer.load('/Users/Zd/Downloads/ltp_data_v3.4.0/ner.model')  # 加载模型
            netags = recognizer.recognize(words, postags)  # 命名实体识别
    
            for word, ntag in zip(words, netags):
                # print word + '/' + ntag
                if ntag == 'S-Nh':
                    self.persons.add(word)
                    # print "人名:"+word
                elif ntag == 'S-Ns':
                    self.locations.add(word)
                    # print "地名:" + word
                elif ntag == 'S-Ni':
                    self.institutions.add(word)
                    # print "机构名:" + word
    
            recognizer.release()  # 释放模型
            return netags
    
    
        # 依存语义分析
        def parse(self,words, postags):
            parser = Parser()  # 初始化实例
            parser.load('/Users/Zd/Downloads/ltp_data_v3.4.0/parser.model')  # 加载模型
            arcs = parser.parse(words, postags)  # 句法分析
            print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
            parser.release()  # 释放模型
            return arcs
    
    
        # 角色标注
        def role_label(self,words, postags, netags, arcs):
            labeller = SementicRoleLabeller()  # 初始化实例
            labeller.load('/Users/Zd/Downloads/ltp_data_v3.4.0/srl')  # 加载模型
            roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
            for role in roles:
                print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
            labeller.release()  # 释放模型
    
    tree = ET.parse('/Users/Zd/Desktop/newsSpider/newsSpider/extractEntity/scrawlToXML.xml')
    root = tree.getroot()
    #实例化,调用类,并赋予参数
    ee = extractEntity(set,set,set)
    for element in root.iter('head'):
    
        # add element:persons,locations,institutions
        p = ET.SubElement(element,'persons')
        p.text = ''
        l = ET.SubElement(element, 'locations')
        l.text = ''
        i = ET.SubElement(element, 'institutions')
        i.text = ''
    
        #str = element.text.encode('utf-8')
        #print element.text
        if element.text is not None:
            #分词
            words = ee.segmentor(element.text.encode('utf-8'))
            #测试标注
            tags = ee.posttagger(words)
            #每条新闻开始,要将set赋予空
            ee.persons = set()
            ee.locations = set()
            ee.institutions = set()
            #命名实体识别
            netags = ee.ner(words, tags)
            for s in ee.persons:
                #print type(s) str
                #拼接人名
                p.text = p.text + ' ' + s
            #print p.text
            for s in ee.locations:
                #print type(s)
                l.text = l.text + ' ' + s
            for s in ee.institutions:
                #print type(s)
                i.text = i.text + ' ' + s
            #print i.text
    
    
    for element in root.iter('content'):
    
        # add element:persons,locations,institutions
        p = ET.SubElement(element,'persons')
        p.text = ''
        l = ET.SubElement(element, 'locations')
        l.text = ''
        i = ET.SubElement(element, 'institutions')
        i.text = ''
    
        #str = element.text.encode('utf-8')
        #print element.text
        if element.text is not None:
            words = ee.segmentor(element.text.encode('utf-8'))
            tags = ee.posttagger(words)
            ee.persons = set()
            ee.locations = set()
            ee.institutions = set()
            netags = ee.ner(words, tags)
            for s in ee.persons:
                #print type(s)
                p.text = p.text + ' ' + s
            #print p.text
            for s in ee.locations:
                #print type(s)
                l.text = l.text + ' ' + s
            for s in ee.institutions:
                #print type(s)
                i.text = i.text + ' ' + s
            #print i.text
    #生成新的xml文件
    tree.write('entity.xml',encoding='utf-8')
    
    # element.text
    # segmentor(element.nodeValue)
    # #测试分句子
    # print('******************测试将会顺序执行:**********************')
    
    # sentence_splitter()
    # print('###############以上为分句子测试###############')
    
    # 测试分词
    # words = segmentor('我家在昆明,我现在在北京上学。中秋节你是否会想到李白?还有,微博是MebiuW')
    # print('###############以上为分词测试###############')
    
    # 测试标注
    # tags = posttagger(words)
    # print('###############以上为词性标注测试###############')
    
    # 命名实体识别
    #netags = ner(words, tags)
    #print('###############以上为命名实体识别测试###############')
    
    # 依存句法识别
    # arcs = parse(words,tags)
    # print('###############以上为依存句法测试###############')
    
    # #角色标注
    # roles = role_label(words,tags,netags,arcs)
    # print('###############以上为角色标注测试###############')
    
    # 测试分句子
    # sentence_splitter()
    # 测试分词
    # words = segmentor('我家在昆明,我现在在北京上学。中秋节你是否会想到李白?')
    # 测试标注
    # tags = posttagger(words)
    # 命名实体识别
    # ner(words,tags)
    
    

    2.code参考:

    https://blog.csdn.net/MebiuW/article/details/52496920
    

    3.ltp的API:

    https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id4
    

    4.xml文档中节点CRUD操作的参考:

    https://blog.csdn.net/lihao21/article/details/72891932
    

    相关文章

      网友评论

          本文标题:对xml文件中标题和内容节点做命名实体识别

          本文链接:https://www.haomeiwen.com/subject/epsaaftx.html