美文网首页
对xml文件中标题和内容节点做命名实体识别

对xml文件中标题和内容节点做命名实体识别

作者: 夜空中最亮的星_6c64 | 来源:发表于2018-10-13 21:52 被阅读0次

1.具体步骤:分词,测试标注,实体识别

#  -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import sys

reload(sys)
sys.setdefaultencoding('utf8')
# 要保证正常运行,请参照最后的完整代码附录

from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import SementicRoleLabeller
from pyltp import NamedEntityRecognizer
from pyltp import Parser

class extractEntity:

    def __init__(self,persons,locations,institutions):
        #使用set可以避免重复元素
        persons = set()
        locations = set()
        institutions = set()

    # 分句,也就是将一片文本分割为独立的句子
    def sentence_splitter(sentence='你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档,然后改了下这里得到的。我的微博是MebiuW,转载请注明来自MebiuW!'):
        sents = SentenceSplitter.split(sentence)  # 分句
        # print '\n'.join(sents)

    # 分词测试
    def segmentor(self,sentence='你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档,然后改了下这里得到的。我的微博是MebiuW,转载请注明来自MebiuW!'):
        segmentor = Segmentor()  # 初始化实例
        segmentor.load('/Users/Zd/Downloads/ltp_data_v3.4.0/cws.model')  # 加载模型
        words = segmentor.segment(sentence)  # 分词
    # #默认可以这样输出
    # print '\t'.join(words)
    # 可以转换成List 输出
        words_list = list(words)
        segmentor.release()  # 释放模型
        return words_list


    # 测试标注
    def posttagger(self,words):
        postagger = Postagger()  # 初始化实例
        postagger.load('/Users/Zd/Downloads/ltp_data_v3.4.0/pos.model')  # 加载模型
        postags = postagger.postag(words)  # 词性标注
        # for word,tag in zip(words,postags):
            # print word+'/'+tag
        postagger.release()  # 释放模型
        return postags


    # 命名实体识别
    def ner(self,words, postags):
        recognizer = NamedEntityRecognizer()  # 初始化实例
        recognizer.load('/Users/Zd/Downloads/ltp_data_v3.4.0/ner.model')  # 加载模型
        netags = recognizer.recognize(words, postags)  # 命名实体识别

        for word, ntag in zip(words, netags):
            # print word + '/' + ntag
            if ntag == 'S-Nh':
                self.persons.add(word)
                # print "人名:"+word
            elif ntag == 'S-Ns':
                self.locations.add(word)
                # print "地名:" + word
            elif ntag == 'S-Ni':
                self.institutions.add(word)
                # print "机构名:" + word

        recognizer.release()  # 释放模型
        return netags


    # 依存语义分析
    def parse(self,words, postags):
        parser = Parser()  # 初始化实例
        parser.load('/Users/Zd/Downloads/ltp_data_v3.4.0/parser.model')  # 加载模型
        arcs = parser.parse(words, postags)  # 句法分析
        print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
        parser.release()  # 释放模型
        return arcs


    # 角色标注
    def role_label(self,words, postags, netags, arcs):
        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load('/Users/Zd/Downloads/ltp_data_v3.4.0/srl')  # 加载模型
        roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
        for role in roles:
            print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
        labeller.release()  # 释放模型

tree = ET.parse('/Users/Zd/Desktop/newsSpider/newsSpider/extractEntity/scrawlToXML.xml')
root = tree.getroot()
#实例化,调用类,并赋予参数
ee = extractEntity(set,set,set)
for element in root.iter('head'):

    # add element:persons,locations,institutions
    p = ET.SubElement(element,'persons')
    p.text = ''
    l = ET.SubElement(element, 'locations')
    l.text = ''
    i = ET.SubElement(element, 'institutions')
    i.text = ''

    #str = element.text.encode('utf-8')
    #print element.text
    if element.text is not None:
        #分词
        words = ee.segmentor(element.text.encode('utf-8'))
        #测试标注
        tags = ee.posttagger(words)
        #每条新闻开始,要将set赋予空
        ee.persons = set()
        ee.locations = set()
        ee.institutions = set()
        #命名实体识别
        netags = ee.ner(words, tags)
        for s in ee.persons:
            #print type(s) str
            #拼接人名
            p.text = p.text + ' ' + s
        #print p.text
        for s in ee.locations:
            #print type(s)
            l.text = l.text + ' ' + s
        for s in ee.institutions:
            #print type(s)
            i.text = i.text + ' ' + s
        #print i.text


for element in root.iter('content'):

    # add element:persons,locations,institutions
    p = ET.SubElement(element,'persons')
    p.text = ''
    l = ET.SubElement(element, 'locations')
    l.text = ''
    i = ET.SubElement(element, 'institutions')
    i.text = ''

    #str = element.text.encode('utf-8')
    #print element.text
    if element.text is not None:
        words = ee.segmentor(element.text.encode('utf-8'))
        tags = ee.posttagger(words)
        ee.persons = set()
        ee.locations = set()
        ee.institutions = set()
        netags = ee.ner(words, tags)
        for s in ee.persons:
            #print type(s)
            p.text = p.text + ' ' + s
        #print p.text
        for s in ee.locations:
            #print type(s)
            l.text = l.text + ' ' + s
        for s in ee.institutions:
            #print type(s)
            i.text = i.text + ' ' + s
        #print i.text
#生成新的xml文件
tree.write('entity.xml',encoding='utf-8')

# element.text
# segmentor(element.nodeValue)
# #测试分句子
# print('******************测试将会顺序执行:**********************')

# sentence_splitter()
# print('###############以上为分句子测试###############')

# 测试分词
# words = segmentor('我家在昆明,我现在在北京上学。中秋节你是否会想到李白?还有,微博是MebiuW')
# print('###############以上为分词测试###############')

# 测试标注
# tags = posttagger(words)
# print('###############以上为词性标注测试###############')

# 命名实体识别
#netags = ner(words, tags)
#print('###############以上为命名实体识别测试###############')

# 依存句法识别
# arcs = parse(words,tags)
# print('###############以上为依存句法测试###############')

# #角色标注
# roles = role_label(words,tags,netags,arcs)
# print('###############以上为角色标注测试###############')

# 测试分句子
# sentence_splitter()
# 测试分词
# words = segmentor('我家在昆明,我现在在北京上学。中秋节你是否会想到李白?')
# 测试标注
# tags = posttagger(words)
# 命名实体识别
# ner(words,tags)

2.code参考:

https://blog.csdn.net/MebiuW/article/details/52496920

3.ltp的API:

https://ltp.readthedocs.io/zh_CN/latest/appendix.html#id4

4.xml文档中节点CRUD操作的参考:

https://blog.csdn.net/lihao21/article/details/72891932

相关文章

网友评论

      本文标题:对xml文件中标题和内容节点做命名实体识别

      本文链接:https://www.haomeiwen.com/subject/epsaaftx.html