美文网首页
wordnet的一些用法

wordnet的一些用法

作者: VanJordan | 来源:发表于2019-05-07 16:14 被阅读0次
from nltk.corpus import wordnet as wn
import argparse
import random
import re

parser = argparse.ArgumentParser()
parser.add_argument("--negative_num", type=int, default=3, help="the ratio of doing data augment")
parser.add_argument('--level', type=str, default='down', help='down up or same mean hyponyms or random')
parser.add_argument('--every', type=int, default=1, help='for every sentence choice how much words to be replaced')
parser.add_argument('--seed', type = int , default = 1234, help = 'random number seed')
args = parser.parse_args()
# vocab_path = './data/vocab.txt'
# filename_list = ['./data/rt-polarity.neg','rt-polarity.pos']
vocab_path = 'D:\\exchange\\adversarial_text\\src\\data\\vocab.txt'
filename_list = ["D:\\exchange\\adversarial_text\\src\\data\\rt-polarity.neg", \
                 "D:\\exchange\\adversarial_text\\src\\data\\rt-polarity.pos"]

replace_num = 0
all_num = 0

word2id = {}
random.seed(args.seed)

def print_log(file):
    def write_log(s):
        print(s)
        with open(file, 'a') as f:
            f.write(str(s) + '\n')

    return write_log


print_log = print_log("D:\\exchange\\adversarial_text\\src\\data\\log.log")
with open(vocab_path, 'r', encoding='iso8859-1') as f:
    for idx, word in enumerate(f.readlines()):
        word2id[word.strip()] = idx

print_log('word2id:'+str(len(word2id)))
def get_candidate(word=None, level='down'):
    word_candidate = []
    if level == 'down':
        for _, sys in enumerate(wn.synsets(word)):  # for its every hyponyms()
            for hyp in sys.hyponyms():
                for word in hyp.lemma_names():
                    word_candidate.append(word)

    elif level == 'up':
        for _, sys in enumerate(wn.synsets(word)):  # for its every hyponyms()
            for hyp in sys.hypernyms():
                for word in hyp.lemma_names():
                    word_candidate.append(word)

    elif level == 'same':
        for _, sys in enumerate(wn.synsets(word)):  # for its every hyponyms()
            for word in sys.lemma_names():
                word_candidate.append(word)

    else:
        raise ValueError('please check the input augment \"level\" ,it must be one of down ,up ,same,or random'
                         )

    random.shuffle(word_candidate)
    return word_candidate


for filename in filename_list:
    with open(filename, 'r', encoding='iso8859-1') as fr, \
            open(filename.split('.')[0] + '_' + str(args.negative_num) + '_'+str(args.every) + '_' + args.level + '.' + filename.split('.')[
                -1], 'w', encoding='iso8859-1') as fw:
        for line in fr:
            all_num += 1
            line = line.strip()
            fw.write(line + '\n')
            replace_word_list = []

            for _ in range(args.negative_num):
                word_list = line.split()
                flags = 0
                if flags < args.every:

                    for _ in range(len(word_list)):
                        choice_idx = random.choice(range(len(word_list)))  # random choice a word to be replaced
                        word = word_list[choice_idx]  # random choice a word to be replaced
                        if wn.synsets(word):
                            break

                    if args.level == 'random':
                        word_candidate = get_candidate(word, random.choice(['same', 'down', 'up']))
                    else:
                        word_candidate = get_candidate(word, args.level)

                    for word_replace in word_candidate:  # for every word
                        if word_replace not in word2id:  # if this word not in original training data then skip it
                            continue
                        if len(word_replace) <= 3 or len(word) <= 3:  # if word lenth is too short then skip it
                            continue
                        if word in word_replace or word_replace in word:  # if word and replace word from same morpheme then skip it
                            continue
                        if word_replace in replace_word_list or word == word_replace:  # if word has be changed  then skip it
                            continue

                        word_list[choice_idx] = word_replace
                        print(word_replace + '\t' + word)
                        fw.write(' '.join(word_list) + '\n')
                        replace_num += 1
                        flags += 1
                        replace_word_list.append(word_replace)
                        break
                else:
                    break

print('all_num is %d, replace_num is %d, ratio is %f' % (all_num, replace_num, replace_num / all_num))


相关文章

  • wordnet的一些用法

  • WordNet

    WordNet面向语义的英语词典,类似于传统辞典。WordNet的一些操作如下: wordnet的层次结构 在层次...

  • Princeton Algorithms, WordNet

    Princeton Algorithms, Part II, WordNet 普林斯顿大学算法课 WordNet ...

  • WordNet

    wordNet wordNet 是普林斯顿大学开发的英语语料库,可以理解为就是一个词典,在python中的nltk...

  • wordnet介绍

    wordnet是以同义词集合(synset)作为基本建构单位进行组织的,即每个同义词集是网络里的一个结点,每个同义...

  • 如何使用wordnet

    介绍 WordNet是包含语义信息的英语词典。 wordnet根据单词的语义分组,相同语义的单词组合在一起称为sy...

  • DataSet

    ImageNet ImageNet[https://image-net.org/]是根据 WordNet 层次结构...

  • Linux下超好用词典 -- GoldenDict

    install goldendict 是本体,goldendict-wordnet是一个en-en离线词典,按需安...

  • day07

    一些ps用法

  • 词表征 1:WordNet、0-1表征、共现矩阵、SVD

    一、基于知识的表征 参见图1.1,WordNet中包含同义词集(synonym sets)和上位词(hyperny...

网友评论

      本文标题:wordnet的一些用法

      本文链接:https://www.haomeiwen.com/subject/zqqyoqtx.html