美文网首页
6 学习分类文章

6 学习分类文章

作者: shashaslife | 来源:发表于2017-11-28 22:07 被阅读0次

    本章学习应该回答的三个问题:

    1. 怎样识别用于分类文章的显著特征?

    2. 怎样构建语言模型,用于自动语言处理?

    3。 通过这些模型我们可以了解语言的什么方面?

    importnltk

    fromnltk.corpusimportnames, movie_reviews,brown

    importrandom

     def gender_features(word):

         return {'last_letter':word[-1]}

    #

    # labled_names=([(name,'male') for name in names.words('male.txt')]+

    #              [(name,'female') for name in names.words('female.txt')])

    #

    # randem_names=random.shuffle(labled_names)

    #

    # featuresets=[(gender_features(n),gender)for (n,gender)in labled_names]

    #

    # train_set, test_set=featuresets[:500],featuresets[500:]

    #

    # classifier=nltk.NaiveBayesClassifier.train(train_set)

    #

    # print(classifier.classify(gender_features('Shasha')))

    # print(nltk.classify.accuracy(classifier, test_set))

    # print(classifier.show_most_informative_features(6))

    # def gender_features2(name): #特征集,找出所有可能的特征,效果不一定好

    #    features={}

    #    features['first_letter']=name[0].lower()

    #    features['last_letter']=name[-1].lower()

    #    for letter in 'abcdefghijklmnopqrstuvwxyz':

    #        features['count({})'.format(letter)]=name.lower().count(letter)

    #        features['has({})'.format(letter)]=(letter in name.lower())

    #

    #    return features

    #

    # print(gender_features2('John'))

    #一旦特征集被选定,完善可以通过错误分析,如下

    # train_names=labled_names[1500:]

    # devtest_names=labled_names[500:1500]

    # test_names=labled_names[:500]

    # train_set=[(gender_features(n),gender) for (n,gender) in train_names]

    # devtest_set=[(gender_features(n),gender) for (n,gender) in devtest_names]

    # test_set=[(gender_features(n),gender) for (n,gender) in test_names]

    # classifier=nltk.NaiveBayesClassifier.train(train_set)

    # print(nltk.classify.accuracy(classifier,devtest_set))

    #使用dev/test 我们可以识别一系列在预测时产生的错误

    # errors=[]

    # for (name,tag) in devtest_names:

    #    guess=classifier.classify(gender_features(name))

    #    if guess != tag:

    #        errors.append((tag,guess,name))

    #

    # for (tag,guess,name)in sorted(errors)[:10]:

    #    print('correct={} guess={} name={}'.format(tag,guess,name))

    #分析结果显示,后两位字母组合也是较为显著特征集,所以应该修改之前的特征集

    #

    # def gender_features(word):

    #    return {'suffix1':word[-1:],

    #            'suffix2':word[-2:]}

    #该方法可以反复使用 直到 找出最佳特征集。

    #为文献资料分类

    #print(movie_reviews.fileids())

    #print(movie_reviews.categories())

    # documents=[(list(movie_reviews.words(fileid)),category)

    #            for category in movie_reviews.categories()

    #            for fileid in movie_reviews.fileids(category)]

    # random.shuffle(documents)

    #

    # all_words=nltk.FreqDist(w.lower() for w in movie_reviews.words())

    # word_features=list(all_words)[:2000]

    #

    # def document_features(document):

    #    document_words=set(document)

    #    features={}

    #    for word in word_features:

    #        features['contains({})'.format(word)]=(word in document_words)

    #    return features

    #

    #

    # featuresets=[(document_features(d), c)

    #              for (d,c) in documents]

    # train_set,test_set=featuresets[100:],featuresets[:100]

    # classifier=nltk.NaiveBayesClassifier.train(train_set)

    # print(nltk.classify.accuracy(classifier,test_set))

    # print(classifier.show_most_informative_features(6))

    # Part-of-Speech-Tagging

    # suffix_fdist=nltk.FreqDist()

    # for word in brown.words():

    #    word=word.lower()

    #    suffix_fdist[word[-1:]] += 1

    #    suffix_fdist[word[-2:]] += 1

    #    suffix_fdist[word[-3:]] += 1

    # common_suffix=[suffix for (suffix,count)in suffix_fdist.most_common(100)]

    #

    # def pos_features(word):

    #    features={}

    #    for suffix in common_suffix:

    #        features['endswith({})'.format(suffix)]=word.lower().endswith(suffix)

    #    return features

    #

    # tagged_words=brown.tagged_words(categories='news')

    # featuresets=[(pos_features(n),g) for (n,g) in tagged_words]

    # size=int(len(featuresets)*0.1)

    # train_set,test_set=featuresets[size:],featuresets[:size]

    #

    # classifier=nltk.DecisionTreeClassifier.train(train_set)

    # print(classifier.classify(pos_features('name')))

    # print(nltk.classify.accuracy(classifier,test_set))

    # Exploting Context 下面方法无法泛化,因为它没有获得前面词的此行标记,因此缺乏实际使用意义

    # def pos_features(sentence,i):

    #    features={'suffix(1)':sentence[i][-1:],

    #              'suffix(2)':sentence[i][-2:],

    #              'suffix(3)':sentence[i][-3:]}

    #    if i==0:

    #        features['prev-word']=''

    #    else:

    #        features['prev-word']=sentence[i-1]

    #    return features

    #

    # tagged_sents=brown.tagged_sents(categories='news')

    # featuresets=[]

    # for tagged_sent in tagged_sents:

    #    untagged_sent = nltk.tag.untag(tagged_sent)

    #    for i, (word,tag) in enumerate(tagged_sent):

    #        featuresets.append((pos_features(untagged_sent,i),tag))

    #

    # size=int(len(featuresets)*0.1)

    # train_set,test_set=featuresets[size:],featuresets[:size]

    # classifier=nltk.NaiveBayesClassifier.train(train_set)

    # print(nltk.classify.accuracy(classifier,test_set))

    # print(classifier.show_most_informative_features(6))

    #序列分类:连续分类或贪婪序列分类:为第一个输入找到最优标签,然后使用这个问题的答案帮助找到下一个最优标签,

    # 为此我们得扩展以上方法的特征集,增加前面单词的此类标注

    # def pos_features(sentence,i,history):

    #    features={'suffix(1)':sentence[i][-1:],

    #              'suffix(2)':sentence[i][-2:],

    #              'suffix(3)':sentence[i][-3:]}

    #    if i==0:

    #        features['prev-word']=''

    #        features['prev-tag'] =''

    #    else:

    #        features['prev-word']=sentence[i - 1]

    #        features['prev-tag'] = history[i - 1]

    #    return features

    #

    # class ConsecutivePosTagger(nltk.TaggerI):

    #

    #    def __init__(self,train_sents):

    #        train_set=[]

    #        for tagged_sent in train_sents:

    #            untagged_sent = nltk.tag.untag(tagged_sent)

    #            history=[]

    #            for i, (word,tag) in enumerate(tagged_sent):

    #                featureset=pos_features(untagged_sent,i,history)

    #                train_set.append((featureset,tag))

    #                history.append(tag)

    #        self.classifier= nltk.NaiveBayesClassifier.train(train_set)

    #

    #    def tag(self,sentence):

    #        history=[]

    #        for i, word in enumerate(sentence):

    #            featureset=pos_features(sentence,i,history)

    #            #print(featureset)

    #            tag = self.classifier.classify(featureset)

    #            history.append(tag)

    #

    #        return zip(sentence,history)

    #

    # tagged_sents=brown.tagged_sents(categories='news')

    # size=int(len(tagged_sents)*0.1)

    # train_sents,test_sents=tagged_sents[size:],tagged_sents[:size]

    # tagger=ConsecutivePosTagger(train_sents)

    #print(list(tagger.tag(brown.sents(categories='romance')[1])))

    #print(list(tagger.tag(brown.tagged_sents(categories='news')[1])))

    #print(tagger.evaluate(test_sents))

    #监督性学习其余案例

    #分割句子

    # sents=nltk.corpus.treebank_raw.sents()

    # tokens=[]

    # boundaries=set()

    # offset=0

    # for sent in sents:

    #    tokens.extend(sent)

    #    offset+=len(sent)

    #    boundaries.add(offset-1)

    # #下一步是确定可能指示句号是否为边界的特征

    # def punct_features(tokens, i):

    #    return{'next-word-capitalized':tokens[i+1][0].isupper(),

    #            'prev-word':tokens[i-1].lower(),

    #            'punct':tokens[i],

    #            'prev-word-is-one-char':len(tokens[i-1])==1}

    #

    # featuresets=[(punct_features(tokens,i),(i in boundaries))

    #              for i in range(1,len(tokens)-1)

    #              if tokens[i] in '.?!']

    #

    # size=int(len(featuresets)*0.1)

    # train_set,test_set=featuresets[size:],featuresets[:size]

    # classifier=nltk.NaiveBayesClassifier.train(train_set)

    # print(nltk.classify.accuracy(classifier,test_set))

    #

    # #使用该句子分割器

    #

    # def segment_sentences(words):

    #    start=0

    #    sents=[]

    #    for i, word in enumerate(words):

    #        if word in '.?!' and classifier.clssify(punct_features(words,i))==True:

    #            sents.append(words[start:i+1])

    #            starts=i+1

    #    if start

    #        sents.append(words[start:])

    #

    #    return sents

    #识别对话行为》识别对话行为是理解对话中句子意思的第一步

    posts=nltk.corpus.nps_chat.xml_posts()[:2]

    defdialogue_act_features(post):

    features={}

    forwordinnltk.word_tokenize(post):

    features['contains({})'.format(word.lower())]=True

    returnfeatures

    featuresets=[(dialogue_act_features(post.text),post.get('class'))

    forpostinposts]

    size=int(len(featuresets)*0.1)

    train_set,test_set=featuresets[size:],featuresets[:size]

    classifier=nltk.NaiveBayesClassifier.train(train_set)

    print(nltk.classify.accuracy(classifier,test_set))

    #套路都是一样:建立特征提取器,建立特征数据集,训练分类器。并检查其效率

    相关文章

      网友评论

          本文标题:6 学习分类文章

          本文链接:https://www.haomeiwen.com/subject/vnpdbxtx.html