本章学习应该回答的三个问题:
1. 怎样识别用于分类文章的显著特征?
2. 怎样构建语言模型,用于自动语言处理?
3。 通过这些模型我们可以了解语言的什么方面?
importnltk
fromnltk.corpusimportnames, movie_reviews,brown
importrandom
def gender_features(word):
return {'last_letter':word[-1]}
#
# labled_names=([(name,'male') for name in names.words('male.txt')]+
# [(name,'female') for name in names.words('female.txt')])
#
# randem_names=random.shuffle(labled_names)
#
# featuresets=[(gender_features(n),gender)for (n,gender)in labled_names]
#
# train_set, test_set=featuresets[:500],featuresets[500:]
#
# classifier=nltk.NaiveBayesClassifier.train(train_set)
#
# print(classifier.classify(gender_features('Shasha')))
# print(nltk.classify.accuracy(classifier, test_set))
# print(classifier.show_most_informative_features(6))
# def gender_features2(name): #特征集,找出所有可能的特征,效果不一定好
# features={}
# features['first_letter']=name[0].lower()
# features['last_letter']=name[-1].lower()
# for letter in 'abcdefghijklmnopqrstuvwxyz':
# features['count({})'.format(letter)]=name.lower().count(letter)
# features['has({})'.format(letter)]=(letter in name.lower())
#
# return features
#
# print(gender_features2('John'))
#一旦特征集被选定,完善可以通过错误分析,如下
# train_names=labled_names[1500:]
# devtest_names=labled_names[500:1500]
# test_names=labled_names[:500]
# train_set=[(gender_features(n),gender) for (n,gender) in train_names]
# devtest_set=[(gender_features(n),gender) for (n,gender) in devtest_names]
# test_set=[(gender_features(n),gender) for (n,gender) in test_names]
# classifier=nltk.NaiveBayesClassifier.train(train_set)
# print(nltk.classify.accuracy(classifier,devtest_set))
#使用dev/test 我们可以识别一系列在预测时产生的错误
# errors=[]
# for (name,tag) in devtest_names:
# guess=classifier.classify(gender_features(name))
# if guess != tag:
# errors.append((tag,guess,name))
#
# for (tag,guess,name)in sorted(errors)[:10]:
# print('correct={} guess={} name={}'.format(tag,guess,name))
#分析结果显示,后两位字母组合也是较为显著特征集,所以应该修改之前的特征集
#
# def gender_features(word):
# return {'suffix1':word[-1:],
# 'suffix2':word[-2:]}
#该方法可以反复使用 直到 找出最佳特征集。
#为文献资料分类
#print(movie_reviews.fileids())
#print(movie_reviews.categories())
# documents=[(list(movie_reviews.words(fileid)),category)
# for category in movie_reviews.categories()
# for fileid in movie_reviews.fileids(category)]
# random.shuffle(documents)
#
# all_words=nltk.FreqDist(w.lower() for w in movie_reviews.words())
# word_features=list(all_words)[:2000]
#
# def document_features(document):
# document_words=set(document)
# features={}
# for word in word_features:
# features['contains({})'.format(word)]=(word in document_words)
# return features
#
#
# featuresets=[(document_features(d), c)
# for (d,c) in documents]
# train_set,test_set=featuresets[100:],featuresets[:100]
# classifier=nltk.NaiveBayesClassifier.train(train_set)
# print(nltk.classify.accuracy(classifier,test_set))
# print(classifier.show_most_informative_features(6))
# Part-of-Speech-Tagging
# suffix_fdist=nltk.FreqDist()
# for word in brown.words():
# word=word.lower()
# suffix_fdist[word[-1:]] += 1
# suffix_fdist[word[-2:]] += 1
# suffix_fdist[word[-3:]] += 1
# common_suffix=[suffix for (suffix,count)in suffix_fdist.most_common(100)]
#
# def pos_features(word):
# features={}
# for suffix in common_suffix:
# features['endswith({})'.format(suffix)]=word.lower().endswith(suffix)
# return features
#
# tagged_words=brown.tagged_words(categories='news')
# featuresets=[(pos_features(n),g) for (n,g) in tagged_words]
# size=int(len(featuresets)*0.1)
# train_set,test_set=featuresets[size:],featuresets[:size]
#
# classifier=nltk.DecisionTreeClassifier.train(train_set)
# print(classifier.classify(pos_features('name')))
# print(nltk.classify.accuracy(classifier,test_set))
# Exploting Context 下面方法无法泛化,因为它没有获得前面词的此行标记,因此缺乏实际使用意义
# def pos_features(sentence,i):
# features={'suffix(1)':sentence[i][-1:],
# 'suffix(2)':sentence[i][-2:],
# 'suffix(3)':sentence[i][-3:]}
# if i==0:
# features['prev-word']=''
# else:
# features['prev-word']=sentence[i-1]
# return features
#
# tagged_sents=brown.tagged_sents(categories='news')
# featuresets=[]
# for tagged_sent in tagged_sents:
# untagged_sent = nltk.tag.untag(tagged_sent)
# for i, (word,tag) in enumerate(tagged_sent):
# featuresets.append((pos_features(untagged_sent,i),tag))
#
# size=int(len(featuresets)*0.1)
# train_set,test_set=featuresets[size:],featuresets[:size]
# classifier=nltk.NaiveBayesClassifier.train(train_set)
# print(nltk.classify.accuracy(classifier,test_set))
# print(classifier.show_most_informative_features(6))
#序列分类:连续分类或贪婪序列分类:为第一个输入找到最优标签,然后使用这个问题的答案帮助找到下一个最优标签,
# 为此我们得扩展以上方法的特征集,增加前面单词的此类标注
# def pos_features(sentence,i,history):
# features={'suffix(1)':sentence[i][-1:],
# 'suffix(2)':sentence[i][-2:],
# 'suffix(3)':sentence[i][-3:]}
# if i==0:
# features['prev-word']=''
# features['prev-tag'] =''
# else:
# features['prev-word']=sentence[i - 1]
# features['prev-tag'] = history[i - 1]
# return features
#
# class ConsecutivePosTagger(nltk.TaggerI):
#
# def __init__(self,train_sents):
# train_set=[]
# for tagged_sent in train_sents:
# untagged_sent = nltk.tag.untag(tagged_sent)
# history=[]
# for i, (word,tag) in enumerate(tagged_sent):
# featureset=pos_features(untagged_sent,i,history)
# train_set.append((featureset,tag))
# history.append(tag)
# self.classifier= nltk.NaiveBayesClassifier.train(train_set)
#
# def tag(self,sentence):
# history=[]
# for i, word in enumerate(sentence):
# featureset=pos_features(sentence,i,history)
# #print(featureset)
# tag = self.classifier.classify(featureset)
# history.append(tag)
#
# return zip(sentence,history)
#
# tagged_sents=brown.tagged_sents(categories='news')
# size=int(len(tagged_sents)*0.1)
# train_sents,test_sents=tagged_sents[size:],tagged_sents[:size]
# tagger=ConsecutivePosTagger(train_sents)
#print(list(tagger.tag(brown.sents(categories='romance')[1])))
#print(list(tagger.tag(brown.tagged_sents(categories='news')[1])))
#print(tagger.evaluate(test_sents))
#监督性学习其余案例
#分割句子
# sents=nltk.corpus.treebank_raw.sents()
# tokens=[]
# boundaries=set()
# offset=0
# for sent in sents:
# tokens.extend(sent)
# offset+=len(sent)
# boundaries.add(offset-1)
# #下一步是确定可能指示句号是否为边界的特征
# def punct_features(tokens, i):
# return{'next-word-capitalized':tokens[i+1][0].isupper(),
# 'prev-word':tokens[i-1].lower(),
# 'punct':tokens[i],
# 'prev-word-is-one-char':len(tokens[i-1])==1}
#
# featuresets=[(punct_features(tokens,i),(i in boundaries))
# for i in range(1,len(tokens)-1)
# if tokens[i] in '.?!']
#
# size=int(len(featuresets)*0.1)
# train_set,test_set=featuresets[size:],featuresets[:size]
# classifier=nltk.NaiveBayesClassifier.train(train_set)
# print(nltk.classify.accuracy(classifier,test_set))
#
# #使用该句子分割器
#
# def segment_sentences(words):
# start=0
# sents=[]
# for i, word in enumerate(words):
# if word in '.?!' and classifier.clssify(punct_features(words,i))==True:
# sents.append(words[start:i+1])
# starts=i+1
# if start
# sents.append(words[start:])
#
# return sents
#识别对话行为》识别对话行为是理解对话中句子意思的第一步
posts=nltk.corpus.nps_chat.xml_posts()[:2]
defdialogue_act_features(post):
features={}
forwordinnltk.word_tokenize(post):
features['contains({})'.format(word.lower())]=True
returnfeatures
featuresets=[(dialogue_act_features(post.text),post.get('class'))
forpostinposts]
size=int(len(featuresets)*0.1)
train_set,test_set=featuresets[size:],featuresets[:size]
classifier=nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier,test_set))
#套路都是一样:建立特征提取器,建立特征数据集,训练分类器。并检查其效率
网友评论