3.1 引言
学习如何建立问题的计算模型,并设计算法求解,对于一个具体的问题如何得到其简化的计算模型。当问题转化为计算模型后,就可以比较容易地设计算法来求解它。
3.2 文档比较
3.2.1 问题提出
比较二份电子文档相似度的算法
3.2.2 算法设计

代码 3.1 读入文件
def read_line(filename):
try:
fp=open(filename)
L = fp.readlines()
except IOError:
print("Error opening or reading input file:",filename)
sys.exit()
return L

代码 3.2 将字符组合成单词
def get_words_from_string(line):
word_list = []
character_list = []
for c in line:
if c.isalnum():
character_list.append(c)
elif len(character_list)>0:
word = "".join(character_list)
word = str.lower(word)
word_list.append(word)
character_list = []
if len(character_list)>0:
word = "".join(character_list)
word = str.lower(word)
word_list.append(word)
return word_list

[k + k(1+2+3+···+w/k)]w/k
去除上面的常项,执行时间为O(W**2/k)
代码 3.3 得到文档的单词
def get_words_from_line(L):
word_list = []
for line in L:
word_in_line = get_words_from_string(line)
word_list = word_list + word_in_line
return word_list


代码 3.4 计算文件中每一个单词出现的频率
def count_frequency(word_list):
L = []
for new_word in word_list:
for entry in L:
if new_word == entry[0]:
entry[1] = entry[1] +1
break
else:
L.append([new_word,1])
return L




\
代码 3.5 计算两向量内积
def inner_product(L1,L2):
sum = 0.0
for word1, count1 in L1:
for word2, count2 in L2:
if word1 == word2:
sum += count1*count2
return sum

代码 3.6 计算两向量夹角
def vector_angle(L1,L2):
numerator = inner_product(L1,L2)
denominator = math.sqrt(inner_product(L1,L1)*inner_product(L2,L2))
return math.acos(numerator/denominator)
算法优化

代码 3.7 文档比较主函数
def word_frequencies_from_file(filename):
L = read_line(filename)
print(L)
word_list = get_words_from_line(L)
print(word_list)
count_list = count_frequency(word_list)
print(count_list)
return count_list
def main():
filname_1 = "t1.verne.txt"
filname_2 = "t2.verne.txt"
word_list_1 = word_frequencies_from_file(filname_1)
word_list_2 = word_frequencies_from_file(filname_2)
distance = vector_angle(word_list_1,word_list_2)
print("The distance betwween the documenets is: %0.6f (radians)" % distance)
if __name__ == "__main__":
import cProfile
cProfile.run("main()")

函数执行后得到得输出中有列ncalls 表示函数的调用次数,tottime 表示总的执行时间,percall 表示每次调用时间。有了这些数据,可便于确定那个函数是整个算法的效率瓶颈

修改 代码 3.3 (得到文档的单词)
def get_words_from_line(L):
word_list = []
for line in L:
word_in_line = get_words_from_string(line)
# word_list = word_list + word_in_line
word_list.extend(word_in_line)
return word_list

代码 3.8 对向量内的元素进行排序预处理
def word_frequencies_for_file(filenname):
line_list = read_line(filename)
word_list = get_words_from_line(line_list)
freq_mapping = count_frequency(word_list)
sorted_freq_mapping = sorted(freq_mapping)
print("File",filename,":",)
print(len(line_list),"lines,")
print(len(word_list),"words,",)
print(len(sorted_freq_mapping),"distinct words")
return sorted_freq_mapping

代码 3.9 内积优化算法
def inner_product(L1,L2):
sum = 0.0
i = 0
j = 0
# for word1, count1 in L1:
# for word2, count2 in L2:
# if word1 == word2:
# sum += count1*count2
# return sum
while i < len(L1) and j < len(L2):
if L1[i][0] == L2[j][0]:
sum += L1[i][1]*L2[j][1]
elif L1[i][0]< L2[j][0] :
# 单词在 L1中不在 L2
i += 1
else:
#单词在 L2 中不在 L1 中
j += 1
return sum

代码 3.10 利用字典数据结构计算每一个单词出现频次
def count_frequency(word_list):
# L = []
# for new_word in word_list:
# for entry in L:
# if new_word == entry[0]:
# entry[1] = entry[1] +1
# break
# else:
# L.append([new_word,1])
# return L
D = {}
for new_word in word_list:
if D.haskey(new_word):
D[new_word] += 1
else:
D[new_word] = 1
return D.items
3.3 拼音矫正
3.3.1 问题提出

3.3.2算法设计


代码3.11 将文件分解成单词序列
def words(text):
return re.findall('[a-z]+',text.lower())

代码 3.12 计算单词出现次数
def train(features):
#生成一个默认value=1的带key的数据字典
model = collections.defaultdic(lambda:1)
for f in features:
model[f] += 1
return model

代码 3.13 计算单词出现次数
NWORDS = train(words(read_line('big.txt')))
def konwn(words):
# return set(w for w in words if w in NWORDS)
wordintxt = set([])
for w in words:
if w in NWORDS:
wordintxt.add(w)
return wordintxt

代码 3.14 输入单词经一次变换后的结果
def edist1(word):
n = len(word)
return set([word[0:1] + word[i+1: ] for i in range(n)] + # 删除
[word[0:i] + word[i+1] + word[i] + word[i+2: ] for i in range(n-1)] + #错位
[word[0:i] + c + word[i+1: ] for i in range(n) for c in alphabet] + #变换
[word[0:i] + c + word[i:] for i in range(n+1) for c in alphabet]) #添加

代码 3.15 单词纠正主程序
def correct(word):
candidates = know([word]) or know(edist1(word) or know_edist2(word) or [word])
return max(candidates,key=lambda w: NWORDS[w])

3.4 稳定匹配问题
3.4.1 问题提出




3.4.2 算法设计


代码 3.16 稳定匹配算法
import copy
guyprefers = {
'm1':['w1','w2','w3'],
'm2':['w1','w2','w3'],
'm3':['w1','w3','w2']
}
galprefers ={
'w1':['m2','m1','m3'],
'w2':['m1','m2','m3'],
'w3':['m3','m1','m3'],
}
guys = sorted(guyprefers.keys())
gals = sorted(galprefers.keys())
def matchmaker():
#单身男生列表
guysfree = guys[:]
#字典数据结构的配对关系
engaged = {}
#男生对女生的喜好
guyprefers2 = copy.deepcopy(guyprefers)
#女生对男生的喜好
galprefers2 = copy.deepcopy(galprefers)
while guysfree:
guy = guysfree.pop(0)
#得到男生guy的偏好列表
guyslist = guyprefers2[guy]
#该男生当前最喜欢的女生
gal = guyslist.pop(0)
#女生gal 是否有对象
fiance = engaged.get(gal)
#女生还未配对
if not fiance:
#将男生guy 和女生gal 配对
engaged[gal] = guy
print("%s and %s" % (guy,gal))
else:
#女生对男生喜好列表
galslist = galprefers2[gal]
if galslist.index(fiance) > galslist.index(guy):
#女生更偏好当前的追求者
engaged[gal] = guy
print("%s dumped %s for %s" % (gal,fiance,guy))
if guyprefers2[fiance]:
#前男友进入单身列表
guysfree.append(fiance)
else:
#女生更偏好现男友
if guyslist:
#当前追求者重新寻找下一个对象
guysfree.append(guy)
return engaged
3.5 小结
通过简单的设计就可以解决看似非常复杂的问题。最重要的是将问题进行转化,也就是把一个具体的问题形式化描述,然后再寻求解决问题的办法。
一个算法的实现还是不断优化的过程,这个优化不仅体现在算法的优化,也包括实现算法的代码优化。
网友评论