第四单元
响应查询
本单元将建立一个可行的搜索引擎,可以抓取并建立一系列网页的索引,可以响应关键词查询。首先是建立网络语料库的索引,其结构将会是:列表的每一项是一个关键词,及该关键词对应的一系列URL,后面可能加一项指出共有多少页面含该关键词。根据这个结构构建索引如下:
index = []
def add_to_index(index,keyword,url):
for e in index:
if e[0] == keyword:
e[1].append(url)
return index
index.append([keyword,[url]])
add_to_index(index,'udacity','http://udacity.com')
add_to_index(index,'computing','http://acm.org')
add_to_index(index,'udacity','http://npr.org')
print index
#>>> [['udacity', ['http://udacity.com', 'http://npr.org']],
#>>> ['computing', ['http://acm.org']]]
根据关键词查找:
index = [['udacity', ['http://udacity.com', 'http://npr.org']],
['computing', ['http://acm.org']]]
def lookup(index,keyword):
for e in index:
if e[0] == keyword:
return e[1]
return []
print lookup(index,'udacity')
#>>> ['http://udacity.com','http://npr.org']
将完整的页面添加进索引:
index = []
def add_to_index(index,keyword,url):
for entry in index:
if entry[0] == keyword and not url in entry[1]: #保证一个关键词的一个URL只记录一次
entry[1].append(url)
return
index.append([keyword,[url]])
def add_page_to_index(index,url,content):
l = content.split() #对页面内容粗略的切分
for e in l:
ok = 0 #标记index中是否已存在该关键词
for x in index:
if e == x:
x[1].append(url)
ok = 1
if not ok:
index.append([e,[url]])
return index
add_page_to_index(index,'fake.text',"This is a test")
print index
#>>> [['This', ['fake.text']], ['is', ['fake.text']], ['a', ['fake.text']],
#>>> ['test',['fake.text']]]
然后在第三单元定义的crawl_web函数中获取到content后加上一句add_page_to_index(index, page, content)即可生成索引,实际上第三单元给出的正是已加上索引的完整版代码。
本单元接下来介绍了因特网的基本原理,首先是页面爬取函数
def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ''
网络的定义:
两点:至少3个实体;即使两个实体不直接相连,仍可以通过第三个实体的传递消息而通信。
之前调用系统自带的split函数切分页面内容出现一些问题,下面实现一个更为细致的切分函数
def split_string(source,splitlist):
output = []
atsplit = True #标记是否到分隔符
for e in source:
if e in splitlist: #splitlist是切分符号表,如标点符号等
atsplit = True
else:
if atsplit: #前一个字符是分隔符,重新加入新字符
output.append(e)
atsplit = False
else: #前一个字符不是分隔符,在最后一个字符串上加字符
output[-1] = output[-1] + e
return output
out = split_string("This is a test-of the,string separation-code!"," ,!-")
print out
#>>> ['This', 'is', 'a', 'test', 'of', 'the', 'string', 'separation', 'code']
out = split_string("After the flood ... all the colors came out.", " .")
print out
#>>> ['After', 'the', 'flood', 'all', 'the', 'colors', 'came', 'out']
out = split_string("First Name,Last Name,Street Address,City,State,Zip Code",",")
print out
#>>>['First Name', 'Last Name', 'Street Address', 'City', 'State', 'Zip Code']
统计点击数,修改索引结构为:
def record_user_click(index,keyword,url):
l = lookup(index, keyword)
if l: #找到该关键词
for e in l:
if e[0] == url:
e[1] += 1
return index
add_to_index(index, keyword, url)
else:
add_to_index(index, keyword, url)
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
for urls in entry[1]: #去重
if urls[0] == url:
return
entry[1].append([url,0])
return
# not found, add new keyword to index
index.append([keyword, [[url,0]]])
统计一段话中的词语个数
def count_words(page):
s = 0
ok = True
for e in page:
if e == ' ':
ok = True
else:
if ok:
s += 1
ok = False
return s
passage =("The number of orderings of the 52 cards in a deck of cards "
"is so great that if every one of the almost 7 billion people alive "
"today dealt one ordering of the cards per second, it would take "
"2.5 * 10**40 times the age of the universe to order the cards in every "
"possible way.")
print count_words(passage)
#>>>56
第五单元
程序怎样运行
首先要评估程序的好坏就要测定它运行时所占用的资源,即算法时间复杂度与空间复杂度。以下的程序以秒为单位测定一个程序运行的时间:
import time
def time_execution(code):
start = time.clock()
result = eval(code)
run_time = time.clock() - start
return result , run_time
def spin_loop(n):
i = 0
while i < n:
i = i + 1
#测试循环1000次加法的时间
time_execution('spin_loop(1000)')
=>
(None, 0.000136000000000025)
其中eval函数的功能是:将字符串str当成有效的表达式来求值并返回计算结果。就可以用来在list,tuple,dict和string之间相互转化:
#字符串转换成列表
>>>a = "[[1,2], [3,4], [5,6], [7,8], [9,0]]"
>>>type(a)
<type 'str'>
>>> b = eval(a)
>>> print b
[[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]]
>>> type(b)
<type 'list'>
#字符串转换成字典
>>> a = "{1: 'a', 2: 'b'}"
>>> type(a)
<type 'str'>
>>> b = eval(a)
>>> print b
{1: 'a', 2: 'b'}
>>> type(b)
<type 'dict'>
#字符串转换成元组
>>> a = "([1,2], [3,4], [5,6], [7,8], (9,0))"
>>> type(a)
<type 'str'>
>>> b = eval(a)
>>> print b
([1, 2], [3, 4], [5, 6], [7, 8], (9, 0))
>>> type(b)
<type 'tuple'>
构建一张大索引表,检测查找关键词的效率:
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
for urls in entry[1]:
if urls[0] == url:
return
entry[1].append([url])
return
# not found, add new keyword to index
index.append([keyword, [url]])
def lookup(index, keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return None
#将字符列表p中的元素连成字符串s
def make_string(p):
s = ""
for e in p:
s += e
return s
def make_big_index(size):
index = []
letters = ['a','a','a','a','a','a']
while len(index) < size:
word = make_string(letters)
add_to_index(index, word, 'fake') #每个关键词的URL都是fake
#按字母从尾到头递增加入索引
for i in range(len(letters) - 1, 0, -1):
if letters[i] < 'z':
letters[i] = chr(ord(letters[i]) + 1)
break;
else:
letters[i] = 'a'
return index
#构建索引表,测试查找其中最后一个关键词的时间
index = make_big_index(10000)
print time_execution('lookup(index, "aaaoup")')
=>
(['fake'], 0.0017249999998512067)
然后介绍哈希表的思想,及好、坏哈希表的比较:
#坏哈希表,关键词分布不均
def bad_hash(key, size):
return ord(key[0]) % size
#较好的哈希表实现
def hash_string(keyword,buckets):
s = 0
for e in keyword:
s += ord(e)
return s % buckets
#哈希表测试函数,检测每个桶中关键词的分布情况
def test_hash_func(func, keys, size):
ans = [0] * size
key_used = []
for w in keys:
if w not in key_used:
hv = func(w, size)
ans[hv] += 1
key_used.append(w)
return ans
#测试
def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ''
words = get_page('http://www.gutenberg.org/cache/epub/1661/pg1661.txt').split()
cnt = test_hash_func(bad_hash, words, 12)
print cnt
=>
[730, 1541, 1055, 1752, 1784, 839, 1452, 2074, 1409, 754, 924, 899]
cnt = test_hash_func(hash_string, words, 12)
print cnt
=>
[1368, 1268, 1273, 1279, 1284, 1245, 1207, 1228, 1281, 1232, 1233, 1315]
cnt = test_hash_func(hash_string, words, 100)
print cnt
=>
[136, 127, 117, 137, 129, 149, 116, 126, 111, 128, 142, 131, 151, 129, 150, 124, 157, 144, 151, 150, 137, 105, 151, 144, 141, 153, 141, 185, 144, 154, 154, 163, 192, 159, 163, 190, 153, 177, 162, 175, 172, 166, 179, 164, 186, 167, 173, 144, 174, 167, 154, 164, 177, 179, 163, 171, 187, 162, 160, 181, 166, 161, 136, 154, 169, 156, 150, 147, 154, 164, 126, 173, 156, 165, 146, 151, 150, 145, 148, 152, 148, 148, 161, 140, 188, 150, 150, 121, 167, 123, 142, 132, 136, 132, 126, 141, 152, 135, 152, 162]
建立空的哈希表:
def make_hashtable(nbuckets):
l = []
for i in range( nbuckets):
l.append([])
return l
print make_hashtable(3)
=>
[[], [], []]
#按以下方式实现则存在问题
def make_hashtable_not(nbuckets):
return [[]] * nbuckets
t = make_hashtable_not(3)
t[1].append(['yes',['https://udacity.com']])
print t
=>
[[['yes', ['https://udacity.com']]], [['yes', ['https://udacity.com']]], [['yes', ['https://udacity.com']]]]
这种方式创建的新列表是这个列表的三次复制,但不是三个副本,而是三个指向。外层列表的每一个元素都指向同一个空列表。
接下来要实现哈希表的查找更新等操作,表的结构为:定义hashtable_get_bucket函数,找到所给关键词对应对哈希桶:
def hashtable_get_bucket(htable,keyword):
l = len(htable)
i = hash_string(keyword,l)
return htable[i]
def hash_string(keyword,buckets):
out = 0
for s in keyword:
out = (out + ord(s)) % buckets
return out
def make_hashtable(nbuckets):
table = []
for unused in range(0,nbuckets):
table.append([])
return table
table = [[['Francis', 13], ['Ellis', 11]], [], [['Bill', 17],
['Zoe', 14]], [['Coach', 4]], [['Louis', 29], ['Rochelle', 4], ['Nick', 2]]]
print hashtable_get_bucket(table, "Zoe")
#>>> [['Bill', 17], ['Zoe', 14]]
print hashtable_get_bucket(table, "Brick")
#>>> []
print hashtable_get_bucket(table, "Lilith")
#>>> [['Louis', 29], ['Rochelle', 4], ['Nick', 2]]
定义 hashtable_add(htable,key,value)函数,在哈希表中添加给定关键词及其值。在上一个函数的基础上只需要在对应的桶的最后添加一项:
def hashtable_add(htable,key,value):
# your code here
hashtable_get_bucket(htable,key).append([key,value])
return htable
def hashtable_get_bucket(htable,keyword):
return htable[hash_string(keyword,len(htable))]
def hash_string(keyword,buckets):
out = 0
for s in keyword:
out = (out + ord(s)) % buckets
return out
def make_hashtable(nbuckets):
table = []
for unused in range(0,nbuckets):
table.append([])
return table
table = make_hashtable(5)
hashtable_add(table,'Bill', 17)
hashtable_add(table,'Coach', 4)
hashtable_add(table,'Ellis', 11)
hashtable_add(table,'Francis', 13)
hashtable_add(table,'Louis', 29)
hashtable_add(table,'Nick', 2)
hashtable_add(table,'Rochelle', 4)
hashtable_add(table,'Zoe', 14)
print table
#>>> [[['Ellis', 11], ['Francis', 13]], [], [['Bill', 17], ['Zoe', 14]],
#>>> [['Coach', 4]], [['Louis', 29], ['Nick', 2], ['Rochelle', 4]]]
定义hashtable_lookup(htable,key)函数,查找对应关键词的值。在前几个函数的基础上,首先调用hashtable_lookup(htable,key)判断表中是否有该词,在该关键词不在表中时调用hashtable_add(htable,key,value)插入;在表中时调用hashtable_get_bucket(htable,key)查找该关键词对应的桶,遍历找到后更新:
def hashtable_lookup(htable,key):
l = hashtable_get_bucket(htable,key)
for e in l:
if e[0] == key:
return e[1]
return None
def hashtable_add(htable,key,value):
bucket = hashtable_get_bucket(htable,key)
bucket.append([key,value])
def hashtable_get_bucket(htable,keyword):
return htable[hash_string(keyword,len(htable))]
def hash_string(keyword,buckets):
out = 0
for s in keyword:
out = (out + ord(s)) % buckets
return out
def make_hashtable(nbuckets):
table = []
for unused in range(0,nbuckets):
table.append([])
return table
table = [[['Ellis', 11], ['Francis', 13]], [], [['Bill', 17], ['Zoe', 14]],
[['Coach', 4]], [['Louis', 29], ['Nick', 2], ['Rochelle', 4]]]
print hashtable_lookup(table, 'Francis')
#>>> 13
print hashtable_lookup(table, 'Louis')
#>>> 29
print hashtable_lookup(table, 'Zoe')
#>>> 14
定义hashtable_update(htable,key,value)
函数,更新给定关键词的值:
def hashtable_update(htable,key,value):
# Your code here
x = hashtable_lookup(htable,key)
if not x:
hashtable_add(htable,key,value)
else:
l = hashtable_get_bucket(htable,key)
for e in l:
if e[0] == key:
e[1] = value
return htable
def hashtable_lookup(htable,key):
bucket = hashtable_get_bucket(htable,key)
for entry in bucket:
if entry[0] == key:
return entry[1]
return None
def hashtable_add(htable,key,value):
bucket = hashtable_get_bucket(htable,key)
bucket.append([key,value])
def hashtable_get_bucket(htable,keyword):
return htable[hash_string(keyword,len(htable))]
def hash_string(keyword,buckets):
out = 0
for s in keyword:
out = (out + ord(s)) % buckets
return out
def make_hashtable(nbuckets):
table = []
for unused in range(0,nbuckets):
table.append([])
return table
table = [[['Ellis', 11], ['Francis', 13]], [], [['Bill', 17], ['Zoe', 14]],
[['Coach', 4]], [['Louis', 29], ['Nick', 2], ['Rochelle', 4]]]
hashtable_update(table, 'Bill', 42)
hashtable_update(table, 'Rochelle', 94)
hashtable_update(table, 'Zed', 68)
print table
#>>> [[['Ellis', 11], ['Francis', 13]], [['Zed', 68]], [['Bill', 42],
#>>> ['Zoe', 14]], [['Coach', 4]], [['Louis', 29], ['Nick', 2],
#>>> ['Rochelle', 94]]]
使用Python自带的字典类型来实现哈希表,首先介绍字典类型,字符串、列表、字典三者的比较:
接下来就是用字典改写crawl_web、add_to_index、lookup三个函数。crawl_web的改动就是将index = [] 改为 index = {}。由于crawl_web调用add_page_to_index,add_page_to_index又调用add_to_index,故只需修改add_to_index:
改为:
lookup改为:
def lookup(index, keyword):
if keyword in index:
return index[keyword]
return None
第五单元主要是修改前面的搜索引擎,使之更高效,能迅速反馈查询。计算机科学思想的核心是分析算法复杂度,设计更有效的数据结构。
习题集
定义缓存,比较斐波那契数列在有缓存和无缓存下的计算效率,无缓存:
import time
def time_execution(code):
start = time.clock()
result = eval(code)
run_time = time.clock() - start
return result , run_time
def cached_fibo(n):
if n == 1 or n == 0:
return n
else:
return cached_fibo(n - 1) + cached_fibo(n - 2)
print time_execution('cached_fibo(40)')
#计算第40项,花费约52.5s
=>
(102334155, 52.544790000000006)
有缓存:
import time
def time_execution(code):
start = time.clock()
result = eval(code)
run_time = time.clock() - start
return result , run_time
def cached_execution(cache, proc, proc_input):
# Your code here
if proc_input in cache:
return cache[proc_input]
cache[proc_input] = proc(proc_input)
return cache[proc_input]
# Here is an example showing the desired behavior of cached_execution:
def factorial(n):
print "Running factorial"
result = 1
for i in range(2, n + 1):
result = result * i
return result
cache = {}
def cached_fibo(n):
if n == 1 or n == 0:
return n
else:
return (cached_execution(cache, cached_fibo, n - 1 )
+ cached_execution(cache, cached_fibo, n - 2 ))
print time_execution('cached_execution(cache, cached_fibo,100)')
#计算第100项,花费远小于一秒,可见记忆化能避免大多数重复的计算
=>
(354224848179261915075L, 0.00022900000000447562)
移动字符:
#后移一个
def shift(letter):
return chr( ord('a') + ( ord(letter) + 1- ord('a') )%26)
print shift('a')
#>>> b
print shift('n')
#>>> o
print shift('z')
#>>> a
#后移n个
def shift_n_letters(letter, n):
l = ord(letter) + n
l = (l - ord('a')) % 26
return chr(ord('a') + l)
print shift_n_letters('s', 1)
#>>> t
print shift_n_letters('s', 2)
#>>> u
print shift_n_letters('s', 10)
#>>> c
print shift_n_letters('s', -10)
#>>> i
后移字符串
def shift(c, t):
l = (ord(c) - ord('a') + t + 26)%26
l = l + ord('a')
return chr(l)
def rotate(s, t):
# Your code here
l = len(s)
a = ''
for i in range(l):
if ord(s[i]) >= ord('a') and ord(s[i]) <= ord('z'):
a += shift(s[i],t)
else:
a += s[i]
return a
print rotate ('sarah', 13)
#>>> 'fnenu'
print rotate('fnenu',13)
#>>> 'sarah'
print rotate('dave',5)
#>>>'ifaj'
print rotate('ifaj',-5)
#>>>'dave'
print rotate(("zw pfli tfuv nfibj tfiivtkcp pfl jyflcu "
"sv rscv kf ivru kyzj"),-17)
#>>> if your code works correctly you should be able to read this
网友评论