r = urllib.urlopen('http://www.yunzhongmoke.com/MrJiang_URL.html')
使用正则表达式爬取网页
py2
import urllib2
import re
dStr = urllib2.urlopen('https://hk.finance.yahoo.com/q/cp?s=%5EDJI').read()
m = re.findall('(.*?)(.*?).*?(.*?).*?', dStr)
if m:
print m
print '\n'
print len(m)
else:
print 'not match’
py3
import urllib.request
import re
dStr = urllib.request.urlopen('https://hk.finance.yahoo.com/q/cp?s=%5EDJI').read()
getdStr=dStr.decode()
#在python 3中urllib.read()返回bytes对象而非str,语句功能是将dStr转换成str
#convert dStr into str, urllib.read() returns bytes objects instead of str
m = re.findall('(.*?)(.*?).*?(.*?).*?', getdStr)
if m:
print(m)
print('\n')
print (len(m))
else:
print ('not match')
网友评论