1http基本原理
URI和URL
URI(统一资源标识符)包括URL(统一资源定位符)和URN(统一资源名称)
HTTP和HTTPS传输协议
HTTP是超文本传输协议;HTTP传输没有加密
HTTPS在HTTP基础上加入SSL套接字层,传输内容通过SSL加密
安全通道保证数据传输安全
能够确认网站真实性
请求方法:get和post
get请求的参数在url里,最多只有1024字节
post请求数据一般通过表单提交,不会出现在url里,大小没有限制
put作修改用;post作新增用
cookie和session区别
http请求是一个无状态的请求
cookie保存在本地;session保存在服务器
2urllib库
Urllib是基于http的高层库,它有以下三个主要功能:
1.request处理客户端的请求
2.response处理服务端的响应
3.parse会解析url
3用爬虫爬取猫眼网站信息
findall(匹配规则,字符串)匹配规则中有用括号括起来的内容就返回括号中的匹配内容,否则返回匹配到的内容;
compile(匹配规则,忽略空格)
str1.strip()删除字符串首位和末位的空格
''.join(score)以空串连接score中的所有内容
import json
import re
import requests
# 获取单个网页
def get_page(page):
# 网址
url = 'http://maoyan.com/board/4?offset='+str(page)
# 伪装请求头
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
# 请求网页并返回响应
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
# 获取所有页面
def get_all_pages():
result = []
for i in range(10):
page = i*10
html = get_page(page)
result_list = parse_page(html)
result.append(result_list)
return result
# 写入图片文件
def save_cover_image(cover_url):
response = requests.get(cover_url)
result = response.content
filename = cover_url.split('/')[-1].split('@')[0]
with open('./images/%s' % filename, 'wb') as f:
f.write(result)
# 解析网页
def parse_page(html):
# 1.片名
# compile(匹配规则,忽略空格)
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
# findall(匹配规则,字符串)返回括号中的匹配内容
movie_names = re.findall(pattern, html)
# 2.主演
pattern = re.compile('<p class="star">(.*?)</p>', re.S)
actors = re.findall(pattern, html)
# str1.strip()删除字符串首位和末位的空格
actors = [actor.strip() for actor in actors]
print(actors)
# 3.上映时间
pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
releasetime = re.findall(pattern, html)
releasetime = [time.strip() for time in releasetime]
# 4.封面图片
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?data-src="(.*?)" alt.*?', re.S)
covers = re.findall(pattern, html)
# 5.排名
pattern = re.compile('<i class="board-index board-index-.*?">(.*?)</i>', re.S)
ranks = re.findall(pattern, html)
ranks = [rank.strip() for rank in ranks]
# 6.评分
pattern = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
scores = re.findall(pattern, html)
# ''.join(score)以空串连接score中的所有内容
scores = [''.join(score) for score in scores]
# 7.详细信息链接
pattern = re.compile('<div class="movie-item-info">.*?<p class="name"><a href="(.*?)" title=.*?', re.S)
details = re.findall(pattern, html)
details = [detail.strip() for detail in details]
# 组装json
message = [({'movie_name': movie_names[i], 'actor': actors[i], 'releasetime':
releasetime[i], 'covers': covers[i], 'rank': ranks[i], 'score': scores[i],
'detail': details[i]},save_cover_image(covers[i])) for i in range(len(details))]
print(message)
return message
# 保存json文件
def save_json_file(result):
json_str = json.dumps(result, ensure_ascii=False)
with open('movie.json', 'w', encoding='utf-8') as f:
f.write(json_str)
def main():
result = get_all_pages()
print(result)
save_json_file(result)
if __name__ == '__main__':
main()
网友评论