# -*- coding: utf-8 -*-
# @Time : 2019/4/4 22:40
# @Author : zxx
# @File : req_demo.py
import requests
import re
from bs4 import BeautifulSoup
def request_get_baidu():
"""
对百度做一次get请求获得网页端所有数据
:return: null
"""
headers = {
'Accept': ' text/html,application/xhtml+xml,appl'
'ication/xml;q=0.9,image/webp,image/apng,'
'*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': ' gzip, deflate, br',
'Accept-Language': ' zh-CN,zh;q=0.9',
'Cache-Control': ' max-age=0',
'Connection': ' keep-alive',
'Cookie': ' BAIDUID=47CD46147CEDF76D63462C3F756C55A'
'D:FG=1; ZD_ENTRY=bing; pgv_pvi=9934791680;'
' pgv_si=s4201997312; cflag=13%3A3; BDUSS=k'
'FNNTcxdmdsN3lJREVJQXU0dEFjWHZFQUdZSDBjekNLV'
'WJOWDVYSFBCRllFY3hjQVFBQUFBJCQAAAAAAAAAAAEAA'
'AAIrrlXsaG6ybH9uMlBAAAAAAAAAAAAAAAAAAAAAAAAAA'
'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
'AAFiEpFxYhKRcZ; BIDUPSID=47CD46147CEDF76D6346'
'2C3F756C55AD; PSTM=1554457103; BD_HOME=1; H_P'
'S_PSSID=26522_1432_21106_28774_28721_28558_28'
'584_28640_26350_28604_28625_22160; BD_UPN=1231'
'4753; sug=3; sugstore=0; ORIGIN=0; bdime=0',
'Host': ' www.baidu.com',
'Upgrade-Insecure-Requests': ' 1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win6'
'4; x64) AppleWebKit/537.36 (KHTML, l'
'ike Gecko) Chrome/73.0.3683.86 Safar'
'i/537.36'
}
resposne = requests.get('http://www.baidu.com', headers=headers)
status_code = resposne.status_code
print('状态码' + str(status_code))
print(resposne.text)
# 爬取豆瓣top250
def get_top250_list_page(url: str, headers: dict):
"""
输入链接获得网页文本数据
:param url: 豆瓣top250列表页网页链接
:param headers: user-agent等http浏览的header信息
:return: 获得网页返回网页文本,未获得返回空值
"""
response = requests.get(url, headers=headers)
if response.status_code == 200:
html = response.text
# print(html)
return html
else:
return None
# 解析获得电影列表页
def parse_top250_list_page(html: str):
"""
解析获得的网页文本数据,打印得到的电影信息
:param
html: str
:return:
url:nextpage的链接
"""
soup = BeautifulSoup(html, features="html.parser")
# print(soup.prettify())
this_page = soup.find('div', attrs='paginator').find('span', attrs='thispage').get_text()
next_page_start = str(int(this_page) * 25)
next_page_url = "https://movie.douban.com/top250?start={}&filter=".format(next_page_start)
# print(next_page_url)
for list in soup.find('ol').find_all('li'):
try:
rank = list.find('em').string
print('名次: ' + rank)
href = list.find('div', attrs={'class': 'hd'}).find('a')['href']
print("详情页: " + href)
name = list.find('span', 'title').string
print('电影名: ' + name)
# director_name = list.find(re.compile('导演:'.re.S)).string
# print('导演: '+director_name)
# content = str(list.find('div', 'bd').find('p'))
content = list.find('div', attrs={'class': 'bd'}).find('p').get_text()
# print(type(content))
# print(content)
director = content.split(" ")[0].lstrip()
print(director)
actor = content.split(" ")[1].split("\n")[0]
print(actor)
year = content.split(" ")[1].split("\n")[1].split(" / ")[0].lstrip()
print(year)
contry = content.split(" ")[1].split("\n")[1].split(" / ")[1]
print(contry)
movieclass = content.split(" ")[1].split("\n")[1].split(" / ")[2]
print(movieclass)
print('---------------------------------------------------------------')
# messages_pattern = re.compile('导演: (.*?) 主演: (.*?).*?([0-9]{4})', re.S)
# messages_pattern = re.compile('导演: (.*?) .*?主演: (.*?)<br/>.*([0-9]{4})/', re.S)
# messages = re.findall(messages_pattern, content)
# print(messages)
# director = messages[0]
# print(director)
# print('导演: '+director)
# actor = messages[1]
# print('主演: '+actor)
# year = messages[2]
# print('导演: %s,主演: %s,年份: %s' % (director, actor, year))
except Exception:
print("艹了")
return next_page_url
def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win6'
'4; x64) AppleWebKit/537.36 (KHTML, l'
'ike Gecko) Chrome/73.0.3683.86 Safar'
'i/537.36'
}
url = "https://movie.douban.com/top250"
# request_get_baidu()
# html = get_top250_list_page(url, headers)
# next_page_url = parse_top250_list_page(html)
while url != "https://movie.douban.com/top250?start=225&filter=":
html = get_top250_list_page(url, headers)
url = parse_top250_list_page(html)
if __name__ == '__main__':
main()
网友评论