import requests
from bs4 import BeautifulSoup
import codecs
html= 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36',
'Cookie':'bid=oJDHcRZAfZ0; ll="118282"; _vwo_uuid_v2=E214A7F1BDCECC6E723664187A86F52E|76a273c7310d9f375d99252440354e87; _pk_id.100001.4cf6=5b4a90664736e799.1479056723.2.1479061426.1479058940.; _pk_ses.100001.4cf6=*; __utma=30149280.993280854.1479056726.1479056726.1479061421.2; __utmb=30149280.0.10.1479061421; __utmc=30149280; __utmz=30149280.1479056726.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=223695111.1435542470.1479056726.1479056726.1479061421.2; __utmb=223695111.0.10.1479061421; __utmc=223695111; __utmz=223695111.1479056726.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'
}
def get_movienames(url):
movie_name_list = []
data = requests.get(url,headers=headers)
soup = BeautifulSoup(data.text,'lxml')
movie_list = soup.find(class_='grid_view')
for i in movie_list.findAll('span',{'class':'title'}):
name=i.get_text()
movie_name_list.append(name)
page = soup.find('span',attrs = {'class':'next'}).find('a')
if page:
return movie_name_list,html+page.attrs['href']
return movie_name_list,False
def main():
url= 'https://movie.douban.com/top250'
with codecs.open('movies','wb',encoding='utf-8')as fu:
while url:
movies,url = get_movienames(url)
fu.write(u'{movies}\n'.format(movies='\n'.join(movies)))
if __name__== '__main__':
main()
QQ图片20161114023648.png
网友评论