美文网首页
DataWhale 爬虫 Task1 2019-04-06

DataWhale 爬虫 Task1 2019-04-06

作者: lala兔斯基 | 来源:发表于2019-04-06 20:02 被阅读0次
    # -*- coding: utf-8 -*-
    # @Time    : 2019/4/4 22:40
    # @Author  : zxx
    # @File    : req_demo.py
    
    import requests
    import re
    from bs4 import BeautifulSoup
    
    
    def request_get_baidu():
        """
        对百度做一次get请求获得网页端所有数据
        :return: null
        """
        headers = {
            'Accept': ' text/html,application/xhtml+xml,appl'
                      'ication/xml;q=0.9,image/webp,image/apng,'
                      '*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding': ' gzip, deflate, br',
            'Accept-Language': ' zh-CN,zh;q=0.9',
            'Cache-Control': ' max-age=0',
            'Connection': ' keep-alive',
            'Cookie': ' BAIDUID=47CD46147CEDF76D63462C3F756C55A'
                      'D:FG=1; ZD_ENTRY=bing; pgv_pvi=9934791680;'
                      ' pgv_si=s4201997312; cflag=13%3A3; BDUSS=k'
                      'FNNTcxdmdsN3lJREVJQXU0dEFjWHZFQUdZSDBjekNLV'
                      'WJOWDVYSFBCRllFY3hjQVFBQUFBJCQAAAAAAAAAAAEAA'
                      'AAIrrlXsaG6ybH9uMlBAAAAAAAAAAAAAAAAAAAAAAAAAA'
                      'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA'
                      'AAFiEpFxYhKRcZ; BIDUPSID=47CD46147CEDF76D6346'
                      '2C3F756C55AD; PSTM=1554457103; BD_HOME=1; H_P'
                      'S_PSSID=26522_1432_21106_28774_28721_28558_28'
                      '584_28640_26350_28604_28625_22160; BD_UPN=1231'
                      '4753; sug=3; sugstore=0; ORIGIN=0; bdime=0',
            'Host': ' www.baidu.com',
            'Upgrade-Insecure-Requests': ' 1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win6'
                          '4; x64) AppleWebKit/537.36 (KHTML, l'
                          'ike Gecko) Chrome/73.0.3683.86 Safar'
                          'i/537.36'
        }
        resposne = requests.get('http://www.baidu.com', headers=headers)
        status_code = resposne.status_code
        print('状态码' + str(status_code))
        print(resposne.text)
    
    
    # 爬取豆瓣top250
    def get_top250_list_page(url: str, headers: dict):
        """
        输入链接获得网页文本数据
        :param url: 豆瓣top250列表页网页链接
        :param headers: user-agent等http浏览的header信息
        :return: 获得网页返回网页文本,未获得返回空值
        """
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            html = response.text
            # print(html)
            return html
        else:
            return None
    
    
    # 解析获得电影列表页
    def parse_top250_list_page(html: str):
        """
        解析获得的网页文本数据,打印得到的电影信息
        :param 
            html: str
        :return: 
            url:nextpage的链接
        """
        soup = BeautifulSoup(html, features="html.parser")
        # print(soup.prettify())
        this_page = soup.find('div', attrs='paginator').find('span', attrs='thispage').get_text()
        next_page_start = str(int(this_page) * 25)
        next_page_url = "https://movie.douban.com/top250?start={}&filter=".format(next_page_start)
        # print(next_page_url)
        for list in soup.find('ol').find_all('li'):
            try:
                rank = list.find('em').string
                print('名次: ' + rank)
                href = list.find('div', attrs={'class': 'hd'}).find('a')['href']
                print("详情页: " + href)
                name = list.find('span', 'title').string
                print('电影名: ' + name)
                # director_name = list.find(re.compile('导演:'.re.S)).string
                # print('导演: '+director_name)
                # content = str(list.find('div', 'bd').find('p'))
                content = list.find('div', attrs={'class': 'bd'}).find('p').get_text()
                # print(type(content))
                # print(content)
                director = content.split("   ")[0].lstrip()
                print(director)
                actor = content.split("   ")[1].split("\n")[0]
                print(actor)
                year = content.split("   ")[1].split("\n")[1].split(" / ")[0].lstrip()
                print(year)
                contry = content.split("   ")[1].split("\n")[1].split(" / ")[1]
                print(contry)
                movieclass = content.split("   ")[1].split("\n")[1].split(" / ")[2]
                print(movieclass)
                print('---------------------------------------------------------------')
                # messages_pattern = re.compile('导演: (.*?) 主演: (.*?).*?([0-9]{4})', re.S)
                # messages_pattern = re.compile('导演: (.*?) .*?主演: (.*?)<br/>.*([0-9]{4})/', re.S)
                # messages = re.findall(messages_pattern, content)
                # print(messages)
                # director = messages[0]
                # print(director)
                # print('导演: '+director)
                # actor = messages[1]
                # print('主演: '+actor)
                # year = messages[2]
                # print('导演: %s,主演: %s,年份: %s' % (director, actor, year))
            except Exception:
                print("艹了")
    
        return next_page_url
    
    
    def main():
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win6'
                          '4; x64) AppleWebKit/537.36 (KHTML, l'
                          'ike Gecko) Chrome/73.0.3683.86 Safar'
                          'i/537.36'
        }
        url = "https://movie.douban.com/top250"
        # request_get_baidu()
        # html = get_top250_list_page(url, headers)
        # next_page_url = parse_top250_list_page(html)
        while url != "https://movie.douban.com/top250?start=225&filter=":
            html = get_top250_list_page(url, headers)
            url = parse_top250_list_page(html)
    
    
    if __name__ == '__main__':
        main()
    
    

    相关文章

      网友评论

          本文标题:DataWhale 爬虫 Task1 2019-04-06

          本文链接:https://www.haomeiwen.com/subject/zzhdiqtx.html