美文网首页
Python·爬取当当网图书信息

Python·爬取当当网图书信息

作者: C_Z_Q_ | 来源:发表于2019-10-28 20:34 被阅读0次

    爬虫实践

    1. 爬取当当网图书信息
    from lxml import html
    import requests
    #安装pandas
    #pip install pandas
    import pandas as pd
    def spider(isbn):
        """爬取当当网图书信息爬虫"""
        # isbn   国际标准书号
        url="http://search.dangdang.com/?key={}&act=input".format(isbn)
        print(url)
    
        #获取网页的源代码
        headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
        #Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36
        html_data = requests.get(url, headers=headers).text
    
        #使用xpath语法
        selector = html.fromstring(html_data)
        #爬取所有书籍的标题
        ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
        print('有{}家商铺售卖此书'.format(len(ul_list)))
        book_info_list = []
        #遍历
        for li in ul_list:
            title = li.xpath('a/@title')
    #        print(title)
            #获取所有购买链接
            link = li.xpath('a/@href')[0]
    #        print(link)
            #获取价格
            price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()')[0]
            #去掉¥
            price = price.replace('¥', ' ')
    #        print(price)
            #//标签1[@属性1=属性值1]/.../text()
            #//标签1[@属性1=属性值1]/.../@属性的名字
            # 爬取除当当自营以外的所有店铺
            #store = li.xpath('p[@class="search_shangjia"]/a[1]/text()')
            #
            store = li.xpath('p[4]/a/title')
            if len(store) == 0:
                #d当当自营
                store = '当当自营'
            else:
                store = store[0]
            #print(store)
            book_info_list.append({
                "title":title,
                "price":price,
                "link":link,
                "store":store
            })
            book_info_list.sort(key=lambda x: float(x["price"]), reverse=True)
            #遍历图书列表
            for i in book_info_list:
                print(i)
            #import pandas as pd
            #转化成dataframe
            df = pd.DataFrame(book_info_list)
            #存储成csv
            df.to_csv('当当图书信息')
    isbn = input('请输入您要查询的书号')
    spider(isbn)
    
    1. 爬取豆瓣即将上映电影爬虫
    from lxml import html
    import requests
    import pandas as pd
    def spider(loction):
        url = "https://movie.douban.com/cinema/later/{}/".format(loction)
        print(url)
        #获取网址源代码
        html_data = requests.get(url).text
        #使用xpath语法
        selector = html.fromstring(html_data)
        ul_list = selector.xpath('//div[@id="showing-soon"]/div')
        print('有{}部电影即将上映'.format(len(ul_list)))
        movie_info_list = []
        for i in ul_list:
            movie_name = i.xpath('div/h3/a/text()')[0]
            print(movie_name)
            movie_data = i.xpath('div/ul/li[1]/text()')[0]
            print(movie_data)
            movie_type = i.xpath('div/ul/li[2]/text()')[0]
            print(movie_type)
            movie_county = i.xpath('div/ul/li[3]/text()')[0]
            print(movie_county)
            movie_people = i.xpath('div/ul/li[@class="dt last"]/span/text()')[0]
            print(movie_people)
            movie_info_list.append(
            {"movie_name": movie_name,
             "movie_data": movie_data,
             "movie_type": movie_type,
             "movie_county": movie_county,
             "movie_people": movie_people
            }
        )
        movie_info_list.sort(key=lambda x: x["movie_people"], reverse=True)
        for i in movie_info_list:
            print(i)
        df = pd.DataFrame(movie_info_list)
        #存储成csv
        df.to_csv('豆瓣')
    loction = input("请输入要查询的地名")
    spider(loction)
    

    1)爬取常用的数据结构模型

    #常用模型
    from random import randint
    li = []
    for i in range(10):
        # li.append("商家{}".format(i))
        li.append({
            "store": "商家{}".format(i),
            'price': randint(300, 500)
        })
    # 遍历
    for x in li:
        print(x)
    # 对商家进行排序
    li.sort(key=lambda x: x['price'])
    print('========================================')
    print('==================排序后=================')
    print('========================================')
    # 排序后
    for  x in li:
        print(x)
    

    2)图片爬取

    # 图片的爬取
    # 图片的地址
    # @ src ,图片地址:http://b-ssl.duitang.com/uploads/blog/201312/04/20131204184148_hhXUT.jpeg
    # 导入 requests
    import requests
    url = 'http://b-ssl.duitang.com/uploads/blog/201312/04/20131204184148_hhXUT.jpeg'
    response = requests.get(url)
    print(response.status_code)
    # response.content和 response.text 的区别
    # response.text
    # 返回类型:str
    # response.content
    # 返回类型:bytes
    img_info = response.content
    print(img_info)
    # 文件读取
    # with open('index1.html', 'r', encoding='UTF-8') as f:
    #     print(f.read())
    # 文件进行写入,  wb  write binary 以二进制方式写入
    # 因为是bytes类型所以不用解码
    with open('mm.jpg', 'wb') as f:
        f.write(img_info)
    
    # 爬文本
    # text = '不好意思'
    # with open('xiaoshuo.txt', 'w', encoding='UTF-8') as f:
    #     f.write(text)
    

    3) 批量命名图片

    # 批量命名图片
    import requests
    # 图片地址
    # url = ''
    # f = requests.get(url).content
    # with open('xx.png', 'wb') as f:
    #     f.write(f)
    from random import randint
    url1 = 'http://5b0988e595225.cdn.sohucs.com/images/20190917/10dd465a62b64513a38b24bd4735da6a.jpeg'
    url2 = 'http://pics1.baidu.com/feed/fd039245d688d43f2b9ef37459037a1f0ef43b26.jpeg?token=790b4a63424ff91158de106833f44ba6&s=1DA4E8155E317A075CAD58D1030010B0'
    movie_info_list = [
        {'movie_name':'中国机长', 'img_url': url1},
        {'movie_name': '天气之子', 'img_url': url2}
    ]
    # 批量下载图片
    # 遍历
    for movie in movie_info_list:
        img_link = movie['img_url']
        response = requests.get(img_link)
        if response.status_code == 200:
            with open('./images/{}.jpg'.format(movie['movie_name']), 'wb') as f:
                f.write(response.content)
    

    相关文章

      网友评论

          本文标题:Python·爬取当当网图书信息

          本文链接:https://www.haomeiwen.com/subject/dlzbvctx.html