爬取python异步社区图书并写入excel

作者: 肥宅_Sean | 来源:发表于2018-01-29 15:47 被阅读195次

    爬取python异步社区图书
    关于Python的搜索前一页的书

    这次的爬取风格比较靠谱,先将爬取到的异步社区的html存起来,这样一方面,加快了测试速度, 另一方面,也不会由于爬取的过于频繁,对对方的服务器,造成负担。
    所谓“盗亦有道”,大概如是也

    import requests
    from bs4 import BeautifulSoup
    import os, re
    import xlwt
    
    def getToThetxt(url):
        res = requests.get(url)
        res.encoding = res.apparent_encoding
        # print(res.text)
        soup = BeautifulSoup(res.text, 'lxml')
        with open(os.getcwd() + '/book.txt', 'w') as f:
            f.write(soup.prettify().replace('\u0142', '').replace('\xa9', ''))
    
    
    def getfromtxt():
        with open(os.getcwd() + '/book.txt', 'r') as f:
            return f.read()
    
    
    def getBookMeg(html):
        soup = BeautifulSoup(html, 'lxml')
        search = soup.find(attrs={'id': 'search-result'})
        bookimg = []
        bookNames = []
        bookAuthor = []
        translator = []
        summary = []
        price = []
        books = search.div.ul.find_all('li', attrs={'class': 'block-item bookList__item'})
        for book in books:
            divs = book.find_all('div')
            bookimg.append(divs[0].find('img'))
            bookNames.append(divs[1].find('h3').contents[1].string.replace(' ', '').replace('\n', ''))
            bookAuthor.append(divs[1].find(attrs={'class': 'bookList__author'}).text.replace(' ', '').replace('\n', ''))
            translator.append(divs[1].find(attrs={'class': 'bookList__translator'}).text.replace(' ', '').replace('\n', ''))
            summary.append(divs[1].find(attrs={'class': 'bookList__summary'}).text.replace(' ', '').replace('\n', ''))
            price.append(divs[2].find_all('li')[0].find('em').find('del').text.replace(' ', '').replace('\n', ''))
        # img先放着,先处理好data先
        work_book = xlwt.Workbook("D:\\Code\\python\\BookGet\\")
        sheet = work_book.add_sheet('sheet1')
        sheet.write(0, 0, "书名")
        sheet.write(0, 1, "作者")
        sheet.write(0, 2, "译者")
        sheet.write(0, 3, "大纲")
        sheet.write(0, 4, "价格")
        for i in range(1, len(books)+1):
            sheet.write(i, 0, bookNames[i-1])
            sheet.write(i, 1, bookAuthor[i-1])
            sheet.write(i, 2, translator[i-1])
            sheet.write(i, 3, summary[i-1])
            sheet.write(i, 4, price[i-1])
        work_book.save("book.xls")
        # data
    
    
    if __name__ == "__main__":
        url = "http://www.epubit.com.cn/search?q=python&type=book"
        path = os.getcwd()
        path = path + '\\book.txt'
        # if not os.path.exists(path):
        if not os.path.exists(path):
            getToThetxt(url)
        getBookMeg(getfromtxt())
    
    

    相关文章

      网友评论

        本文标题:爬取python异步社区图书并写入excel

        本文链接:https://www.haomeiwen.com/subject/ernxzxtx.html