爬取python异步社区图书并写入excel

作者: 肥宅_Sean | 来源:发表于2018-01-29 15:47 被阅读195次

爬取python异步社区图书
关于Python的搜索前一页的书

这次的爬取风格比较靠谱,先将爬取到的异步社区的html存起来,这样一方面,加快了测试速度, 另一方面,也不会由于爬取的过于频繁,对对方的服务器,造成负担。
所谓“盗亦有道”,大概如是也

import requests
from bs4 import BeautifulSoup
import os, re
import xlwt

def getToThetxt(url):
    res = requests.get(url)
    res.encoding = res.apparent_encoding
    # print(res.text)
    soup = BeautifulSoup(res.text, 'lxml')
    with open(os.getcwd() + '/book.txt', 'w') as f:
        f.write(soup.prettify().replace('\u0142', '').replace('\xa9', ''))


def getfromtxt():
    with open(os.getcwd() + '/book.txt', 'r') as f:
        return f.read()


def getBookMeg(html):
    soup = BeautifulSoup(html, 'lxml')
    search = soup.find(attrs={'id': 'search-result'})
    bookimg = []
    bookNames = []
    bookAuthor = []
    translator = []
    summary = []
    price = []
    books = search.div.ul.find_all('li', attrs={'class': 'block-item bookList__item'})
    for book in books:
        divs = book.find_all('div')
        bookimg.append(divs[0].find('img'))
        bookNames.append(divs[1].find('h3').contents[1].string.replace(' ', '').replace('\n', ''))
        bookAuthor.append(divs[1].find(attrs={'class': 'bookList__author'}).text.replace(' ', '').replace('\n', ''))
        translator.append(divs[1].find(attrs={'class': 'bookList__translator'}).text.replace(' ', '').replace('\n', ''))
        summary.append(divs[1].find(attrs={'class': 'bookList__summary'}).text.replace(' ', '').replace('\n', ''))
        price.append(divs[2].find_all('li')[0].find('em').find('del').text.replace(' ', '').replace('\n', ''))
    # img先放着,先处理好data先
    work_book = xlwt.Workbook("D:\\Code\\python\\BookGet\\")
    sheet = work_book.add_sheet('sheet1')
    sheet.write(0, 0, "书名")
    sheet.write(0, 1, "作者")
    sheet.write(0, 2, "译者")
    sheet.write(0, 3, "大纲")
    sheet.write(0, 4, "价格")
    for i in range(1, len(books)+1):
        sheet.write(i, 0, bookNames[i-1])
        sheet.write(i, 1, bookAuthor[i-1])
        sheet.write(i, 2, translator[i-1])
        sheet.write(i, 3, summary[i-1])
        sheet.write(i, 4, price[i-1])
    work_book.save("book.xls")
    # data


if __name__ == "__main__":
    url = "http://www.epubit.com.cn/search?q=python&type=book"
    path = os.getcwd()
    path = path + '\\book.txt'
    # if not os.path.exists(path):
    if not os.path.exists(path):
        getToThetxt(url)
    getBookMeg(getfromtxt())

相关文章

网友评论

    本文标题:爬取python异步社区图书并写入excel

    本文链接:https://www.haomeiwen.com/subject/ernxzxtx.html