美文网首页
2019-08-26 第一个爬虫案例

2019-08-26 第一个爬虫案例

作者: 王自然_4e0b | 来源:发表于2019-08-26 23:42 被阅读0次

    #第一步请求网页数据

    import requests

    from bs4 import BeautifulSoup

    #请求数据

    url = 'https://book.douban.com/latest'

    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}

    data = requests.get(url,headers =headers)

    print(data.text)

    #解析数据

    soup = BeautifulSoup(data.text,'lxml')

    print(soup)

    #检查元素观察网页

    #观察到网页上的书籍按照两边分布,按照标签进行提取

    import bs4

    books_left = soup.find('ul',{'class':'cover-col-4 clearfix'})

    books_left = books_left.find_all('li')

    books_right = soup.find('ul',{'class':'cover-col-4 pl20 clearfix'})

    books_right = books_right.find_all('li')

    books = list(books_left) + list(books_right)

    #对每一个图书区块进行相同的操作,获取图书信息

    img_urls = []

    titles = []

    ratings = []

    authors = []

    #details = []

    for book in books:

        #图片封面的url地址

        img_url = book.find_all('a')[0].find('img').get('src')

        img_urls.append(img_url)

        #图书标题

        title = book.find_all('a')[1].get_text()

        titles.append(title)

        #评价星级

        rating = book.find('p',{'class':'rating'}).get_text()

        rating = rating.replace('\n','').replace(' ','')

        ratings.append(rating)

        #作者及出版信息

        author = book.find('p',{'class':'color-gray'}).get_text()

        author = author.replace('\n','').replace(' ','')

        authors.append(author)

    print('done')

    print(img_urls)

    print(titles)

    print(ratings)

    print(authors)

    #数据导出

    import pandas as pd

    result =pd.DataFrame()

    result['img_urls'] = img_urls

    result['titles'] = titles

    result['ratings'] = ratings

    result['authors'] = authors

    result.to_excel('result.xls',index = None)

    相关文章

      网友评论

          本文标题:2019-08-26 第一个爬虫案例

          本文链接:https://www.haomeiwen.com/subject/gnwjectx.html