美文网首页
Week1. 58Tongcheng Crawler

Week1. 58Tongcheng Crawler

作者: tookerski | 来源:发表于2016-07-01 20:59 被阅读0次
    #!/usr/bin/env Python
    # -*- coding:utf-8 -*-
    '''
    Created on 2016/6/21
    @author: tookerski
    '''
    from bs4 import BeautifulSoup
    import requests
    import time
    
    #定义获取header的函数,header包含user-agent参数
    def get_header():
        header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2767.5 Safari/537.36'}
        return header
    
    #定义获取翻页链接,默认页数100
    def get_page_links(seller=0,page_num=100):
        urls=['http://bj.58.com/pbdn/{}/pn{}/'.format(seller,i) for i in range(0,page_num+1)]
        return urls
    
    #定义获取商品的链接
    def get_item_links(url,header):
        rp=requests.get(url,headers=header)
        soup=BeautifulSoup(rp.text,'lxml')
        #做一个判断,如果此页有商品,需包含div.infocon标签。有的话继续,没有的pass
        if soup.find('div','infocon'):
            item_all_tags=soup.select('tr > td.t > a.t')#所有商品的链接
            item_zz_tags=soup.select('tr.zzinfo > td.t > a.t')#转转商品的链接
            tags=[i for i in item_all_tags if i not in item_zz_tags]#排除转转商品,添加到列表
            pro_urls = ['http://bj.58.com/pbdn/{}/pn1/'.format(str(i)) for i in range(0, 2)]
            if url==pro_urls[0] or url==pro_urls[1]:
                tags=tags[3:]   #这里,只有第一页的前3个是推广链接,如果是第一页的话,排除前3个链接
            item_links=[]
            for tag in tags:
                item_links.append(tag.get('href'))
            return item_links
        else:
            pass
    
    def get_item_details(url,header,data=0):
        r = requests.get(url,headers=header)
        s = BeautifulSoup(r.text,'lxml')
        view = s.select('span.look_time')[0].text   #获取浏览数
        type = s.select('span.crb_i > a')[0].text   #获取类型
        title = s.select('h1.info_titile')[0].text  #获取主题
        pric = s.select('div.price_li > span > i')[0].text  #获取价格
        area = s.select('div.palce_li > span > i')[0].text  #获取区域
        data = {
            'view':view,
            'type':type,
            'title':title,
            'area':area
        }
        print(data)
    
    #获取header,循环抓取商品信息,一次请求间隔2秒
    h = get_header()
    for url in get_page_links():
        for link in get_item_links(url,h):
            get_item_details(link,h)
            time.sleep(2)
        time.sleep(2)
    

    相关文章

      网友评论

          本文标题:Week1. 58Tongcheng Crawler

          本文链接:https://www.haomeiwen.com/subject/nufqjttx.html