美文网首页
用py爬虫抓取天猫店信息(附源代码)

用py爬虫抓取天猫店信息(附源代码)

作者: 大黑不小白 | 来源:发表于2017-04-08 18:01 被阅读0次

    github地址:https://github.com/A-mermaid-Line-Engineer/python-begin.git
    由于毕业论文要求,用Python做了一个爬虫抓取天猫店铺基本信息,在此共享。

    1.安装Python2.7版本

    访问https://www.python.org/


    在Downloads列表选择window版本直接下载安装。和平时安装程序一样,可以参考百度经验http://jingyan.baidu.com/article/19192ad8399a62e53e5707e0.html

    2.安装第三方库beautifulsoup

    http://cuiqingcai.com/1319.html 这个博客中对beautifulsoup的安装讲的十分明白。
    建议直接用Python自带的pip包安装
    在命令行中输入

    pip install beautifulsoup4
    

    可能还需要安装lxml,同样的

    pip install lxml
    

    3.使用命令行运行程序

    win+r调出搜索框,输入cmd调出亲切的黑底白字
    输入 cd+空格+程序路径获取程序目录
    输入 python+空格+anay.py(主程序名称)开始运行程序
    在弹出的 Please input product:后输入你想抓取的商品品类,例如雪地靴
    等待程序自动运行并声称表格。
    注:抓取前50页大约3000条信息需要一个小时左右。也可以在主程序的page中修改抓取页数。

    附:主程序源代码

    抓取天猫店铺相关信息主程序代码

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # version python27
    '''分析内容'''
    from get_html import download_html as get
    from bs4 import BeautifulSoup as bs
    import re,sys,urllib
    type = sys.getfilesystemencoding()
    def get_url(key, page = 50):    #采集50页的内容,根据需求自己改
        print 'get urls...'
        keyword = urllib.quote(key.strip())
        urls = []
        i=1
        while(i<=page):
            url = "https://list.tmall.com/search_product.htm?type=pc&q=%s&totalPage=100&sort=s&style=g&from=mallfp..pc_1_suggest&suggest=0_1&jumpto=%d#J_Filter"%(keyword,i)
            urls.append(url)
            i = i + 1
        return urls
    def get_content(url):
        html = get(url)
        soup = bs(html, 'lxml')
        res = soup.select(".ks-datalazyload")
        ms = re.compile(r"<em\sclass=\"count\"[\s\S]*?>([\s\S]*?)<\/em>",re.I|re.M)
        ar = re.compile(r"<li\sclass=\"locus\"[\s\S]*?>([\s\S]*?)<\/div>",re.I|re.M)
        age = re.compile(r"<span\sclass=\"tm-shop-age-content\"[\s\S]*?>([\s\S]*?)<\/span>",re.I|re.M)
        for i in res:
            try:
                s = ms.findall(str(i))
            except:
                s = ['None','None','None']
            try:
                area = ar.findall(str(i))
                areas = re.sub(r'<[^>]+>','',area[0].decode('utf-8').encode(type).strip())
                areas = areas.replace('\r','')
                areas = areas.replace('\n','')
                areas = areas.replace('\t','')
                areas = areas.replace(' ','')
            except:
                areas = 'None'
            try:
                ages = age.findall(str(i))
                agess = ages[0].decode('utf-8').encode(type).strip()
            except:
                agess = 'None'
            s.append(areas)
            s.append(agess)
        return s
    def get_link(html):
        soup = bs(html ,'lxml')
        l = soup.select('.productTitle a')
        link = 'https:'+l[0].get('href')
        return link
    def xls(key,url):
        keyword = urllib.quote(key.strip())
        html = get(url) 
        soup = bs(html, 'lxml')
        res = soup.select(".product-iWrap")
        p = re.compile(r"<p\sclass=\"productPrice\">([\s\S]*?)<\/p>",re.I|re.M)
        t = re.compile(r"<p\sclass=\"productTitle\">([\s\S]*?)<\/p>",re.I|re.M)
        c = re.compile(r"<p\sclass=\"productStatus\">([\s\S]*?)<\/span>",re.I|re.M)
        for i in res:
            try:
                price = re.sub(r'<[^>]+>','',p.search(str(i)).group(1)).decode('utf-8').encode(type).strip()
                title = re.sub(r'<[^>]+>','',t.search(str(i)).group(1)).decode('utf-8').encode(type).strip()
                count = re.sub(r'<[^>]+>','',c.search(str(i)).group(1)).decode('utf-8').encode(type).strip()
                link = get_link(str(i))
                con = get_content(link)
                with open(key+'.xls','a') as f:
                    txt = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(title,price,count,con[0],con[1],con[2],con[3],con[4])
                    f.write(txt)
            except:
                pass
            
    
    key = raw_input("Please input product:")
    if key.strip() == '':
        key = input("Please input product:")
    urls = get_url(key)
    f = open(key+'.xls','w')
    title = '商品名称\t价格\t销量\t描述\t服务\t物流\t所在地\t开店时长\n'
    f.write(title.decode('utf-8').encode(type))
    f.close()
    for u in urls:
        xls(key,u)
    print 'End!'
    

    通用抓取网页代码

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # version python27
    '''下载网页'''
    import urllib2,gzip,StringIO
    def download_html(url, num_retries=2):
        print 'Download url:', url
        header = {'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'accept-encoding':'gzip, deflate, sdch, br',
        'accept-language':'en-US,en;q=0.8',
        'cache-control':'max-age=0',
        'user_agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
        }
        try:
            req = urllib2.Request(url,headers = header)
            page = urllib2.urlopen(req,timeout=10)
            rpheader = page.info()
            body = page.read()
        except urllib2.URLError as e:
            print 'Download Error:', e.reason
            body = None
            if num_retries > 0:
                if hasattr(e, 'code') and 500 <= e.code < 600:
                    return download_html(url, num_retries-1)
        encoding = rpheader.get("Content-Encoding")
        if encoding == 'gzip':
            content=gz_decoding(body).strip()
        else:
            content=body.strip()
        return content
    def gz_decoding(data):
        compressedstream = StringIO.StringIO(data)  
        gziper = gzip.GzipFile(fileobj=compressedstream)    
        data2 = gziper.read() 
        return data2
    

    相关文章

      网友评论

          本文标题:用py爬虫抓取天猫店信息(附源代码)

          本文链接:https://www.haomeiwen.com/subject/vwalattx.html