美文网首页
python爬虫

python爬虫

作者: 这个太难了 | 来源:发表于2017-11-30 21:05 被阅读0次
    1、爬取糗事百科

    代码:

    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    time: 2017 / 10 / 23
    import requests
    import bs4
    from bs4 import BeautifulSoup
    
    
    def getHtmlText(url):
        try:
            r=requests.get(url,timeout=30)
            r.raise_for_status()
            r.encoding=r.apparent_encoding
            return r.text
        except:
            return" "
    
    
    def readJokeText(html,fpath):
        soup=BeautifulSoup(html,'html.parser')
        for spans in soup.find_all('div',attrs={'class':'content'}):
            if isinstance(spans, bs4.element.Tag):
                sp = spans.find('span')
                # 如果用sp.string打印会出现有None,用sp.getText()方法打印就没有了
                print(sp.getText(),end=' ')
                with open(fpath,'a',encoding='utf-8') as f:
                    f.write(sp.getText()+'\n')
                    f.close()
    
    
    def main():
        depth = 2
        path = "F://Joke.txt"
        start_url="https://www.qiushibaike.com/text/page/"
        for i in range(1,depth+1):
            url = start_url + str(i)
            html=getHtmlText(url)
            readJokeText(html, path)
    main()
    
    2、爬取淘宝手机信息

    代码:

    import requests
    import re
    
    
    def getHTMLText(url):
        try:
            r = requests.get(url,timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return ""
    
    
    def parsePage(ilt,html):
        try:
            tlst = re.findall(r'\"title\":\".*?\"', html)
            mplst = re.findall(r'\"price\":\"[\d]*\"', html)
            mslst = re.findall(r'\"month_sales\":\"[\d]*\"', html)
            for j in range(len(mslst)):
                title = eval(tlst[j].split(':')[1])
                mprice = eval(mplst[j].split(':')[1])
                msale = eval(mslst[j].split(':')[1])
                ilt.append([title, mprice, msale])
    
        except:
             print(" ")
    
            
    def printGoods(ilt):
        count = 0
        tplt = "{:4}\t{:20}\t{:20}\t{:20}"
        print(tplt.format("序号", "商品名称", "价格", "付款人数"))
        for g in ilt:
            count = count + 1
            print(tplt.format(count, g[0], g[1], g[2]))
    
    def main():
        depth = 2
        info = []
        goods = "手机"
        start_url = "https://s.taobao.com/search?q="+goods
        for i in range(depth):
            try:
                url = start_url+"&s="+str(48*i)
                html = getHTMLText(url)
                parsePage(info,html)
            except:
                continue
        printGoods(info)
    main()
    
    3、爬取中国大学排名

    代码:

    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    time: 2017 / 10 / 22
    import requests
    from bs4 import BeautifulSoup
    import bs4
    
    def getHTMLText(url):
        #获取url的html信息并返回
        try:
            r=requests.get(url, timeout = 30)
            r.raise_for_status()
            r.encoding=r.apparent_encoding
            return r.text
        except:
            return ""
    
    
    def fillUnivList(ulist, html):
        #将html的信息放到列表ulist中
        soup=BeautifulSoup(html,'html.parser')
        #找出tbody标签
        for tr in soup.find('tbody').children:
            # 检测tr标签的类型,如果不是bs4.element.Tag定义的标签则过滤掉
            if isinstance(tr,bs4.element.Tag):
                #查找td标签
                tds=tr('td')
                ulist.append([tds[0].string,tds[1].string,tds[2].string,tds[3].string])
    
    
    
    def printUnivList(ulist, num):
        tplt="{0:^8}\t{1:{4}^10}\t{2:{4}^8}\t{3:{4}^6}"
        print(tplt.format("排名","学校名称","省市","总分",chr(12288)))
        for i in range(num):
            u=ulist[i]
            print(tplt.format(u[0],u[1],u[2],u[3],chr(12288)))
    
    
    def main():
        uinfo=[]
        url="http://www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html"
        html=getHTMLText(url)
        fillUnivList(uinfo, html)
        printUnivList(uinfo,10)   #打印10所大学信息
    main()
    
    4、爬取豆瓣top250

    代码:

    #!/usr/bin/python
    # -*- coding: UTF-8 -*-
    import requests
    import re
    import bs4
    from bs4 import BeautifulSoup
    
    
    def getHTMLText(url):
        try:
            r = requests.get(url,timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return ""
    
    
    def paserHTML(ulist,html,fpath):
        uinfo = []
    
        soup = BeautifulSoup(html,'html.parser')
        try:
           # title = re.findall(r'<span class="title">(.*?)</span>',html)
            name = re.findall(r'<span class="title">(.[^&]*?)</span>',html)
            score = re.findall(r'.*?"v:average">(.*?)</span>',html)
            data = soup.find('ol',{'class': 'grid_view'})
            mlist = data.find_all('li')
            for m in mlist:
                info = m.find('p').getText()
                dioc=  re.findall(r'导演:\s(.*?)\s', info)[0]
                p = re.findall(r'<span>([\d].*?)</span>',html)
                zlist = re.findall(r'主演:\s(.*?)\s', html)
                brief = re.findall(r'<span class="inq">(.*?)</span>',html)
                uinfo.append(dioc)
            for i in range(len(name)):
               ulist.append([name[i],score[i],uinfo[i],zlist[i],p[i],brief[i]])
               with open(fpath,'a',encoding='utf-8')as f:
                   f.write(str(ulist))
        except:
            print("")
    
    
    
    def printDouban(ulist):
        count = 0
        tplt = "{:4}\t{:16}\t{:10}\t{:20}\t{:20}\t{:20}\t{:20}"
        print(tplt.format("排名","电影名称","评分","导演","主演","评价","简介",chr(12288)))
        for t in ulist:
            count = count + 1
            print(tplt.format(count,t[0],t[1],t[2],t[3],t[4],t[5],chr(12288)))
    
    
    
    def main():
        path = "F://DouBan.txt"
        tinfo = []
        url = 'https://movie.douban.com/top250'
        html = getHTMLText(url)
        paserHTML(tinfo,html,path)
        printDouban(tinfo)
    main()
    

    相关文章

      网友评论

          本文标题:python爬虫

          本文链接:https://www.haomeiwen.com/subject/sqybbxtx.html