案例集锦

作者: sszhang | 来源:发表于2018-06-11 02:26 被阅读0次

    案例一: 京东商品页面的爬取

    import requests
    url = 'https://item.jd.com/2967929.html'
    try:
      r = requests.get(url)
      r = raise_for_status()
      r.encoding = r.apparent_encoding
      print(r.text[:1000])
    except:
      print('FINDING ERRORS')
    
    

    案例二:亚马逊商品页面的爬取

    由于amazon禁止python访问,要把headers信息替换成浏览器

    import requests
    url = 'https://www.amazon.cn/gp/product/B01M8L5Z3Y'
    try:
      kv = {'user-agent' : 'Mozilla/5.0'}
      r = requests.get(url, headers =kv)
      r = raise_for_status()
      r.encoding = r.apparent_encoding
      print(r.text[:1000])
    except:
      print('FINDING ERRORS')
    

    案例三:百度360关键词提交搜索

    百度的关键词接口
    http://www.baidu.com/s?wd=keyword
    360的关键词接口
    http://www.so.com/s?q=keyword

    import requests
    keyword = 'Python'
    try:
      kv = {'wd' : keyword}
      r = requests.get('http://www.baidu.com/s', params =kv)
      print(r.request.url)
      r = raise_for_status()
      r.encoding = r.apparent_encoding
      print(r.text[:1000])
    except:
      print('FINDING ERRORS')
    
    import requests
    keyword = 'Python'
    try:
      kv = {'q' : keyword}
      r = requests.get('http://www.so.com/s', params =kv)
      print(r.request.url)
      r = raise_for_status()
      r.encoding = r.apparent_encoding
      print(r.text[:1000])
    except:
      print('FINDING ERRORS')
    

    案例四:图片爬取并且存储

    import requests
    import os
    url = 'http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg'
    root = 'D://pics//'
    path = root + url.split('/')[-1]
    try:
      if not os.path.exists(root):
        os.mkdir(root)
      if not os.path.exists(path):
        r = requests.get(url)
        with open(path, 'wb') as f:
          f.write(r.content) # 保存二进制格式,即图片
          f.close()
          print('saved files')
      else:
        print('files have already existed')
    except:
      print('Failure')
    

    案例五:IP地址查询

    import requests
    url = 'http://m.ip138.com/ip.asp?ip='
    try:
      r = requests.get(url+'202.204.80.112')
      r.raise_for_status()
      r.encoding = r.apparent_encoding
      print(r.text[-500:])
    except:
      print('Failed')
    

    案例六:大学排名

    输入: 大学排名URL链接
    输出: 大学排名信息的屏幕输出(排名,大学名称,总分)
    技术路线: Requests, BeatifulSoup
    步骤一: 从网页上获取大学排名网页内容
    步骤二: 提取网页内容中信息到合适的数据结构
    步骤三:利用数据结构展示并输出结果

    import
    def getHTMLText(url):
     try:
       r = requests.get(url, timeout = 30)
       r.rasie_for_status
       r.encoding = r.apparent_encoding
       return r.text
      except:
         print('ERRORS')
    
    def fillUnivList(ulist, html):
     soup = BeautifulSoup(html, 'html.parser')
     for tr in soup('tbody'). children:
         if isinstance(tr, bs4.element.Tag):
           tds = tr('td') # find_all()
           ulist.append([tds[0].string, tds[1].string, tds[2].string])
    
    def printUnivList(ulist, num):
     print('{: ^10}\t{: ^6}\t{: ^10}\t'.format('Ranking', 'School Name', 'Marks'))
     for i in range(num):
       u = ulist[i]
       print('{: ^10}\t{: ^6}\t{: ^10}\t'.format(u[0], u[1], u[2]))
    
    def main()
       uinfo =[]
       url = 'http: //www.zuihaodaxue.cn/zuihaodaxuepaiming2016.html'
       html = getHTMLText(url)
       fillUnivList(uinfo, html)        
    

    案例七

    目标: 获取淘宝搜索页面的信息,提取其中的商品名称和价格
    理解: 淘宝的搜索接口,翻页的处理

    import re
    import requests
    
    def getHTMLText(url):
      try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
      except:
        return 'Error'
    
    def parsePage(ilt, html):
      try:
        plt = re.findall(r'\"view_price\"\: \"[\d\.]*\" ', html)
        tlt = re.findall(r' \"raw_title\"\:\".*?\" ',html)
        for i in range(len(plt)):
          price = eval(plt[i].split(':')[1])
          title = eval(tlt[i].split(':')[1])
          ilt.append([price, title])
      except:
        print('Errors')
    
    
    def printGoodsList(ilt):
      tplt = "{:4}\t{:8}\t{:16}"
      print (tplt.format('NO', 'PRICE', 'ITEM NAME')):
      count =0
      for q in ilt:
        count  = count +1
        print(tplt.format(count, g[0], g[1]))
    
    def main()
      goods ='bags'
      depth = 2
      start_url = 'https://s.taobao.com/search? q = ' + goods
      infoList =[ ]
      for i in range(depth)
        try: 
          url = start_url + '&s=' + str(44*i)
          html = getHTMLText(url)
          parsePage(infoList, html)
    

    案例八 股票数据定向爬虫

    目标:获取上交所和深交所所有股票的名称和交易信息
    输出:保存在文件中
    步骤:
    步骤一: 从东方财富网获取股票列表
    步骤二: 根据股票列表逐个到百度股票获取个股信息
    步骤三: 将结果保存到文件中

    import traceback
    import re
    from bs4 import BeautifulSoup
    import bs4
    
    def getHTMLText(url):
    
    def getStockList(lst, stockURL):
     html = getHTMLText(stockURL)
     soup = Beautifuloup(html, 'html.parser')
     a = soup.find_all('a')
     for i in a:
       try:
          href = i.attrs['href']
         lst.append(re.findall(r'[s][hz]\d{6}', href)[0])
       except:
         continue
     
    def getStockInfo(lst, stockURL, fpath):
     for stock in lst:
       url = stockURL + stock + '.html'
       html = getHTMLText(url)
     try:
       if html == ''
         continue
       infoDict =[ ]
       soup = BeautifulSoup(html, 'html.parser')
       stockInfo = soup.find('div', attrs ={'class' : 'stock-bets'})
       name = stockInfo.find_all(attrs = {'class' : 'bets -name'})[0]
      
       infoDict.update({'StockName': name.text.split()[0]})
       keyList = stockInfo.find_all('dt')
       valueList = stockInfo.find_all('dd')
       for i in range(len(keyList)):
         key = keyList[i].text
         val = valueList[i].text
         infoDict[key] = val
       
       with open(fpath, 'a', encoding = 'utf-8') as f:
         f.write(str(infoDict) + '\n')
     except:
         traceback.print_exc()
         continue
    
    def main():
     stock_list_url = 'http://quote.eastmoney.com/stock'
     stock_info_url = 'httpos://gupiao.baidu.com/stock/'
     output_file = 'D://BaiduStockInfo.txt'
     slist = [ ]
     getStockList(slist, stock_list_url)
     getStockInfo(slist, stock_info_url, output_file)
    

    案例九 Scrappy股票数据定向爬虫

    import scrapy
    import re
    
    class StockSpider(scrapy.Spider)
      name = 'stocks'
      start_urls = ['http://quote.eastmoney.com/stocklist.html']
    
    def parse(self, response):
      for href in reponse.css('a::attr(href)').exact():
         try:
            stock = re.findall(r'[s][hz]\d{6}', href) [0]
            url = 'https://gupiao.baidu.com/stock' + stock + '.html'
            yield scrapy.Request(url, callback = self.parse_stock)
        except:
            continue
    
    def parse_stock(self, reponse):
      infoDict ={ }
      stockInfo = response.csss('.stock-bets')
      name = stockInfo.css('.bets-name').extract()[0]
      keyList = stockInfo.css('dt').extract()
      valuleList = stockInfo.css('dd').extract()
      for i in range(len(keyList)):
        key = re.findall(r'>.*</dt>', keyList[i][0][1:-5])
        try:
          val = re.findall(r'\d+\.?.*</dd>', valueList[i])[0][0:-5]
        except:
          val = '--'
    
      infoDict.update(
          { 'stockName': re.findall('\s.*\(', name)[0].split()[0] + re.findall('\>.*\<', name) [0][1:-1]}}
    
    # pipeline
    class BaidustocksPipeline(object):
      def process_item(self, item, spider):
        return item
    
    class BaidustocksInfoPipeline(object):
        def open_spider(self, spider):
            self.f = open('BaiduStockInfo.txt', 'w')
        
        def close_spider(self, spider)
            self.f.close()
    
        def process_item(self, item, spider):
            try:
                line = str(dic(item)) + '\n'
                self.f.write(line)
            except:
                pass
    
    ITEM_PIPELINES = {
      'BaiduStocks.pipelines.BaidustockInfoPipeline: ' 300,}

    相关文章

      网友评论

        本文标题:案例集锦

        本文链接:https://www.haomeiwen.com/subject/ypzfeftx.html