美文网首页
爬取堆糖网

爬取堆糖网

作者: 被强煎的荷包蛋 | 来源:发表于2018-08-01 15:27 被阅读0次
    import requests
    import urllib.parse
    
    #通过url 获取数据
    def get_page(url):
          #requests.get 自带json.loads
          page = requests.get(url)
          page = page.content
          #将 bytes 转换成 字符串
          page.encoding = 'utf-8'
          return page
    
    def page_from_duitang(label):
        pages = []
        url = 'https://www.duitang.com/napi/blog/list/by_search/kw={}&start={}&limit=1000'
        #将中文转成url 编码
        label = urllib.parse.quote(label)
        for index in range(0,3600,100):
            u = url.format(label,index)
            print(u)
            page = get_page(u)
            pages.append(page)
        return pages
    
    def findall_in_page(page,startpart,endpart):
          all_strings = []
          end = 0
          while page.find(startpart,end) != -1:
                start = page.find(startpart,end)+len(startpart)
                end = page.find(endpart,start)
                string = page[start:end] 
                all_strings.append(string)
          return all_strings
    
    def pic_urls_from_page(pages):
          pic_urls = []
          for page in pages:
              urls = findall_in_page(page,"path":",""")
              pic_urls.extend(urls) #extend 和 append的不同
          return pic_urls
    
    def download_pics(url,n):
          r = requests.get(url)
          path = 'pic/' + str(n) + '.jpg'
          with open(path,'wb') as f:
               f.write(r.content)
    
    def main(label)
        pages = page_from_duitang(label)
        pic_urls = pic_urls_from_page(pages)
        n = 0
        for url in pic_urls:
             n +=1
            print("正在下载第 {} 张图片".format(n))
            download_pics(url,n)
    
                
          
    

    相关文章

      网友评论

          本文标题:爬取堆糖网

          本文链接:https://www.haomeiwen.com/subject/etulvftx.html