美文网首页
网络爬虫:urllib模块应用3--含中文的url

网络爬虫:urllib模块应用3--含中文的url

作者: 牛耀 | 来源:发表于2018-12-23 14:26 被阅读0次
    from urllib import parse,request
    from fake_useragent import UserAgent
    
    def searchSpider(kw,start_page,end_page):
        #quote,将中文转为url能够识别的编码格式
        quote_str = parse.quote(kw)
        print(quote_str)
        unquote_str = parse.unquote(quote_str)
        print(unquote_str)
    
        for page in range(start_page,end_page+1):
            # 将字典类型的参数,转化为url的编码格式的字符串
            parmars = {
                'wd': kw,
                'pn': (page-1)*10,
            }
            result = parse.urlencode(parmars)
            print(result)
    
            #https://www.baidu.com/s?wd=%E9%A9%AC%E4%BA%91&pn=0
            full_url = 'https://www.baidu.com/s?'+result
            print(full_url)
            html = load_page(full_url)
            filename = '第'+str(page)+'页'+kw+'.html'
            save_page_html(html,filename)
    
    def load_page(url):
    
        req_header = {
            'User-Agent':user_agent.random
        }
    
        req = request.Request(url,headers=req_header)
    
        response = request.urlopen(req)
    
        if response.status == 200:
    
            print('请求成功',response.url)
    
            return response.read().decode('utf-8')
    
    def save_page_html(html,filename):
        """
        保存获取到的页面源码
        :param html:页面源码
        :param filename:文件名
        :return:
        """
        with open('baidusearch/'+filename,'w',encoding='utf-8') as file:
            file.write(html)
    
    if __name__ == '__main__':
    
        #实例化一个ua对象
        user_agent = UserAgent()
    
        #模拟搜索引擎,根据关键字获取页面信息(HTML页面源码)
        #输入搜索关键字
        kw = input('请输入搜索关键字')
        #起始页
        start_page = int(input('输入起始页:'))
        #输入截止页
        end_page = int(input('输入截止页:'))
    
        searchSpider(kw,start_page,end_page)
    

    相关文章

      网友评论

          本文标题:网络爬虫:urllib模块应用3--含中文的url

          本文链接:https://www.haomeiwen.com/subject/kvknkqtx.html