美文网首页
Python爬取招聘

Python爬取招聘

作者: 开心的小哈 | 来源:发表于2018-12-17 13:57 被阅读0次
    from selenium import webdriver
    import re
    
    
    def geturl(urlname):
        url="https://search.51job.com/list/010000,000000,0000,00,9,99,"+urlname+",2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
        brower = webdriver.Firefox()
        brower.get(url)
        pagesoures=brower.page_source#抓取网页源代码
        restr="共(\\d+)条"#如果不带括号会输出全部,只要()内的数据如果政策抓取不到也许他前面有空格
        rex=re.compile(restr,re.IGNORECASE)
        mylist=rex.findall(pagesoures)
        brower.close()
        news=mylist[0].strip()#去掉前后空格
        if len(mylist)==0:
            return "失败"
        else:
            return news
    
    # print(geturl("python"))
    androids=["android开发","安卓开发","Android","Android实习生","软件测试"]
    for ands in androids:
        print(ands,geturl(ands))
    

    urllib.request

    import urllib.request
    import urllib.error
    def download(url):
        response=urllib.request.urlopen(url,timeout=5)#timeout访问超时
        print(type(response))#<class 'http.client.HTTPResponse'>
        print(response.info())#包含了网站请求的详细信息
        print(response.read())#读取原代码,可以传入几个直接如100等
    try:
       print(download("http://www.google.com"))
    except urllib.error.URLError as e:#抓住错误变量类型当作变量
       print("网络异常",e)
    

    伪装

    import urllib.request
    
    
    def openUrl(url):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                                 'Chrome/51.0.2704.63 Safari/537.36'}#伪装浏览器(有的网站会限制此时就要用伪装)
        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)  # 请求
        html = response.read()  # 获取
        html = html.decode("utf-8")  # 解码
        print(html)  # 打印
    
    def openUrl2(url):
        return urllib.request.urlopen(url).read()#读取全部网页
    
    if __name__ == "__main__":
        url = "http://www.bazhuayu.com/download"  # 'http://www.douban.com/'
        print(openUrl2(url))
        openUrl(url)
    
    20180715201515845.png

    还可以冒充手机浏览器等
    如果网站把我们屏蔽了可以通过伪装,调用浏览器2中解决
    有的浏览网址需要传入中文比如百度....所以在这里我们要进行编码和解码

    import urllib.request
    
    url = "http://zzk.cnblogs.com/s?w=python"+ urllib.parse.quote("爬虫")+"&t=b"
    
    print(urllib.parse.quote("爬虫"))#统一规范 编码
    print(urllib.request.unquote(urllib.parse.quote("爬虫")))#解码
    print(urllib.request.urlopen(url).read().decode("UTF-8"))#打印编码
    

    相关文章

      网友评论

          本文标题:Python爬取招聘

          本文链接:https://www.haomeiwen.com/subject/vsnqkqtx.html