美文网首页编程地带程序员
requests + Beautiful 爬取boss直聘

requests + Beautiful 爬取boss直聘

作者: MA木易YA | 来源:发表于2018-11-05 15:00 被阅读15次
    import requests
    from bs4 import BeautifulSoup
    import json
    import codecs
    
    def GetHtmlText(url):
        try:
            headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3493.3 Safari/537.36"} #随机构造请求头,增加一定的反反爬能力。
            r = requests.get(url, timeout=5, headers=headers)
            r.raise_for_status() #判断r若果不是200,产生异常requests.HTTPError异常
            r.encoding = r.apparent_encoding
            return r.text  #http响应内容的字符串形式,即URL返回的页面内容
        except:
            return None
    
    def fillJobList(html):
        soup = BeautifulSoup(html,'html.parser') #解析网页源代码
        dic = {}
        test_list = []
        for job in soup.findAll('div',{'class':'info-primary'}):
            try:
                position = job.find('div',{'class':'job-title'}).text
                pay = job.find('span',{'class':'red'}).text
                edu = job.find('p').text
                dic['position'] = position
                dic['pay'] = pay
                dic['edu'] = edu
                detail_url = job.find('a').attrs['href']
                detail_html = GetHtmlText('https://www.zhipin.com'+ detail_url)
                detail_soup = BeautifulSoup(detail_html, 'html.parser')
                detail = detail_soup.find('div', {'class': 'text'}).text.strip()
                dic['detail'] = detail
                # print(dic)
                test_list.append(dic)
            except:
                continue
        return test_list
    
    def write(dic):
    
        with open("data.json", "w", encoding='utf-8') as f:
            f.write(json.dumps(dic, ensure_ascii=False))
        # with open('jobs.json', 'w', encoding='utf-8') as f:
        #     fp = json.dumps(dic,  ensure_ascii=False)
        #     f.write(fp)
    
    def main():
        keywords = input('输入职位:')
        pages = int(input('获取页数:'))
        for i in range(1, pages + 1):
            url = 'https://www.zhipin.com/c101270100-p100109/?query=' + keywords + '&page=' + str(i)
            html = GetHtmlText(url)
            print(fillJobList(html))
            print(len(fillJobList(html)))
            #for i in fillJobList(html):
            #     write(i)
    
    
    
    if __name__ == '__main__':
        main()
    
    •    其他爬虫代码可参考github

    相关文章

      网友评论

        本文标题:requests + Beautiful 爬取boss直聘

        本文链接:https://www.haomeiwen.com/subject/bcesxqtx.html