爬虫2

作者: 冬gua | 来源:发表于2018-03-21 21:46 被阅读0次

    爬虫之 beautifulsoup

    Beautiful Soup 3目前已经停止开发,推荐现在的项目使用Beautiful Soup 

    利用beautiful 爬取

    import requests

    from bs4 import BeautifulSoup

    import json

    '''发送请求,获取响应的内容'''

    headers = {

        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)      Chrome/63.0.3239.132 Safari/537.36',

    }

    base_url = 'https://hr.tencent.com/position.php'

    keywords = input('输入职位:')

    begin_page = int(input('起始页:'))

    end_page = int(input('结束页:'))

    job_list = []

    for page in range(begin_page, end_page + 1):

        params = {

            'keywords': keywords,

            'start': (page - 1) * 10

        }

        print('%s爬取中...' % page)

        response1 = requests.get(url=base_url, params=params,headers=headers)

        content = response1.content

        # with open('./tencent-%s.html'%page, 'wb') as file:

        #     file.write(content)

        content = content.decode('utf-8')

        '''数据提取'''

        bs = BeautifulSoup(content,'lxml')

        # tr_list = bs.select('tr[class="odd"],tr[class="even"]')

        tr_list = bs.find_all(name='tr',attrs={'class':['even','odd']})

        for tr in tr_list:

            job={}

            job['job_name'] = tr.a.text.strip()

            job['job_href'] = tr.a['href']

            job['job_type'] = tr.find_all('td')[1].text.strip()

            job['job_person'] = tr.find_all('td')[2].text.strip()

            job['job_address'] = tr.find_all('td')[3].text.strip()

            job['job_time'] = tr.find_all('td')[4].text.strip()

            job_list.append(job)

    #转成json            ensure_asci=False----默认是True,改成False,才能显示中文, ensure_ascii=False 来禁用ascii编码

    #dump 和 dumps两种写法,

    # job_json_string = json.dumps(job_list,ensure_ascii=False)

    # with open('./tencent.json', 'w',encoding='utf-8') as file:

    #     file.write(job_json_string)

    json.dump(job_list,open('./tencent.json', 'w',encoding='utf-8'),ensure_ascii=False)

    相关文章

      网友评论

          本文标题:爬虫2

          本文链接:https://www.haomeiwen.com/subject/yfinqftx.html