美文网首页
bs4简单使用

bs4简单使用

作者: 徒手說梦话 | 来源:发表于2019-01-27 19:19 被阅读0次
    from bs4 import BeautifulSoup
    
    html = """
        <table class="tablelist" cellpadding="0" cellspacing="0">
                    <tbody><tr class="h">
                        <td class="l" width="374">职位名称</td>
                        <td>职位类别</td>
                        <td>人数</td>
                        <td>地点</td>
                        <td>发布时间</td>
                    </tr>
                                    <tr class="even">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47342&amp;keywords=python&amp;tid=0&amp;lid=0">TEG05-高级安全策略工程师(深圳)</a></td>
                        <td>技术类</td>
                        <td>1</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="odd">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47331&amp;keywords=python&amp;tid=0&amp;lid=0">18428-财付通平台组件测试工程师</a></td>
                        <td>技术类</td>
                        <td>1</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="even">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47318&amp;keywords=python&amp;tid=0&amp;lid=0">CSIG07-基础安全威胁情报分析师</a></td>
                        <td>技术类</td>
                        <td>4</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="odd">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47319&amp;keywords=python&amp;tid=0&amp;lid=0">CSIG07-业务威胁情报分析师</a></td>
                        <td>技术类</td>
                        <td>1</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="even">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47320&amp;keywords=python&amp;tid=0&amp;lid=0">CSIG07-业务威胁情报分析师</a></td>
                        <td>技术类</td>
                        <td>2</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="odd">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47317&amp;keywords=python&amp;tid=0&amp;lid=0">25925-数据挖掘工程师</a></td>
                        <td>技术类</td>
                        <td>2</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="even">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47311&amp;keywords=python&amp;tid=0&amp;lid=0">PCG04-测试开发高级工程师(深圳)</a></td>
                        <td>技术类</td>
                        <td>1</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="odd">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47297&amp;keywords=python&amp;tid=0&amp;lid=0">28603-116 微信支付效能开发工程师(深圳)</a></td>
                        <td>技术类</td>
                        <td>1</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="even">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47299&amp;keywords=python&amp;tid=0&amp;lid=0">28601-微信支付行业缴费开发工程师(深圳)</a></td>
                        <td>技术类</td>
                        <td>1</td>
                        <td>深圳</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="odd">
                        <td class="l square"><a target="_blank" href="position_detail.php?id=47300&amp;keywords=python&amp;tid=0&amp;lid=0">19157-车联物联安全—固件/硬件安全研究员(上海)</a></td>
                        <td>技术类</td>
                        <td>1</td>
                        <td>上海</td>
                        <td>2019-01-27</td>
                    </tr>
                                    <tr class="f">
                        <td colspan="5">
                            <div class="left">共<span class="lightblue total">550</span>个职位</div>
                            <div class="right"><div class="pagenav"><a href="javascript:;" class="noactive" id="prev">上一页</a><a class="active" href="javascript:;">1</a><a href="position.php?keywords=python&amp;start=10#a">2</a><a href="position.php?keywords=python&amp;start=20#a">3</a><a href="position.php?keywords=python&amp;start=30#a">4</a><a href="position.php?keywords=python&amp;start=40#a">5</a><a href="position.php?keywords=python&amp;start=50#a">6</a><a href="position.php?keywords=python&amp;start=60#a">7</a><a href="position.php?keywords=python&amp;start=70#a">...</a><a href="position.php?keywords=python&amp;start=540#a">55</a><a href="position.php?keywords=python&amp;start=10#a" id="next">下一页</a><div class="clr"></div></div></div>
                            <div class="clr"></div>
                        </td>
                    </tr>
                </tbody></table>
    """
    soup = BeautifulSoup(html,'lxml')
    # 获取所以的tr标签
    """ 
    trs = soup.find_all('tr')[:-1]
    for tr in trs:
        print(tr)
    """
    
    # 获取第三个tr标签
    """ 
    tr = soup.find_all('tr',limit=3)[2] # limit意思是提取n个tr标签
    print(tr)  
    """
    
    # 获取所以class等于even的tr标签
    """ 
    trs = soup.find_all('tr',class_= 'even') # class是关键字所以要加_
    for tr in trs:
        print(tr) 
    """
    
    # 获取所有td等于test,class也等于test的标签a提取出来
    """ 
    aLists = soup.find_all('td',id='test',calss_='test')
    for aList in aLists:
        print(aList) 
    """
    # 获取所有a标签的href属性
    """ 
    aLists = soup.find_all('a',limit=10)[1:]
    for a in aLists:
        # 1.通过下标的操作方式
        # href = a['href']
        # print(href)
        # 2.通过attrs属性的方式
        href = a.attrs['href'] 
    """
    
    # 获取所有的职位信息
    trs = soup.find_all('tr')[1:-1]
    movies = []
    movie = {}
    for tr in trs:
        # tds = tr.find_all('td')
        """ 
        for td in tds:
            print(td.string) 
        """
        """     
        title = tds[0].string # 获取某个标签下的非标签字符串,返回是个字符
        category = tds[1].string
        nums = tds[2].string
        city = tds[3].string
        time = tds[4].string
        movie['标题'] = title
        movie['类型'] = category
        movie['人数'] = nums
        movie['时间'] = time
        movies.append(movie) 
        """
        # infos = list(tr.strings) # 获取某个标签下的子孙非标签字符串,返回来是一个生成器,需要list,但是会产生空字符串
        infos = list(tr.stripped_strings)
        movie['标题'] = infos[0]
        movie['类型'] = infos[1]
        movie['人数'] = infos[2]
        movie['时间'] = infos[3]
        movies.append(movie)
        print(movies)
    
    获取第三个tr标签.png 获取所以class等于even的tr标签.png 获取所以的tr标签.png 获取所有a标签的href属性.png

    相关文章

      网友评论

          本文标题:bs4简单使用

          本文链接:https://www.haomeiwen.com/subject/jjjnjqtx.html