美文网首页
将bs4.element.Tag转换成string

将bs4.element.Tag转换成string

作者: 日落_3d9f | 来源:发表于2020-01-13 21:10 被阅读0次

    代码如下:

    # -*- coding:utf-8 -*-
    
    from bs4 import BeautifulSoup
    import urllib, urllib2, sys, json, re, os, time, cgi
    import string,time,datetime
    from multiprocessing import Pool
    import pymysql.cursors
    from Queue import Queue
    from random import choice
    from random import Random
    import datetime
    reload(sys)
    sys.setdefaultencoding('utf-8')
    
    if __name__=='__main__':
        USER_AGENTS = [
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.24) Gecko/20111103 Firefox/3.6.24",
            "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"
        ]
        user_agent = choice(USER_AGENTS)
        headers = {'User-Agent':user_agent}
        r = urllib2.Request("https://zzz.www.cn/enterprise/show87836/", headers=headers)
        response = urllib2.urlopen(r)
        page = response.read()
        soup = BeautifulSoup(page,features='html.parser')
        name_all = soup.select('div.info h1')[0].text
        name_remove = soup.select('div.info h1 em')[0].text
        name = name_all.replace(name_remove,"")#公司名称
    
        connection = pymysql.connect(host='localhost',
                                 user='root',
                                 password='',
                                 db='gongchang',
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)
    
        news_li = soup.select('ul.news-list li div.img a')
        if len(news_li) > 0 :
            reversed_news_arr = []
            news_urls = []
            for single_news in news_li:
                reversed_news_arr.append(single_news)
            for new in reversed_news_arr:
                news_a = new.get('href')
                news_urls.append(news_a)
            if len(news_urls) > 0:
                for news_url in news_urls:
                    r3 = urllib2.Request(news_url, headers=headers)
                    response3 = urllib2.urlopen(r3)
                    page3 = response3.read()
                    soup3 = BeautifulSoup(page3,features='html.parser')
                    news_title = soup3.select('h1#newstitle')[0].text
                    [s.extract() for s in soup3("a")]
                    news_content = soup3.select('div#news-content')
                    news_format_content = news_content.pop()
                    
                    print(type(news_format_content))
    

    调试代码中用replace函数一直错处,后来发现打印出来的数据类型是:

    $ python xinwen.py 
    <class 'bs4.element.Tag'>
    <class 'bs4.element.Tag'>
    <class 'bs4.element.Tag'>
    <class 'bs4.element.Tag'>
    <class 'bs4.element.Tag'>
    <class 'bs4.element.Tag'>
    

    需要将其转换为str类型,即加上

    str(news_format_content)
    

    相关文章

      网友评论

          本文标题:将bs4.element.Tag转换成string

          本文链接:https://www.haomeiwen.com/subject/lkinactx.html