美文网首页
将bs4.element.Tag转换成string

将bs4.element.Tag转换成string

作者: 日落_3d9f | 来源:发表于2020-01-13 21:10 被阅读0次

代码如下:

# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib, urllib2, sys, json, re, os, time, cgi
import string,time,datetime
from multiprocessing import Pool
import pymysql.cursors
from Queue import Queue
from random import choice
from random import Random
import datetime
reload(sys)
sys.setdefaultencoding('utf-8')

if __name__=='__main__':
    USER_AGENTS = [
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.24) Gecko/20111103 Firefox/3.6.24",
        "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"
    ]
    user_agent = choice(USER_AGENTS)
    headers = {'User-Agent':user_agent}
    r = urllib2.Request("https://zzz.www.cn/enterprise/show87836/", headers=headers)
    response = urllib2.urlopen(r)
    page = response.read()
    soup = BeautifulSoup(page,features='html.parser')
    name_all = soup.select('div.info h1')[0].text
    name_remove = soup.select('div.info h1 em')[0].text
    name = name_all.replace(name_remove,"")#公司名称

    connection = pymysql.connect(host='localhost',
                             user='root',
                             password='',
                             db='gongchang',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)

    news_li = soup.select('ul.news-list li div.img a')
    if len(news_li) > 0 :
        reversed_news_arr = []
        news_urls = []
        for single_news in news_li:
            reversed_news_arr.append(single_news)
        for new in reversed_news_arr:
            news_a = new.get('href')
            news_urls.append(news_a)
        if len(news_urls) > 0:
            for news_url in news_urls:
                r3 = urllib2.Request(news_url, headers=headers)
                response3 = urllib2.urlopen(r3)
                page3 = response3.read()
                soup3 = BeautifulSoup(page3,features='html.parser')
                news_title = soup3.select('h1#newstitle')[0].text
                [s.extract() for s in soup3("a")]
                news_content = soup3.select('div#news-content')
                news_format_content = news_content.pop()
                
                print(type(news_format_content))

调试代码中用replace函数一直错处,后来发现打印出来的数据类型是:

$ python xinwen.py 
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>

需要将其转换为str类型,即加上

str(news_format_content)

相关文章

网友评论

      本文标题:将bs4.element.Tag转换成string

      本文链接:https://www.haomeiwen.com/subject/lkinactx.html