代码如下:
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import urllib, urllib2, sys, json, re, os, time, cgi
import string,time,datetime
from multiprocessing import Pool
import pymysql.cursors
from Queue import Queue
from random import choice
from random import Random
import datetime
reload(sys)
sys.setdefaultencoding('utf-8')
if __name__=='__main__':
USER_AGENTS = [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN; rv:1.9.2.24) Gecko/20111103 Firefox/3.6.24",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36"
]
user_agent = choice(USER_AGENTS)
headers = {'User-Agent':user_agent}
r = urllib2.Request("https://zzz.www.cn/enterprise/show87836/", headers=headers)
response = urllib2.urlopen(r)
page = response.read()
soup = BeautifulSoup(page,features='html.parser')
name_all = soup.select('div.info h1')[0].text
name_remove = soup.select('div.info h1 em')[0].text
name = name_all.replace(name_remove,"")#公司名称
connection = pymysql.connect(host='localhost',
user='root',
password='',
db='gongchang',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
news_li = soup.select('ul.news-list li div.img a')
if len(news_li) > 0 :
reversed_news_arr = []
news_urls = []
for single_news in news_li:
reversed_news_arr.append(single_news)
for new in reversed_news_arr:
news_a = new.get('href')
news_urls.append(news_a)
if len(news_urls) > 0:
for news_url in news_urls:
r3 = urllib2.Request(news_url, headers=headers)
response3 = urllib2.urlopen(r3)
page3 = response3.read()
soup3 = BeautifulSoup(page3,features='html.parser')
news_title = soup3.select('h1#newstitle')[0].text
[s.extract() for s in soup3("a")]
news_content = soup3.select('div#news-content')
news_format_content = news_content.pop()
print(type(news_format_content))
调试代码中用replace函数一直错处,后来发现打印出来的数据类型是:
$ python xinwen.py
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
需要将其转换为str类型,即加上
str(news_format_content)
网友评论