import queue
import threading
import requests
import re
from lxml import etree
import time
import random
import json
# 已爬 url
urlList = []
# 正在爬url对列
urlsData = queue.Queue()
# urlERRor失败次数
urlError = {}
# 第几个爬虫
count = 0
# 模拟header头
header = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
class Counter(threading.Thread):
# @summary: 初始化对象。
# @param lock: 琐对象。
# @param threadName: 线程名称
# @param requests: 线程名称
# @param url: 爬取来源url
# @param name: 数据名称
# @param id: 数据id
def __init__(self, lock, threadName, requests, url):
print(threadName+'run..')
super(Counter, self).__init__(name=threadName)
self.lock = lock
self.requests = requests
self.url = url
def _data_get(self):
# 开始任务
try:
# 爬取来源地址
html = requests.get(self.url,headers=header)
rs = etree.HTML(html.content)
# 解析网页百度地址
url = re.findall(r'href="(https://pan.baidu.com/s/.*?|http://pan.baidu.com/s/.*?)"',html.content.decode('utf-8'))
# 解析网页百度云密码
password = re.findall(r'密码(:|;|: )(\w{0,4})', html.content.decode('utf-8'))
name = rs.xpath('//h1/text()')
# 打印
try:
password = password[0][1]
except BaseException as e:
password = ''
# 爬取豆瓣电影封面图
try:
url1 = "http://www.baidu.com/s?"
html = requests.get(url1,params={
'wd':"site:movie.douban.com {}".format(self.name)
})
select = etree.HTML(html.content)
# saveHtml("text1", html.content)
a = select.xpath('//h3[@class="t"]/a/@href')
html = requests.get(a[0])
select = etree.HTML(html.content)
# print(html.content)
ase = select.xpath('//img/@src')
img = ase[0]
except BaseException as e:
print(self.name,'豆瓣电影封面获取失败')
img = ''
# 提交数据
print(name[0])
# 提交数据
rr = requests.post('http://localhost/basic/index.php?r=bian/update', {
'password': password,
'url': url[0],
'img': img,
'source_url': self.url,
'name': name[0]
})
threadmax.release()
print(rr.content)
# message = '''
# '%s','%s','%s','%s';
# ''' % (password, url[0], img, name[0])
# print(message)
except BaseException as e:
if self.url in urlError:
urlError[self.url] = urlError[self.url] + 1
else:
urlError[self.url] = 1
if urlError[self.url]<3:
urlsData.put(self.url)
print('百度云地址解析失败',self.url,'失败次数',urlError[self.url],e)
print('目前剩余任务', urlsData.qsize())
def run(self):
global count
self.lock.acquire()
self._data_get()
self.lock.release()
if __name__ == '__main__':
threadmax = threading.BoundedSemaphore(100)
lock = threading.Lock()
i = 0
try:
# 单进程爬所有任务url
for index1 in range(20):
index = 1038 - index1
html = requests.get('http://www.xiexingeini.com/page/{}'.format(index), headers=header)
html = etree.HTML(html.content)
# 所有任务
urls = html.xpath('//header/h2[@class="entry-title"]/a/@href')
for url in range(len(urls)):
urlsData.put(urls[url])
print('已抓取url',urlsData.qsize())
print('全部任务:',urlsData.qsize())
# 对列循环爬取
while True:
threads = []
uu = urlsData.get()
i = i+1
try:
threadmax.acquire()
ts = Counter(lock, "thread-" + str(i), requests, uu).start()
except BaseException as e:
print(e)
# 重新插入对列
urlsData.put(uu)
if e == "can't start new thread":
print('线程开启失败')
time.sleep(180)
else:
print(uu,'error')
except BaseException as e:
print('url error')
print(e)
# while True:
# # 添加数据:吃
# q1.put('a')
# q1.put('b')
#
# # 打印队列中的数据
# print(q1.queue)
#
# # 取出队列中的数据:先进先出原则
# print(q1.get())
# print(q1.queue)
# print(q1.qsize())
# print(q1.get())
# 当队列里没有数据是,get获取不到到数据,会造成阻塞
网友评论