运用多个模块写了一个爬取直线网ae模板的爬虫,主要用到request和bs4提取数据,base64将百度云地址解码,多线程threading和queue生产与消费关系(异步)访问抓取,并用pymysql将数据导入到mysql数据库中
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2018-02-25 00:21:54
# @Author : bb (317716008@qq.com)
# @Word : python can change world!
# @Version : python3.6
import requests
from bs4 import BeautifulSoup
import base64
from queue import Queue
from threading import Thread
from time import sleep
import sys
import pymysql
header={'Accept':'text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_e6d53931d3c448a0325f4c2…a0325f4c254adbf071=1519469796',
'Host':'www.newcger.com',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/58.0'}
q=Queue()
def base64encode(str2):
encodestr = base64.b64encode(str2.encode('utf-8'))
print(str(encodestr,'utf-8'))
#base64encode('abcr34r344r')
def basedecode(str2):
str2=str2+'=='
a=bytes(str2,'utf-8')
url=str(base64.b64decode(a),'utf-8')
print(url)
return url
#basedecode('aHR0cHM6Ly9wYW4uYmFpZHUuY29tL3MvMWk2WVBoWHo')
def geturl(url2,header):
conn=pymysql.connect(host='127.0.0.1',port=3306,db='dbname',user='yourname',passwd='yourpassword',charset='utf8')
cur=conn.cursor()
sleep(1)
url="http://www.linecg.com/ae_content_52366.html"
res=requests.get(url2,params=header).content
soup=BeautifulSoup(res,"html.parser")
html=soup.find_all('a')
name=soup.title.string
print(name) ###名称
tag_list=set()
data=soup.find_all('button')
for i in html:
#print(i.attrs['href'])
try:
tag=i.attrs['href'].split('/')[1]
if tag =='search':
tag_list.add(i.string)
except:
continue
for i in data:
zz=i.attrs['data-url']
try:
zz=basedecode(zz) ##云盘地址
#print(zz)
except:
continue
print(tag_list) ##标签
try:
print(tag_list) #标签
sql="insert into ae(ae_name,ae_html,ae_zz,ae_tag,web_name) values(%s,%s,%s,%s,%s)"
cur.execute(sql,(str(name),str(url2),str(zz),str(tag_list),'直线网'))
conn.commit()
#new_id = cur.lastrowid
#print(new_id)
except Exception as e:
print(e)
finally:
cur.close()
conn.close()
def gethtml(page,header):
url='http://www.linecg.com/ae_list_'+str(page)+'_0_0.html#c1'
res=requests.get(url,params=header).content
soup=BeautifulSoup(res,"html.parser")
html=soup.find_all('h2')
for i in html:
html2='http://www.linecg.com'+str(i.a.attrs['href'])
print(html2)
q.put(html2)
def work():
#print(q.qsize())
while q.qsize()>0:
html2=q.get()
geturl(html2,header)
def main():
for i in range(601,670):
try:
gethtml(i,header)
except:
continue
for i in range(10):
sleep(0.1)
t=Thread(target=work)
t.start()
if __name__ == '__main__':
main()
网友评论