"色情产业是科技发展进步的源泉。"
闲的没事干翻了翻百度云盘的东西,发现个这个

诶- -我什么时候存过这个东西。果断百度
进群125240963 即可获取Python零基础 爬虫 web开发视频,数十套PDF书籍

好吧,看起来忘记了是什么东西,但是这并不影响今天的话题。。
翻了翻网站,是wordpress4.8.2的,看起来撸他是没有可能了。。

四处翻翻发现个网站地图
https://www.mygalgame.com/sitemap.html

貌似可以写个爬虫把所有资源都嘿嘿嘿下来
想到就做,一贯风格
先分析下这个页面的架构,a标签在li里,li在ul里,ul在content里,所以只要取id为content的元素然后再取a标签的href和字符串就行了【演示为取href】

sitemap.py
#coding:utf-8
import re
from bs4 import BeautifulSoup
from distutils.filelist import findall
import urllib2
import urllib
import sys
reload(sys)
sys.setdefaultencoding('utf-8')#设置UTF-8防CMD报错
url= "https://www.mygalgame.com/sitemap.html"#目标链接
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}#加header头防反爬虫
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)#获取网页
#print response.read()
soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')
#print("name---------- url")
tags_list = list(soup.find(id='content').find_all('a'))#获取标签
#print tags_list
for tag in tags_list:
#print tag
m_url=tag['href']
print m_url
fo = open('1.txt', "ab+") #打开文件
# 以二进制写入链接 需要转换为utf-8编码,否则会出现乱码
fo.write(('\r' + m_url + '\r\n').encode('UTF-8'))
fo.close() #关闭文件
运行后得到了这个页面所有链接并保存在当前目录下的1.txt中

拿到链接后再分析其中的一个链接

由于链接是个按钮标签,所以可以用soup这么写

获取button
def getinf1(url):
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}#加header头防反爬虫
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)#获取网页
#html = response.read()
soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')
url_list = soup.find(id='zan-bodyer').find_all('button')#获取button标签
str3=str(url_list)
lista =str3
return lista
获取完成后需要对里面的百度云链接进行处理,我这里选择正则
def getbdy(html):
getbdyre=re.findall("=(http://pan\.baidu\.com.*?)'",html)#正则匹配百度云链接
return getbdyre
然后是对备注和链接密码进行处理
密码:
def getinf2(url):
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}#加header头防反爬虫
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)#获取网页
#html = response.read()
soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')
pwd_list = soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[0].text#获取密码+介绍
str1=str(pwd_list).encode('UTF-8')
listb =str1
return listb
备注:
def getinf3(url):
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}#加header头防反爬虫
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)#获取网页
#html = response.read()
soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')
try:#异常处理
pwd2_list=soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[1].text
str2=pwd2_list.encode('UTF-8')
listc =str2
return listc
except:
f=open('error.txt','ab+')
f.write(url+'\n')
f.close()
然后主体:
i=0
j=0
#文件读取
with open('1.txt','r') as f:
for url in f.readlines():#循环读取列表
infor=getinf1(url)#获取网页信息
#print infor
infor2 = getinf2(url)
strinfor=str(infor)
bdyinfor = getbdy(strinfor)
strinfor2=str(getinf3(url))
strinfor3=strinfor2
fo = open('2.txt', "ab+") #打开文件
# 以二进制写入链接 需要转换为utf-8编码,否则会出现乱码
fo.write(('\r' + strinfor3+"--------URL:"+bdyinfor+"--------PWD:"+infor2 + '\r\n').encode('UTF-8'))
fo.close() #关闭文件
print j
j+=1
i+=0
time.sleep(2)
最后加sleep的原因很简单- -这网站在抓取的时候被跑死过3次。。。
整体代码如下:
pa.py
#coding:utf-8
import re
from bs4 import BeautifulSoup
from distutils.filelist import findall
import urllib2
import urllib
import sys
import time
reload(sys)
sys.setdefaultencoding('utf8')
def getinf1(url):
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}#加header头防反爬虫
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)#获取网页
#html = response.read()
soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')
url_list = soup.find(id='zan-bodyer').find_all('button')#获取链接
str3=str(url_list)
lista =str3
return lista
def getbdy(html):
getbdyre=re.findall("=(http://pan\.baidu\.com.*?)'",html)#正则匹配百度云链接
return getbdyre
def getinf2(url):
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}#加header头防反爬虫
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)#获取网页
#html = response.read()
soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')
pwd_list = soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[0].text#获取密码+介绍
str1=str(pwd_list).encode('UTF-8')
listb =str1
return listb
def getinf3(url):
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}#加header头防反爬虫
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)#获取网页
#html = response.read()
soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')
try:#异常处理
pwd2_list=soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[1].text
str2=pwd2_list.encode('UTF-8')
listc =str2
return listc
except:
f=open('error.txt','ab+')
f.write(url+'\n')
f.close()
i=0
j=0
#文件读取
with open('1.txt','r') as f:
for url in f.readlines():#循环读取列表
infor=getinf1(url)#获取网页信息
#print infor
infor2 = getinf2(url)
strinfor=str(infor)
bdyinfor = getbdy(strinfor)
strinfor2=str(getinf3(url))
strinfor3=strinfor2
fo = open('2.txt', "ab+") #打开文件
# 以二进制写入链接 需要转换为utf-8编码,否则会出现乱码
fo.write(('\r' + strinfor3+"--------URL:"+bdyinfor +"--------PWD:"+infor2 + '\r\n').encode('UTF-8'))
fo.close() #关闭文件
print j
j+=1
i+=0
time.sleep(2)
1.txt必须跟这个文件在同一目录下
网友评论