python小爬虫抓取福利游戏【MYMALGAME】

作者: 轻松学Python111 | 来源:发表于2018-07-30 22:49 被阅读6次

python小爬虫抓取福利游戏【MYMALGAME】
python爬虫抓取视频保存到文件
学会爬虫抓取竞争对手数据，《Python3网络爬虫开发实战》PD
Python实用练手小案例
基于Python的豆瓣影评分析——数据预处理
Python爬虫入门--了解爬虫---什么是爬虫？
Python 的简单爬虫
Python爬虫入门(01) -- 10行代码实现一个爬虫
python抓取简单爬虫02
Python爬虫基础

"色情产业是科技发展进步的源泉。"

闲的没事干翻了翻百度云盘的东西，发现个这个

诶- -我什么时候存过这个东西。果断百度

进群125240963 即可获取Python零基础爬虫 web开发视频，数十套PDF书籍

好吧，看起来忘记了是什么东西，但是这并不影响今天的话题。。

翻了翻网站，是wordpress4.8.2的，看起来撸他是没有可能了。。

四处翻翻发现个网站地图

https://www.mygalgame.com/sitemap.html

貌似可以写个爬虫把所有资源都嘿嘿嘿下来

想到就做，一贯风格

先分析下这个页面的架构，a标签在li里，li在ul里，ul在content里，所以只要取id为content的元素然后再取a标签的href和字符串就行了【演示为取href】

sitemap.py

#coding:utf-8

import re

from bs4 import BeautifulSoup

from distutils.filelist import findall

import urllib2

import urllib

import sys

reload(sys)

sys.setdefaultencoding('utf-8')#设置UTF-8防CMD报错

url= "https://www.mygalgame.com/sitemap.html"#目标链接

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#print response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

#print("name---------- url")

tags_list = list(soup.find(id='content').find_all('a'))#获取标签

#print tags_list

for tag in tags_list:

#print tag

m_url=tag['href']

print m_url

fo = open('1.txt', "ab+") #打开文件

# 以二进制写入链接需要转换为utf-8编码，否则会出现乱码

fo.write(('\r' + m_url + '\r\n').encode('UTF-8'))

fo.close() #关闭文件

运行后得到了这个页面所有链接并保存在当前目录下的1.txt中

拿到链接后再分析其中的一个链接

由于链接是个按钮标签，所以可以用soup这么写

获取button

def getinf1(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

url_list = soup.find(id='zan-bodyer').find_all('button')#获取button标签

str3=str(url_list)

lista =str3

return lista

获取完成后需要对里面的百度云链接进行处理，我这里选择正则

def getbdy(html):

getbdyre=re.findall("=(http://pan\.baidu\.com.*?)'",html)#正则匹配百度云链接

return getbdyre

然后是对备注和链接密码进行处理

密码：

def getinf2(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

pwd_list = soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[0].text#获取密码+介绍

str1=str(pwd_list).encode('UTF-8')

listb =str1

return listb

备注：

def getinf3(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

try:#异常处理

pwd2_list=soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[1].text

str2=pwd2_list.encode('UTF-8')

listc =str2

return listc

except:

f=open('error.txt','ab+')

f.write(url+'\n')

f.close()

然后主体：

i=0

j=0

#文件读取

with open('1.txt','r') as f:

for url in f.readlines():#循环读取列表

infor=getinf1(url)#获取网页信息

#print infor

infor2 = getinf2(url)

strinfor=str(infor)

bdyinfor = getbdy(strinfor)

strinfor2=str(getinf3(url))

strinfor3=strinfor2

fo = open('2.txt', "ab+") #打开文件

# 以二进制写入链接需要转换为utf-8编码，否则会出现乱码

fo.write(('\r' + strinfor3+"--------URL:"+bdyinfor+"--------PWD:"+infor2 + '\r\n').encode('UTF-8'))

fo.close() #关闭文件

print j

j+=1

i+=0

time.sleep(2)

最后加sleep的原因很简单- -这网站在抓取的时候被跑死过3次。。。

整体代码如下：

pa.py

#coding:utf-8

import re

from bs4 import BeautifulSoup

from distutils.filelist import findall

import urllib2

import urllib

import sys

import time

reload(sys)

sys.setdefaultencoding('utf8')

def getinf1(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

url_list = soup.find(id='zan-bodyer').find_all('button')#获取链接

str3=str(url_list)

lista =str3

return lista

def getbdy(html):

getbdyre=re.findall("=(http://pan\.baidu\.com.*?)'",html)#正则匹配百度云链接

return getbdyre

def getinf2(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

pwd_list = soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[0].text#获取密码+介绍

str1=str(pwd_list).encode('UTF-8')

listb =str1

return listb

def getinf3(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

try:#异常处理

pwd2_list=soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[1].text

str2=pwd2_list.encode('UTF-8')

listc =str2

return listc

except:

f=open('error.txt','ab+')

f.write(url+'\n')

f.close()

i=0

j=0

#文件读取

with open('1.txt','r') as f:

for url in f.readlines():#循环读取列表

infor=getinf1(url)#获取网页信息

#print infor

infor2 = getinf2(url)

strinfor=str(infor)

bdyinfor = getbdy(strinfor)

strinfor2=str(getinf3(url))

strinfor3=strinfor2

fo = open('2.txt', "ab+") #打开文件

# 以二进制写入链接需要转换为utf-8编码，否则会出现乱码

fo.write(('\r' + strinfor3+"--------URL:"+bdyinfor +"--------PWD:"+infor2 + '\r\n').encode('UTF-8'))

fo.close() #关闭文件

print j

j+=1

i+=0

time.sleep(2)