美文网首页python热爱者Python新世界生活不易 我用python
python小爬虫抓取福利游戏【MYMALGAME】

python小爬虫抓取福利游戏【MYMALGAME】

作者: 轻松学Python111 | 来源:发表于2018-07-30 22:49 被阅读6次

"色情产业是科技发展进步的源泉。"

闲的没事干翻了翻百度云盘的东西,发现个这个

诶- -我什么时候存过这个东西。果断百度

进群125240963 即可获取Python零基础   爬虫   web开发视频,数十套PDF书籍

好吧,看起来忘记了是什么东西,但是这并不影响今天的话题。。

翻了翻网站,是wordpress4.8.2的,看起来撸他是没有可能了。。

四处翻翻发现个网站地图

https://www.mygalgame.com/sitemap.html

貌似可以写个爬虫把所有资源都嘿嘿嘿下来

想到就做,一贯风格

先分析下这个页面的架构,a标签在li里,li在ul里,ul在content里,所以只要取id为content的元素然后再取a标签的href和字符串就行了【演示为取href】

sitemap.py

#coding:utf-8

import re

from bs4 import BeautifulSoup

from distutils.filelist import findall

import urllib2

import urllib

import sys

reload(sys)

sys.setdefaultencoding('utf-8')#设置UTF-8防CMD报错

url= "https://www.mygalgame.com/sitemap.html"#目标链接

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#print response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

#print("name---------- url")

tags_list = list(soup.find(id='content').find_all('a'))#获取标签

#print tags_list

for tag in tags_list:

#print tag

m_url=tag['href']

print m_url

fo = open('1.txt', "ab+")         #打开文件

# 以二进制写入链接 需要转换为utf-8编码,否则会出现乱码

fo.write(('\r' + m_url + '\r\n').encode('UTF-8'))

fo.close()        #关闭文件

运行后得到了这个页面所有链接并保存在当前目录下的1.txt中

拿到链接后再分析其中的一个链接

由于链接是个按钮标签,所以可以用soup这么写

获取button

def getinf1(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

url_list = soup.find(id='zan-bodyer').find_all('button')#获取button标签

str3=str(url_list)

lista =str3

return lista

获取完成后需要对里面的百度云链接进行处理,我这里选择正则

def getbdy(html):

getbdyre=re.findall("=(http://pan\.baidu\.com.*?)'",html)#正则匹配百度云链接

return getbdyre

然后是对备注和链接密码进行处理

密码:

def getinf2(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

pwd_list = soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[0].text#获取密码+介绍

str1=str(pwd_list).encode('UTF-8')

listb =str1

return listb

备注:

def getinf3(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

try:#异常处理

pwd2_list=soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[1].text

str2=pwd2_list.encode('UTF-8')

listc =str2

return listc

except:

f=open('error.txt','ab+')

f.write(url+'\n')

f.close()

然后主体:

i=0

j=0

#文件读取

with open('1.txt','r') as f:

for url in f.readlines():#循环读取列表

infor=getinf1(url)#获取网页信息

#print infor

infor2 = getinf2(url)

strinfor=str(infor)

bdyinfor = getbdy(strinfor)

strinfor2=str(getinf3(url))

strinfor3=strinfor2

fo = open('2.txt', "ab+")         #打开文件

# 以二进制写入链接 需要转换为utf-8编码,否则会出现乱码

fo.write(('\r' + strinfor3+"--------URL:"+bdyinfor+"--------PWD:"+infor2 + '\r\n').encode('UTF-8'))

fo.close()        #关闭文件

print j

j+=1

i+=0

time.sleep(2)

最后加sleep的原因很简单- -这网站在抓取的时候被跑死过3次。。。

整体代码如下:

pa.py

#coding:utf-8

import re

from bs4 import BeautifulSoup

from distutils.filelist import findall

import urllib2

import urllib

import sys

import time

reload(sys)

sys.setdefaultencoding('utf8')

def getinf1(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

url_list = soup.find(id='zan-bodyer').find_all('button')#获取链接

str3=str(url_list)

lista =str3

return lista

def getbdy(html):

getbdyre=re.findall("=(http://pan\.baidu\.com.*?)'",html)#正则匹配百度云链接

return getbdyre

def getinf2(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

pwd_list = soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[0].text#获取密码+介绍

str1=str(pwd_list).encode('UTF-8')

listb =str1

return listb

def getinf3(url):

user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"

headers={"User-Agent":user_agent}#加header头防反爬虫

request=urllib2.Request(url,headers=headers)

response=urllib2.urlopen(request)#获取网页

#html = response.read()

soup = BeautifulSoup(response,"html5lib",from_encoding='utf-8')

try:#异常处理

pwd2_list=soup.find_all(name='span',attrs={'style':'color: #ff0000;'})[1].text

str2=pwd2_list.encode('UTF-8')

listc =str2

return listc

except:

f=open('error.txt','ab+')

f.write(url+'\n')

f.close()

i=0

j=0

#文件读取

with open('1.txt','r') as f:

for url in f.readlines():#循环读取列表

infor=getinf1(url)#获取网页信息

#print infor

infor2 = getinf2(url)

strinfor=str(infor)

bdyinfor = getbdy(strinfor)

strinfor2=str(getinf3(url))

strinfor3=strinfor2

fo = open('2.txt', "ab+")         #打开文件

# 以二进制写入链接 需要转换为utf-8编码,否则会出现乱码

fo.write(('\r' + strinfor3+"--------URL:"+bdyinfor +"--------PWD:"+infor2 + '\r\n').encode('UTF-8'))

fo.close()        #关闭文件

print j

j+=1

i+=0

time.sleep(2)

1.txt必须跟这个文件在同一目录下

相关文章

网友评论

    本文标题:python小爬虫抓取福利游戏【MYMALGAME】

    本文链接:https://www.haomeiwen.com/subject/wrbqvftx.html