（隐私）安居客滑动代码：

作者: 朝畫夕拾 | 来源:发表于2019-10-08 09:50 被阅读0次

（隐私）安居客滑动代码：
编写爬虫脚本爬取广州楼盘信息
安居客滑动验证码破解 --- js破解
无题
爬取安居客网站，增加Header
自定义ViewGroup（1）
UIProgressView和UISlider
滑动事件判断(手指上划、下滑/左滑、右滑)
安居客滑动验证码破解(仅供学习交流参考）
记录：ViewPager2 + FragmentStateAda

import requests, random, datetime, re, os, time, base64, pymssql

from lxmlimport etree

from fontTools.ttLibimport TTFont

from ioimport BytesIO

if not os.path.exists("allUrl"):

os.makedirs("allUrl")

def readfile(path):# 读取文件的函数

content= [line.strip() for linein open(path, encoding='utf-8', errors='ignore').readlines()]

return content

def savefile(savepath,content):

fp= open(savepath, 'a+', encoding='utf8', newline="", errors='ignore')

fp.write(content+"\r\n")

fp.flush()

fp.close()

def make_font_file(base64_string: str):

bin_data= base64.decodebytes(base64_string.encode())

with open('text.otf', 'wb') as f:

f.write(bin_data)

return bin_data

def get_num(string,html):

c= base_64(html)

ret_list= []

for charin string:

decode_num= ord(char)

num= c[decode_num]

num= int(num[-2:])-1

ret_list.append(num)

return ret_list

def base_64(html):

pattern= re.compile(r"'data:application/font-ttf;charset=utf-8;base64,(.*?)'", re.I)

base64_str= "".join(pattern.findall(html))

bin_data= make_font_file(base64_str)

font= TTFont(BytesIO(bin_data))

font.saveXML("text.xml")

font= TTFont(BytesIO(make_font_file(base64_str)))

uniList = font['cmap'].tables[0].ttFont.getGlyphOrder()

c= font['cmap'].tables[0].ttFont.tables['cmap'].tables[0].cmap

return c

def sqlInfo(data, table): ## 插入数据库

conn= pymssql.connect(host="172.30.100.148", user="", password="", database="LiangZB", charset='utf8')

cur= conn.cursor()

keys= ', '.join(data.keys())

values= ', '.join(['%s'] * len(data))

sql= "INSERT INTO {0} ({1}) VALUES ({2})" .format(table, keys, values)

try:

cur.execute(sql, tuple(data.values()))

conn.commit()

#print('插入成功!')

cur.close()

conn.close()

except Exception as ex:

print("错误在这>>>>>", ex, "<<<<<错误在这")

savefile("./allUrl/ErUrl.log", str(ex))

conn.rollback()

def getUA():

USER_AGENTS= [

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",

"Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",

"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",

"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2",

"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",

"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"

]

headers= {

"cookie": "als=0; wmda_uuid=c5544afe7d0808fc59edf66a50d80a4c; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; lps=https%3A%2F%2Fsu.zu.anjuke.com%2F%3Ffrom%3Dnavigation%7C; sessid=822172B8-0C58-5884-73B2-B62464C991EE; ajk_member_captcha=7ee0c936789ed4b80f136496306be275; __xsptplus8=8.23.1563787771.1563787774.2%232%7Csp0.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%230rqgP09ab8EK59L-eUjI82DRSDo0GvqE%23; _ga=GA1.2.1380953827.1562922627; _gid=GA1.2.1974631730.1563787776; 58tj_uuid=3fd91ac1-3f8d-42c9-9372-37c8f91fb5a0; new_uv=19; twe=2; ctid=26; aQQ_ajkguid=CA200195-1F12-F331-0163-0E0C21DC28DB; wmda_session_id_6289197098934=1563844517440-3ce63403-a0fd-d14f; xzfzqtoken=jfw1KwG52vwOvUY%2B7FXZkEz77v59E%2BTTnz6i1wVuH13eI6X6c03vXAKSyKTxfDxXin35brBb%2F%2FeSODvMgkQULA%3D%3D",

"referer": "https://zhengzhou.anjuke.com/",

"upgrade-insecure-requests": "1",

"user-agent": random.choice(USER_AGENTS)}

return headers

def response(url, num_retries=3):

try:

res= requests.get(url, headers=getUA(), allow_redirects=True, timeout=None)

time.sleep(random.uniform(0.5, 0.8))

res.raise_for_status() # 如果不是200会抛出HTTPError错误

res.encoding= res.apparent_encoding

if res.encoding== "utf-8" or res.encoding== "UTF-8" or res.encoding== "Windows-1254":

html= res.content.decode("utf-8", "ignore")

else:

html= res.content.decode("GBK", "ignore")

except requests.HTTPErroras ex:

html= None

if num_retries > 0: #如果不是200就重试，每次递减重试次数

return response(url, num_retries - 1)

except requests.exceptions.ConnectionErroras ex: #如果url不存在会抛出ConnectionError错误，这个情况不做重试

return None

return html

def parse(html,regu):

text= etree.HTML(html).xpath(regu)

return text

def getHTMLText(url, ss):

attempts= 0

success= False

while attempts< 3 and not success:

try:

html= response(url)

AnJuKeItem= {}

AnJuKeItem["城市"] = "".join(parse(html, '//div[@class="city-view"]/text()')).strip()

AnJuKeItem["编号"] = url.split("?")[0].split("fangyuan/")[1]

AnJuKeItem["租赁方式"] = "".join(parse(html, '//ul[@class="title-label cf"]/li[1]/text()')).strip()

AnJuKeItem["项目名称"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[1]/text()')).strip()

AnJuKeItem["区域"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[2]/text()')).strip()

AnJuKeItem["板块"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[3]/text()')).strip()

AnJuKeItem["地铁"] = "".join(parse(html, '//ul[@class="title-label cf"]/li[3]/text()')).strip()

AnJuKeItem["房型"] = ""

horseType_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[2]/span[2]//text()')).strip()

for linein horseType_str:

if linenot in type_list:

AnJuKeItem["房型"] += line

else:

line= get_num(line,html)[0]

AnJuKeItem["房型"] += str(line)

AnJuKeItem["面积"] = ""

area_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[3]/span[2]//text()')).replace("平方米","").strip()

for linein area_str:

if linenot in type_list:

AnJuKeItem["面积"] += line

else:

line= get_num(line,html)[0]

AnJuKeItem["面积"] += str(line)

AnJuKeItem["租金"] = ""

rent_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[1]/span[1]/em//text()')).strip()

for linein rent_str:

if linenot in type_list:

AnJuKeItem["租金"] += line

else:

line= get_num(line,html)[0]

AnJuKeItem["租金"] += str(line)

AnJuKeItem["装修"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[6]/span[2]//text()')).strip()

AnJuKeItem["朝向"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[4]/span[2]//text()')).strip()

AnJuKeItem["楼层"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[5]/span[2]//text()')).strip()

AnJuKeItem["类型"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[7]/span[2]//text()')).strip()

pattern= re.compile(r'var instance = new anjuke.Ajax.MapInPV\("map-canvas", {\n(.*?)</script>',re.I|re.DOTALL|re.M|re.S)

baiduLine_list= "".join(pattern.findall(html)).strip().split("\n")

AnJuKeItem["百度经度"] = baiduLine_list[0].split(":")[1].replace(",", "").strip()

AnJuKeItem["百度纬度"] = baiduLine_list[1].split(":")[1].replace(",", "").strip()

AnJuKeItem["发布时间"] = ""

tellTime_list= "".join(parse(html, '//div[@class="right-info"]/b/text()')).strip()

tellTime_str= re.sub(r"年|月|日", "/", tellTime_list)

for linein tellTime_str:

if linenot in type_list:

AnJuKeItem["发布时间"] += line

else:

line= get_num(line,html)[0]

AnJuKeItem["发布时间"] += str(line)

to_day= datetime.datetime.now()

AnJuKeItem["采集时间"] = "{}/{}/{}".format(to_day.year, to_day.month, to_day.day)

AnJuKeItem["品牌"] = "".join(parse(html, '//div[@class="broker-line"]/a/@title')).strip()

AnJuKeItem["采集网站"] = "安居客"

addressUrl= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[1]/@href')).strip()

res1= response(addressUrl)

try:

AnJuKeItem["地址"] = "".join(parse(res1, '//div[@class="comm-title"]/h1/span/text()')).split("-")[1].strip()

AnJuKeItem["年代"] = "".join(parse(res1, '//*[@id="basic-infos-box"]/dl/dd[5]/text()')).strip()

except Exception as ex:

cuoWu= "".join(parse(res1, '//*[@id="list-content"]/div[1]/span/text()')).strip()

if "为您找到" in cuoWu:

AnJuKeItem["地址"] = ""

AnJuKeItem["年代"] = ""

else:

print("三级页面已出现验证码！！！%s" %addressUrl)

print(Exception, ":", ex)

print(input("请破解验证码："))

print("*"*100)

print("二级页面URL：", url)

print(ss, "\t",AnJuKeItem["城市"],AnJuKeItem["编号"],AnJuKeItem["项目名称"],AnJuKeItem["房型"],AnJuKeItem["装修"],AnJuKeItem["地址"],AnJuKeItem["年代"])

print("*" * 60)

table= "AnJuKe"

sqlInfo(AnJuKeItem, table)

success= True

except Exception as ex:

attempts+= 1

if attempts== 3:

print(ss, url + "\t" + "2级URL报错，存入 anJKeUrl.log日志。。。")

print(Exception, ":", ex)

savefile("./allUrl/anJKeUrl.log", url + "|#|" + str(ex) + "2级")

print("*" * 60)

break

if __name__== '__main__':

print("I'm Working ...")

print('*****\t当前时间为：{}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

st= datetime.datetime.now()

type_list= ['驋', '龒', '龤', '閏', '麣', '鸺', '龥', '齤', '餼', '鑶']

city_dict= {"nb":"宁波", "nc":"南昌", "km":"昆明", "nn":"南宁", "gy":"贵阳", "nt":"南通", "su":"苏州", "zz":"郑州", "yz":"扬州", "wlmq":"乌鲁木齐"}

ss= 0

for key, valuein city_dict.items():

for iin range(1, 51):

url= "https://{}.zu.anjuke.com/fangyuan/p{}/".format(key, i)

attempts= 0

success= False

while attempts< 3 and not success:

try:

html= response(url)

genUrl_list= parse(html, '//*[@id="list-content"]/div/div[1]/h3/a/@href')

print("本网站 {} 第 {} 页共有 {} 个房源URL!".format(value, i, len(genUrl_list)))

for startUrlin genUrl_list[:]:

startUrl= startUrl.split("&")[0]

ss+= 1

getHTMLText(startUrl, ss)

success= True

except Exception as ex:

attempts+= 1

if attempts== 3:

print(url+ "\t" + "1级URL报错，存入anJkUrl.log日志。。。")

print(Exception, ":", ex)

savefile("./allUrl/anJKeUrl.log", url+ "|#|" + str(ex) + "1级")

print("*" * 100)

break

print("程序执行结束！")

print('当前时间为：{}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

et= datetime.datetime.now()

print('[info]耗时: %s' %(et- st))

import requests, random,json, re, time, execjs, urllib3

from ioimport BytesIO

from ioimport StringIO

from PILimport Image

urllib3.disable_warnings()

class AJK_Slide_Captcha():

def __init__(self):

self.headers= {

"Referer": "https://www.anjuke.com/captcha-verify/?callback=shield&from=antispam",

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"

}

def get_sessionId(self, captcha_url):

resp= requests.get(captcha_url, headers=self.headers, verify=False, timeout=None)

sessionId= re.search('name="sessionId".*?value="(.*?)"', resp.content.decode()).group(1)

return sessionId

def get_responseId_bgImgUrl(self, sessionId):

resp= requests.get("https://verifycode.58.com/captcha/getV3", headers=self.headers, verify=False, timeout=None,

params={

"callback": "callback",

"showType": "embed",

"sessionId": sessionId,

"_": str(int(time.time() * 1000))

})

captchaData= json.loads(resp.text.replace("callback(", "").replace(")", ""))

responseId= captchaData["data"]["responseId"]

bgImgUrl= captchaData["data"]["bgImgUrl"]

return (responseId,bgImgUrl)

def get_image(self, bgImgUrl):

resp= requests.get("https://verifycode.58.com" + bgImgUrl, headers=self.headers, verify=False, timeout=None)

# req.content是二进制的字符串传化为file 的 io对象

f= BytesIO(resp.content)

image= Image.open(f)

return image

def get_position(self,image):

image= image.resize((284, 160))

image= image.convert('L')

yuzhi= 150

yuzhi2= 40

ll= 10

for iin range(55, image.size[0] - 20): # 260

for jin range(0, image.size[1] - 20): # 160

flag= True

for lin range(0, ll):

pixel= image.getpixel((i, j)) - image.getpixel((i+ 1, j+ l))

if pixel< yuzhi2: flag= False

# pixel = image.getpixel((i - l, j))

# if pixel

for lin range(0, ll):

pixel= image.getpixel((i, j+ l))

if pixel< yuzhi: flag= False

if flag:

cropedimage = image.crop((i, j, i+ 30, j+ 30))

return i- 7

def get_trace(self,xPos, traceTxtPath):

with open(traceTxtPath, 'r+') as fp:

lines= fp.readlines()

allValueLineList= []

for linein lines:

if line.strip() == '': continue

start= int(re.search('"(\d+)', line).group(1))

end= int(re.search('(\d+)\,\d+\,\d+\|"', line).group(1))

if end- start== xPos or end- start== xPos + 1 or end- start== xPos - 1:

allValueLineList.append((end- start, line.strip().strip('"')))

lastXpos, trace= random.choice(allValueLineList)

changeNumCnt= 0

while changeNumCnt< 4:

changeNumCnt+= 1

num= random.choice(range(0, 10))

try:

search= random.choice(re.findall('(\d+%d)\|' % num, trace))

subSearch= str(int(search) + random.choice([1, -1]))

line = re.sub(search, subSearch, trace)

except:

changeNumCnt-= 1

return (lastXpos, trace)

def get_fpToken(self):

res2= requests.get("https://cdata.58.com/fpToken",headers=self.headers, timeout=None, verify=False,

params={

"callback": "callback",

})

html2= res2.content.decode("utf-8", "ignore")

fpToken= html2.split('"token":"')[1].replace('"})', '').strip()

return fpToken

def get_jiami_data(self, responseId, fpToken,lastXpos, trace):

jsCode= execjs.compile(open("./jiami.js", "r").read())

jiami_data= jsCode.call("getSlideAnswer", responseId, fpToken, lastXpos, trace)

return jiami_data

def slove(self,jiami_data,responseId,sessionId):

response= requests.get("https://verifycode.58.com/captcha/getV3", headers=self.headers, timeout=None, verify=False,

params={

"data": jiami_data,

"responseId": responseId,

"sessionId": sessionId,

"_": str(int(time.time() * 1000))

})

return response.text

def run(self):

# step1: 在验证码页面中获取 sessionId

sessionId= self.get_sessionId('https://www.anjuke.com/captcha-verify/?callback=shield')

print('step1: sessionId->', sessionId)

# step2: 获取 responseId 和 bgImgUrl

(responseId, bgImgUrl) = self.get_responseId_bgImgUrl(sessionId)

print('step2: responseId->', responseId)

# Step 3, Get Image

image= self.get_image(bgImgUrl)

print('step3: image->', image)

# Step 4 ,caculate position

position= self.get_position(image)

print('step4: position->', position)

# Step 5 get trace

(lastXpos, trace) = self.get_trace(position, traceTxtPath='CaptchaTrace.txt')

print('step5: lastXpos->', lastXpos, "==", 'trace->', trace,)

# Step 6 get fpToken

fpToken= self.get_fpToken()

print('step6: fpToken->', fpToken)

# # Step 7 加密data

jiami_data= self.get_jiami_data(responseId, fpToken, lastXpos, trace)

print('step7: jiami_data->', jiami_data)

# Step 8 slove

responseText= self.slove(jiami_data, responseId, sessionId)

print('\nstep8: 最后请求结果->', responseText)

if __name__== '__main__':

AJK_Slide_Captcha().run()

from urllib.parseimport quote

from lxmlimport etree

import requests, os

def pdf_Dict(monUrl, ss):

headers= {

"Cookie": "Hm_lvt_d885bd65f967ea9372fc7200bc83fa81=1568943378; Hm_lpvt_d885bd65f967ea9372fc7200bc83fa81=1568944562",

"Host": "www.shclearing.com",

"Upgrade-Insecure-Requests": "1",

"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"

}

genUrl= "http://www.shclearing.com/xxpl/fxpl/"

resp= requests.get(monUrl, headers=headers, timeout=10)

htmp= resp.content.decode("utf-8","ignore")

titleUrl_list= etree.HTML(htmp).xpath('//ul[@class="list"]/li/a/@href')

sss= 0

for titleUrlin titleUrl_list:

sss+= 1

titleUrl= titleUrl.split("./")[1]

pdfUrl= genUrl+ titleUrl

res= requests.get(pdfUrl, headers = headers, timeout = None)

html= res.content.decode("utf-8","ignore")

### 创建路径

dataTime= "".join(etree.HTML(html).xpath('//*[@id="content"]/span/text()')).replace("日期：", "").strip()

yearTime= dataTime.split("-")[0]

monTime= "".join(dataTime.split("-")[:2]).strip()

pdfurlPath= "./" + yearTime+ "/" + monTime+"/" + dataTime+ "/"

if not os.path.exists(pdfurlPath):

os.makedirs(pdfurlPath)

### post 参数

scriptStr= "".join(etree.HTML(html).xpath('//*[@id="content"]/div[@class="attachments"]//text()')).strip()

fileNames_list= scriptStr.split("var fileNames = '")[1].split("';")[0].replace("./","").strip().split(";;")

descNames_list= scriptStr.split("var descNames = '")[1].split("';")[0].strip().split(";;")

pdfDict= dict( (fileNames_list,descNames_list) for fileNames_list,descNames_listin zip(fileNames_list,descNames_list))

for FileName, DownNamein pdfDict.items():

print("第 {} 页 {} 个URL！post参数：".format(ss, sss), FileName+ '\t' + DownName)

res1= requests.post(

"http://www.shclearing.com/wcm/shch/pages/client/download/download.jsp",

data={

"FileName": FileName,

"DownName": quote(DownName)

},

headers = headers,

timeout = None

)

pdf= res1.content# 响应的二进制文件

with open(pdfurlPath+ DownName, 'wb') as f: # 二进制写入

f.write(pdf)

if __name__== '__main__':

### 前五页

monUrl_list= ["http://www.shclearing.com/xxpl/fxpl/index.html","http://www.shclearing.com/xxpl/fxpl/index_1.html","http://www.shclearing.com/xxpl/fxpl/index_2.html","http://www.shclearing.com/xxpl/fxpl/index_3.html","http://www.shclearing.com/xxpl/fxpl/index_4.html"]

ss= 0

for monUrlin monUrl_list:

ss+= 1

pdf_Dict(monUrl, ss)