美文网首页
(隐私)安居客滑动代码:

(隐私)安居客滑动代码:

作者: 朝畫夕拾 | 来源:发表于2019-10-08 09:50 被阅读0次

import requests, random, datetime, re, os, time, base64, pymssql

from lxmlimport etree

from fontTools.ttLibimport TTFont

from ioimport BytesIO

if not os.path.exists("allUrl"):

    os.makedirs("allUrl")

def readfile(path):# 读取文件的函数

    content= [line.strip() for linein open(path, encoding='utf-8', errors='ignore').readlines()]

    return content

def savefile(savepath,content):

    fp= open(savepath, 'a+', encoding='utf8', newline="", errors='ignore')

    fp.write(content+"\r\n")

    fp.flush()

    fp.close()

def make_font_file(base64_string: str):

    bin_data= base64.decodebytes(base64_string.encode())

    with open('text.otf', 'wb') as f:

        f.write(bin_data)

    return bin_data

def get_num(string,html):

    c= base_64(html)

    ret_list= []

    for charin string:

        decode_num= ord(char)

        num= c[decode_num]

        num= int(num[-2:])-1

        ret_list.append(num)

    return ret_list

def base_64(html):

    pattern= re.compile(r"'data:application/font-ttf;charset=utf-8;base64,(.*?)'", re.I)

    base64_str= "".join(pattern.findall(html))

    bin_data= make_font_file(base64_str)

    font= TTFont(BytesIO(bin_data))

    font.saveXML("text.xml")

    font= TTFont(BytesIO(make_font_file(base64_str)))

    uniList = font['cmap'].tables[0].ttFont.getGlyphOrder()

    c= font['cmap'].tables[0].ttFont.tables['cmap'].tables[0].cmap

return c

def sqlInfo(data, table):        ## 插入数据库

    conn= pymssql.connect(host="172.30.100.148", user="", password="", database="LiangZB", charset='utf8')

    cur= conn.cursor()

    keys= ', '.join(data.keys())

    values= ', '.join(['%s'] * len(data))

    sql= "INSERT INTO {0} ({1}) VALUES ({2})" .format(table, keys, values)

    try:

        cur.execute(sql, tuple(data.values()))

        conn.commit()

        #print('插入成功!')

        cur.close()

        conn.close()

    except Exception as ex:

        print("错误在这>>>>>", ex, "<<<<<错误在这")

        savefile("./allUrl/ErUrl.log", str(ex))

        conn.rollback()

def getUA():

    USER_AGENTS= [

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",

        "Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",

        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",

        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2",

        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",

        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",

        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",

        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",

        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"

    ]

    headers= {

        "cookie": "als=0; wmda_uuid=c5544afe7d0808fc59edf66a50d80a4c; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; lps=https%3A%2F%2Fsu.zu.anjuke.com%2F%3Ffrom%3Dnavigation%7C; sessid=822172B8-0C58-5884-73B2-B62464C991EE; ajk_member_captcha=7ee0c936789ed4b80f136496306be275; __xsptplus8=8.23.1563787771.1563787774.2%232%7Csp0.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%230rqgP09ab8EK59L-eUjI82DRSDo0GvqE%23; _ga=GA1.2.1380953827.1562922627; _gid=GA1.2.1974631730.1563787776; 58tj_uuid=3fd91ac1-3f8d-42c9-9372-37c8f91fb5a0; new_uv=19; twe=2; ctid=26; aQQ_ajkguid=CA200195-1F12-F331-0163-0E0C21DC28DB; wmda_session_id_6289197098934=1563844517440-3ce63403-a0fd-d14f; xzfzqtoken=jfw1KwG52vwOvUY%2B7FXZkEz77v59E%2BTTnz6i1wVuH13eI6X6c03vXAKSyKTxfDxXin35brBb%2F%2FeSODvMgkQULA%3D%3D",

        "referer": "https://zhengzhou.anjuke.com/",

        "upgrade-insecure-requests": "1",

        "user-agent": random.choice(USER_AGENTS)}

    return headers

def response(url, num_retries=3):

    try:

        res= requests.get(url, headers=getUA(), allow_redirects=True, timeout=None)

        time.sleep(random.uniform(0.5, 0.8))

        res.raise_for_status()    # 如果不是200会抛出HTTPError错误

        res.encoding= res.apparent_encoding

if res.encoding== "utf-8" or res.encoding== "UTF-8" or res.encoding== "Windows-1254":

            html= res.content.decode("utf-8", "ignore")

        else:

            html= res.content.decode("GBK", "ignore")

    except requests.HTTPErroras ex:

        html= None

        if num_retries > 0:      #如果不是200就重试,每次递减重试次数

            return response(url, num_retries - 1)

    except requests.exceptions.ConnectionErroras ex:  #如果url不存在会抛出ConnectionError错误,这个情况不做重试

        return None

    return html

def parse(html,regu):

    text= etree.HTML(html).xpath(regu)

    return text

def getHTMLText(url, ss):

    attempts= 0

    success= False

    while attempts< 3 and not success:

        try:

            html= response(url)

            AnJuKeItem= {}

            AnJuKeItem["城市"] = "".join(parse(html, '//div[@class="city-view"]/text()')).strip()

            AnJuKeItem["编号"] = url.split("?")[0].split("fangyuan/")[1]

            AnJuKeItem["租赁方式"] = "".join(parse(html, '//ul[@class="title-label cf"]/li[1]/text()')).strip()

            AnJuKeItem["项目名称"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[1]/text()')).strip()

            AnJuKeItem["区域"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[2]/text()')).strip()

            AnJuKeItem["板块"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[3]/text()')).strip()

            AnJuKeItem["地铁"] = "".join(parse(html, '//ul[@class="title-label cf"]/li[3]/text()')).strip()

            AnJuKeItem["房型"] = ""

            horseType_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[2]/span[2]//text()')).strip()

            for linein horseType_str:

                if linenot in type_list:

                    AnJuKeItem["房型"] += line

else:

                    line= get_num(line,html)[0]

                    AnJuKeItem["房型"] += str(line)

            AnJuKeItem["面积"] = ""

            area_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[3]/span[2]//text()')).replace("平方米","").strip()

            for linein area_str:

                if linenot in type_list:

                    AnJuKeItem["面积"] += line

else:

                    line= get_num(line,html)[0]

                    AnJuKeItem["面积"] += str(line)

            AnJuKeItem["租金"] = ""

            rent_str= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[1]/span[1]/em//text()')).strip()

            for linein rent_str:

                if linenot in type_list:

                    AnJuKeItem["租金"] += line

else:

                    line= get_num(line,html)[0]

                    AnJuKeItem["租金"] += str(line)

            AnJuKeItem["装修"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[6]/span[2]//text()')).strip()

            AnJuKeItem["朝向"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[4]/span[2]//text()')).strip()

            AnJuKeItem["楼层"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[5]/span[2]//text()')).strip()

            AnJuKeItem["类型"] = "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[7]/span[2]//text()')).strip()

            pattern= re.compile(r'var instance = new anjuke.Ajax.MapInPV\("map-canvas", {\n(.*?)</script>',re.I|re.DOTALL|re.M|re.S)

            baiduLine_list= "".join(pattern.findall(html)).strip().split("\n")

            AnJuKeItem["百度经度"] = baiduLine_list[0].split(":")[1].replace(",", "").strip()

            AnJuKeItem["百度纬度"] = baiduLine_list[1].split(":")[1].replace(",", "").strip()

            AnJuKeItem["发布时间"] = ""

            tellTime_list= "".join(parse(html, '//div[@class="right-info"]/b/text()')).strip()

            tellTime_str= re.sub(r"年|月|日", "/", tellTime_list)

            for linein tellTime_str:

                if linenot in type_list:

                    AnJuKeItem["发布时间"] += line

else:

                    line= get_num(line,html)[0]

                    AnJuKeItem["发布时间"] += str(line)

            to_day= datetime.datetime.now()

            AnJuKeItem["采集时间"] = "{}/{}/{}".format(to_day.year, to_day.month, to_day.day)

            AnJuKeItem["品牌"] = "".join(parse(html, '//div[@class="broker-line"]/a/@title')).strip()

            AnJuKeItem["采集网站"] = "安居客"

            addressUrl= "".join(parse(html, '//div[@class="lbox"]/ul[1]/li[8]/a[1]/@href')).strip()

            res1= response(addressUrl)

            try:

                AnJuKeItem["地址"] = "".join(parse(res1, '//div[@class="comm-title"]/h1/span/text()')).split("-")[1].strip()

                AnJuKeItem["年代"] = "".join(parse(res1, '//*[@id="basic-infos-box"]/dl/dd[5]/text()')).strip()

            except Exception as ex:

                cuoWu= "".join(parse(res1, '//*[@id="list-content"]/div[1]/span/text()')).strip()

                if "为您找到" in cuoWu:

                    AnJuKeItem["地址"] = ""

                    AnJuKeItem["年代"] = ""

                else:

                    print("三级页面已出现验证码!!!%s" %addressUrl)

                    print(Exception, ":", ex)

                    print(input("请破解验证码:"))

                    print("*"*100)

            print("二级页面URL:", url)

            print(ss, "\t",AnJuKeItem["城市"],AnJuKeItem["编号"],AnJuKeItem["项目名称"],AnJuKeItem["房型"],AnJuKeItem["装修"],AnJuKeItem["地址"],AnJuKeItem["年代"])

            print("*" * 60)

            table= "AnJuKe"

            sqlInfo(AnJuKeItem, table)

            success= True

        except Exception as ex:

            attempts+= 1

            if attempts== 3:

                print(ss, url + "\t" + "2级URL报错,存入 anJKeUrl.log日志。。。")

                print(Exception, ":", ex)

                savefile("./allUrl/anJKeUrl.log", url + "|#|" + str(ex) + "2级")

                print("*" * 60)

                break

if __name__== '__main__':

    print("I'm Working ...")

    print('*****\t当前时间为:{}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    st= datetime.datetime.now()

    type_list= ['驋', '龒', '龤', '閏', '麣', '鸺', '龥', '齤', '餼', '鑶']

    city_dict= {"nb":"宁波", "nc":"南昌", "km":"昆明", "nn":"南宁", "gy":"贵阳", "nt":"南通", "su":"苏州", "zz":"郑州", "yz":"扬州", "wlmq":"乌鲁木齐"}

    ss= 0

    for key, valuein city_dict.items():

        for iin range(1, 51):

            url= "https://{}.zu.anjuke.com/fangyuan/p{}/".format(key, i)

            attempts= 0

            success= False

            while attempts< 3 and not success:

                try:

                    html= response(url)

                    genUrl_list= parse(html, '//*[@id="list-content"]/div/div[1]/h3/a/@href')

                    print("本网站 {} 第 {} 页共有 {} 个房源URL!".format(value, i, len(genUrl_list)))

                    for startUrlin genUrl_list[:]:

                        startUrl= startUrl.split("&")[0]

                        ss+= 1

                        getHTMLText(startUrl, ss)

                    success= True

                except Exception as ex:

                    attempts+= 1

                    if attempts== 3:

                        print(url+ "\t" + "1级URL报错,存入anJkUrl.log日志。。。")

                        print(Exception, ":", ex)

                        savefile("./allUrl/anJKeUrl.log", url+ "|#|" + str(ex) + "1级")

                        print("*" * 100)

                        break

    print("程序执行结束!")

    print('当前时间为:{}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')))

    et= datetime.datetime.now()

    print('[info]耗时: %s' %(et- st))



import requests, random,json, re, time, execjs, urllib3

from ioimport BytesIO

from ioimport StringIO

from PILimport Image

urllib3.disable_warnings()

class AJK_Slide_Captcha():

    def __init__(self):

        self.headers= {

        "Referer": "https://www.anjuke.com/captcha-verify/?callback=shield&from=antispam",

        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"

        }

    def get_sessionId(self, captcha_url):

        resp= requests.get(captcha_url, headers=self.headers, verify=False, timeout=None)

        sessionId= re.search('name="sessionId".*?value="(.*?)"', resp.content.decode()).group(1)

        return sessionId

def get_responseId_bgImgUrl(self, sessionId):

        resp= requests.get("https://verifycode.58.com/captcha/getV3", headers=self.headers, verify=False, timeout=None,

            params={

                "callback": "callback",

                "showType": "embed",

                "sessionId": sessionId,

                "_": str(int(time.time() * 1000))

})

        captchaData= json.loads(resp.text.replace("callback(", "").replace(")", ""))

        responseId= captchaData["data"]["responseId"]

        bgImgUrl= captchaData["data"]["bgImgUrl"]

        return (responseId,bgImgUrl)

    def get_image(self, bgImgUrl):

        resp= requests.get("https://verifycode.58.com" + bgImgUrl, headers=self.headers, verify=False, timeout=None)

        # req.content是二进制的字符串 传化为file 的 io对象

        f= BytesIO(resp.content)

        image= Image.open(f)

        return image

def get_position(self,image):

        image= image.resize((284, 160))

        image= image.convert('L')

        yuzhi= 150

        yuzhi2= 40

        ll= 10

        for iin range(55, image.size[0] - 20):    # 260

            for jin range(0, image.size[1] - 20):  # 160

                flag= True

                for lin range(0, ll):

                    pixel= image.getpixel((i, j)) - image.getpixel((i+ 1, j+ l))

                    if pixel< yuzhi2: flag= False

                    # pixel = image.getpixel((i - l, j))

# if pixel

                for lin range(0, ll):

                    pixel= image.getpixel((i, j+ l))

                    if pixel< yuzhi: flag= False

                if flag:

                    cropedimage = image.crop((i, j, i+ 30, j+ 30))

                    return i- 7

    def get_trace(self,xPos, traceTxtPath):

        with open(traceTxtPath, 'r+') as fp:

            lines= fp.readlines()

        allValueLineList= []

        for linein lines:

            if line.strip() == '': continue

            start= int(re.search('"(\d+)', line).group(1))

            end= int(re.search('(\d+)\,\d+\,\d+\|"', line).group(1))

            if end- start== xPos or end- start== xPos + 1 or end- start== xPos - 1:

                allValueLineList.append((end- start, line.strip().strip('"')))

        lastXpos, trace= random.choice(allValueLineList)

        changeNumCnt= 0

        while changeNumCnt< 4:

            changeNumCnt+= 1

            num= random.choice(range(0, 10))

            try:

                search= random.choice(re.findall('(\d+%d)\|' % num, trace))

                subSearch= str(int(search) + random.choice([1, -1]))

                line = re.sub(search, subSearch, trace)

            except:

                changeNumCnt-= 1

        return (lastXpos, trace)

    def get_fpToken(self):

        res2= requests.get("https://cdata.58.com/fpToken",headers=self.headers, timeout=None, verify=False,

                params={

                    "callback": "callback",

                })

        html2= res2.content.decode("utf-8", "ignore")

        fpToken= html2.split('"token":"')[1].replace('"})', '').strip()

        return fpToken

def get_jiami_data(self, responseId, fpToken,lastXpos, trace):

        jsCode= execjs.compile(open("./jiami.js", "r").read())

        jiami_data= jsCode.call("getSlideAnswer", responseId, fpToken, lastXpos, trace)

        return jiami_data

def slove(self,jiami_data,responseId,sessionId):

        response= requests.get("https://verifycode.58.com/captcha/getV3", headers=self.headers, timeout=None, verify=False,

            params={

                "data": jiami_data,

                "responseId": responseId,

                "sessionId": sessionId,

                "_": str(int(time.time() * 1000))

})

        return response.text

def run(self):

        # step1: 在验证码页面中 获取 sessionId

        sessionId= self.get_sessionId('https://www.anjuke.com/captcha-verify/?callback=shield')

        print('step1:    sessionId->', sessionId)

        # step2: 获取 responseId 和 bgImgUrl

        (responseId, bgImgUrl) = self.get_responseId_bgImgUrl(sessionId)

        print('step2:    responseId->', responseId)

        # Step 3, Get Image

        image= self.get_image(bgImgUrl)

        print('step3:    image->', image)

        # Step 4 ,caculate position

        position= self.get_position(image)

        print('step4:    position->', position)

        # Step 5 get trace

        (lastXpos, trace) = self.get_trace(position, traceTxtPath='CaptchaTrace.txt')

        print('step5:  lastXpos->', lastXpos, "==", 'trace->', trace,)

        # Step 6 get fpToken

        fpToken= self.get_fpToken()

        print('step6:    fpToken->', fpToken)

        # # Step 7 加密data

        jiami_data= self.get_jiami_data(responseId, fpToken, lastXpos, trace)

        print('step7:    jiami_data->', jiami_data)

        # Step 8 slove

        responseText= self.slove(jiami_data, responseId, sessionId)

        print('\nstep8:    最后请求结果->', responseText)

if __name__== '__main__':

    AJK_Slide_Captcha().run()



from urllib.parseimport quote

from lxmlimport etree

import requests, os

def pdf_Dict(monUrl, ss):

    headers= {

        "Cookie": "Hm_lvt_d885bd65f967ea9372fc7200bc83fa81=1568943378; Hm_lpvt_d885bd65f967ea9372fc7200bc83fa81=1568944562",

        "Host": "www.shclearing.com",

        "Upgrade-Insecure-Requests": "1",

        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"

    }

    genUrl= "http://www.shclearing.com/xxpl/fxpl/"

    resp= requests.get(monUrl, headers=headers, timeout=10)

    htmp= resp.content.decode("utf-8","ignore")

    titleUrl_list= etree.HTML(htmp).xpath('//ul[@class="list"]/li/a/@href')

    sss= 0

    for titleUrlin titleUrl_list:

        sss+= 1

        titleUrl= titleUrl.split("./")[1]

        pdfUrl= genUrl+ titleUrl

res= requests.get(pdfUrl, headers = headers, timeout = None)

        html= res.content.decode("utf-8","ignore")

        ###  创建路径

        dataTime= "".join(etree.HTML(html).xpath('//*[@id="content"]/span/text()')).replace("日期:", "").strip()

        yearTime= dataTime.split("-")[0]

        monTime= "".join(dataTime.split("-")[:2]).strip()

        pdfurlPath= "./" + yearTime+ "/" + monTime+"/" + dataTime+ "/"

        if not os.path.exists(pdfurlPath):

            os.makedirs(pdfurlPath)

        ###  post 参数

        scriptStr= "".join(etree.HTML(html).xpath('//*[@id="content"]/div[@class="attachments"]//text()')).strip()

        fileNames_list= scriptStr.split("var fileNames = '")[1].split("';")[0].replace("./","").strip().split(";;")

        descNames_list= scriptStr.split("var descNames = '")[1].split("';")[0].strip().split(";;")

        pdfDict= dict( (fileNames_list,descNames_list) for fileNames_list,descNames_listin zip(fileNames_list,descNames_list))

        for FileName, DownNamein pdfDict.items():

            print("第 {} 页 {} 个URL!post参数:".format(ss, sss), FileName+ '\t' + DownName)

            res1= requests.post(

            "http://www.shclearing.com/wcm/shch/pages/client/download/download.jsp",

            data={

                "FileName": FileName,

                "DownName": quote(DownName)

            },

            headers = headers,

            timeout = None

            )

            pdf= res1.content# 响应的二进制文件

            with open(pdfurlPath+ DownName, 'wb') as f:      # 二进制写入

                f.write(pdf)

if __name__== '__main__':

    ###      前五页

    monUrl_list= ["http://www.shclearing.com/xxpl/fxpl/index.html","http://www.shclearing.com/xxpl/fxpl/index_1.html","http://www.shclearing.com/xxpl/fxpl/index_2.html","http://www.shclearing.com/xxpl/fxpl/index_3.html","http://www.shclearing.com/xxpl/fxpl/index_4.html"]

    ss= 0

    for monUrlin monUrl_list:

        ss+= 1

        pdf_Dict(monUrl, ss)

相关文章

网友评论

      本文标题:(隐私)安居客滑动代码:

      本文链接:https://www.haomeiwen.com/subject/unypuctx.html