美文网首页
大众点评字体加密破解,破解流程与破解代码

大众点评字体加密破解,破解流程与破解代码

作者: sexy_cyber | 来源:发表于2019-01-17 11:48 被阅读162次

    感谢老铁分享:github链接:https://github.com/longxiaofei/dianping

    关于点评字体加密破解流程(来源github):

    1.请求商家评论页,获取请求成功后的结果,解析出字体样式所对应css的url
    2.请求css的url,获取请求成功后的结果,获取字体背景svg所对应的url,并解析出class样式对应的偏移坐标
    3.请求svg的url,获取请求成功后的结果,解析出偏移坐标所对应的字体,并与class的偏移坐标做映射
    4.将原html的字体span标签替换

    代码(代码很简单,重在思路):

    import datetime
    import random
    import time
    import re
    
    # from selenium.webdriver.chrome.options import Options
    # from selenium import webdriver
    from lxml import etree
    import requests
    
    class DianpingComment:
        font_size = 14
        start_y = 23
    
        def __init__(self, shop_id, cookies, delay=7, handle_ban=False):
            self.shop_id = shop_id
            self._delay = delay
            self._cookies = self._format_cookies(cookies)
            self._css_headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
            }
            self._default_headers = {
                'Connection': 'keep-alive',
                'Host': 'www.dianping.com',
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
            }
            self._cur_request_url = 'http://www.dianping.com/shop/{}/review_all/p1'.format(shop_id)
            if handle_ban:
                print('不想写跳过验证了')
                # self._browser = self._init_browser()
                # self._handle_ban()
    
    
        def run(self):
            self._css_link = self._get_css_link(self._cur_request_url)
            self._font_dict = self._get_font_dict(self._css_link)
            self._get_conment_page()
    
        def _delay_func(self):
            delay_time = random.randint((self._delay-2)*10, (self._delay+2)*10) * 0.1
            time.sleep(delay_time)
    
        def _format_cookies(self, cookies):
            cookies = {cookie.split('=')[0]: cookie.split('=')[1]
            for cookie in cookies.replace(' ', '').split(';')}
                       
            return cookies
    
        def _get_conment_page(self):
            """
                请求评论页,并将<span></span>样式替换成文字
            """
            while self._cur_request_url:
                self._delay_func()
                print('[{now_time}] {msg}'.format(now_time=datetime.datetime.now(), msg=self._cur_request_url))
    
                res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies)
                html = res.text
                
                class_set = set()
                for span in re.findall(r'<span class="([a-zA-Z0-9]{5,6})"></span>', html):
                    class_set.add(span)
    
                for class_name in class_set:
                    html = re.sub('<span class="%s"></span>' % class_name, self._font_dict[class_name], html)
    
                doc = etree.HTML(html)
                self._parse_comment_page(doc)
    
                try:
                    self._default_headers['Referer'] = self._cur_request_url
                    next_page_url = 'http://www.dianping.com' + doc.xpath('.//a[@class="NextPage"]/@href')[0]
                except IndexError:
                    next_page_url = None
                self._cur_request_url = next_page_url
    
        def _data_pipeline(self, data):
            """
                处理数据
            """
            print(data)
    
    
        def _parse_comment_page(self, doc):
            """
                解析评论页并提取数据
            """
            for li in doc.xpath('//*[@class="reviews-items"]/ul/li'):
                name = li.xpath('.//a[@class="name"]/text()')[0].strip('\n\r \t')
                try:
                    star = li.xpath('.//span[contains(./@class, "sml-str")]/@class')[0]
                    star = re.search(r'sml-str(\d+)', star)[1]
                except IndexError:
                    star = 0
                time = li.xpath('.//span[@class="time"]/text()')[0].strip('\n\r \t')
                score = ' '.join(map(lambda s: s.strip('\n\r \t'), li.xpath('.//span[@class="score"]//text()')))
                comment = ''.join(li.xpath('.//div[@class="review-words Hide"]/text()')).strip('\n\r \t')
                if not comment:
                    comment = ''.join(li.xpath('.//div[@class="review-words"]/text()')).strip('\n\r \t')
                
                data = {
                    'name': name,
                    'comment': comment,
                    'star': star,
                    'score': score,
                    'time': time,
                }
    
                self._data_pipeline(data)
    
        def _get_css_link(self, url):
            """
                请求评论首页,获取css样式文件
            """
            res = requests.get(url, headers=self._default_headers, cookies=self._cookies)
            html = res.text
            print(html)
            css_link = re.search(r'<link re.*?css.*?href="(.*?svgtextcss.*?)">', html)
            print(css_link)
            assert css_link
            css_link = 'http:' + css_link[1]
            return css_link
    
        def _get_font_dict(self, url):
            """
                获取css样式对应文字的字典
            """
            res = requests.get(url, headers=self._css_headers)
            html = res.text
            background_image_link = re.search(r'background-image:.*?\((.*?svg)\)', html)
            assert background_image_link
            background_image_link = 'http:' + background_image_link[1]
            html = re.sub(r'span.*?\}', '', html)
            group_offset_list = re.findall(r'\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;', html)
    
            font_dict_by_offset = self._get_font_dict_by_offset(background_image_link)
            font_dict = {}
            
            for class_name, x_offset, y_offset in group_offset_list:
                x_offset = x_offset.replace('.0', '')
                y_offset = y_offset.replace('.0', '')
                font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)]
    
            return font_dict
    
        def _get_font_dict_by_offset(self, url):
            """
                获取坐标偏移的文字字典, 会有最少两种形式的svg文件(目前只遇到两种)
            """
            res = requests.get(url, headers=self._css_headers)
            html = res.text
            font_dict = {}
    
            y_list = re.findall(r'd="M0 (\d+?) ', html)
    
            if y_list:
                font_list = re.findall(r'<textPath .*?>(.*?)<', html)
                for i, string in enumerate(font_list):
                    y_offset = self.start_y - int(y_list[i])
    
                    sub_font_dict = {}
                    for j, font in enumerate(string):
                        x_offset = -j*self.font_size
                        sub_font_dict[x_offset] = font
    
                    font_dict[y_offset] = sub_font_dict
    
            else:
                font_list = re.findall(r'<text.*?y="(.*?)">(.*?)<', html)
    
                for y, string in font_list:
                    y_offset = self.start_y - int(y)
                    sub_font_dict = {}
                    for j, font in enumerate(string):
                        x_offset = -j*self.font_size
                        sub_font_dict[x_offset] = font
    
                    font_dict[y_offset] = sub_font_dict
            return font_dict
    
    
    if __name__ == "__main__":
        pass
    
    
    
    

    相关文章

      网友评论

          本文标题:大众点评字体加密破解,破解流程与破解代码

          本文链接:https://www.haomeiwen.com/subject/ozzbdqtx.html