感谢老铁分享:github链接:https://github.com/longxiaofei/dianping
关于点评字体加密破解流程(来源github):
1.请求商家评论页,获取请求成功后的结果,解析出字体样式所对应css的url
2.请求css的url,获取请求成功后的结果,获取字体背景svg所对应的url,并解析出class样式对应的偏移坐标
3.请求svg的url,获取请求成功后的结果,解析出偏移坐标所对应的字体,并与class的偏移坐标做映射
4.将原html的字体span标签替换
代码(代码很简单,重在思路):
import datetime
import random
import time
import re
# from selenium.webdriver.chrome.options import Options
# from selenium import webdriver
from lxml import etree
import requests
class DianpingComment:
font_size = 14
start_y = 23
def __init__(self, shop_id, cookies, delay=7, handle_ban=False):
self.shop_id = shop_id
self._delay = delay
self._cookies = self._format_cookies(cookies)
self._css_headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}
self._default_headers = {
'Connection': 'keep-alive',
'Host': 'www.dianping.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}
self._cur_request_url = 'http://www.dianping.com/shop/{}/review_all/p1'.format(shop_id)
if handle_ban:
print('不想写跳过验证了')
# self._browser = self._init_browser()
# self._handle_ban()
def run(self):
self._css_link = self._get_css_link(self._cur_request_url)
self._font_dict = self._get_font_dict(self._css_link)
self._get_conment_page()
def _delay_func(self):
delay_time = random.randint((self._delay-2)*10, (self._delay+2)*10) * 0.1
time.sleep(delay_time)
def _format_cookies(self, cookies):
cookies = {cookie.split('=')[0]: cookie.split('=')[1]
for cookie in cookies.replace(' ', '').split(';')}
return cookies
def _get_conment_page(self):
"""
请求评论页,并将<span></span>样式替换成文字
"""
while self._cur_request_url:
self._delay_func()
print('[{now_time}] {msg}'.format(now_time=datetime.datetime.now(), msg=self._cur_request_url))
res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies)
html = res.text
class_set = set()
for span in re.findall(r'<span class="([a-zA-Z0-9]{5,6})"></span>', html):
class_set.add(span)
for class_name in class_set:
html = re.sub('<span class="%s"></span>' % class_name, self._font_dict[class_name], html)
doc = etree.HTML(html)
self._parse_comment_page(doc)
try:
self._default_headers['Referer'] = self._cur_request_url
next_page_url = 'http://www.dianping.com' + doc.xpath('.//a[@class="NextPage"]/@href')[0]
except IndexError:
next_page_url = None
self._cur_request_url = next_page_url
def _data_pipeline(self, data):
"""
处理数据
"""
print(data)
def _parse_comment_page(self, doc):
"""
解析评论页并提取数据
"""
for li in doc.xpath('//*[@class="reviews-items"]/ul/li'):
name = li.xpath('.//a[@class="name"]/text()')[0].strip('\n\r \t')
try:
star = li.xpath('.//span[contains(./@class, "sml-str")]/@class')[0]
star = re.search(r'sml-str(\d+)', star)[1]
except IndexError:
star = 0
time = li.xpath('.//span[@class="time"]/text()')[0].strip('\n\r \t')
score = ' '.join(map(lambda s: s.strip('\n\r \t'), li.xpath('.//span[@class="score"]//text()')))
comment = ''.join(li.xpath('.//div[@class="review-words Hide"]/text()')).strip('\n\r \t')
if not comment:
comment = ''.join(li.xpath('.//div[@class="review-words"]/text()')).strip('\n\r \t')
data = {
'name': name,
'comment': comment,
'star': star,
'score': score,
'time': time,
}
self._data_pipeline(data)
def _get_css_link(self, url):
"""
请求评论首页,获取css样式文件
"""
res = requests.get(url, headers=self._default_headers, cookies=self._cookies)
html = res.text
print(html)
css_link = re.search(r'<link re.*?css.*?href="(.*?svgtextcss.*?)">', html)
print(css_link)
assert css_link
css_link = 'http:' + css_link[1]
return css_link
def _get_font_dict(self, url):
"""
获取css样式对应文字的字典
"""
res = requests.get(url, headers=self._css_headers)
html = res.text
background_image_link = re.search(r'background-image:.*?\((.*?svg)\)', html)
assert background_image_link
background_image_link = 'http:' + background_image_link[1]
html = re.sub(r'span.*?\}', '', html)
group_offset_list = re.findall(r'\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;', html)
font_dict_by_offset = self._get_font_dict_by_offset(background_image_link)
font_dict = {}
for class_name, x_offset, y_offset in group_offset_list:
x_offset = x_offset.replace('.0', '')
y_offset = y_offset.replace('.0', '')
font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)]
return font_dict
def _get_font_dict_by_offset(self, url):
"""
获取坐标偏移的文字字典, 会有最少两种形式的svg文件(目前只遇到两种)
"""
res = requests.get(url, headers=self._css_headers)
html = res.text
font_dict = {}
y_list = re.findall(r'd="M0 (\d+?) ', html)
if y_list:
font_list = re.findall(r'<textPath .*?>(.*?)<', html)
for i, string in enumerate(font_list):
y_offset = self.start_y - int(y_list[i])
sub_font_dict = {}
for j, font in enumerate(string):
x_offset = -j*self.font_size
sub_font_dict[x_offset] = font
font_dict[y_offset] = sub_font_dict
else:
font_list = re.findall(r'<text.*?y="(.*?)">(.*?)<', html)
for y, string in font_list:
y_offset = self.start_y - int(y)
sub_font_dict = {}
for j, font in enumerate(string):
x_offset = -j*self.font_size
sub_font_dict[x_offset] = font
font_dict[y_offset] = sub_font_dict
return font_dict
if __name__ == "__main__":
pass
网友评论