美文网首页Python新世界
反爬虫-python3.6抓取猫眼电影信息

反爬虫-python3.6抓取猫眼电影信息

作者: 919b0c54458f | 来源:发表于2018-12-14 10:27 被阅读2次

    思路分解:

    1.页面信息

    url:http://maoyan.com/cinema/24311?poi=164257570

    查看信息发现价格存在乱码现象:

    进群:960410445   即可获取数十套PDF!

    刷新页面找到乱码的URL,下载woff格式文件:方法:复制URL:右键单击转到下载完成,即为代码中的baseprice.woff文件,再次刷新网页,同样的方法再次下载URL作为匹配的woff文件,即为代码中的maoprice.woff.

    用这个网址打开保存的base.woff文件,如下图:

    FontEditor

    fontstore.baidu.com

    与代码行对应:

    反爬虫字体解析原理:先在网页上下载乱码文件base.woff,可以转化为xml,用pycharm打开可以看到信息,再刷新页面后下载maoyan.woff文件可以看到二者有对应的关系,就可以编写代码。

    二者的对应关系:

    2.字体解析代码:

    baseFont = TTFont('C:\Users\nanafighting\Desktop\baseprice.woff')

    maoyanFont = TTFont('maoprice.woff')

    maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()

    maoyan_num_list = []

    baseNumList = ['.', '6', '4', '7', '5', '2', '8', '0', '1', '9', '3']

    baseUniCode = ['x', 'uniF76E', 'uniEACB', 'uniE8D1', 'uniE737', 'uniE9B7', 'uniF098', 'uniF4DC', 'uniF85E','uniE2F1', 'uniEE4E']

    for i in range(1, 12):

    maoyanGlyph = maoyanFont['glyf'][maoyan_unicode_list[i]]

    for j in range(11):

    baseGlyph = baseFont['glyf'][baseUniCode[j]]

    if maoyanGlyph == baseGlyph:

    maoyan_num_list.append(baseNumList[j])

    break

    maoyan_unicode_list[1] = 'uni0078'

    utf8List = [eval(r"'\u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]

    3.代码中容易出错的地方:字符串的转换

    moviewish = mw[i].get_text().encode('utf-8')

    #字符串转换方法1

    #moviewish = str(moviewish, encoding='utf-8')

    #moviewish = '%r' % moviewish

    #moviewish = moviewish[1:-1]

    #字符串转换方法2

    moviewish=''.join('%s' %id for id in moviewish)

    for i in range(len(utf8List)):

    #字符转换

    utf8List[i]=''.join('%s' %id for id in utf8List[i])

    maoyan_num_list[i]=''.join('%s' %id for id in maoyan_num_list[i])

    moviewish = moviewish.replace(utf8List[i], maoyan_num_list[i])

    #完整代码import requestsimport refrom fontTools.ttLib import TTFontfrom bs4 import BeautifulSoup as bsfrom lxml import htmlfrom fontTools.ttLib import TTFont# 抓取maoyan票房class MaoyanSpider:

    # 页面初始化

    def __init__(self):

    self.headers = {

    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",

    "Accept-Encoding": "gzip, deflate, br",

    "Accept-Language": "zh-CN,zh;q=0.8",

    "Cache-Control": "max-age=0",

    "Connection": "keep-alive",

    "Upgrade-Insecure-Requests": "1",

    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",

    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"

    }

    # 获取票房

    def getNote(self):

    url = 'http://maoyan.com/cinema/24311?poi=164257570'

    host = {'host': 'maoyan.com',

    'refer': 'http://maoyan.com/news'}

    # 合并字典

    headers={**self.headers,**host}

    #headers = dict(self.headers.items() + host.items())在python3中会报错

    # 获取页面内容

    r = requests.get(url, headers=headers)

    # print r.text

    response = html.fromstring(r.text)

    u = r.text

    # 匹配ttf font

    cmp = re.compile(", url('(//.*.woff)') format('woff')")

    rst = cmp.findall(u)

    ttf = requests.get("http:" + rst[0], stream=True)

    with open("maoyanprice.woff", "wb") as pdf:

    for chunk in ttf.iter_content(chunk_size=1024):

    if chunk:

    pdf.write(chunk)

    # 解析字体库font文件

    #baseprice.woff是自己在网页上下载的乱码字符

    baseFont = TTFont('C:\Users\nanafighting\Desktop\baseprice.woff')

    maoyanFont = TTFont('maoprice.woff')

    maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()

    maoyan_num_list = []

    baseNumList = ['.', '6', '4', '7', '5', '2', '8', '0', '1', '9', '3']

    baseUniCode = ['x', 'uniF76E', 'uniEACB', 'uniE8D1', 'uniE737', 'uniE9B7', 'uniF098', 'uniF4DC', 'uniF85E','uniE2F1', 'uniEE4E']

    for i in range(1, 12):

    maoyanGlyph = maoyanFont['glyf'][maoyan_unicode_list[i]]

    for j in range(11):

    baseGlyph = baseFont['glyf'][baseUniCode[j]]

    if maoyanGlyph == baseGlyph:

    maoyan_num_list.append(baseNumList[j])

    break

    maoyan_unicode_list[1] = 'uni0078'

    utf8List = [eval(r"'\u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]

    # 获取发帖内容

    soup = bs(u, "html.parser")

    index = soup.find_all('div', {'class': 'show-list'})

    print('---------------Prices-----------------')

    for n in range(len(index)):

    mn = soup.find_all('h3', {'class': 'movie-name'})

    ting = soup.find_all('span', {'class': 'hall'})

    mt = soup.find_all('span', {'class': 'begin-time'})

    mw = soup.find_all('span', {'class': 'stonefont'})

    for i in range(len(mn)):

    moviename = mn[i].get_text()

    film_ting = ting[i].get_text()

    movietime = mt[i].get_text()

    moviewish = mw[i].get_text().encode('utf-8')

    #字符串转换

    #moviewish = str(moviewish, encoding='utf-8')

    #moviewish = '%r' % moviewish

    #moviewish = moviewish[1:-1]

    moviewish=''.join('%s' %id for id in moviewish)

    for i in range(len(utf8List)):

    #字符转换

    utf8List[i]=''.join('%s' %id for id in utf8List[i])

    maoyan_num_list[i]=''.join('%s' %id for id in maoyan_num_list[i])

    moviewish = moviewish.replace(utf8List[i], maoyan_num_list[i])

    print(moviename, film_ting, movietime, moviewish)spider = MaoyanSpider()print(spider.getNote())

    运行结果:

    相关文章

      网友评论

        本文标题:反爬虫-python3.6抓取猫眼电影信息

        本文链接:https://www.haomeiwen.com/subject/fhpwhqtx.html