python爬取优惠信息

作者: 困困harper | 来源:发表于2017-09-05 22:51 被阅读114次

    大部分银行的官网上都会公布促销信息,有些银行做的很规范,有些做的非常混乱,今天通过python爬取一家相对比较规范的银行促销信息,如下图:

    当我看到这个促销信息,我内心是高兴的,特别的规整。

    我通过chrome浏览器可以查看此页面源码,通过检测发现所有数据都是由ajax获取json来动态加载,这样就省去了遍历页面的麻烦,可以直接解析json数据。

    首先我们通过chrome检测功能中的network来拦截消息,如下图

    我发现(https://creditcard.cmbc.com.cn/fe/getType.gsp?city=%E5%8C%97%E4%BA%AC%E5%B8%82),这个请求的response会返回json,我通过这个请求获取商区信息。然后通过请求(https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=1&dictrict=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=16&typeName=)其中包含了城市ID、商区ID、页码、行数,这里我指定每页行数为10,通过这个请求会拿到当前页的所有促销信息和总行数。我根据总行数和每页10行,计算出需要翻页次数。最后进入促销信息介绍页面,如下图:

    同样json格式获取信息:

    整体代码如下:

    # 导入包
    import os
    import requests
    import json
    from lxml import etree
    from multiprocessing import Pool

    def getHtml(url):
    # 定义req为一个requests请求的对象
    req = requests.get(url)
    # req这个请求对象的status_code方法获取请求的状态码
    status_code = req.status_code
    if (status_code!= 200):
    return 'req_error'
    # print(status_code)
    # 指定网页解码方式
    req.encoding = 'utf-8'
    # 获取网页源码 用html变量接收 text content方法灵活运用
    # https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=1&dictrict=&kindName=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=16&typeName=
    html = req.text
    return html


    #请求url
    def get_fenye(url):
    html = getHtml(url)
    # print(html.strip())
    if(html!='req_error'):
    json_citys = json.loads(html.strip())
    # json_citys = [{'tvalue': '重庆', 'tkey': '28', 'py': 'Z'}]
    # print(type(json_citys))
    for json_city in json_citys:
    # print(json_city['tkey'],json_city['tvalue'],json_city['py'])
    area_url = 'https://creditcard.cmbc.com.cn/fe/getType.gsp?city=%s' % (json_city['tvalue'])
    html2 = getHtml(area_url)
    # print(html2.strip())
    if (html2 != 'req_error'):
    json_areas = json.loads(html2.strip())
    # print(type(json_areas))
    if( isinstance(json_areas, list)):
    businessAreaLists = json_areas[0]['cityList'][0]['businessAreaList']
    # print(type(businessAreaLists),businessAreaLists)
    for businessAreaList in businessAreaLists:
    # print(json_city['tvalue'],businessAreaList['businessAreaId'],businessAreaList['businessAreaName'])
    cx_url = 'https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=%s&dictrict=%s&kindName=&latitude=&longitude=&mark=pc&page=1&resolving=&rows=10&typeName=' % (json_city['tkey'], businessAreaList['businessAreaId'])
    # print(cx_url)
    # https: // creditcard.cmbc.com.cn / fe / common / shop - Business - info.jsp?shopid = 107141

    html3 = getHtml(cx_url)
    if (html3 != 'req_error'):
    json_cxs = json.loads(html3.strip())
    row_cnt = json_cxs[0]['rowCount']
    # print(row_cnt, int(row_cnt / 10))
    if(row_cnt>0):
    for num in range(1,int(row_cnt / 10)+2):
    # print(num)
    fy_url = 'https://creditcard.cmbc.com.cn/fe/getShopListByCityName.gsp?city=%s&dictrict=%s&kindName=&latitude=&longitude=&mark=pc&page=%d&resolving=&rows=10&typeName=' % (json_city['tkey'], businessAreaList['businessAreaId'],num)
    print(fy_url)
    html4 = getHtml(fy_url)
    if (html4 != 'req_error'):
    json_cxsps = json.loads(html4.strip())
    ShopLists = json_cxsps[0]['ShopList']
    # print(type(ShopLists),ShopLists)
    for ShopList in ShopLists:
    tcity = ShopList['tcity']
    tlongitude = ShopList['tlongitude']
    tstatus = ShopList['tstatus']
    tshopId = ShopList['tshopId']
    timgType = ShopList['timgType']
    tlatitude = ShopList['tlatitude']
    tmerchName = ShopList['tmerchName']
    taddress = ShopList['taddress']
    # timgName = ShopList['timgName']
    tdiscount = ShopList['tdiscount']
    # https://creditcard.cmbc.com.cn/fe/find/fingShopByShopName.gsp?mark =pc&resolving=&rowser=&rowserEdition=&shopId=107141
    shop_url = 'https://creditcard.cmbc.com.cn/fe/find/fingShopByShopName.gsp?mark =pc&resolving=&rowser=&rowserEdition=&shopId=%s' % (tshopId)
    html5 = getHtml(shop_url)
    if (html5 != 'req_error'):
    json_shops = json.loads(html5.strip())
    # print(json_shops)
    print(shop_url)
    ShopDetailList = json_shops[0]['ShopDetailList'][0]
    timgName = json_shops[0]['ShopImgList'][0]['timgName']
    # https://creditcard.cmbc.com.cn/
    # print(ShopDetailList)
    print(json_city['tvalue'],businessAreaList['businessAreaName'],tcity, tlongitude, tstatus, tshopId, timgType, tlatitude, tmerchName, taddress,
    timgName, tdiscount,timgName)
    # print(ShopImgList)
    tbriefInfoAdv = ShopDetailList['tbriefInfoAdv']
    ditails = ShopDetailList['ditail']
    content = '名称:' + tmerchName + '\n' + '地址:' + taddress + '\n' + '优惠信息:'+ tdiscount + '\n'

    for ditail in ditails:
    # print(ditail['key'],ditail['value'])
    content = content + ditail['key'] + ':' + ditail['value'] + '\n'
    print('--------------------------------------')
    content = content + '图片地址:' + 'https://creditcard.cmbc.com.cn/'+timgName + '\n' + '--------------------------------------' + '\n'
    writeTxt('cmbc',json_city['tvalue'], businessAreaList['businessAreaName'], content)


    def writeTxt(bank_name,menu_name,short_name,content):
    base_dir = os.path.abspath(__file__)
    parent_dir = os.path.dirname(base_dir)
    menu_dir = os.path.join(parent_dir, bank_name, menu_name)
    if os.path.isdir(menu_dir):
    pass
    else:
    os.makedirs(menu_dir)
    os.chdir(menu_dir)
    file_name = os.path.join(menu_dir, short_name + '.txt')
    with open(file_name, 'a', encoding='utf-8') as file:
    file.write(content)


    if __name__ == '__main__':
    root_url = 'https://creditcard.cmbc.com.cn/fe/getCityList.gsp'
    get_fenye(root_url)

    运行结果:

    后续可以加上线程池方式来加速爬取优惠信息。。。。。。。。。。

    1,增加状态返回码,单进程爬取时候发现报错,但是重复爬有没有错误信息,所以增加响应码。

    2,将爬取信息写到文件中,按城市划分目录,地区来命名。

    3,

    相关文章

      网友评论

        本文标题:python爬取优惠信息

        本文链接:https://www.haomeiwen.com/subject/qxgpjxtx.html