美文网首页
爬取appstore应用信息

爬取appstore应用信息

作者: 尧月 | 来源:发表于2018-02-27 15:30 被阅读359次
    #!/usr/bin/env python
    # encoding=utf8  
    import sys  
    
    import bs4
    from bs4 import BeautifulSoup
    import requests
    import json
    import re
    
    gIgnoreGenreList = ['报刊杂志', '贴纸', '商品指南']
        
    # https://www.apple.com/cn/itunes/charts/paid-apps/
    # https://www.apple.com/cn/itunes/charts/free-apps/
    def parse_appstore_page(cate_url, out_file):
        # section apps grid
        html = requests.get(cate_url).content  
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
    
        result = soup.find("div", id="main")
    
        result = result.find("section", class_="section apps grid")
        result = result.find("ul")
        app_list = result.children
        app_result = []
        for child in app_list:
            if isinstance(child, bs4.element.Tag):
                app_info = child.find("h3").find("a")
    
                # print child
                app_name = app_info.string
                app_itunes_url = app_info.get("href")
    
                # print(repr(app_name).decode('unicode-escape'))
                # print(repr(app_detail_url).decode('unicode-escape'))
                one_app = {}
                one_app["app_name"] = app_name
                one_app["app_detail_url"] = app_itunes_url
                app_result.append(one_app)
    
        if len(out_file) > 0:
            print "save result to file :%s" % out_file
            result_string = json.dumps(app_result, ensure_ascii=False)
            f=open(out_file,"w")
            f.write(result_string)
            f.write('\n')
            f.close()
    
    # https://itunes.apple.com/cn/genre/ios/id36?mt=8
    def parse_genre_page(genre_url, limit = 10, out_file = "genre_result.txt"):
        html = requests.get(genre_url).content  
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
    
        result = soup.find("div", id="main")
    
        result = result.find("div", id="content")
        result = result.find("div", id="genre-nav")
        result = result.find("div", class_="grid3-column")
    
        # ul list
        ul_list = result.find_all("ul", recursive=False)
        cate_result = []
    
        for one_ul in ul_list:
            if isinstance(one_ul, bs4.element.Tag):
    
                cate_list = one_ul.children
    
                for cate in cate_list:
                    cate_info = cate.find("a")
                    cate_name = cate_info.string
    
                    if cate_name in gIgnoreGenreList:
                        print "ingore cate :" + cate_name
                        continue
    
                    cate_url = cate_info.get("href")
    
                    one_cate = {}
                    one_cate["name"] = cate_name
                    one_cate["url"] = cate_url
    
                    print "processing genre %s." % cate_name
                    cate_app = parse_genre_content(cate_url, limit)
                    one_cate["app_list"] = cate_app
    
                    cate_result.append(one_cate)
                    break
    
        if len(out_file) > 0:
            print "save result to file :%s" % out_file
            result_string = json.dumps(cate_result, ensure_ascii=False)
            f=open(out_file,"w")
            f.write(result_string)
            f.write('\n')
            f.close()
    
    # https://itunes.apple.com/cn/genre/ios-导航/id6010?mt=8 
    # 返回一个数组 [{app_name:xxxx, app_detail_url:xxxxx}]
    # limit 为app返回数量,-1 为不限制
    def parse_genre_content(content_url, limit = -1):
        html = requests.get(content_url).content  
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
    
        result = soup.find("div", id="main")
    
        result = result.find("div", id="content")
        result = result.find("div", id="selectedgenre")
        result = result.find("div", class_="grid3-column")
    
        # ul list
        ul_list = result.find_all("div")
        app_result = []
    
        count = 0
        for one_ul in ul_list:
            if isinstance(one_ul, bs4.element.Tag):
    
                app_list = one_ul.find("ul").children
    
                for app in app_list:
                    if isinstance(app, bs4.element.Tag):
                        
    
                        if limit > -1 and count >= limit:
                            break   
                        # print count
                        app_info = app.find("a")
                        app_name = app_info.string
                        app_url = app_info.get("href")
    
                        one_app = {}
                        one_app["app_name"] = app_name
                        one_app["app_detail_url"] = app_url
                        one_app["app_id"] = parse_appid(app_url)
    
                        app_detail = parse_detail_page(app_url)
    
                        if app_detail != None:
                            print app_detail
                            one_app.update(app_detail)
                        
                        app_result.append(one_app)
                        count += 1
                            
        return app_result
    
    
    # https://itunes.apple.com/cn/app/高德地图-精准导航-出行必备/id461703208?mt=8
    # {latest_version:x.x.x, update_date:xxxxxxx, system_version:iOS 7.0}
    def parse_detail_page(detail_url):
        app_detail = {}
    
        html = requests.get(detail_url).content  
        soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
        result = soup.find("main", class_="is-app-theme")
    
        # print result
        # result_string = soup
        # f=open("out_file.txt","w")
        # print >> f, result
        # f.write(result_string)
        # f.close()
    
        # return
    
        whats_new = result.find("section", class_="l-content-width section section--bordered whats-new")
    
    # 获取版本号和最后更新时间
        update_date = whats_new.find("time").string
        latest_version = parse_appver(whats_new.find("p").string)
    
        # print update_date
        # print latest_version
        # APP 信息
        information = result.find("dl", class_=["information-list", "information-list--app", "medium-columns"])
    
    # 获取app支持的最低iOS系统版本号
        version_label = information.select("div")[3].select("div > span")[0].string
        system_version = parse_systver(version_label)
        
        # print system_version
        # return
    
        app_detail["update_date"] = update_date
        app_detail["latest_version"] = latest_version
        app_detail["system_version"] = system_version
        
        return app_detail
    
    def parse_appid(detail_url):
        # [^/]+(?!.*/)(?=[\?]+)
        return re.search("[^/]+(?!.*/)(?=[\?]+)", detail_url).group(0)
    
    def parse_systver(version_label):
        # return version_label
        return re.search("iOS.(\d+\.\d+)", version_label).group(1)
    
    def parse_appver(version_label):
        # "版本 10.5.0"
        # return version_label
        return re.search(u"版本\s(.+)", version_label).group(1)
    
    if __name__ == '__main__':
    
        reload(sys)  
        sys.setdefaultencoding('utf8')   
        # parse_appstore_page('https://www.apple.com/cn/itunes/charts/free-apps/', 'free-apps.txt')
        # parse_appstore_page('https://www.apple.com/cn/itunes/charts/paid-apps/', 'paid-apps.txt')
    
        parse_genre_page('https://itunes.apple.com/cn/genre/ios/id36?mt=8', 1, 'appstore-genre.txt')
    
        # version = re.search("iOS.(\d+\.\d+)", "xxx iOS 8.0 asdasdadasd").group(1)
        # print version
    
    

    相关文章

      网友评论

          本文标题:爬取appstore应用信息

          本文链接:https://www.haomeiwen.com/subject/kckmxftx.html