美文网首页
虎牙、B站网页信息python抓取试试

虎牙、B站网页信息python抓取试试

作者: 千转军师 | 来源:发表于2021-08-24 11:57 被阅读0次

    利用虎牙和b站的网页来抓取用户及其粉丝数量

    使用时要注意:
    (1)cmd 命令下 python test.py
    后面有参数,如果没敲参数,会有提示,如:

    useage: python test.py <roomIdStart[0:10000000]> <roomIdInterval[0:10000]> <multiProcessNum[0:20]> <loopTimes[0:20]>
    param num is less than 5, num is 1
    

    含义是:起始房间号、搜索组的大小、线程数、循环次数
    搜索的总共房间数 = 搜索组的大小 * 线程数 * 循环次数
    (2)结果
    结果放在文件 data/target.txt
    注意事先创建文件夹 data,否则提示没有 data/target.txt
    (3)例子(虎牙):

    ================= new record ==================
    time:2022-02-20 10:18:55
    roomIdStart:1000
    roomIdInterval:10
    multiProcessNum:18
    loopTimes:2
    range[1000:1360]
    ===============================================
    1066 电子厂-心态 7945900 1539218884
    1123 奇领颜韵Ycy【万徒】 117598 1199552286636
    ==========================
    总共时18.54秒

    ##################################
    # 每日一抓:  虎牙粉丝排行榜数据 #
    ##################################
    from urllib.request import urlopen
    import sys
    
    
    
    #创建(覆盖)文件
    def create_file(name):
        f = open(name, "w")
        f.close();
    
    #数据存入文件
    def save_data_to_file(fileName, bufUtf8):
        f = open(fileName, "ab")
        f.write(bufUtf8)
    
    
    
    #抓取网页数据2 虎牙视频网页,抓取订阅数
    #例子: https://v.huya.com/u/1199553057095
    
    def get_subscribe_num_by_idstr(idstr):
        videoBaseUrl = 'https://v.huya.com/u/'
        subscribeMark = "                        <span>订阅:<em>"
        subscribeEndMark = "</em></span>\r\n"
        url = videoBaseUrl + idstr
        cnt = 0;
        result = "0"
        for line in urlopen(url):
            cnt += 1
            #限定有用数据范围,为了节省时间
            if cnt < 180 or cnt > 280 :
                continue
            line_str = line.decode(encoding = "utf-8")
            length = len(line_str);
            if line_str.find(subscribeMark) != -1:
                result = line_str[len(subscribeMark): length - len(subscribeEndMark)]
        return result
    
    #抓取网页数据
    #例子: https://www.huya.com/298039
    
    def web_content_pro(url, roomId, fileName):
        anchorMark = "<h3 class=\"host-name\" title="
        anchorEndMark = "</h3>\r\n"
        subscribeMark = "        <div class=\"subscribe-count\" id=\"activityCount\">"
        subscribeEndMark = "</div>\r\n"
        videoMark = "            <a class=\"host-video\" href=\"http://v.huya.com/u/"
        videoEndMark = "\" target=\"_blank\"><i></i><em>视频</em></a>\r\n"
        anchor = ""
        subscribe = "0"
        video = ""
        marka = 0;
        markb = 0;
        markc = 0;
        cnt = 0
        for line in urlopen(url):
            cnt += 1
            #限定有用数据范围,为了节省时间
            if cnt < 100 or cnt > 200 :
                continue 
            line_str = line.decode(encoding = "utf-8")
            length = len(line_str);
            
            #主播名
            if marka == 0 and line_str.find(anchorMark) != -1:
                marka = 1;
                anchor = line_str[line_str.find(">") + 1: length - len(anchorEndMark)]
            #订阅量
            if markb == 0 and  line_str.find(subscribeMark) != -1:
                markb = 1
                subscribe = line_str[len(subscribeMark): length - len(subscribeEndMark)]
            #视频对应的id
            if markc == 0 and line_str.find(videoMark) != -1:
                markc = 1
                video = line_str[len(videoMark): length - len(videoEndMark)] 
            if marka == 1 and markb == 1 and markc == 1 :
                break;
        if(len(anchor) > 0):
            #resultOut = "roomId:" + str(roomId) + "==>anchor:" + anchor + ";subscribe:" + subscribe + ";video:" + video
            endMark = ""
            if subscribe == '0':
                subscribe = get_subscribe_num_by_idstr(video)
                endMark = "\t[alarm:subscribe=0]"
            resultOut = str(roomId) + "\t" + anchor + "\t" + subscribe + "\t" + video + endMark
            print(resultOut);
            resultOut += "\n";
            save_data_to_file(fileName, resultOut.encode(encoding = "utf-8"))
        else:
            
            print("roomId:" + str(roomId) + "==>【未找到】")
        
    
    #抓取数据
    def catch_data(fileName, url, start, num):
        for x in range(num):
            urlTmp = urlStd + str(start + x)
            web_content_pro(url + str(start + x), start + x, fileName);
            
            
    #并发任务
    from multiprocessing import Process
    from os import getpid
    urlStd = "https://www.huya.com/"
    #catchStart = 298039
    #catchStart = 521000
    #catchNum = 2
    fileForSaveData = "data/fansData"
    
    def catch_data_task(index, start, num):
        newName = fileForSaveData + str(index) + ".txt"
        create_file(newName)
        catch_data(newName, urlStd, start, num)
        
    #目标文件添加内容
    fileForSaveData = "data/fansData"
    targetFileName = "data/target.txt"
    def target_file_add_content(num):
        fin = open(targetFileName, "ab")
        for x in range(num):   
            newName = fileForSaveData + str(x) + ".txt"
            fout = open(newName, "rb")
            fin.write(fout.read())
            fout.close()
        fin.close()
    
    #主函数
    #注:1000个房间号,20个线程,实测耗时 430s 、 380s
    def task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes):
        '''
        roomIdStart = 2000
        roomIdInterval = 10
        multiProcessNum = 20
        loopTimes = 10
        '''
        import time
        fin = open(targetFileName, "ab")
        title = "\n================= new record ==================\n"
        timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
        title += "time:" + timestr + "\n";
        title += "roomIdStart:" + str(roomIdStart) + "\nroomIdInterval:" + str(roomIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n"
        rangeStr = "range[" + str(roomIdStart) + ":" + str(roomIdStart + roomIdInterval * multiProcessNum * loopTimes) + "]"
        title += rangeStr + "\n"
        title += "===============================================\n"
        fin.write(title.encode(encoding="utf8"))
        fin.close()
        
        from time import time
        start = time()
        for loop in range(loopTimes):
            proce = []
            loopStart = roomIdStart + loop * roomIdInterval * multiProcessNum
            for x in range(multiProcessNum):
                proce.append(Process(target=catch_data_task, args=(x, loopStart + (x * roomIdInterval), roomIdInterval )))
                proce[x].start()
            for x in proce:
                x.join()
            target_file_add_content(multiProcessNum)
        end = time()
        timeStr = '==========================\n总共时%.2f秒' % (end - start)
        print(timeStr)
        fin = open(targetFileName, "ab")
        fin.write(timeStr.encode(encoding="utf8"))
        fin.close()
        
    def main():
        num = len(sys.argv)
        faild = 0
        errorMsg = "useage: python " + sys.argv[0] + " <roomIdStart[0:10000000]>" + " <roomIdInterval[0:10000]>" + " <multiProcessNum[0:20]>" + " <loopTimes[0:20]>"
        if num <= 4:
            errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv))
            print(errorMsg)
            faild = 1
        if faild != 1:
            roomIdStart = int(sys.argv[1])
            if roomIdStart < 0 or roomIdStart > 10000000:
                errorMsg += "\nparam roomIdStart is out of range, you input ==>" + sys.argv[1]
                print(errorMsg)
                faild = 1
        if faild != 1:
            roomIdInterval = int(sys.argv[2])
            if roomIdInterval < 0 or roomIdInterval > 10000:
                errorMsg += "\nparam roomIdInterval is out of range, you input ==>" + sys.argv[2]
                print(errorMsg)
                faild = 1
        if faild != 1:
            multiProcessNum = int(sys.argv[3])
            if multiProcessNum < 0 or multiProcessNum > 20:
                errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3]
                print(errorMsg)
                faild = 1
        if faild != 1:
            loopTimes = int(sys.argv[4])
            if loopTimes < 0 or loopTimes > 20:
                errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4]
                print(errorMsg)
                faild = 1
        if faild != 1:
            print("your input ==>\nroomIdStart:%d roomIdInterval:%d multiProcessNum:%d loopTimes:%d\n"%(roomIdStart, roomIdInterval, multiProcessNum, loopTimes))
            task_start(roomIdStart, roomIdInterval, multiProcessNum, loopTimes)
        
    #=============================
    if __name__ == '__main__':
        main()
        #print(sys.argv[0])
        #print(len(sys.argv))
    
    
    ##################################
    #          测       试       :b站用户粉丝数量信息抓取(多次使用可能被短期禁止访问)    #
    ##################################
    from urllib.request import urlopen
    from multiprocessing import freeze_support,Lock,Process,Value
    import sys
    
    g_var_cnt=Value('i',0)
    g_var_lock=Lock()
    
    
    #创建(覆盖)文件
    def create_file(name):
        f = open(name, "w")
        f.close()
        
    #向文件写入内容
    def write_to_file(name, bufIn):
        f = open(name, "a")
        f.write(bufIn)
        f.close()
    
    #参考: 获取网页隐藏信息  https://blog.csdn.net/qq_38270802/article/details/90204609
    #"https://space.bilibili.com/10558188"
    #"https://api.bilibili.com/x/space/acc/info?mid=10558188&jsonp=jsonp"
    #"https://api.bilibili.com/x/relation/stat?vmid=10558188&jsonp=jsonp"
    def get_user_info(totle, fileName, rangeStart, rangeEnd, cnt, lock):
        #fileName = "tmp.txt"
        
        urlForInfoStart = "https://api.bilibili.com/x/space/acc/info?mid="
        urlForInfoEnd = "&jsonp=jsonp"
        name = "[null]"
        nameMarkStart = "\"name\":\""
        nameMarkEnd = "\",\"sex\":\""
        
        urlForStatStart = "https://api.bilibili.com/x/relation/stat?vmid="
        urlForStatEnd = "&jsonp=jsonp"
        following = 0
        followingMarkStart = "\"following\":"
        followingMarkEnd = ",\"whisper\":"
        
        follow = 0
        followMarkStart = "\"follower\":"
        followMarkEnd = "}}"
        
        f = open(fileName, "w")
        
        for x in range(rangeStart, rangeEnd):
            #=================================================================用户昵称
            urlopen("https://api.bilibili.com/x/space/acc/info?mid=10558188&jsonp=jsonp")
            rst = urlopen(urlForInfoStart + str(x) + urlForInfoEnd).read().decode("utf-8")
            if None != rst:
                #用户昵称
                pos1 = rst.find(nameMarkStart)
                pos2 = rst.find(nameMarkEnd)
                if pos1 != -1 and pos2 != -1:
                    name = rst[pos1 + len(nameMarkStart):pos2]
            #=================================================================关注和粉丝数
            rst = urlopen(urlForStatStart + str(x) + urlForStatEnd).read().decode("utf-8")
            #print(rst)
            if None != rst:
                #关注数
                pos1 = rst.find(followingMarkStart)
                pos2 = rst.find(followingMarkEnd)
                if pos1 != -1 and pos2 != -1:
                    following = int(rst[pos1 + len(followingMarkStart):pos2])
                #粉丝数
                pos1 = rst.find(followMarkStart)
                pos2 = rst.find(followMarkEnd)
                if pos1 != -1 and pos2 != -1:
                    follow = int(rst[pos1 + len(followMarkStart):pos2])
            #=================================================================写入文件 
            with lock:
                cnt.value += 1
            #print(cnt.value, totle, cnt.value * 100 / totle, x, name, following, follow)
            outStr = "\r%d%%\t"%(cnt.value * 100 / totle)
            outStr += "%d\t%s\t%d\t%d"%(x, name, following, follow)
            lenTmp = len(outStr)
            for i in range(40 - lenTmp):
                outStr += " "       
            print(outStr, end="")
            bufIn = "%d\t%s\t%d\t%d\n"%(x, name, following, follow) 
            f.write(bufIn)
            
        f.close()
        print("")
    
    def get_user_info_task(dataFileName, loopTimes, loop, index, multiProcessNum, start, interval, cnt, lock):
        print("=======================================>task[%d/%d]%d%%"%(loop + 1, loopTimes, loop * 100 / loopTimes))
        fileName = dataFileName + str(index) + ".txt"
        totle = multiProcessNum * interval
        get_user_info(totle, fileName, start, start + interval, cnt, lock)
        
    
    def target_file_add_content(targetFileName, dataFileName, num):
        fin = open(targetFileName, "ab")
        for x in range(num):   
            newName = dataFileName + str(x) + ".txt"
            fout = open(newName, "rb")
            fin.write(fout.read())
            fout.close()
        fin.close()
    
    def task_start(userIdStart, userIdInterval, multiProcessNum, loopTimes):
        import time
        start = time.time()
        dataFileName = "data/biliData"
        targetFileName = "data/target.txt"
        
        fin = open(targetFileName, "ab")
        title = "\n================= new record ==================\n"
        timestr = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
        title += "time:" + timestr + "\n";
        title += "userIdStart:" + str(userIdStart) + "\nuserIdInterval:" + str(userIdInterval) + "\nmultiProcessNum:" + str(multiProcessNum) + "\nloopTimes:" + str(loopTimes) + "\n"
        rangeStr = "range[" + str(userIdStart) + ":" + str(userIdStart + userIdInterval * multiProcessNum * loopTimes) + "]"
        title += rangeStr + "\n"
        title += "===============================================\n"
        fin.write(title.encode(encoding="utf8"))
        fin.close()
      
        for loop in range(loopTimes):
            proce = []
            g_var_cnt.value = 0
            loopStart = userIdStart + loop * userIdInterval * multiProcessNum
            proce=[Process(target=get_user_info_task, args=(dataFileName, loopTimes, loop, i, multiProcessNum, loopStart + (i * userIdInterval), userIdInterval, g_var_cnt, g_var_lock,)) for i in range(multiProcessNum)]
            for x in proce:
                x.start()
            for x in proce:
                x.join()
            target_file_add_content(targetFileName, dataFileName, multiProcessNum)
        end = time.time()
        timeStr = '\n==========================\n总共时%.2f秒, 搜索%d位用户[%d:%d], 实际搜索%d(访问可能被禁止)' % (end - start, multiProcessNum * loopTimes * userIdInterval, userIdStart, userIdStart + userIdInterval * multiProcessNum * loopTimes, g_var_cnt.value)
        print(timeStr)
        fin = open(targetFileName, "a")
        fin.write(timeStr)
        fin.close()
    
    
    def main():
        num = len(sys.argv)
        faild = 0
        errorMsg = "useage: python " + sys.argv[0] + " <userIdStart[0:100000000]>" + " <userIdInterval[0:10000]>" + " <multiProcessNum[0:20]>" + " <loopTimes[0:20]>"
        if num <= 4:
            errorMsg += "\nparam num is less than 5, num is " + str(len(sys.argv))
            print(errorMsg)
            faild = 1
        if faild != 1:
            userIdStart = int(sys.argv[1])
            if userIdStart < 0 or userIdStart > 100000000:
                errorMsg += "\nparam userIdStart is out of range, you input ==>" + sys.argv[1]
                print(errorMsg)
                faild = 1
        if faild != 1:
            userIdInterval = int(sys.argv[2])
            if userIdInterval < 0 or userIdInterval > 10000:
                errorMsg += "\nparam userIdInterval is out of range, you input ==>" + sys.argv[2]
                print(errorMsg)
                faild = 1
        if faild != 1:
            multiProcessNum = int(sys.argv[3])
            if multiProcessNum < 0 or multiProcessNum > 20:
                errorMsg += "\nparam multiProcessNum is out of range, you input ==>" + sys.argv[3]
                print(errorMsg)
                faild = 1
        if faild != 1:
            loopTimes = int(sys.argv[4])
            if loopTimes < 0 or loopTimes > 20:
                errorMsg += "\nparam loopTimes is out of range, you input ==>" + sys.argv[4]
                print(errorMsg)
                faild = 1
        if faild != 1:
            print("============================================================")
            print("                       task  start                          ")
            print("userIdStart:%d userIdInterval:%d multiProcessNum:%d loopTimes:%d"%(userIdStart, userIdInterval, multiProcessNum, loopTimes))
            print("============================================================")
            task_start(userIdStart, userIdInterval, multiProcessNum, loopTimes)
    
    
    if __name__ == '__main__':
        main()
    
    

    相关文章

      网友评论

          本文标题:虎牙、B站网页信息python抓取试试

          本文链接:https://www.haomeiwen.com/subject/ilililtx.html