美文网首页
中国大学排名定向爬虫

中国大学排名定向爬虫

作者: David5262 | 来源:发表于2019-10-28 23:00 被阅读0次
    # coding=gbk
    import requests
    from bs4 import BeautifulSoup
    import bs4
    
    class Ranking:
        def getHtml(self,url):
            try:
                r = requests.get(url,timeout=30)
                r.raise_for_status()
                r.encoding = r.apparent_encoding
                return r.text
            except:
                return ""
        def urlList(self,numberList,schoolList,scoreList, html):
            try:
                soup = BeautifulSoup(html, "html.parser")
                ##匹配带有class属性的tbody标签
                tbodyInfo = soup.find("tbody", attrs={'class': 'hidden_zhpm'})
                if isinstance(tbodyInfo,bs4.element.Tag):
                    #找到tbody下的tr标签
                    trList = tbodyInfo.find_all('tr')
                    for tdTag in trList:
                        #在每个tr标签下,查找所有的td标签
                        tdList = tdTag.find_all('td')
                        #通过角标获取所要的值
                        count = tdList[0].string
                        numberList.append(count)
                        schoolName = tdList[1].string
                        schoolList.append(schoolName)
                        scores = tdList[3].string
                        scoreList.append(scores)
                return numberList,schoolList,scoreList
            except Exception as e:
                print(e)
        def printList(self, num):
            try:
                with open("data2.txt",'w',encoding='utf-8') as f:
                    for i in range(num):
                        f.write("{0:^10}\t{1:{3}^10}\t{2:^10}".format(numberList[i], schoolList[i], scoreList[i],chr(12288)))
                        f.write('\n')
            except IOError as e:
                print(e)
    if __name__ == '__main__':
        numberList = []
        schoolList = []
        scoreList = []
        url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2019.html"
        rank = Ranking()
        html = rank.getHtml(url)
        rank.urlList(numberList,schoolList,scoreList,html)
        rank.printList(len(numberList))
    

    相关文章

      网友评论

          本文标题:中国大学排名定向爬虫

          本文链接:https://www.haomeiwen.com/subject/kcbivctx.html