美文网首页
简单爬虫练习:爬虫文章

简单爬虫练习:爬虫文章

作者: 绛珠仙靖 | 来源:发表于2019-12-27 15:18 被阅读0次
    #author: Jingke
    from bs4 import BeautifulSoup
    import ssl
    from urllib.request import Request, urlopen
    import urllib.request
    
    
    class Scrape_news():
       @classmethod
       def url_link(cls, url, *args, **kwargs):
           ssl._create_default_https_context = ssl._create_unverified_context
           opener = urllib.request.build_opener()
           opener.addheaders = [('User-Agent',
                                 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
           urllib.request.install_opener(opener)
           req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    
           web = urlopen(req)
           bsObj = BeautifulSoup(web)
           news = bsObj.find_all(*args, **kwargs)
    
           list = []
           for new in news:
               list.append(new.get('href'))
    
           l=[]
           for i in list:
               if "articles" in i:
                   i = "http://www.qdaily.com/" + i
                   l.append(i)
           print(l)
           return l
    
    
    if __name__ == '__main__':
       # Scrape_news.url_link('http://www.qdaily.com', "h3", {"class": "smart-dotdotdot"})
       Scrape_news.url_link('http://www.qdaily.com', "a")
    
    

    result:
    ['http://www.qdaily.com//articles/64790.html', 'http://www.qdaily.com//articles/64771.html', 'http://www.qdaily.com//articles/64794.html', 'http://www.qdaily.com//articles/64764.html', 'http://www.qdaily.com//articles/64696.html', 'http://www.qdaily.com//articles/64790.html', 'http://www.qdaily.com//articles/64771.html', 'http://www.qdaily.com//articles/64794.html', 'http://www.qdaily.com//articles/64764.html', 'http://www.qdaily.com//articles/64696.html', 'http://www.qdaily.com//articles/64935.html', 'http://www.qdaily.com//articles/64924.html', 'http://www.qdaily.com//articles/64933.html', 'http://www.qdaily.com//articles/64934.html', 'http://www.qdaily.com//articles/64923.html', 'http://www.qdaily.com//articles/64921.html', 'http://www.qdaily.com//articles/64930.html', 'http://www.qdaily.com//articles/64931.html', 'http://www.qdaily.com//articles/64927.html', 'http://www.qdaily.com//articles/64922.html', 'http://www.qdaily.com//articles/64929.html', 'http://www.qdaily.com//articles/64928.html', 'http://www.qdaily.com//articles/64925.html', 'http://www.qdaily.com//articles/64926.html', 'http://www.qdaily.com//articles/64919.html', 'http://www.qdaily.com//articles/64920.html', 'http://www.qdaily.com//articles/64904.html']

    ------------------------------------------------------------------------------------------------------------------#

    #author: Jingke
    
    class Scrape_news():
        @classmethod
        def url_link(cls, url, *args, **kwargs):
            ssl._create_default_https_context = ssl._create_unverified_context
            opener = urllib.request.build_opener()
            opener.addheaders = [('User-Agent',
                                  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
            urllib.request.install_opener(opener)
            req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    
            web = urlopen(req)
            bsObj = BeautifulSoup(web)
            news = bsObj.find_all(*args, **kwargs)
    
            list=[]
            for new in news:
                list.append(new.get_text())
            print(list)
            return list
    
    
    if __name__ == '__main__':
        Scrape_news.url_link('http://www.qdaily.com', "h3", {"class": "smart-dotdotdot"})
    

    result:
    ['重新认识人性的可能,如何看待 18 世纪英国平民文化?',
    '两次世界大战之间的日本陆军,他们如何走向战争?',
    '艾滋病如何在美国被发现,又怎样展现人性的复杂?',
    '卢梭研究经典,我们该如何理解卢梭的孤独?',
    '社交媒体和数字技术的发展,如何改变传统人际关系?',
    '如果爱情让人自身和自身保持同一,那它可能是什么?',
    '130 幅城市复原图,如何重现古地中海文明?',
    '从 1931 到 1945 年,日本人的思想发生了什么转变?',
    '百年以来,什么是中国文人论政的报国情怀?']

    相关文章

      网友评论

          本文标题:简单爬虫练习:爬虫文章

          本文链接:https://www.haomeiwen.com/subject/zxsdoctx.html