美文网首页
爬虫实战2——古诗

爬虫实战2——古诗

作者: AlexDM | 来源:发表于2017-01-12 10:12 被阅读15次

    前一篇文章记录的获取网页内的古诗网址、作者、古诗题目,接下来就是下载古诗了

    # coding:utf-8
    
    import urllib2,re
    from bs4 import BeautifulSoup
    import bs4
    
    def retrive_tangshi_300():
        url = 'http://www.gushiwen.org/gushi/tangshi.aspx'
        r = urllib2.urlopen(url)
        soup = BeautifulSoup(r.read(),'html.parser',from_encoding='utf-8')
    
        # 通过select选取标签内容、地址
        #tags = soup.select('div a')
        #for tag in tags:
        #    print tag['href']
    
        shige_list = []
        current_poem = {}
    
        tags = soup.find_all('div', class_ = "guwencont2")
        for tag in tags:
            #print tag.a['href']
            for t in tag.children:
                #print t,type(t)
                if type(t) == bs4.element.Tag:
                    pattern = re.compile(r'(.*)\((.*)\)')
                    m = pattern.match(t.string)
                    if m:
                        current_poem['url'] = t['href']
                        current_poem['title'] = m.group(1)
                        current_poem['author'] = m.group(2)
                        shige_list.append(current_poem)
                        current_poem = {}
        return shige_list
    
    def load_poem(poems):
        #print type(poems)
        u = 'http://www.gushiwen.org'
        u += poems['url']
        r = urllib2.urlopen(u)
        soup = BeautifulSoup(r.read(),'html.parser',from_encoding='utf-8')
        #print soup
        tags = soup.find_all('p',align = 'center')
        for tag in tags:
            if type(tag) == bs4.element.Tag:
                content = tag.get_text()
                poems['content'] = content
        return poems
    
    
    if __name__ == '__main__':
        r = retrive_tangshi_300()
        for i in range(3):
            #print r[i]
            sg = load_poem(r[i])
            print sg['title'],sg['author'],sg['content']
        #print r[0]
    

    相关文章

      网友评论

          本文标题:爬虫实战2——古诗

          本文链接:https://www.haomeiwen.com/subject/lfsdbttx.html