美文网首页
request_html库的一些简单使用

request_html库的一些简单使用

作者: xin激流勇进 | 来源:发表于2018-07-08 15:04 被阅读0次

    request_html常用方法

    from requests_html import HTMLSession
    
    session = HTMLSession()
    r = session.get('https://news.cnblogs.com/n/recommend')
    news = r.html.find('h2.news_entry > a')
    for new in news:
        print(new.text)
        print(new.absolute_links)
    

    加载js,下载漫画图片

    %%time
    for i in range(1, 16):
        r = session.get('http://www.gugu5.com/n/14178/556176.html?p=%s'%i)
        r.html.render()
        src = r.html.find('#qTcms_pic', first=True).attrs['src']
        print(src)
        display(Image(url=src))
    

    http://html.python-requests.org/

    小例子

    from requests_html import HTMLSession
    from IPython.display import display, Image
    
    session = HTMLSession()
    
    %%time
    for i in range(1, 15):
        r = session.get('http://www.gugu5.com/n/14178/531259.html?p=%s'%i)
        r.html.render()
        src = r.html.find('#qTcms_pic', first=True).attrs['src']
        
        display(Image(url=src))
        print('第%s页'%i)
    

    爬取猫眼top100电影

    import requests
    from bs4 import BeautifulSoup
    import csv
    
    
    def get_page(url):
    
        headers = {
            'Host': 'maoyan.com',
            'Referer': 'http://maoyan.com/board',
            'User-Agent': 'Mozilla/5.0'
        }
    
        r = requests.get(url, headers=headers)
        return r.text
    
    
    def parse(content):
        movies = []
        soup = BeautifulSoup(content, 'lxml')
        dl = soup.find('dl', attrs={'class': 'board-wrapper'})
    
        for dd in dl.find_all('dd'):
            rank = dd.find('i').text
            title = dd.find('div', attrs={'class': "movie-item-info"}).find('a').text
            stars = dd.find('p', attrs={'class': 'star'}).text.strip().split(':')[-1].strip()
            releasetime = dd.find('p', attrs={'class': 'releasetime'}).text.strip().split(':')[-1].strip()
            score = dd.find('p', attrs={'class': 'score'}).text
    
            movie = [rank, title, stars, releasetime, score]
            movies.append(movie)
    
        return movies
    
    
    def write(rows, file):
        with open(file, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerows(rows)
    
    
    def main():
        url = 'http://maoyan.com/board/4?offset='
        for i in range(10):
            print(url + str(i*10))
            content = get_page(url + str(i*10))
            movies = parse(content)
            # print(movies)
            write(movies, '1.csv')
    
    
    if __name__ == '__main__':
        main()
    

    相关文章

      网友评论

          本文标题:request_html库的一些简单使用

          本文链接:https://www.haomeiwen.com/subject/szsouftx.html