美文网首页
爬豆瓣电影信息12-02

爬豆瓣电影信息12-02

作者: 张文超ai | 来源:发表于2018-12-02 21:59 被阅读0次

    爬到data.html

    import requests
    from bs4 import BeautifulSoup  # 从bs4引入BeautifulSoup
     
    #请求网页
    url = "https://movie.douban.com/cinema/later/chengdu/"
    response = requests.get(url)
     
     # 初始化BeautifulSoup方法一:利用网页字符串自带的编码信息解析网页
    soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml') 
     
    # 初始化BeautifulSoup方法二:手动指定解析编码解析网页
    # soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf-8') 
     
    # print(soup)  # 输出BeautifulSoup转换后的内容
    all_movies = soup.find('div', id="showing-soon")  # 先找到最大的div
    # print(all_movies)  # 输出最大的div的内容
     
    html_file = open('data.html', 'w', encoding="utf-8")
    html_file.write("""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="UTF-8">
        <title>豆瓣电影即将上映影片信息</title>
        <link href="https://cdn.bootcss.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet">
    </head>
    <body>
    <h2 class="text-center">豆瓣电影即将上映影片信息</h2>
    <table class="table table-striped table-hover mx-auto text-center">
        <thead>
            <tr>
                <th>影片名</th>
                <th>上映日期</th>
                <th>影片类型</th>
                <th>地区</th>
                <th>关注者数量</th>
            </tr>
        </thead>
        <tbody>
    """)
    for each_movie in all_movies.find_all('div', class_="item"):  # 从最大的div里面找到影片的div
        # print(each_movie)  # 输出每个影片div的内容
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')
        movie_name = all_a_tag[1].text
        moive_href = all_a_tag[1]['href']
        movie_date = all_li_tag[0].text
        movie_type = all_li_tag[1].text
        movie_area = all_li_tag[2].text
        # 替换字符串里面的 想看 两个字为空,使得更加美观
        movie_lovers = all_li_tag[3].text.replace("想看", '')
        print('名字:{},链接:{},日期:{},类型:{},地区:{}, 关注者:{}'.format(
            movie_name, moive_href, movie_date, movie_type, movie_area, movie_lovers))
        html_file.write("""
            <tr>
                <td><a href="{}">{}</a></td>
                <td>{}</td>
                <td>{}</td>
                <td>{}</td>
                <td>{}</td>
            </tr>
        """.format(moive_href, movie_name, movie_date, movie_type, movie_area, movie_lovers))
    html_file.write("""
         </tbody>
    </table>
    </body>
    </html>
    """)
    html_file.close()
    print("write_finished!")
    

    爬到CSV文件

    import csv
    import requests
    from bs4 import BeautifulSoup  # 从bs4引入BeautifulSoup
     
    # 请求网页
    url = "https://movie.douban.com/cinema/later/chengdu/"
    response = requests.get(url)
    # 初始化BeautifulSoup方法一:利用网页字符串自带的编码信息解析网页
    soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml')
     
    # 初始化BeautifulSoup方法二:手动指定解析编码解析网页
    # soup = BeautifulSoup(response.content, 'lxml', from_encoding='utf-8')
     
    # print(soup)  # 输出BeautifulSoup转换后的内容
    all_movies = soup.find('div', id="showing-soon")  # 先找到最大的div
    # print(all_movies)  # 输出最大的div的内容
     
    csv_file = open('data.csv', 'w', encoding="gbk", newline='')
    writer = csv.writer(csv_file)
     
    writer.writerow(["影片名", "链接", "上映日期", "影片类型", "地区", "关注者"])  # 写入标题
    for each_movie in all_movies.find_all('div', class_="item"):  # 从最大的div里面找到影片的div
        # print(each_movie)  # 输出每个影片div的内容
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')
        movie_name = all_a_tag[1].text
        moive_href = all_a_tag[1]['href']
        movie_date = all_li_tag[0].text
        movie_type = all_li_tag[1].text
        movie_area = all_li_tag[2].text
        movie_lovers = all_li_tag[3].text.replace("想看", '')
        print('名字:{},链接:{},日期:{},类型:{},地区:{}, 关注者:{}'.format(
            movie_name, moive_href, movie_date, movie_type, movie_area, movie_lovers))
        writer.writerow([movie_name, moive_href, movie_date, movie_type, movie_area, movie_lovers])
     
    csv_file.close()
    print("write_finished!")
    

    相关文章

      网友评论

          本文标题:爬豆瓣电影信息12-02

          本文链接:https://www.haomeiwen.com/subject/pgbscqtx.html