东秦03

作者: __method__ | 来源:发表于2021-04-13 20:15 被阅读0次

xpath语法

# 读取本地
# pip install lxml
from lxml import html
with open('index.html', mode='r', encoding='utf-8') as f:
    data = f.read()
    # print(data)
    selector = html.fromstring(data)
    # 获取标签内容
    h1 = selector.xpath('/html/body/h1/text()')[0]
    print(h1)

    # 获取标签属性 @ 属性名
    a = selector.xpath('/html/body/a/@href')[0]
    print(a)

    link = selector.xpath('/html/body/img/@src')[0]
    print(link)

豆瓣top250

import requests
from lxml import html
from matplotlib import pyplot as plt
import pandas as pd
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
movie_ls = []
for i in range(0, 230, 25):
    url = "https://movie.douban.com/top250?start={}&filter=".format(i)
    # 添加请求头, 目的是伪装成浏览器
    response = requests.get(url, headers=headers)
    # 看编码
    print(response.encoding)
    print(response.status_code)
    data = response.text
    selector = html.fromstring(data)
    #  // 代表任意位置出发
    # 获取html标签的内容
    # //标签名1[@属性=属性值]/标签名1[@属性=属性值/text()
    # 获取html标签的属性值
    # //标签名1[@属性=属性值]/标签名1[@属性=属性值/@属性名
    ol_list = selector.xpath('//div[@id="content"]//ol/li')
    print(len(ol_list))
    counts = {}
    for movie in ol_list:
        movie_name = movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()')[0]
        print(movie_name)
        movie_score = \
        movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[2]/text()')[0]
        print(movie_score)
        movie_evals = \
        movie.xpath('div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')[0]
        print(movie_evals)
        movie_img_link = movie.xpath('div[@class="item"]/div[@class="pic"]/a/img/@src')[0]
        print(movie_img_link)
        # 写入本地  .content获取二进制数据
        img_data = requests.get(movie_img_link).content
        # wb 是写二进制
        with open('./imgs/{}.jpg'.format(movie_name), mode='wb') as f:
            f.write(img_data)
        counts[movie_score] = counts.get(movie_score,0) + 1
        movie_ls.append({
            "movie_name":movie_name,
            "movie_score":movie_score,
            "movie_evals":movie_evals,
            "movie_img_link":movie_img_link
        })
# 比如各个评分占比 9.1 占 250电影的百分之多少
num_ls = list(counts.values())
score_ls = list(counts.keys())
plt.pie(num_ls, labels=score_ls)
plt.show()
df = pd.DataFrame(movie_ls)
df.to_csv('doubantop250.csv')

相关文章

网友评论

      本文标题:东秦03

      本文链接:https://www.haomeiwen.com/subject/tobclltx.html