写在前面
最近一直在学习关于数据挖掘的相关知识,就想找个实例练练手,结果没有源数据,借助python强大的爬虫工具搞点数据。
1. 准备工作
- 熟悉python关于requests相关知识
- 网络查找IMDB电影评分数据入口
- 熟悉xpath相关语法及引用实例
- 整理需要爬取字段名称及内容
2. 整理数据爬取思路
- 分析数据获取入口页面
- 借助chrome开发者工具获取每个字段xpath代码
- 根据测试效果对异常进行处理
核心代码展示
ssl._create_default_https_context = ssl._create_unverified_context
session = requests.Session()
session = requests.Session()
req = session.get(url)
# req = session.get(URL)
# 设置网页编码格式
req.encoding = 'utf8'
# 将request.content 转化为 Element
root = etree.HTML(req.content)
# 选取 ol/li/div[@class="item"] 不管它们在文档中的位置
items = root.xpath('//*[@id="main"]/div/div/div[3]/div')
# //*[@id="main"]/div/div/div[3]/div[1]/div[3]/p[1]/span[3]
for item in items:
# rank, name, alias, rating_num, quote, url = "", "", "", "", "", ""
try:
url = item.xpath('./div[2]/a/@href')[0]
item_year = (item.xpath('./div[3]/h3/span[2]/text()')[0].strip() if len(item.xpath('./div[3]/h3/span[2]/text()'))
else ' ')
imdb_id = (item.xpath('./div[3]/h3/span[1]/text()')[0].strip() if len(item.xpath('./div[3]/h3/span[1]/text()'))
else ' ')
imdb_name = (item.xpath('./div[3]/h3/a/text()')[0].strip()if len(item.xpath('./div[3]/h3/a/text()'))
else ' ')
# //*[@id="main"]/div/div/div[3]/div[10]/div[3]/p[1]/span[1]
# //*[@id="main"]/div/div/div[3]/div[10]/div[3]/p[1]/span[1]
runtime = (item.xpath('./div[3]/p[1]/span[3]/text()')[0].strip() if len(item.xpath('./div[3]/p[1]/span[3]/text()'))
else ' ')
if len(runtime) and ',' in runtime:
runtime = (
item.xpath('./div[3]/p[1]/span[1]/text()')[0].strip() if len(item.xpath('./div[3]/p[1]/span[1]/text()'))
else ' ')
# //*[@id="main"]/div/div/div[3]/div[10]/div[3]/p[1]/span[3]
style = (item.xpath('./div[3]/p[1]/span[5]/text()')[0].strip() if len(item.xpath('./div[3]/p[1]/span[5]/text()'))
else ' ')
if len(style):
style = style.replace(',','|')
else:
style = (
item.xpath('./div[3]/p[1]/span[3]/text()')[0].strip() if len(item.xpath('./div[3]/p[1]/span[3]/text()'))
else ' ')
style = style.replace(',', '|')
# print(style)
rating_num = (item.xpath('./div[3]/div/div[1]/strong/text()')[0].strip() if len(item.xpath('./div[3]/div/div[1]/strong/text()'))
else ' ')
matescore = (item.xpath('./div[3]/div/div[3]/span/text()')[0].strip() if len(item.xpath('./div[3]/div/div[3]/span/text()'))
else ' ')
text_muted = (item.xpath('./div[3]/p[2]/text()')[0].strip() if len(item.xpath('./div[3]/p[2]/text()')) else ' ')
director = (item.xpath('./div[3]/p[3]/a[1]/text()')[0].strip() if len(item.xpath('./div[3]/p[3]/a[1]/text()')) else ' ')
director_url = (item.xpath('./div[3]/p[3]/a[1]/@href')[0] if len(item.xpath('./div[3]/p[3]/a[1]/@href')) else ' ')
actor_one_url = (item.xpath('./div[3]/p[3]/a[2]/@href')[0] if len(item.xpath('./div[3]/p[3]/a[2]/@href')) else ' ')
actor_one_name = (item.xpath('./div[3]/p[3]/a[2]/text()')[0].strip() if len(item.xpath('./div[3]/p[3]/a[2]/text()')) else ' ')
actor_two_url = (item.xpath('./div[3]/p[3]/a[3]/@href')[0] if len(item.xpath('./div[3]/p[3]/a[3]/@href')) else ' ')
actor_two_name = (item.xpath('./div[3]/p[3]/a[3]/text()')[0].strip() if len(item.xpath('./div[3]/p[3]/a[3]/text()')) else ' ')
actor_three_url = (item.xpath('./div[3]/p[3]/a[4]/@href')[0].strip() if len(item.xpath('./div[3]/p[3]/a[4]/@href')) else ' ')
actor_three_name = (item.xpath('./div[3]/p[3]/a[4]/text()')[0].strip() if len(item.xpath('./div[3]/p[3]/a[4]/text()')) else ' ')
actor_four_url = (item.xpath('./div[3]/p[3]/a[5]/@href')[0] if len(item.xpath('./div[3]/p[3]/a[5]/@href')) else ' ')
actor_four_name = (item.xpath('./div[3]/p[3]/a[5]/text()')[0].strip() if len(item.xpath('./div[3]/p[3]/a[5]/text()')) else ' ')
vote_count = (item.xpath('./div[3]/p[4]/span[2]/text()')[0].strip() if len(item.xpath('./div[3]/p[4]/span[2]/text()')) else ' ')
gross = (item.xpath('./div[3]/p[4]/span[5]/text()')[0].strip() if len(item.xpath('./div[3]/p[4]/span[5]/text()')) else ' ')
网友评论