爬虫实践
- 爬取当当网图书信息
from lxml import html
import requests
#安装pandas
#pip install pandas
import pandas as pd
def spider(isbn):
"""爬取当当网图书信息爬虫"""
# isbn 国际标准书号
url="http://search.dangdang.com/?key={}&act=input".format(isbn)
print(url)
#获取网页的源代码
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
#Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36
html_data = requests.get(url, headers=headers).text
#使用xpath语法
selector = html.fromstring(html_data)
#爬取所有书籍的标题
ul_list = selector.xpath('//div[@id="search_nature_rg"]/ul/li')
print('有{}家商铺售卖此书'.format(len(ul_list)))
book_info_list = []
#遍历
for li in ul_list:
title = li.xpath('a/@title')
# print(title)
#获取所有购买链接
link = li.xpath('a/@href')[0]
# print(link)
#获取价格
price = li.xpath('p[@class="price"]/span[@class="search_now_price"]/text()')[0]
#去掉¥
price = price.replace('¥', ' ')
# print(price)
#//标签1[@属性1=属性值1]/.../text()
#//标签1[@属性1=属性值1]/.../@属性的名字
# 爬取除当当自营以外的所有店铺
#store = li.xpath('p[@class="search_shangjia"]/a[1]/text()')
#
store = li.xpath('p[4]/a/title')
if len(store) == 0:
#d当当自营
store = '当当自营'
else:
store = store[0]
#print(store)
book_info_list.append({
"title":title,
"price":price,
"link":link,
"store":store
})
book_info_list.sort(key=lambda x: float(x["price"]), reverse=True)
#遍历图书列表
for i in book_info_list:
print(i)
#import pandas as pd
#转化成dataframe
df = pd.DataFrame(book_info_list)
#存储成csv
df.to_csv('当当图书信息')
isbn = input('请输入您要查询的书号')
spider(isbn)
- 爬取豆瓣即将上映电影爬虫
from lxml import html
import requests
import pandas as pd
def spider(loction):
url = "https://movie.douban.com/cinema/later/{}/".format(loction)
print(url)
#获取网址源代码
html_data = requests.get(url).text
#使用xpath语法
selector = html.fromstring(html_data)
ul_list = selector.xpath('//div[@id="showing-soon"]/div')
print('有{}部电影即将上映'.format(len(ul_list)))
movie_info_list = []
for i in ul_list:
movie_name = i.xpath('div/h3/a/text()')[0]
print(movie_name)
movie_data = i.xpath('div/ul/li[1]/text()')[0]
print(movie_data)
movie_type = i.xpath('div/ul/li[2]/text()')[0]
print(movie_type)
movie_county = i.xpath('div/ul/li[3]/text()')[0]
print(movie_county)
movie_people = i.xpath('div/ul/li[@class="dt last"]/span/text()')[0]
print(movie_people)
movie_info_list.append(
{"movie_name": movie_name,
"movie_data": movie_data,
"movie_type": movie_type,
"movie_county": movie_county,
"movie_people": movie_people
}
)
movie_info_list.sort(key=lambda x: x["movie_people"], reverse=True)
for i in movie_info_list:
print(i)
df = pd.DataFrame(movie_info_list)
#存储成csv
df.to_csv('豆瓣')
loction = input("请输入要查询的地名")
spider(loction)
1)爬取常用的数据结构模型
#常用模型
from random import randint
li = []
for i in range(10):
# li.append("商家{}".format(i))
li.append({
"store": "商家{}".format(i),
'price': randint(300, 500)
})
# 遍历
for x in li:
print(x)
# 对商家进行排序
li.sort(key=lambda x: x['price'])
print('========================================')
print('==================排序后=================')
print('========================================')
# 排序后
for x in li:
print(x)
2)图片爬取
# 图片的爬取
# 图片的地址
# @ src ,图片地址:http://b-ssl.duitang.com/uploads/blog/201312/04/20131204184148_hhXUT.jpeg
# 导入 requests
import requests
url = 'http://b-ssl.duitang.com/uploads/blog/201312/04/20131204184148_hhXUT.jpeg'
response = requests.get(url)
print(response.status_code)
# response.content和 response.text 的区别
# response.text
# 返回类型:str
# response.content
# 返回类型:bytes
img_info = response.content
print(img_info)
# 文件读取
# with open('index1.html', 'r', encoding='UTF-8') as f:
# print(f.read())
# 文件进行写入, wb write binary 以二进制方式写入
# 因为是bytes类型所以不用解码
with open('mm.jpg', 'wb') as f:
f.write(img_info)
# 爬文本
# text = '不好意思'
# with open('xiaoshuo.txt', 'w', encoding='UTF-8') as f:
# f.write(text)
3) 批量命名图片
# 批量命名图片
import requests
# 图片地址
# url = ''
# f = requests.get(url).content
# with open('xx.png', 'wb') as f:
# f.write(f)
from random import randint
url1 = 'http://5b0988e595225.cdn.sohucs.com/images/20190917/10dd465a62b64513a38b24bd4735da6a.jpeg'
url2 = 'http://pics1.baidu.com/feed/fd039245d688d43f2b9ef37459037a1f0ef43b26.jpeg?token=790b4a63424ff91158de106833f44ba6&s=1DA4E8155E317A075CAD58D1030010B0'
movie_info_list = [
{'movie_name':'中国机长', 'img_url': url1},
{'movie_name': '天气之子', 'img_url': url2}
]
# 批量下载图片
# 遍历
for movie in movie_info_list:
img_link = movie['img_url']
response = requests.get(img_link)
if response.status_code == 200:
with open('./images/{}.jpg'.format(movie['movie_name']), 'wb') as f:
f.write(response.content)
网友评论