爬虫流程:爬虫的原理:url -> html -> model (洗数据) -> 分析
- 依赖的包
requests // 用于发送请求,获取页面信息
pyquery // pyquery库是jQuery的Python实现,将响应内容转化为PyQuery对象,实现css选择(分析页面) - 获取页面数据
- 循环url
import os
import requests
from pyquery import PyQuery as pq
as
语句可以将包名称简化;
class Model(object):
def __repr__(self):
name = self.__class__.__name__
properties = ('{}=({})'.format(k, v) for k, v in self.__dict__.items())
s = '\n<{} \n {}>'.format(name, '\n '.join(properties))
return s
- 基类,用于调整爬取的数据结构,注意后面的
return
,能够返回真正的数据,不然打印出来的类全是类型,socket
第三章有截图; -
__repr__()
方法不用调用,print 输出时,自动调用这个方法,也称为魔法函数; - 类属性:
__class__.__name__
: 返回类名
__dict__
:返回属性的字典集合 - () 的使用
- 三个
\n
:
字符串都有 join() 方法,参数时要连接的元素序列
class Movie(Model):
def __init__(self):
self.name = ''
self.score = 0
self.quote = ''
self.cover_url = ''
self.ranking = 0
定义属性(字段),存储数据。
def movie_from_div(div):
e = pq(div)
m = Movie()
m.name = e('.title').text()
m.score = e('.rating_num').text()
m.quote = e('.inq').text()
m.cover_url = e('img').attr('src')
m.ranking = e('.pic em').text()
return m
每次想要进行 css 选择,都需要用 eq() 进行包装。上一个是针对整个页面,这个只是针对 div 内的元素;
文本的获取用 .text() 方法
属性的获取用 .attr() 方法
如果目标元素没有 class 或 id 标记,那么可以通过父元素向下查找
def movies_from_url(url):
r = requests.get(url)
page = r.content
e = pq(page)
items = e('.item')
movies = [movie_from_div(i) for i in items]
return movies
request.get()
下载 url
对应的页面,页面内容通过 content
属性获得页面内容(html),这两步下载页面。
pq(page)
获得支持 css
语法的对象
def main():
url = 'https://movie.douban.com/top250'
movies = movies_from_url(url)
print('top250 movies', movies)
if __name__ == '__main__':
main()
通过观察 url 规律,可以爬取多个页面
def main():
# 在页面上点击下一页, 观察 url 变化, 找到规律
for i in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start={}'.format(i)
movies = movies_from_url(url)
print('top250 movies', movies)
基础爬虫之将数据保存至数据库mongodb
import os
import requests
from pyquery import PyQuery as pq
from pymongo import MongoClient
class Model(object):
db = MongoClient().web16_4_pachong
def __repr__(self):
name = self.__class__.__name__
properties = ('{0} : ({1})'.format(k, v) for k, v in self.__dict__.items())
s = '\n<{0} \n {1}>'.format(name, '\n '.join(properties))
return s
def save(self):
name = self.__class__.__name__
_id = self.db[name].save(self.__dict__)
class Movie(Model):
@classmethod
def valid_names(cls):
names = [
# (字段名, 类型, 默认值)
('name', str, ''),
('score', int, 0),
('quote', str, ''),
('cover_url', str, ''),
('ranking', int, 0),
]
return names
def movie_from_div(div):
e = pq(div)
m = Movie()
m.name = e('.title').text()
m.score = e('.rating_num').text()
m.quote = e('.inq').text()
m.cover_url = e('img').attr('src')
m.ranking = e('.pic em').text()
m.save()
return m
def movies_from_url(url):
r = requests.get(url)
page = r.content
e = pq(page)
items = e('.item')
movies = [movie_from_div(i) for i in items]
return movies
def main():
url = 'https://movie.douban.com/top250'
movies = movies_from_url(url)
print('top250 movies', movies)
if __name__ == '__main__':
main()
网友评论