如果你想特别关注某个知乎用户,他的每一个赞,每一个关注,每一个回答都不想放过,那么请看下文,自己搭建爬虫,邮件实时提醒。
https://github.com/Lyndon1994/robot
按照下图新建文件夹:

config.py
MONGO_USER='root'
MONGO_PASSWORD='password'
MONGO_HOST='127.0.0.1'
EMAIL_FROM_NAME='Robot'
EMAIL_FROM='serveremail@126.com'
EMAIL_PASSWORD='password'
EMAIL_TO='toemail@qq.com'
EMAIL_SERVER='smtp.126.com'
EMAIL_TO_NAME='亲爱的用户'
listen.py
#!/usr/bin/python3.4
# -*- coding: utf-8 -*-
# 订阅某知乎用户动态更新,发送邮件提醒
import json
from jinja2 import Template
import time, requests
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import os
import config
import smtplib
import lxml.html
from pymongo import MongoClient
import traceback
import logging
FORMAT = '%(asctime)-15s %(levelname)s:%(module)s:%(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
IOS_USER_AGENT = 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
FF_USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0'
class ListenZhihu():
def __init__(self, client=None):
self.client = MongoClient('mongodb://%s:%s@%s' % (
config.MONGO_USER, config.MONGO_PASSWORD, config.MONGO_HOST)) if client is None else client
self.db = self.client.zhihu
def get_activity(self, activity_id):
return self.db.activities.find_one({'_id': activity_id})
def remind(self, item):
try:
self.send_email(item)
item['_id'] = item['id']
self.db.activities.insert_one(item)
except:
print(traceback.print_exc())
def listen_activities(self, url='https://www.zhihu.com/people/wei-jing-ji-44/activities'):
headers = {'user-agent': FF_USER_AGENT}
r = requests.get(url, headers=headers)
tree = lxml.html.fromstring(r.content)
data = tree.cssselect('div#data')
if data:
data_state = data[0].attrib['data-state']
data_state_obj = json.loads(data_state)
activities = data_state_obj['entities']['activities']
for activity_id, activity_data in activities.items():
if self.get_activity(activity_id) is None:
logging.info('读取到一条新动态...')
item = {}
item['id'] = activity_id
item['action_text'] = activity_data['actionText']
item['created_time'] = activity_data['createdTime']
item['actor_name'] = activity_data['actor']['name']
item['actor_url'] = 'https://www.zhihu.com/people/%s/activities' % activity_data['actor'][
'urlToken']
target_id = activity_data['target']['id']
target_schema = activity_data['target']['schema']
item['schema'] = target_schema
if target_schema == 'answer':
target = data_state_obj['entities']['answers'][str(target_id)]
item['author'] = target['author']
item['title'] = target['question']['title']
item['content'] = target['content']
question_id = target['question']['id']
answer_id = target['id']
item['url'] = 'https://www.zhihu.com/question/%s/answer/%s' % (question_id, answer_id)
item['voteupCount'] = target['voteupCount']
elif target_schema == 'question':
target = data_state_obj['entities']['questions'][str(target_id)]
item['author'] = target['author']
item['title'] = target['title']
question_id = target['id']
item['url'] = 'https://www.zhihu.com/question/%s' % question_id
self.remind(item)
else:
logging.debug('没有获取到数据...')
def send_email(self, item):
def _format_addr(s):
name, addr = parseaddr(s)
return formataddr((Header(name, 'utf-8').encode(), addr))
from_addr = config.EMAIL_FROM
password = config.EMAIL_PASSWORD
to_addr = config.EMAIL_TO
smtp_server = config.EMAIL_SERVER
created_time = time.strftime('%m-%d %H:%M', time.localtime(item['created_time']))
with open('%s/remind.html' % os.path.split(os.path.realpath(__file__))[0]) as f:
template = Template(f.read())
send_data = template.render(item=item, created_time=created_time)
msg = MIMEText(send_data, 'html', 'utf-8')
msg['From'] = _format_addr('%s <%s>' % (config.EMAIL_FROM_NAME, from_addr))
msg['To'] = _format_addr('%s <%s>' % (config.EMAIL_TO_NAME, to_addr))
msg['Subject'] = Header('%s有了新动态' % item['actor_name'], 'utf-8').encode()
server = smtplib.SMTP(smtp_server, 25)
server.starttls()
server.login(from_addr, password)
server.sendmail(from_addr, [to_addr], msg.as_string())
server.quit()
logging.info('邮件发送成功...')
if __name__ == '__main__':
ListenZhihu().listen_activities()
remind.html
<!DOCTYPE html>
<html lang="zh">
<head>
<meta charset="UTF-8">
<title>{{ item['actor_name'] }}有新动态</title>
</head>
<body>
<style>
body {
line-height: 1.7;
}
a {
text-decoration: none;
}
</style>
<div class="header" style="color: #999;"><a href="{{item['actor_url']}}" style="color: #999;">{{item['actor_name']}}</a>
{{item['action_text']}} <span style="float: right;">{{created_time}}</span>
</div>
<div class="content">
<span class="title"><a href="{{item['url']}}" style="color: #259">{{item['title']}}</a></span>
{% if item.get('author')%}
<div class="author">
<a href="{{ 'https://www.zhihu.com/people/%s'%item['author']['urlToken'] }}" style="color: #222;">{{item['author']['name']}}</a>
<span style="color: #999;">{{item['author']['headline']}}</span>
</div>
{% endif %}
{% if item.get('voteupCount')%}
<span class="like" style="color: #999;">{{item.get('voteupCount','')}}人赞同了该回答</span>
{% endif %}
<div style="margin-top: 10px">{{item.get('content','')}}</div>
</div>
</body>
</html>
另外的r.json不需要,zhihu.log是自动生成的日志文件。
接下来利用Linux的crontab发起一个定时任务:
* 6-23 * * * python3.4 /path/to/zhihu/listen.py >> /tmp/zhihu.log 2>&1
每天6点到23点,每分钟抓取一次。
注意:有些用户的状态是需要登录才可以抓取的,所以需要先用Python实现登录功能,暂不提供。
网友评论