一、Chrome浏览器相关:
- 查看源代码”里能看到的数据,可以直接通过程序请求当前 URL 获取。(get请求)
- Elements 里的 HTML 代码不等于请求返回值,只能作为辅助。
- 查看请求的具体信息,包括方法、headers、参数,复制到程序里使用。
二、具体实现代码
import requests
from lxml import etree
class Login:
def __init__(self):
self.headers = {
'Referer': 'https://github.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'Host': 'github.com'
}
self.login_url = 'https://github.com/login'
self.post_url = 'https://github.com/session'
self.session = requests.Session()
self.response = self.session.get(self.login_url, headers=self.headers)
self.selector = etree.HTML(self.response.text)
def token(self):
token = self.selector.xpath('//div[@class="auth-form px-3"]//input[@name="authenticity_token"]/@value')[0]
return token
def get_timestamp(self):
timestamp = self.selector.xpath('//input[@name="timestamp"]/@value')
return timestamp
def get_timestamp_secret(self):
timestamp_secret = self.selector.xpath('//input[@name="timestamp_secret"]/@value')
def get_ga_id(self):
# 这个是Google Analytics的id
ga_id = '422801072.1583054032'
return ga_id
# 开始实现模拟登录
def login(self, email, password):
params = {
'commit': 'Sign in',
'utf8': '✓',
'authenticity_token': self.token(),
'ga_id': self.get_ga_id(),
'login': email,
'password': password,
'webauthn - support': 'supported',
'webauthn - iuvpaa - support': 'unsupported',
'timestamp': self.get_timestamp(),
'timestamp_secret': self.get_timestamp_secret()
}
response = self.session.post(self.post_url, data=params, headers=self.headers)
if response.status_code == 200:
feed_url = 'https://github.com/dashboard-feed'
feed_response = self.session.get(feed_url, headers=self.headers)
if feed_response.status_code == 200:
return feed_response.text
def parse(self, data):
selector = etree.HTML(data).xpath('//div[@class="watch_started"]')
for element in selector:
# string返回的是一个列表,有的为空
string = element.xpath('.//div[@class="d-flex flex-items-baseline"]')
# 如果列表有内容,则解析出文本
if len(string):
news = string[0].xpath('.//text()')
# 去除空字符串,但空字符串仍保留在列表中
for index, value in enumerate(news):
news[index] = value.strip()
# 把空字符串从列表丢掉,且只保留有效字段的前3个,第4个日期字段不要了
news = [new for new in news if new][:3]
new = ''
for s in news:
new = new + s + ' '
print(new)
def run(self):
data = self.login('email', 'password')
self.parse(data)
if __name__ == '__main__':
Login().run()
输出结果如下:
Germey starred docker-library/python
Germey starred google/cadvisor
Germey starred bitnami/bitnami-docker-postgresql
Germey starred cnbattle/douyin
ChenglongChen starred ChineseGLUE/ChineseGLUE
ChenglongChen starred FLHonker/Awesome-Knowledge-Distillation
ChenglongChen starred iCGY96/awesome_OpenSetRecognition_list
ChenglongChen starred google-research/fixmatch
ChenglongChen starred AtmaHou/Task-Oriented-Dialogue-Dataset-Survey
iamseancheney starred scalingexcellence/scrapybook-2nd-edition
ChenglongChen starred zhmiao/OpenLongTailRecognition-OLTR
ChenglongChen starred dkozlov/awesome-knowledge-distillation
ChenglongChen starred thunlp/TLNN
iamseancheney starred kingname/GeneralNewsExtractor
Shawn1993 starred sebastianruder/NLP-progress
iamseancheney starred seathiefwang/FaceRecognition-tensorflow
网友评论