简单爬网页
from bs4 import BeautifulSoup
import requests
url = 'https://knewone.com/?page=2'
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
imgs = soup.select('article > header > a > img') #wrapper > ul > li:nth-child(39) > article > header > a > img
titles = soup.select('article > section > h4 > a') #wrapper > ul > li:nth-child(39) > article > section > h4 > a
links = soup.select('article > section > h4 > a') #wrapper > ul > li:nth-child(39) > article > section > h4 > a
for img, title, link in zip(imgs, titles, links):
data = {
'img': img.get('src'),
'title' : title.get('title'),
'link': 'https://knewone.com/' + link.get('href')
}
print(data)
如果是动态异步加载的网页,需要审查元素点network,然后其XHS里面,你再加载信息,就可以得到尾缀了。
假设我们想要探测如下网页的变化,看看作者有没有更新。首先,网页地址:
https://github.com/lennylxx/ipv6-hosts
截图:
对应的api为:https://api.github.com/repos/lennylxx/ipv6-hosts
打开以后会有如下的JSON代码,很像python里面的字典:
{
"id": 21858929,
"node_id": "MDEwOlJlcG9zaXRvcnkyMTg1ODkyOQ==",
"name": "ipv6-hosts",
"full_name": "lennylxx/ipv6-hosts",
"owner": {
"login": "lennylxx",
"id": 5811576,
"node_id": "MDQ6VXNlcjU4MTE1NzY=",
"avatar_url": "https://avatars3.githubusercontent.com/u/5811576?v=4",
"gravatar_id": "",
"url": "https://api.github.com/users/lennylxx",
"html_url": "https://github.com/lennylxx",
"followers_url": "https://api.github.com/users/lennylxx/followers",
"following_url": "https://api.github.com/users/lennylxx/following{/other_user}",
"gists_url": "https://api.github.com/users/lennylxx/gists{/gist_id}",
"starred_url": "https://api.github.com/users/lennylxx/starred{/owner}{/repo}",
"subscriptions_url": "https://api.github.com/users/lennylxx/subscriptions",
"organizations_url": "https://api.github.com/users/lennylxx/orgs",
"repos_url": "https://api.github.com/users/lennylxx/repos",
"events_url": "https://api.github.com/users/lennylxx/events{/privacy}",
"received_events_url": "https://api.github.com/users/lennylxx/received_events",
"type": "User",
"site_admin": false
},
"private": false,
"html_url": "https://github.com/lennylxx/ipv6-hosts",
"description": null,
"fork": false,
"url": "https://api.github.com/repos/lennylxx/ipv6-hosts",
"forks_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/forks",
"keys_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/keys{/key_id}",
"collaborators_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/collaborators{/collaborator}",
"teams_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/teams",
"hooks_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/hooks",
"issue_events_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/issues/events{/number}",
"events_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/events",
"assignees_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/assignees{/user}",
"branches_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/branches{/branch}",
"tags_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/tags",
"blobs_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/blobs{/sha}",
"git_tags_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/tags{/sha}",
"git_refs_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/refs{/sha}",
"trees_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/trees{/sha}",
"statuses_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/statuses/{sha}",
"languages_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/languages",
"stargazers_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/stargazers",
"contributors_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/contributors",
"subscribers_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/subscribers",
"subscription_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/subscription",
"commits_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/commits{/sha}",
"git_commits_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/git/commits{/sha}",
"comments_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/comments{/number}",
"issue_comment_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/issues/comments{/number}",
"contents_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/contents/{+path}",
"compare_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/compare/{base}...{head}",
"merges_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/merges",
"archive_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/{archive_format}{/ref}",
"downloads_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/downloads",
"issues_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/issues{/number}",
"pulls_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/pulls{/number}",
"milestones_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/milestones{/number}",
"notifications_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/notifications{?since,all,participating}",
"labels_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/labels{/name}",
"releases_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/releases{/id}",
"deployments_url": "https://api.github.com/repos/lennylxx/ipv6-hosts/deployments",
"created_at": "2014-07-15T12:36:53Z",
"updated_at": "2018-07-04T07:31:08Z",
"pushed_at": "2018-06-22T01:57:04Z",
"git_url": "git://github.com/lennylxx/ipv6-hosts.git",
"ssh_url": "git@github.com:lennylxx/ipv6-hosts.git",
"clone_url": "https://github.com/lennylxx/ipv6-hosts.git",
"svn_url": "https://github.com/lennylxx/ipv6-hosts",
"homepage": "",
"size": 7345,
"stargazers_count": 2858,
"watchers_count": 2858,
"language": "Python",
"has_issues": true,
"has_projects": true,
"has_downloads": true,
"has_wiki": true,
"has_pages": false,
"forks_count": 861,
"mirror_url": null,
"archived": false,
"open_issues_count": 12,
"license": {
"key": "mit",
"name": "MIT License",
"spdx_id": "MIT",
"url": "https://api.github.com/licenses/mit",
"node_id": "MDc6TGljZW5zZTEz"
},
"forks": 861,
"open_issues": 12,
"watchers": 2858,
"default_branch": "master",
"network_count": 861,
"subscribers_count": 313
}
更新时间在哪里?
在上述JSON文件里,标注了"updated_at": "2018-07-04T07:31:08Z",
,这就是更新时间
如果想要看网页是否变化,就对更新时间进行检测即可。
import requests
import time
api = 'https://api.github.com/users/kennethreitz/starred'
web_page = 'https://github.com/kennethreitz'
last_update = None
all_info = requests.get(api).json()
cur_update = all_info['updated_at']
print(cur_update)
while True:
if not last_update:
last_update = cur_update
if last_update < cur_update:
webbrowser.open(webpage)
time.sleep(600)
对比几个热门库的热度
这里可以使用这里的api,现成的:https://developer.github.com/v3/search/,我用的是q。
以django为例,https://api.github.com/search/repositories?q=django,这是django相关的项目,api有一个好处,那就是简单,json呈现。python中有.json()方法,可以使得json转化为python的字典、列表等等。
再比如topic内容是Django的,都有现成的api可以用:https://api.github.com/search/repositories?q=topic:django
那么使用的时候应该这样去做:
#https://api.github.com/search/repositories?q=topic:django
#https://api.github.com/search/repositories?q=django
#get_names -- check_repos
import requests
def get_names():
print('Separate each name with Space')
names = input()
return names.split()
def check_repos(names):
repo_api = 'https://api.github.com/search/repositories?q='
ecosys_api = 'https://api.github.com/search/repositories?q=topic:'
for name in names:
repo_info = requests.get(repo_api+name).json()['items'][0]
#1/json - 2/dict - 3/dict['items'] - list[0] -- django{"name": "django","stargazers_count": 34961}
stars = repo_info['stargazers_count']
forks = repo_info['forks_count']
ecosys_info = requests.get(ecosys_api+name).json()['total_count']
print(name)
print('Stars:'+str(stars))
print('Forks:'+str(forks))
print('Ecosys:'+str(ecosys_info))
print('-------------------')
names =get_names()
check_repos(names)
输出结果:
>>>Separate each name with Space
flask django sanic bottle
flask
Stars37174
Forks11015
Ecosys:6734
-------------------
django
Stars34965
Forks14861
Ecosys:10212
-------------------
sanic
Stars9640
Forks895
Ecosys:158
-------------------
bottle
Stars5528
Forks1125
Ecosys:117
-------------------
网友评论