python脚本先贴上
# -*- coding: utf-8 -*-
import requests
import re
from urllib.parse import urljoin
import csv
import chardet
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/70.0.3538.77 Safari/537.36',
}
def web_link_collection(page_url):
lis = []
try:
resp = requests.get(page_url, headers=header, timeout=3)
except Exception as e:
print('error:', e)
else:
encoding = chardet.detect(resp.content)["encoding"]
resp.encoding = encoding
link_list = re.findall(r'<a.*?href=["\'](.*?)["\'].*?>(.*?)</a>', resp.text, re.S | re.I)
for link in link_list:
url = urljoin(page_url, link[0])
sc = url_status(url.strip())
text = link[1]
print([url, sc, text])
lis.append([url, sc, text])
return lis
def url_status(url):
try:
res = requests.get(url, headers=header, allow_redirects=False)
except Exception as e:
print('error:', e)
else:
sc = res.status_code
if sc == 301 or sc == 302:
loc_url = res.headers['Location']
sc = '%s#%s' % (sc, loc_url)
return sc
if __name__ == '__main__':
url = '' # 需要查询的页面
lis = web_link_collection(url)
file = open('result.csv', 'w', encoding='utf-8', newline='')
cw = csv.writer(file)
cw.writerow(['URL', 'Status_code', '链接文本'])
for line in lis:
cw.writerow(line)
file.close()
脚本说明
- 功能:采集网页上所有链接,存入csv。
- 格式:url - 状态码(如果是301或302,则获取跳转后的链接) - 链接文本
- 作用
- 是否存在500、404、301、302等页面。
- 文字链接:链接文本是否设置正确;图片链接:查看是否有alt属性。
- 部分链接是否需要添加rel="nofollow",或添加title=""属性,或使用JS代替。
扩展阅读:https://searchengineland.com/heres-what-happened-when-i-followed-googlebot-for-3-months-308674
网友评论