教程来自 b站,mooc Python网络爬虫与信息提取, 使用requests re 模块
issue : 获取url内容需要在登录前提下
solution: 使用cookies 登陆
issue: Max retries exceeded with url 报错
solution : 由于开启了Fidder,关闭即可
issue : 获取url内容需要在登录前提下
solution: 使用cookies 登陆
issue: Max retries exceeded with url 报错
solution : 由于开启了Fidder,关闭即可,仅限本次报错
HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'Cookie': '获得个人cookies'
}
#获得url 页面 源代码
def getHTMLText(url):
try:
r = requests.get(url = url,headers = HEADERS,timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("cant get html")
return ""
#解析出需要的信息
def parsePage(ilt,html):
try:
plt = re.findall(r'"view_price":"[\d.]*"',html) #保留字符进行转义
#print(plt)
tlt = re.findall(r'"raw_title":".*?"',html)
#print(tlt)
for i in range(len(plt)):
price = eval(plt[i].split(":")[1])
#eval 去掉双引号
title = eval(tlt[i].split(":")[1])
ilt.append([price,title])
except:
print("")
#将信息进行打印
def printGoodList(ilt):
tplt = '{:4}\t{:8}\t{:16}'
print(tplt.format("num",'price','goodsname'))
count = 0
for g in ilt:
count += 1
print(tplt.format(count,g[0],g[1]))
# 主函数
def main():
goods = input("输入商品名称")
#depth = 4
depth = input("输入解析页面个数")
depth = int(depth)
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
for i in range(depth):
try:
url = start_url + '&s=' + str(44*i)
#print(url)
html = getHTMLText(url)
#print(html) #可以得到html
parsePage(infoList,html)
except:
continue
printGoodList(infoList)
网友评论