# -*- coding: utf-8 -*-
cookie="anonymid=jxcn09d5-vd52v0; depovince=GUZ; _r01_=1; ick_login=d41bb8a9-056b-41a7-b187-7c706f0f8702; ick=39b091b8-f882-499b-992b-34a682d3469a; JSESSIONID=abcHJrhG1CAIo64PJRrUw; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244004; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244011; wp_fold=0; td_cookie=18446744069457827825; jebecookies=e9de8580-fb15-4891-b7c3-7c08ebb41f5c|||||; _de=AE9934B6C85831351B86F7DDD5B20F8A; p=b25ab0edb69343d7f80c5e481864b8c30; first_login_flag=1; ln_uact=18620028487; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=b8a0d848e228dd51d2c84609f814495b0; societyguester=b8a0d848e228dd51d2c84609f814495b0; id=971298880; xnsid=6e5116da; ver=7.0; loginfrom=null"
cookies= {}
# 提取键值对 请求头中携带cookie必须是一个字典,所以要把原生的cookie字符串转换成cookie字典
for cookiein cookie.split(';'):
key, value= cookie.split("=", 1)
cookies[key] = value
print(cookies)
print("*"*100)
print()
cookies= "anonymid=jxcn09d5-vd52v0; depovince=GUZ; _r01_=1; ick_login=d41bb8a9-056b-41a7-b187-7c706f0f8702; ick=39b091b8-f882-499b-992b-34a682d3469a; JSESSIONID=abcHJrhG1CAIo64PJRrUw; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244004; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244011; wp_fold=0; td_cookie=18446744069457827825; jebecookies=e9de8580-fb15-4891-b7c3-7c08ebb41f5c|||||; _de=AE9934B6C85831351B86F7DDD5B20F8A; p=b25ab0edb69343d7f80c5e481864b8c30; first_login_flag=1; ln_uact=18620028487; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=b8a0d848e228dd51d2c84609f814495b0; societyguester=b8a0d848e228dd51d2c84609f814495b0; id=971298880; xnsid=6e5116da; ver=7.0; loginfrom=null"
cookies= {i.split("=")[0]: i.split("=")[1] for iin cookies.split("; ")}
print(cookies)
要爬取的网页数据只有在登陆之后才能获取,所以我从浏览器中copy了登录后的cookie到scrapy项目settings文件的请求头中,但是程序执行完之后发现并没有获取到数据,控制台打印出来的debug信息提示需要登录,也就是说我在请求头中添加的cookie并没有效果。后来在网上查了资料,发现如果要携带cookie的话是需要设置的,如下所示:
在settings文件第37行的地方,默认是注释掉的,我们只需要解注释, 并把 Fales 改为 True 就可以了,程序会默认使用settings文件请求头中的cookie。
COOKIES_ENABLED= True ### 默认 Fales
网上看到一篇文章说是有三种设置cookie的方式,上面说的只是其中一种,也是相对小白的一种,有兴趣的可以看看这篇文章,只是写的有点不是很详细,刚入门的话有点难理解。
# -*- coding: utf-8 -*-
import scrapy, re
class LoginSpider(scrapy.Spider):
name ='login' allowed_domains = ['renren.com']
start_urls = ['http://www.renren.com/971298880/profile']
def start_requests(self):
""" 根据cookies模拟登陆人人网,注意settings.py文件的cookies必须是开启的 """
cookies="anonymid=jxcn09d5-vd52v0; depovince=GUZ; _r01_=1; ick_login=d41bb8a9-056b-41a7-b187-7c706f0f8702; ick=39b091b8-f882-499b-992b-34a682d3469a; JSESSIONID=abcHJrhG1CAIo64PJRrUw; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244004; jebe_key=9ca5b44f-aaec-4180-962e-bf7581ad6e5e%7Cc1d85b293dafa0e44367ceed107b877e%7C1561517245262%7C1%7C1561517244011; wp_fold=0; td_cookie=18446744069457827825; jebecookies=e9de8580-fb15-4891-b7c3-7c08ebb41f5c|||||; _de=AE9934B6C85831351B86F7DDD5B20F8A; p=b25ab0edb69343d7f80c5e481864b8c30; first_login_flag=1; ln_uact=18620028487; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; t=b8a0d848e228dd51d2c84609f814495b0; societyguester=b8a0d848e228dd51d2c84609f814495b0; id=971298880; xnsid=6e5116da; ver=7.0; loginfrom=null" cookies = {i.split("=")[0]:i.split("=")[1]foriincookies.split("; ")}
yield scrapy.Request(self.start_urls[0], callback=self.parse, cookies=cookies)
def parse(self, response):
print(re.findall("新用户28487",response.body.decode()))
yield scrapy.Request( callback=self.parse_detail )
#这里不需要再添加cookies,因为settings.py文件中开启了,只需要添加一次,
下次请求自动携带'http://www.renren.com/971298880/profile?v=info_timeline',
def parse_detail(self,response):#详情页
print(re.findall("新用户28487", response.body.decode()))
网友评论