1.目标网站
2.动态加载目标
json源.png
目标json的url:https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1556900266027&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn
3.scrapy代码
TencentSpider.py
import scrapy
import json
import time
class TencentSpider(scrapy.Spider):
name = 'tencent01'
allowed_domains = ['careers.tencent.com']
def __init__(self):
super().__init__()
self.offset = 1
self.url = self.get_url(self.offset)
self.start_urls = [self.url]
def get_url(self, offset):
return 'https://careers.tencent.com/tencentcareer/api/post/Query?' \
'timestamp={}&' \
'countryId=&' \
'cityId=&' \
'bgIds=&' \
'productId=&' \
'categoryId=&' \
'parentCategoryId=&' \
'attrId=&' \
'keyword=&' \
'pageIndex={}' \
'&pageSize=10&language=zh-cn&area=cn'.format(int(time.time()), offset)
def parse(self, response):
resultJson = json.loads(response.body)
info = resultJson['Data']['Posts']
for i in info:
yield i
if self.offset < 339:
self.offset += 1
yield scrapy.Request(self.get_url(self.offset), callback=self.parse)
pipeline.py
class TecentjobPipeline(object):
def __init__(self):
self.filename = open("tencent.json", 'wb')
def process_item(self, item, spider):
text = json.dumps(dict(item),ensure_ascii=False) + "\n"
self.filename.write(text.encode('utf-8'))
return item
def close_spider(self, spider):
self.filename.close()
items.py
class TecentjobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
BGName = scrapy.Field()
CategoryName = scrapy.Field()
CountryName = scrapy.Field()
Id = scrapy.Field()
IsCollect = scrapy.Field()
IsValid = scrapy.Field()
LastUpdateTime = scrapy.Field()
LocationName = scrapy.Field()
PostId = scrapy.Field()
PostURL = scrapy.Field()
ProductName = scrapy.Field()
RecruitPostId = scrapy.Field()
RecruitPostName = scrapy.Field()
Responsibility = scrapy.Field()
SourceID = scrapy.Field()
4.CMD命令
scrapy crawl tencent01
网友评论