美文网首页
webmagic爬取B站用户信息

webmagic爬取B站用户信息

作者: 请不要酱紫 | 来源:发表于2018-10-15 15:52 被阅读0次

      通过谷歌浏览器开发模式找到了B站用户信息的口:https://space.bilibili.com/ajax/member/GetInfo 接口请求方式为post,需传入参数mid,mid为用户id,通过测试猜测mid后台保存为自动自增生成。

      开发环境: JDK1.8+Spring Boot + webmagic + mysql + mybatis plus

      在测试过程中发现 B站后台对接口调用的来源会检测限制,故在请求接口时添加请求头Referer:https://space.bilibili.com,关键代码如下:

    public class BiliSpider2 implements PageProcessor {
    
        @Autowired
        private SpiderService spiderService;
    
        private Site site = Site.me().setTimeOut(10000).setRetryTimes(3)
        .setSleepTime(10000).setCharset("UTF-8");
    
      @Override
        public void process(Page page) {
            String status = new JsonPathSelector("$.status").select(page.getRawText());
            if(Boolean.valueOf(status)){
                try {
                    String mid = new JsonPathSelector("$.data.mid").select(page.getRawText());
                    String regtime = new JsonPathSelector("$.data.regtime").select(page.getRawText());
                    String name = new JsonPathSelector("$.data.name").select(page.getRawText());
                    String sex = new JsonPathSelector("$.data.sex").select(page.getRawText());
                    String rank = new JsonPathSelector("$.data.rank").select(page.getRawText());
                    String face = new JsonPathSelector("$.data.face").select(page.getRawText());
                    String spacesta = new JsonPathSelector("$.data.spacesta").select(page.getRawText());
                    String birthday = new JsonPathSelector("$.data.birthday").select(page.getRawText());
                    String sign = new JsonPathSelector("$.data.sign").select(page.getRawText());
                    String currentLevel = new JsonPathSelector("$.data.level_info.current_level").select(page.getRawText());
                    String vipType = new JsonPathSelector("$.data.vip.vipType").select(page.getRawText());
                    String vipStatus = new JsonPathSelector("$.data.vip.vipStatus").select(page.getRawText());
                    String fansbadge = new JsonPathSelector("$.data.fans_badge").select(page.getRawText());
                    BiliUser biliUser = new BiliUser()
                            .setMid(Integer.valueOf(mid))
                            .setName(name)
                            .setBirthday(birthday)
                            .setCurrentLevel(Integer.valueOf(currentLevel))
                            .setFace(face)
                            .setFansBadge(Boolean.valueOf(fansbadge))
                            .setRank(Integer.valueOf(rank))
                            .setSex(sex)
                            .setSpacesta(spacesta)
                            .setSign(sign)
                            .setVipType(Integer.valueOf(vipType))
                            .setVipStatus(Integer.valueOf(vipStatus))
                            .setRegtime(DateUtils.dateConvertToLocalDateTime(new Date(Integer.valueOf(regtime))));
                    page.putField("user", biliUser);
                  
                }
            }
        }
    
      @Override
       public Site getSite() {
            Set<Integer> acceptStatCode = new HashSet<>();
            acceptStatCode.add(200);
            site = site.setAcceptStatCode(acceptStatCode)
                    .addHeader("Content-Type", "application/x-www-form-urlencoded")
                    .setUserAgent(UserAgentUtils.radomUserAgent());
    
            return site;
        }
    }
    

      这里是最简单的,当然可以在其中添加一些其他的规则,例如爬取返回的页面并不是我们期望的页面,当爬取过于频繁时,有的网站会对ip限制,返回错误页面,而在下载器中我们没有去验证,也不方便在其中验证,我们就可以在这里去处理。

      这里我将其做集成在spring boot

    @Component
    @Slf4j
    public class SpiderService {
        @Autowired
        private BiliPipeline biliPipeline;
        @Autowired
        private BiliSpider2 biliSpider2;
        @Autowired
        private BiliUserService biliUserService;
        @Autowired
        private ProxyIpService proxyIpService;
    
        private Spider spider;
        @PostConstruct
        private void init(){
            spider = Spider.create(biliSpider2)
                    .addPipeline(biliPipeline)
                    .thread(10);
        }
    
        public void start(Integer count){
            biliPipeline.clean();
            Integer maxMid = biliUserService.getMaxMid();
            System.out.println(maxMid);
            if(maxMid == null){
                maxMid = 0;
            }
            for (int i = maxMid; i < maxMid+count; i++) {
                Request request = new Request("https://space.bilibili.com/ajax/member/GetInfo");
                request.setMethod(HttpConstant.Method.POST);
                Map<String, Object> map = new HashMap<>();
                map.put("mid", i+1);
                request.setRequestBody(HttpRequestBody.form(map,"utf-8"));
                request.addHeader("Referer", "https://space.bilibili.com");
                spider.addRequest(request);
            }
            HttpClientDownloader downloader = new HttpClientDownloader();
            List<ProxyIp> list = proxyIpService.list(new QueryWrapper<ProxyIp>()
                    .orderByAsc("connect_speed")
                    .last("limit 1000"));
            List<Proxy> proxyList = new ArrayList<>();
            for (ProxyIp proxyIp : list) {
                proxyList.add(new Proxy(proxyIp.getIp(), proxyIp.getPort()));
            }
            downloader.setProxyProvider(SimpleProxyProvider.from(proxyList.toArray(new Proxy[0])));
            spider.setDownloader(downloader);
            spider.start();
        }
    
        public void stop(){
            spider.stop();
        }
    
        public Spider getSpider() {
            return spider;
        }
    }
    
    @Slf4j
    @RestController
    public class BiliController {
    
        @Autowired
        private SpiderService spiderService;
    
    
        @GetMapping("/start/{count}")
        public void start(@PathVariable Integer count){
            if (count == null) count = 100;
            spiderService.start(count);
    
        }
    
        @GetMapping("/stop")
        public void stop(){
            spiderService.stop();
        }
    
    }
    

    相关文章

      网友评论

          本文标题:webmagic爬取B站用户信息

          本文链接:https://www.haomeiwen.com/subject/ojvfzftx.html