美文网首页
webmagic实战使用

webmagic实战使用

作者: 老姚1987 | 来源:发表于2020-01-04 19:16 被阅读0次

    一.引入依赖包

    <dependency>
      <groupId>us.codecraft</groupId>
      <artifactId>webmagic-core</artifactId>
      <version>0.4.3</version>
      </dependency>
      <dependency>
      <groupId>us.codecraft</groupId>
      <artifactId>webmagic-extension</artifactId>
      <version>0.4.3</version>
      </dependency>
    

    二.代码

    package com.pz998.quartz.spider;
    
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.commons.collections.CollectionUtils;
    import org.apache.commons.lang.StringUtils;
    import org.eclipse.jetty.util.MultiMap;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    import com.pz998.rpc.model.entity.BdDepartmentDiseaseRelaRpc;
    import com.pz998.rpc.model.entity.BdDepartmentRpc;
    import com.pz998.rpc.model.entity.BdDiseaseDoctorRelaRpc;
    import com.pz998.rpc.model.entity.BdDoctorRpc;
    import com.pz998.rpc.model.entity.BdHospitalRpc;
    
    import net.minidev.json.JSONArray;
    import net.minidev.json.JSONObject;
    import net.minidev.json.parser.JSONParser;
    import net.minidev.json.parser.ParseException;
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.processor.PageProcessor;
    import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
    import us.codecraft.webmagic.selector.JsonPathSelector;
    import us.codecraft.xsoup.Xsoup;
    
    public class YiBaiduProcessor implements PageProcessor{
    
    private static final String START_URL = "https://yi.baidu.com/pc/hospital/list?cityId=371&pageSize=10&page=1";
    
    private static final String HOSPITAL_DETAIL_URL = "https://yi\\.baidu\\.com/pc/hospital/index\\?zt=pcpinzhuan&zt_ext=&pvid=\\d+&key=\\S+";
    
    private static final String HOSPITAL_LIST_URL = "https://yi\\.baidu\\.com/pc/hospital/list\\?cityId=\\d++&pageSize=10&page=\\d++";
    
    private static final String HOSPITAL_INFO_URL ="https://yi\\.baidu\\.com/pc/hospital/info\\?key=\\S+";
    
    private static final String DEPT_INFO_URL = "https://yi\\.baidu\\.com/pc/admindepartment/detail\\?zt=\\w+&zt_ext=&pvid=\\d+&hosId=\\d+&adminDepartId=\\d+";
    
    private static final String HOSPITAL_DEPT_URL ="https://yi\\.baidu\\.com/pc/hospital/alldep\\?key=\\S+";
    
    private static final String DOCTOR_LIST_URL = "https://yi\\.baidu\\.com/pc/admindepartment/doctorlist\\?diseaseId=0&medTitle=0&serviceType=0&page=\\d+&pageSize=8&provId=0&cityId=0&regionId=0&adminDepartId=\\d+&hosId=\\d+";
    
    private static final String DOCTOR_INFO_URL = "https://yi\\.baidu\\.com/pc/doctor/detailpage\\?zt=\\w+&zt_ext=&pvid=0&doctorId=\\d+";
    //https://yi.baidu.com/pc/hospital/info?key=%E6%AD%A6%E6%B1%89%E5%B8%82%E5%A6%87%E5%A5%B3%E5%84%BF%E7%AB%A5%E5%8C%BB%E7%96%97%E4%BF%9D%E5%81%A5%E4%B8%AD%E5%BF%83
    //https://yi.baidu.com/pc/hospital/alldep?key=
    private Site site = Site.me();
    
    public static final String STATE_SUCCESS = "0";
    
    public static final Map<String,String> CITY_MAP = new HashMap<String,String>();
    
    static{
    CITY_MAP.put("371","武汉");
    CITY_MAP.put("1", "北京");
    CITY_MAP.put("2", "上海");
    CITY_MAP.put("84","广州");
    }
    @Override
    public void process(Page page) {
    String url=page.getUrl().toString();
    if(page.getUrl().regex(HOSPITAL_LIST_URL).match()){
    try{
    String state = new JsonPathSelector("$.status").select(page.getRawText());
    if(STATE_SUCCESS.equals(state)){
    List hospitalList = new JsonPathSelector("$.data.hospitalList[*]").selectList(page.getRawText());
    MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
    String cityId = resultMap.getString("cityId");
    if(CollectionUtils.isNotEmpty(hospitalList)){
    List<BdHospitalRpc> bdHospitalList = new ArrayList<BdHospitalRpc>();
    for(Object obj:hospitalList){
    JSONObject jsonObj = (JSONObject)obj;
    String name = (String)jsonObj.get("name");
    System.out.println("name:"+name);
    String address = (String)jsonObj.get("address");
    String level = (String)jsonObj.get("level");
    Integer insurance = (Integer)jsonObj.get("insurance");
    String phone = (String)jsonObj.get("phone");
    String grade = (String)jsonObj.get("grade");
    Integer doctorNum = (Integer)jsonObj.get("doctorNum");
    String imageUrl = (String)jsonObj.get("logo");
    Integer serveNum = (Integer)jsonObj.get("serveNum");
    Integer commentNum = (Integer)jsonObj.get("commentNum");
    String routeLink = (String)jsonObj.get("routeLink");
    
    MultiMap<String> routeLinkMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(routeLink);
    String location = routeLinkMap.getString("location");
    String latitude = "";
    String longitude = "";
    if(StringUtils.isNotEmpty(location)){
    String[] locationArray = location.split(",");
    latitude = locationArray.length>0?locationArray[0]:"";
    longitude = locationArray.length>1?locationArray[1]:"";
    }
    BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
    bdHospitalRpc.setSourceId(name);
    bdHospitalRpc.setName(name);
    bdHospitalRpc.setAddress(address);
    bdHospitalRpc.setLevel(level);
    bdHospitalRpc.setPhone(phone);
    bdHospitalRpc.setImageUrl(imageUrl);
    bdHospitalRpc.setLatitude(latitude);
    bdHospitalRpc.setLongitude(longitude);
    bdHospitalRpc.setScore(grade);
    String city = CITY_MAP.get(cityId);
    bdHospitalRpc.setCity(city);
    String insuranceStr = insurance==null?"":insurance.toString();
    bdHospitalRpc.setIsMedicalInsurance(insuranceStr);
    String doctorNumStr = doctorNum==null?"":doctorNum.toString();
    bdHospitalRpc.setHighQualityDoctorNum(doctorNumStr);
    
    String serveNumStr = serveNum==null?"":serveNum.toString();
    bdHospitalRpc.setFinishedServiceNum(serveNumStr);
    
    String commentNumStr=commentNum==null?"":commentNum.toString();
    bdHospitalRpc.setPatientCommentNum(commentNumStr);
    bdHospitalList.add(bdHospitalRpc);
    
    String infoUrl = "https://yi.baidu.com/pc/hospital/info?key="+name;
    String allDeptUrl = "https://yi.baidu.com/pc/hospital/alldep?key="+name;
    page.addTargetRequest(infoUrl);
    page.addTargetRequest(allDeptUrl);
    }
    
    page.putField("bdHospitalList", bdHospitalList);
    }
    }
    }catch(Exception e){
    e.printStackTrace();
    }
    }else
    if(page.getUrl().regex(HOSPITAL_INFO_URL).match()){
    try{
    MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
      String hosName = resultMap.getString("key");
      BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
    List<String> contextList = page.getHtml().xpath("ul[@class='container-list-info']/li[@class='ys-util-margin-b35']/p[@class='ys-util-text-smaller ys-util-margin-t9 ys-util-margin-b30']/text()").all();
    if(CollectionUtils.isNotEmpty(contextList)){
    String context1 = contextList.size()>=1?contextList.get(0):"";
    String context2 = contextList.size()>=2?contextList.get(1):"";
    String context3 = contextList.size()>=3?contextList.get(2):"";
    String context4 = contextList.size()>=4?contextList.get(3):"";
    String context5 = contextList.size()>=5?contextList.get(4):"";
    
    bdHospitalRpc.setContent(context1);
    bdHospitalRpc.setHistory(context2);
    bdHospitalRpc.setCharacteristicDept(context3);
    bdHospitalRpc.setTeam(context4);
    bdHospitalRpc.setHonor(context5);
    // System.out.println("医院概况:"+context1);
    // System.out.println("历史沿革:"+context2);
    // System.out.println("特色科室:"+context3);
    // System.out.println("医护团队:"+context4);
    // System.out.println("医院荣誉:"+context5);
    }
    
    bdHospitalRpc.setSourceId(hosName);
    page.putField("bdHospitalRpc", bdHospitalRpc);
    
    }catch(Exception e){
    e.printStackTrace();
    }
    }else if(page.getUrl().regex(HOSPITAL_DEPT_URL).match()){
    try{
      MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
      String hosName = resultMap.getString("key");
      String topDepts = "";
      List<String> tableHtml = page.getHtml().xpath("div[@class='container-common-office']/table[@class='ys-util-margin-b15 list-office ys-util-border-big']").all();
    List<BdDepartmentRpc> departmentList = new ArrayList<BdDepartmentRpc>();
    for(String html:tableHtml){
    Document document = Jsoup.parse(html);
    String platDept = Xsoup.select(document, "td[@class='primary-office']/h4/text()").get();
    List<String> hospitalDepts = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']").list();
    List<String> hospitalDeptNames = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']/text()").list();
    
    //重点科室信息
    if(StringUtils.isEmpty(platDept)){
    topDepts = com.pz998.quartz.utils.StringUtils.listToString(hospitalDeptNames);
    //医院科室信息
    }else{
    for(String d:hospitalDepts){
    Document deptDocument = Jsoup.parse(d);
    String deptName = Xsoup.select(deptDocument, "a/text()").get();
    String deptHref = Xsoup.select(deptDocument, "a/@href").get();
    MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(deptHref);
    String deptId = deptResultMap.getString("adminDepartId");
    String hosId = deptResultMap.getString("hosId");
    BdDepartmentRpc bdDepart = new BdDepartmentRpc();
    bdDepart.setSourceId(deptId);
    bdDepart.setName(deptName);
    bdDepart.setParentSource(platDept);
    bdDepart.setHospitalSource(hosName);
    departmentList.add(bdDepart);
    //将科室详情地址放入目标采集队列
    page.addTargetRequest(deptHref);
    //将科室下医生列表链接放入队列
    for(int i=1;i<6;i++){
    String doctorUrl = "https://yi.baidu.com/pc/admindepartment/doctorlist?diseaseId=0&medTitle=0&serviceType=0&page="+i+"&pageSize=8&provId=0&cityId=0&regionId=0&adminDepartId="+deptId+"&hosId="+hosId;
    page.addTargetRequest(doctorUrl);
    }
    
    }
    }
    }
    
    BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
    bdHospitalRpc.setSourceId(hosName);
    System.out.println("重点科室:"+topDepts);
    bdHospitalRpc.setCharacteristicFaculty(topDepts);
    page.putField("hosTopDept", bdHospitalRpc);
    page.putField("departmentList", departmentList);
    
    // System.out.println(page.getHtml().toString());
    }catch(Exception e){
    e.printStackTrace();
    }
    //采集科室信息
    }
    else if(page.getUrl().regex(DEPT_INFO_URL).match()){
    String deptPhone = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-normal-height']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();
    String deptAddress = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t8 ys-util-text-normal']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();
    String content = page.getHtml().xpath("div[@class='office-info']/p[@class='ys-util-text-smaller ys-util-margin-t15 office-info-total']/text()").toString();
    String titleDescr = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-min-height']/h3[@class='ys-util-text-min ys-util-margin-r12']/text()").toString();
    
    MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
      String deptId = deptResultMap.getString("adminDepartId");
    String hosId = deptResultMap.getString("hosId");
    
    BdDepartmentRpc bdDepartmentRpc = new BdDepartmentRpc();
    bdDepartmentRpc.setAddress(deptAddress);
    bdDepartmentRpc.setPhone(deptPhone);
    bdDepartmentRpc.setContent(content);
    bdDepartmentRpc.setSourceId(deptId);
    bdDepartmentRpc.setTitleDescr(titleDescr);
    page.putField("bdDepartmentRpc", bdDepartmentRpc);
    
    }else if(page.getUrl().regex(DOCTOR_LIST_URL).match()){
    String status = new JsonPathSelector("$.status").select(page.getRawText());
    if(STATE_SUCCESS.equals(status)){
    String data = new JsonPathSelector("$.data[*]").select(page.getRawText());
    if(data!=null){
    MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
    String deptId = deptResultMap.getString("adminDepartId");
    String hosId = deptResultMap.getString("hosId");
    String pageNum = deptResultMap.getString("page");
    List<BdDepartmentDiseaseRelaRpc> BdDepartmentDiseaseRelaRpcList = new ArrayList<BdDepartmentDiseaseRelaRpc>();
    JSONParser jsonParser = new JSONParser();
    JSONObject dataJo = null;
    try {
    dataJo = (JSONObject)jsonParser.parse(data);
    } catch (ParseException e) {
    e.printStackTrace();
    }
    
    if("1".equals(pageNum)){
    JSONArray diseaseArray= dataJo==null?null:(JSONArray)dataJo.get("selectorList");
    if(CollectionUtils.isNotEmpty(diseaseArray)){
    JSONObject obj = (JSONObject)diseaseArray.get(0);
    JSONArray diseaseList = (JSONArray)obj.get("list");
    if(CollectionUtils.isNotEmpty(diseaseList)){
    for(Object disease:diseaseList){
    JSONObject diseaseJo=(JSONObject)disease;
    String itemName = (String)diseaseJo.get("itemName");
    if("全部".equals(itemName)){
    continue;
    }
    BdDepartmentDiseaseRelaRpc bdDepartmentDiseaseRelaRpc = new BdDepartmentDiseaseRelaRpc();
    bdDepartmentDiseaseRelaRpc.setHospitalSourceId(hosId);
    bdDepartmentDiseaseRelaRpc.setDepartmentSourceId(deptId);
    bdDepartmentDiseaseRelaRpc.setDiseaseSource(itemName);
    BdDepartmentDiseaseRelaRpcList.add(bdDepartmentDiseaseRelaRpc);
    }
    }
    
    }
    }
    
    page.putField("bdDepartmentDiseaseRelaRpcList", BdDepartmentDiseaseRelaRpcList);
    
    if(dataJo.containsKey("doctorList")){
    List doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());
    if(CollectionUtils.isNotEmpty(doctorList)){
    
    //收集医生信息
    List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>();
    //收集医生与疾病关系信息
    List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>();
    for(Object o:doctorList){
    JSONObject doctorJo = (JSONObject)o;
    //医生认证信息
    String identifyMarkStr = "";
    if(doctorJo.containsKey("doctorIdentify")){
    List<String> identifyMarkList = new JsonPathSelector("$.doctorIdentify[*].identifyMark").selectList(doctorJo.toJSONString());
    identifyMarkStr = com.pz998.quartz.utils.StringUtils.listToString(identifyMarkList);
    }
    
    String doctorName = (String)doctorJo.get("doctorName");
    String doctorTitle= (String)doctorJo.get("doctorTitle");
    Object commentScore = doctorJo.get("commentScore");
    String doctorSkill = (String)doctorJo.get("doctorSkill");
    String allTimeHref = (String)doctorJo.get("allTimeHref");
    String doctorPhoto = (String)doctorJo.get("doctorPhoto");
    //医生详情页加入目标采集
    page.addTargetRequest(allTimeHref);
    MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(allTimeHref);
    String doctorId = resultMap.getString("doctorId");
    
    BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();
    bdDoctorRpc.setHospitalSourceId(hosId);
    bdDoctorRpc.setDepartmentSourceId(deptId);
    bdDoctorRpc.setSourceId(doctorId);
    bdDoctorRpc.setName(doctorName);
    bdDoctorRpc.setPracticeTitle(doctorTitle);
    String commentScoreStr = commentScore==null?"":commentScore.toString();
    bdDoctorRpc.setRecommendScore(commentScoreStr);
    bdDoctorRpc.setDiseaseTag(doctorSkill);
    bdDoctorRpc.setImageUrl(doctorPhoto);
    bdDoctorRpc.setIdentifyMark(identifyMarkStr);
    bdDoctorList.add(bdDoctorRpc);
    
    JSONArray treatPatientArray = (JSONArray)doctorJo.get("treatPatient");
    if(CollectionUtils.isNotEmpty(treatPatientArray)){
    for(Object treatPatient:treatPatientArray){
    JSONObject treatPatientJo = (JSONObject)treatPatient;
    String diseaseName = (String)treatPatientJo.get("diseaseName");
    BdDiseaseDoctorRelaRpc bdDiseaseDoctorRelaRpc = new BdDiseaseDoctorRelaRpc();
    bdDiseaseDoctorRelaRpc.setDiseaseSourceId(diseaseName);
    bdDiseaseDoctorRelaRpc.setDoctorSourceId(doctorId);
    bdDiseaseDoctorRelaList.add(bdDiseaseDoctorRelaRpc);
    }
    }
    }
    
    page.putField("bdDiseaseDoctorRelaList", bdDiseaseDoctorRelaList);
    page.putField("bdDoctorList", bdDoctorList);
    }
    }
    }
    }
    
    }else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){
    MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
      String doctorId = deptResultMap.getString("doctorId");
    
      BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();
      bdDoctorRpc.setSourceId(doctorId);
    
      String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString();
      bdDoctorRpc.setIntro(experience);
      List<String> commentList = page.getHtml().xpath("ul[@class='summary-comment']/li/p[@class='ys-util-text-default ys-util-text-smaller']/i[@class='comment-score ys-util-text-primary ys-util-text-big']/text()").all();
      if(CollectionUtils.isNotEmpty(commentList)){
      String recommendScore = commentList.size()>=1?commentList.get(0):"";
      String treatmentEffectScore = commentList.size()>=2?commentList.get(1):"";
      String attitudeScore = commentList.size()>=3?commentList.get(2):"";
    
      bdDoctorRpc.setRecommendScore(recommendScore);
      bdDoctorRpc.setTreatmentEffectScore(treatmentEffectScore);
      bdDoctorRpc.setAttitudeScore(attitudeScore);
      }
    
      page.putField("bdDoctorRpc", bdDoctorRpc);
    
    }
    }
    
    @Override
    public Site getSite() {
    return site;
    }
    
    public static void main(String[] args) {
    Spider.create(new YiBaiduProcessor()).addUrl(START_URL).thread(10).run();
    }
    }
    
    • 上述代码采集百度医生数据,采集线路进入医院列表-->医院详情-->科室列表-->科室详情-->医生列表-->医生详情

    • 每个eles if 匹配一类页面地址 即上面说的采集链路上的一个采集节点

    • 采集相应数据时会将网站的原始关系映射采集过来 ,在构建本地存储对象时从采集链接中获取采集,如医院,医生id值

      如果代码
          }else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){
      

      MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId");

      BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId);

      String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); if(StringUtils.isEmpty(experience)){ experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10 doctor-info-total']/text()").toString(); } bdDoctorRpc.setIntro(experience); System.out.println("experience:"+experience);

    • 解析Ajax json结果

      List<String> doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());
      

    if(CollectionUtils.isNotEmpty(doctorList)){ //收集医生信息 List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>(); //收集医生与疾病关系信息 List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>(); for(String o:doctorList){ JSONObject doctorJo = JSON.parseObject(o);

    • 针对元素特征一样的元素集 如li 列表 table 表格 需要依次获取其中的内容

    学习视频

    image image.gif

    复制链接,在浏览器打开
    tomcat源码解析
    https://study.163.com/course/introduction/1209535854.htm

    Springmvc源码解析
    https://study.163.com/course/introduction/1209536851.htm

    dubbo源码解析
    https://study.163.com/course/introduction/1209648816.htm

    相关文章

      网友评论

          本文标题:webmagic实战使用

          本文链接:https://www.haomeiwen.com/subject/sgjfactx.html