一.引入依赖包
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.4.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.4.3</version>
</dependency>
二.代码
package com.pz998.quartz.spider;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.eclipse.jetty.util.MultiMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.pz998.rpc.model.entity.BdDepartmentDiseaseRelaRpc;
import com.pz998.rpc.model.entity.BdDepartmentRpc;
import com.pz998.rpc.model.entity.BdDiseaseDoctorRelaRpc;
import com.pz998.rpc.model.entity.BdDoctorRpc;
import com.pz998.rpc.model.entity.BdHospitalRpc;
import net.minidev.json.JSONArray;
import net.minidev.json.JSONObject;
import net.minidev.json.parser.JSONParser;
import net.minidev.json.parser.ParseException;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.xsoup.Xsoup;
public class YiBaiduProcessor implements PageProcessor{
private static final String START_URL = "https://yi.baidu.com/pc/hospital/list?cityId=371&pageSize=10&page=1";
private static final String HOSPITAL_DETAIL_URL = "https://yi\\.baidu\\.com/pc/hospital/index\\?zt=pcpinzhuan&zt_ext=&pvid=\\d+&key=\\S+";
private static final String HOSPITAL_LIST_URL = "https://yi\\.baidu\\.com/pc/hospital/list\\?cityId=\\d++&pageSize=10&page=\\d++";
private static final String HOSPITAL_INFO_URL ="https://yi\\.baidu\\.com/pc/hospital/info\\?key=\\S+";
private static final String DEPT_INFO_URL = "https://yi\\.baidu\\.com/pc/admindepartment/detail\\?zt=\\w+&zt_ext=&pvid=\\d+&hosId=\\d+&adminDepartId=\\d+";
private static final String HOSPITAL_DEPT_URL ="https://yi\\.baidu\\.com/pc/hospital/alldep\\?key=\\S+";
private static final String DOCTOR_LIST_URL = "https://yi\\.baidu\\.com/pc/admindepartment/doctorlist\\?diseaseId=0&medTitle=0&serviceType=0&page=\\d+&pageSize=8&provId=0&cityId=0®ionId=0&adminDepartId=\\d+&hosId=\\d+";
private static final String DOCTOR_INFO_URL = "https://yi\\.baidu\\.com/pc/doctor/detailpage\\?zt=\\w+&zt_ext=&pvid=0&doctorId=\\d+";
//https://yi.baidu.com/pc/hospital/info?key=%E6%AD%A6%E6%B1%89%E5%B8%82%E5%A6%87%E5%A5%B3%E5%84%BF%E7%AB%A5%E5%8C%BB%E7%96%97%E4%BF%9D%E5%81%A5%E4%B8%AD%E5%BF%83
//https://yi.baidu.com/pc/hospital/alldep?key=
private Site site = Site.me();
public static final String STATE_SUCCESS = "0";
public static final Map<String,String> CITY_MAP = new HashMap<String,String>();
static{
CITY_MAP.put("371","武汉");
CITY_MAP.put("1", "北京");
CITY_MAP.put("2", "上海");
CITY_MAP.put("84","广州");
}
@Override
public void process(Page page) {
String url=page.getUrl().toString();
if(page.getUrl().regex(HOSPITAL_LIST_URL).match()){
try{
String state = new JsonPathSelector("$.status").select(page.getRawText());
if(STATE_SUCCESS.equals(state)){
List hospitalList = new JsonPathSelector("$.data.hospitalList[*]").selectList(page.getRawText());
MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
String cityId = resultMap.getString("cityId");
if(CollectionUtils.isNotEmpty(hospitalList)){
List<BdHospitalRpc> bdHospitalList = new ArrayList<BdHospitalRpc>();
for(Object obj:hospitalList){
JSONObject jsonObj = (JSONObject)obj;
String name = (String)jsonObj.get("name");
System.out.println("name:"+name);
String address = (String)jsonObj.get("address");
String level = (String)jsonObj.get("level");
Integer insurance = (Integer)jsonObj.get("insurance");
String phone = (String)jsonObj.get("phone");
String grade = (String)jsonObj.get("grade");
Integer doctorNum = (Integer)jsonObj.get("doctorNum");
String imageUrl = (String)jsonObj.get("logo");
Integer serveNum = (Integer)jsonObj.get("serveNum");
Integer commentNum = (Integer)jsonObj.get("commentNum");
String routeLink = (String)jsonObj.get("routeLink");
MultiMap<String> routeLinkMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(routeLink);
String location = routeLinkMap.getString("location");
String latitude = "";
String longitude = "";
if(StringUtils.isNotEmpty(location)){
String[] locationArray = location.split(",");
latitude = locationArray.length>0?locationArray[0]:"";
longitude = locationArray.length>1?locationArray[1]:"";
}
BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
bdHospitalRpc.setSourceId(name);
bdHospitalRpc.setName(name);
bdHospitalRpc.setAddress(address);
bdHospitalRpc.setLevel(level);
bdHospitalRpc.setPhone(phone);
bdHospitalRpc.setImageUrl(imageUrl);
bdHospitalRpc.setLatitude(latitude);
bdHospitalRpc.setLongitude(longitude);
bdHospitalRpc.setScore(grade);
String city = CITY_MAP.get(cityId);
bdHospitalRpc.setCity(city);
String insuranceStr = insurance==null?"":insurance.toString();
bdHospitalRpc.setIsMedicalInsurance(insuranceStr);
String doctorNumStr = doctorNum==null?"":doctorNum.toString();
bdHospitalRpc.setHighQualityDoctorNum(doctorNumStr);
String serveNumStr = serveNum==null?"":serveNum.toString();
bdHospitalRpc.setFinishedServiceNum(serveNumStr);
String commentNumStr=commentNum==null?"":commentNum.toString();
bdHospitalRpc.setPatientCommentNum(commentNumStr);
bdHospitalList.add(bdHospitalRpc);
String infoUrl = "https://yi.baidu.com/pc/hospital/info?key="+name;
String allDeptUrl = "https://yi.baidu.com/pc/hospital/alldep?key="+name;
page.addTargetRequest(infoUrl);
page.addTargetRequest(allDeptUrl);
}
page.putField("bdHospitalList", bdHospitalList);
}
}
}catch(Exception e){
e.printStackTrace();
}
}else
if(page.getUrl().regex(HOSPITAL_INFO_URL).match()){
try{
MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
String hosName = resultMap.getString("key");
BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
List<String> contextList = page.getHtml().xpath("ul[@class='container-list-info']/li[@class='ys-util-margin-b35']/p[@class='ys-util-text-smaller ys-util-margin-t9 ys-util-margin-b30']/text()").all();
if(CollectionUtils.isNotEmpty(contextList)){
String context1 = contextList.size()>=1?contextList.get(0):"";
String context2 = contextList.size()>=2?contextList.get(1):"";
String context3 = contextList.size()>=3?contextList.get(2):"";
String context4 = contextList.size()>=4?contextList.get(3):"";
String context5 = contextList.size()>=5?contextList.get(4):"";
bdHospitalRpc.setContent(context1);
bdHospitalRpc.setHistory(context2);
bdHospitalRpc.setCharacteristicDept(context3);
bdHospitalRpc.setTeam(context4);
bdHospitalRpc.setHonor(context5);
// System.out.println("医院概况:"+context1);
// System.out.println("历史沿革:"+context2);
// System.out.println("特色科室:"+context3);
// System.out.println("医护团队:"+context4);
// System.out.println("医院荣誉:"+context5);
}
bdHospitalRpc.setSourceId(hosName);
page.putField("bdHospitalRpc", bdHospitalRpc);
}catch(Exception e){
e.printStackTrace();
}
}else if(page.getUrl().regex(HOSPITAL_DEPT_URL).match()){
try{
MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
String hosName = resultMap.getString("key");
String topDepts = "";
List<String> tableHtml = page.getHtml().xpath("div[@class='container-common-office']/table[@class='ys-util-margin-b15 list-office ys-util-border-big']").all();
List<BdDepartmentRpc> departmentList = new ArrayList<BdDepartmentRpc>();
for(String html:tableHtml){
Document document = Jsoup.parse(html);
String platDept = Xsoup.select(document, "td[@class='primary-office']/h4/text()").get();
List<String> hospitalDepts = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']").list();
List<String> hospitalDeptNames = Xsoup.select(document, "td[@class='secondary-office']/dl/dd/h4/a[@class='a-hover ys-util-text-normal']/text()").list();
//重点科室信息
if(StringUtils.isEmpty(platDept)){
topDepts = com.pz998.quartz.utils.StringUtils.listToString(hospitalDeptNames);
//医院科室信息
}else{
for(String d:hospitalDepts){
Document deptDocument = Jsoup.parse(d);
String deptName = Xsoup.select(deptDocument, "a/text()").get();
String deptHref = Xsoup.select(deptDocument, "a/@href").get();
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(deptHref);
String deptId = deptResultMap.getString("adminDepartId");
String hosId = deptResultMap.getString("hosId");
BdDepartmentRpc bdDepart = new BdDepartmentRpc();
bdDepart.setSourceId(deptId);
bdDepart.setName(deptName);
bdDepart.setParentSource(platDept);
bdDepart.setHospitalSource(hosName);
departmentList.add(bdDepart);
//将科室详情地址放入目标采集队列
page.addTargetRequest(deptHref);
//将科室下医生列表链接放入队列
for(int i=1;i<6;i++){
String doctorUrl = "https://yi.baidu.com/pc/admindepartment/doctorlist?diseaseId=0&medTitle=0&serviceType=0&page="+i+"&pageSize=8&provId=0&cityId=0®ionId=0&adminDepartId="+deptId+"&hosId="+hosId;
page.addTargetRequest(doctorUrl);
}
}
}
}
BdHospitalRpc bdHospitalRpc = new BdHospitalRpc();
bdHospitalRpc.setSourceId(hosName);
System.out.println("重点科室:"+topDepts);
bdHospitalRpc.setCharacteristicFaculty(topDepts);
page.putField("hosTopDept", bdHospitalRpc);
page.putField("departmentList", departmentList);
// System.out.println(page.getHtml().toString());
}catch(Exception e){
e.printStackTrace();
}
//采集科室信息
}
else if(page.getUrl().regex(DEPT_INFO_URL).match()){
String deptPhone = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-normal-height']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();
String deptAddress = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t8 ys-util-text-normal']/label[@class='ys-util-text-normal ys-util-margin-l10']/text()").toString();
String content = page.getHtml().xpath("div[@class='office-info']/p[@class='ys-util-text-smaller ys-util-margin-t15 office-info-total']/text()").toString();
String titleDescr = page.getHtml().xpath("div[@class='summary-left']/div[@class='summary-row ys-util-margin-t12 ys-util-text-min-height']/h3[@class='ys-util-text-min ys-util-margin-r12']/text()").toString();
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
String deptId = deptResultMap.getString("adminDepartId");
String hosId = deptResultMap.getString("hosId");
BdDepartmentRpc bdDepartmentRpc = new BdDepartmentRpc();
bdDepartmentRpc.setAddress(deptAddress);
bdDepartmentRpc.setPhone(deptPhone);
bdDepartmentRpc.setContent(content);
bdDepartmentRpc.setSourceId(deptId);
bdDepartmentRpc.setTitleDescr(titleDescr);
page.putField("bdDepartmentRpc", bdDepartmentRpc);
}else if(page.getUrl().regex(DOCTOR_LIST_URL).match()){
String status = new JsonPathSelector("$.status").select(page.getRawText());
if(STATE_SUCCESS.equals(status)){
String data = new JsonPathSelector("$.data[*]").select(page.getRawText());
if(data!=null){
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
String deptId = deptResultMap.getString("adminDepartId");
String hosId = deptResultMap.getString("hosId");
String pageNum = deptResultMap.getString("page");
List<BdDepartmentDiseaseRelaRpc> BdDepartmentDiseaseRelaRpcList = new ArrayList<BdDepartmentDiseaseRelaRpc>();
JSONParser jsonParser = new JSONParser();
JSONObject dataJo = null;
try {
dataJo = (JSONObject)jsonParser.parse(data);
} catch (ParseException e) {
e.printStackTrace();
}
if("1".equals(pageNum)){
JSONArray diseaseArray= dataJo==null?null:(JSONArray)dataJo.get("selectorList");
if(CollectionUtils.isNotEmpty(diseaseArray)){
JSONObject obj = (JSONObject)diseaseArray.get(0);
JSONArray diseaseList = (JSONArray)obj.get("list");
if(CollectionUtils.isNotEmpty(diseaseList)){
for(Object disease:diseaseList){
JSONObject diseaseJo=(JSONObject)disease;
String itemName = (String)diseaseJo.get("itemName");
if("全部".equals(itemName)){
continue;
}
BdDepartmentDiseaseRelaRpc bdDepartmentDiseaseRelaRpc = new BdDepartmentDiseaseRelaRpc();
bdDepartmentDiseaseRelaRpc.setHospitalSourceId(hosId);
bdDepartmentDiseaseRelaRpc.setDepartmentSourceId(deptId);
bdDepartmentDiseaseRelaRpc.setDiseaseSource(itemName);
BdDepartmentDiseaseRelaRpcList.add(bdDepartmentDiseaseRelaRpc);
}
}
}
}
page.putField("bdDepartmentDiseaseRelaRpcList", BdDepartmentDiseaseRelaRpcList);
if(dataJo.containsKey("doctorList")){
List doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());
if(CollectionUtils.isNotEmpty(doctorList)){
//收集医生信息
List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>();
//收集医生与疾病关系信息
List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>();
for(Object o:doctorList){
JSONObject doctorJo = (JSONObject)o;
//医生认证信息
String identifyMarkStr = "";
if(doctorJo.containsKey("doctorIdentify")){
List<String> identifyMarkList = new JsonPathSelector("$.doctorIdentify[*].identifyMark").selectList(doctorJo.toJSONString());
identifyMarkStr = com.pz998.quartz.utils.StringUtils.listToString(identifyMarkList);
}
String doctorName = (String)doctorJo.get("doctorName");
String doctorTitle= (String)doctorJo.get("doctorTitle");
Object commentScore = doctorJo.get("commentScore");
String doctorSkill = (String)doctorJo.get("doctorSkill");
String allTimeHref = (String)doctorJo.get("allTimeHref");
String doctorPhoto = (String)doctorJo.get("doctorPhoto");
//医生详情页加入目标采集
page.addTargetRequest(allTimeHref);
MultiMap<String> resultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(allTimeHref);
String doctorId = resultMap.getString("doctorId");
BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();
bdDoctorRpc.setHospitalSourceId(hosId);
bdDoctorRpc.setDepartmentSourceId(deptId);
bdDoctorRpc.setSourceId(doctorId);
bdDoctorRpc.setName(doctorName);
bdDoctorRpc.setPracticeTitle(doctorTitle);
String commentScoreStr = commentScore==null?"":commentScore.toString();
bdDoctorRpc.setRecommendScore(commentScoreStr);
bdDoctorRpc.setDiseaseTag(doctorSkill);
bdDoctorRpc.setImageUrl(doctorPhoto);
bdDoctorRpc.setIdentifyMark(identifyMarkStr);
bdDoctorList.add(bdDoctorRpc);
JSONArray treatPatientArray = (JSONArray)doctorJo.get("treatPatient");
if(CollectionUtils.isNotEmpty(treatPatientArray)){
for(Object treatPatient:treatPatientArray){
JSONObject treatPatientJo = (JSONObject)treatPatient;
String diseaseName = (String)treatPatientJo.get("diseaseName");
BdDiseaseDoctorRelaRpc bdDiseaseDoctorRelaRpc = new BdDiseaseDoctorRelaRpc();
bdDiseaseDoctorRelaRpc.setDiseaseSourceId(diseaseName);
bdDiseaseDoctorRelaRpc.setDoctorSourceId(doctorId);
bdDiseaseDoctorRelaList.add(bdDiseaseDoctorRelaRpc);
}
}
}
page.putField("bdDiseaseDoctorRelaList", bdDiseaseDoctorRelaList);
page.putField("bdDoctorList", bdDoctorList);
}
}
}
}
}else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url);
String doctorId = deptResultMap.getString("doctorId");
BdDoctorRpc bdDoctorRpc = new BdDoctorRpc();
bdDoctorRpc.setSourceId(doctorId);
String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString();
bdDoctorRpc.setIntro(experience);
List<String> commentList = page.getHtml().xpath("ul[@class='summary-comment']/li/p[@class='ys-util-text-default ys-util-text-smaller']/i[@class='comment-score ys-util-text-primary ys-util-text-big']/text()").all();
if(CollectionUtils.isNotEmpty(commentList)){
String recommendScore = commentList.size()>=1?commentList.get(0):"";
String treatmentEffectScore = commentList.size()>=2?commentList.get(1):"";
String attitudeScore = commentList.size()>=3?commentList.get(2):"";
bdDoctorRpc.setRecommendScore(recommendScore);
bdDoctorRpc.setTreatmentEffectScore(treatmentEffectScore);
bdDoctorRpc.setAttitudeScore(attitudeScore);
}
page.putField("bdDoctorRpc", bdDoctorRpc);
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new YiBaiduProcessor()).addUrl(START_URL).thread(10).run();
}
}
-
上述代码采集百度医生数据,采集线路进入医院列表-->医院详情-->科室列表-->科室详情-->医生列表-->医生详情
-
每个eles if 匹配一类页面地址 即上面说的采集链路上的一个采集节点
-
采集相应数据时会将网站的原始关系映射采集过来 ,在构建本地存储对象时从采集链接中获取采集,如医院,医生id值
如果代码 }else if(page.getUrl().regex(DOCTOR_INFO_URL).match()){
MultiMap<String> deptResultMap = com.pz998.quartz.utils.StringUtils.getUrlParamer(url); String doctorId = deptResultMap.getString("doctorId");
BdDoctorRpc bdDoctorRpc = new BdDoctorRpc(); bdDoctorRpc.setSourceId(doctorId);
String experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10']/text()").toString(); if(StringUtils.isEmpty(experience)){ experience = page.getHtml().xpath("div[@class='doctor-experience']/div[@class='ys-util-text-smaller ys-util-margin-t10 doctor-info-total']/text()").toString(); } bdDoctorRpc.setIntro(experience); System.out.println("experience:"+experience);
-
解析Ajax json结果
List<String> doctorList = new JsonPathSelector("$.data.doctorList[*]").selectList(page.getRawText());
if(CollectionUtils.isNotEmpty(doctorList)){ //收集医生信息 List<BdDoctorRpc> bdDoctorList = new ArrayList<BdDoctorRpc>(); //收集医生与疾病关系信息 List<BdDiseaseDoctorRelaRpc> bdDiseaseDoctorRelaList = new ArrayList<BdDiseaseDoctorRelaRpc>(); for(String o:doctorList){ JSONObject doctorJo = JSON.parseObject(o);
- 针对元素特征一样的元素集 如li 列表 table 表格 需要依次获取其中的内容
学习视频
image image.gif
复制链接,在浏览器打开
tomcat源码解析
https://study.163.com/course/introduction/1209535854.htm
Springmvc源码解析
https://study.163.com/course/introduction/1209536851.htm
dubbo源码解析
https://study.163.com/course/introduction/1209648816.htm
网友评论