领导知道我在学python,周五中午安排我去爬网上的数据,然后给后台的同事使用。由于当时有点事,就暂时搁置,周末在家没事做,就打开电脑写了下,用java和python两种语言各自试着写了下,python真是香!
python也是正在自学当中。。。
#Python用到的库:
#requests:请求
#BeautifulSoup:html解析
#csv、codecs:csv文件操作
import requests
from bs4 import BeautifulSoup
import csv
import codecs
file_csv = codecs.open('/Users/zuixia/Desktop/人体疾病科室对应表.csv', 'w+', 'utf-8')
writer = csv.writer(file_csv)
writer.writerow([r'身体部位', '疾病/科室'])
base_url = 'http://ts3.daxiang91.com'
url_body_part = 'http://ts3.daxiang91.com/TestNew/Guide/GetSymptom?crowd=1&body=1'
url_disease = 'http://ts3.daxiang91.com/TestNew/Guide/GetSymptomByBody'
res_body = requests.get(url_body_part)
soup_body = BeautifulSoup(res_body.text, 'lxml')
bodyParts = soup_body.findAll('div', {'class': 'am-list-item list bg'})
for bodyPart in bodyParts:
param = {"bodyId": bodyPart.get('data-bodyid'), "crowd": '1'}
res_disease = requests.post(url_disease, params=param)
soup_disease = BeautifulSoup(res_disease.text, 'lxml')
diseases = soup_disease.findAll('a', {'class': 'am-list-item list'})
for disease in diseases:
dep_link = disease.get('href')
res_dep = requests.get(base_url + dep_link, cookies={'ElephantSC': '34f2ngsofaaj2jzkqabuijda'})
soup_dep = BeautifulSoup(res_dep.text, 'lxml')
deps = soup_dep.findAll('div', {'class': 'am-list-item list'})
for dep in deps:
writer.writerow([bodyPart.get_text().strip(), disease.get_text().strip()+','+dep.get('data-deptname')])
print("输出完成啦!!!!")
Java的写法:
用java的时候我是没有直接去写excel文件,先用JavaBean接收,转成了Json再用工具写Excel的。不然也就像python写的那样在for循环里面直接操作了。
所以就多了个JavaBean。
用到的库:阿里的fastJson、apache的HSSFWorkbook、Jsoup
import com.alibaba.fastjson.JSON;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
public class Main {
static DataBean databean = new DataBean();
static ArrayList<DataBean.BodyBean> bodyBeans = new ArrayList();
static ArrayList<DataBean.BodyBean.Disease> diseaseList = new ArrayList();
public static void main(String[] args) {
try {
parsePersonBodyTag();
} catch (IOException e) {
e.printStackTrace();
}
}
private static void parsePersonBodyTag() throws IOException {
Document doc = Jsoup.connect("http://ts3.daxiang91.com/TestNew/Guide/GetSymptom?crowd=1&body=1").timeout(1000000000).get();
Elements elements = doc.getElementsByAttribute("data-bodyid");
for (int i = 0; i < elements.size(); i++) {
Element element = elements.get(i);
DataBean.BodyBean bodyBean = new DataBean.BodyBean();
bodyBean.setBodyPart(element.text());
bodyBean.setBodyId(Integer.valueOf(element.attr("data-bodyid")));
parseDiseases(bodyBean);
}
databean.setBodyBeanList(bodyBeans);
String jsonString = JSON.toJSONString(databean);
System.out.println(jsonString);
JSONToExcel.creatExc(jsonString);
}
private static void parseDiseases(DataBean.BodyBean bodyBean) throws IOException {
diseaseList = new ArrayList();
Document doc = Jsoup.connect("http://ts3.daxiang91.com/TestNew/Guide/GetSymptomByBody")
.data("bodyId", String.valueOf(bodyBean.getBodyId())).data("crowd", "1")
.timeout(1000000000).post();
Elements elements = doc.getElementsByClass("am-list-item list");
for (Element element : elements) {
parseDepList(bodyBean, element);
}
bodyBean.setDiseaseList(diseaseList);
bodyBeans.add(bodyBean);
}
private static void parseDepList(DataBean.BodyBean bodyBean, Element element) throws IOException {
DataBean.BodyBean.Disease disease = new DataBean.BodyBean.Disease();
disease.setDisName(element.text());
String url = element.attr("href");
Document doc = Jsoup.connect("http://ts3.daxiang91.com" + url)
.cookie("ElephantSC", "34f2ngsofaaj2jzkqabuijda")
.timeout(1000000000)
.get();
Elements lists = doc.getElementsByClass("am-list-item list");
ArrayList<DataBean.BodyBean.Disease.Department> depList = new ArrayList();
for (Element list : lists) {
DataBean.BodyBean.Disease.Department department = new DataBean.BodyBean.Disease.Department();
department.setDepName(list.text());
depList.add(department);
}
disease.setDepList(depList);
diseaseList.add(disease);
}
}
import java.util.List;
public class DataBean {
private List<BodyBean> bodyBeanList;
public List<BodyBean> getBodyBeanList() {
return bodyBeanList;
}
public void setBodyBeanList(List<BodyBean> bodyBeanList) {
this.bodyBeanList = bodyBeanList;
}
/*第一层数据--身体部位*/
public static class BodyBean {
@Override
public String toString() {
return "BodyBean{" +
"bodyPart='" + bodyPart + '\'' +
", bodyId=" + bodyId +
", diseaseList=" + diseaseList +
'}';
}
private String bodyPart;//部位名称
private int bodyId;
public int getBodyId() {
return bodyId;
}
public void setBodyId(int bodyId) {
this.bodyId = bodyId;
}
private List<Disease> diseaseList;//部位下疾病的集合
public String getBodyPart() {
return bodyPart;
}
public void setBodyPart(String bodyPart) {
this.bodyPart = bodyPart;
}
public List<Disease> getDiseaseList() {
return diseaseList;
}
public void setDiseaseList(List<Disease> diseaseList) {
this.diseaseList = diseaseList;
}
/*第二层数据--身体部位下的疾病*/
public static class Disease {
@Override
public String toString() {
return "Disease{" +
"disName='" + disName + '\'' +
", depList=" + depList +
'}';
}
private String disName;//疾病名称
private List<Department> depList;//疾病对应的科室集合
public String getDisName() {
return disName;
}
public void setDisName(String disName) {
this.disName = disName;
}
public List<Department> getDepList() {
return depList;
}
public void setDepList(List<Department> depList) {
this.depList = depList;
}
public static class Department {
@Override
public String toString() {
return "Department{" +
"depName='" + depName + '\'' +
'}';
}
private String depName;//科室名称
public String getDepName() {
return depName;
}
public void setDepName(String depName) {
this.depName = depName;
}
}
}
}
@Override
public String toString() {
return "DataBean{" +
"bodyBeanList=" + bodyBeanList +
'}';
}
}
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.util.CellRangeAddress;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import java.util.Set;
public class JSONToExcel {
public static void creatExc(String json) throws IOException {
Set<String> keys = null;
// 创建HSSFWorkbook对象
HSSFWorkbook wb = new HSSFWorkbook();
// 创建HSSFSheet对象
HSSFSheet sheet = wb.createSheet("sheet");
HSSFRow row1 = sheet.createRow(0);
HSSFCell cell1 = row1.createCell(0);
cell1.setCellValue("人体部位");
HSSFCell cell2 = row1.createCell(1);
cell2.setCellValue("疾病名称");
HSSFCell cell = row1.createCell(2);
cell.setCellValue("科室名称");
DataBean dataBean = JSON.parseObject(json, DataBean.class);
List<DataBean.BodyBean> bodyBeanList = dataBean.getBodyBeanList();
int totalRowNum = 0;
int startRow1;
int startRow2 = 0;
int endRow2 = 0;
for (int i = 0; i < bodyBeanList.size(); i++) {
startRow1 = totalRowNum + 1;
DataBean.BodyBean bodyBean = bodyBeanList.get(i);
List<DataBean.BodyBean.Disease> diseaseList = bodyBean.getDiseaseList();
for (int j = 0; j < diseaseList.size(); j++) {
startRow2 = endRow2 + 1;
DataBean.BodyBean.Disease disease = diseaseList.get(j);
List<DataBean.BodyBean.Disease.Department> depList = disease.getDepList();
for (int k = 0; k < depList.size(); k++) {
DataBean.BodyBean.Disease.Department department = depList.get(k);
totalRowNum++;
endRow2++;
HSSFRow row = sheet.createRow(totalRowNum);
HSSFCell cellBody = row.createCell(0);
HSSFCell cellDisease = row.createCell(1);
HSSFCell cellDep = row.createCell(2);
cellBody.setCellValue(bodyBean.getBodyPart());
cellDisease.setCellValue(disease.getDisName());
cellDep.setCellValue(department.getDepName());
}
// sheet.addMergedRegion(new CellRangeAddress(startRow2, endRow2, 1, 1));
}
// sheet.addMergedRegion(new CellRangeAddress(startRow1, totalRowNum, 0, 0));
}
// 输出Excel文件
FileOutputStream output = new FileOutputStream("/Users/zuixia/Desktop/demo.xls");
wb.write(output);
// wb.close();
output.flush();
output.close();
System.out.println("输出成功!!!");
}
}
以上可见,就算忽略我那个JavaBean,java要做的工作也比Python多很多,代码量也大了很多。
python里面用到的库都是用pip3安装的(我用的是python3.7和pip3),敲几行命令就可以了,而Java要自己去下载jar包或者配置maven。
网友评论