美文网首页
一个简单的爬虫,对比Python和Java

一个简单的爬虫,对比Python和Java

作者: 卖梦为生_若愚 | 来源:发表于2019-04-16 09:54 被阅读0次

领导知道我在学python,周五中午安排我去爬网上的数据,然后给后台的同事使用。由于当时有点事,就暂时搁置,周末在家没事做,就打开电脑写了下,用java和python两种语言各自试着写了下,python真是香!
python也是正在自学当中。。。

#Python用到的库:
#requests:请求
#BeautifulSoup:html解析
#csv、codecs:csv文件操作
import requests
from bs4 import BeautifulSoup
import csv
import codecs

file_csv = codecs.open('/Users/zuixia/Desktop/人体疾病科室对应表.csv', 'w+', 'utf-8')
writer = csv.writer(file_csv)
writer.writerow([r'身体部位', '疾病/科室'])
base_url = 'http://ts3.daxiang91.com'
url_body_part = 'http://ts3.daxiang91.com/TestNew/Guide/GetSymptom?crowd=1&body=1'
url_disease = 'http://ts3.daxiang91.com/TestNew/Guide/GetSymptomByBody'
res_body = requests.get(url_body_part)
soup_body = BeautifulSoup(res_body.text, 'lxml')
bodyParts = soup_body.findAll('div', {'class': 'am-list-item list bg'})
for bodyPart in bodyParts:
    param = {"bodyId": bodyPart.get('data-bodyid'), "crowd": '1'}
    res_disease = requests.post(url_disease, params=param)
    soup_disease = BeautifulSoup(res_disease.text, 'lxml')
    diseases = soup_disease.findAll('a', {'class': 'am-list-item list'})
    for disease in diseases:
        dep_link = disease.get('href')
        res_dep = requests.get(base_url + dep_link, cookies={'ElephantSC': '34f2ngsofaaj2jzkqabuijda'})
        soup_dep = BeautifulSoup(res_dep.text, 'lxml')
        deps = soup_dep.findAll('div', {'class': 'am-list-item list'})
        for dep in deps:
            writer.writerow([bodyPart.get_text().strip(), disease.get_text().strip()+','+dep.get('data-deptname')])
print("输出完成啦!!!!")

Java的写法:
用java的时候我是没有直接去写excel文件,先用JavaBean接收,转成了Json再用工具写Excel的。不然也就像python写的那样在for循环里面直接操作了。
所以就多了个JavaBean。
用到的库:阿里的fastJson、apache的HSSFWorkbook、Jsoup

import com.alibaba.fastjson.JSON;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;

public class Main {
    static DataBean databean = new DataBean();
    static ArrayList<DataBean.BodyBean> bodyBeans = new ArrayList();
    static ArrayList<DataBean.BodyBean.Disease> diseaseList = new ArrayList();

    public static void main(String[] args) {
        try {
            parsePersonBodyTag();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    private static void parsePersonBodyTag() throws IOException {
        Document doc = Jsoup.connect("http://ts3.daxiang91.com/TestNew/Guide/GetSymptom?crowd=1&body=1").timeout(1000000000).get();
        Elements elements = doc.getElementsByAttribute("data-bodyid");
        for (int i = 0; i < elements.size(); i++) {
            Element element = elements.get(i);
            DataBean.BodyBean bodyBean = new DataBean.BodyBean();
            bodyBean.setBodyPart(element.text());
            bodyBean.setBodyId(Integer.valueOf(element.attr("data-bodyid")));
            parseDiseases(bodyBean);
        }
        databean.setBodyBeanList(bodyBeans);
        String jsonString = JSON.toJSONString(databean);
        System.out.println(jsonString);
        JSONToExcel.creatExc(jsonString);
    }

    private static void parseDiseases(DataBean.BodyBean bodyBean) throws IOException {
        diseaseList = new ArrayList();
        Document doc = Jsoup.connect("http://ts3.daxiang91.com/TestNew/Guide/GetSymptomByBody")
                .data("bodyId", String.valueOf(bodyBean.getBodyId())).data("crowd", "1")
                .timeout(1000000000).post();
        Elements elements = doc.getElementsByClass("am-list-item list");
        for (Element element : elements) {
            parseDepList(bodyBean, element);
        }
        bodyBean.setDiseaseList(diseaseList);
        bodyBeans.add(bodyBean);
    }

    private static void parseDepList(DataBean.BodyBean bodyBean, Element element) throws IOException {
        DataBean.BodyBean.Disease disease = new DataBean.BodyBean.Disease();
        disease.setDisName(element.text());
        String url = element.attr("href");
        Document doc = Jsoup.connect("http://ts3.daxiang91.com" + url)
                .cookie("ElephantSC", "34f2ngsofaaj2jzkqabuijda")
                .timeout(1000000000)
                .get();
        Elements lists = doc.getElementsByClass("am-list-item list");
        ArrayList<DataBean.BodyBean.Disease.Department> depList = new ArrayList();
        for (Element list : lists) {
            DataBean.BodyBean.Disease.Department department = new DataBean.BodyBean.Disease.Department();
            department.setDepName(list.text());
            depList.add(department);
        }
        disease.setDepList(depList);
        diseaseList.add(disease);
    }
}
import java.util.List;

public  class DataBean {

    private List<BodyBean> bodyBeanList;

    public List<BodyBean> getBodyBeanList() {
        return bodyBeanList;
    }

    public void setBodyBeanList(List<BodyBean> bodyBeanList) {
        this.bodyBeanList = bodyBeanList;
    }

    /*第一层数据--身体部位*/
    public static class BodyBean {
        @Override
        public String toString() {
            return "BodyBean{" +
                    "bodyPart='" + bodyPart + '\'' +
                    ", bodyId=" + bodyId +
                    ", diseaseList=" + diseaseList +
                    '}';
        }

        private String bodyPart;//部位名称
        private int bodyId;

        public int getBodyId() {
            return bodyId;
        }

        public void setBodyId(int bodyId) {
            this.bodyId = bodyId;
        }

        private List<Disease> diseaseList;//部位下疾病的集合

        public String getBodyPart() {
            return bodyPart;
        }

        public void setBodyPart(String bodyPart) {
            this.bodyPart = bodyPart;
        }

        public List<Disease> getDiseaseList() {
            return diseaseList;
        }

        public void setDiseaseList(List<Disease> diseaseList) {
            this.diseaseList = diseaseList;
        }

        /*第二层数据--身体部位下的疾病*/
        public static class Disease {
            @Override
            public String toString() {
                return "Disease{" +
                        "disName='" + disName + '\'' +
                        ", depList=" + depList +
                        '}';
            }

            private String disName;//疾病名称
            private List<Department> depList;//疾病对应的科室集合

            public String getDisName() {
                return disName;
            }

            public void setDisName(String disName) {
                this.disName = disName;
            }

            public List<Department> getDepList() {
                return depList;
            }

            public void setDepList(List<Department> depList) {
                this.depList = depList;
            }

            public static class Department {
                @Override
                public String toString() {
                    return "Department{" +
                            "depName='" + depName + '\'' +
                            '}';
                }

                private String depName;//科室名称

                public String getDepName() {
                    return depName;
                }

                public void setDepName(String depName) {
                    this.depName = depName;
                }
            }
        }
    }

    @Override
    public String toString() {
        return "DataBean{" +
                "bodyBeanList=" + bodyBeanList +
                '}';
    }
}

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.util.CellRangeAddress;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import java.util.Set;

public class JSONToExcel {
    public static void creatExc(String json) throws IOException {
        Set<String> keys = null;
        // 创建HSSFWorkbook对象
        HSSFWorkbook wb = new HSSFWorkbook();
        // 创建HSSFSheet对象
        HSSFSheet sheet = wb.createSheet("sheet");
        HSSFRow row1 = sheet.createRow(0);
        HSSFCell cell1 = row1.createCell(0);
        cell1.setCellValue("人体部位");
        HSSFCell cell2 = row1.createCell(1);
        cell2.setCellValue("疾病名称");
        HSSFCell cell = row1.createCell(2);
        cell.setCellValue("科室名称");
        DataBean dataBean = JSON.parseObject(json, DataBean.class);
        List<DataBean.BodyBean> bodyBeanList = dataBean.getBodyBeanList();
        int totalRowNum = 0;
        int startRow1;
        int startRow2 = 0;
        int endRow2 = 0;
        for (int i = 0; i < bodyBeanList.size(); i++) {
            startRow1 = totalRowNum + 1;
            DataBean.BodyBean bodyBean = bodyBeanList.get(i);
            List<DataBean.BodyBean.Disease> diseaseList = bodyBean.getDiseaseList();
            for (int j = 0; j < diseaseList.size(); j++) {
                startRow2 = endRow2 + 1;
                DataBean.BodyBean.Disease disease = diseaseList.get(j);
                List<DataBean.BodyBean.Disease.Department> depList = disease.getDepList();
                for (int k = 0; k < depList.size(); k++) {
                    DataBean.BodyBean.Disease.Department department = depList.get(k);
                    totalRowNum++;
                    endRow2++;
                    HSSFRow row = sheet.createRow(totalRowNum);
                    HSSFCell cellBody = row.createCell(0);
                    HSSFCell cellDisease = row.createCell(1);
                    HSSFCell cellDep = row.createCell(2);
                    cellBody.setCellValue(bodyBean.getBodyPart());
                    cellDisease.setCellValue(disease.getDisName());
                    cellDep.setCellValue(department.getDepName());
                }
//                sheet.addMergedRegion(new CellRangeAddress(startRow2, endRow2, 1, 1));
            }
//            sheet.addMergedRegion(new CellRangeAddress(startRow1, totalRowNum, 0, 0));

        }

        // 输出Excel文件
        FileOutputStream output = new FileOutputStream("/Users/zuixia/Desktop/demo.xls");
        wb.write(output);
//        wb.close();
        output.flush();
        output.close();
        System.out.println("输出成功!!!");
    }
}

以上可见,就算忽略我那个JavaBean,java要做的工作也比Python多很多,代码量也大了很多。
python里面用到的库都是用pip3安装的(我用的是python3.7和pip3),敲几行命令就可以了,而Java要自己去下载jar包或者配置maven。

相关文章

  • java爬虫与python爬虫谁更强?

    java爬虫与python爬虫的对比: python做爬虫语法更简单,代码更简洁。java的语法比python严格...

  • 一个简单的爬虫,对比Python和Java

    领导知道我在学python,周五中午安排我去爬网上的数据,然后给后台的同事使用。由于当时有点事,就暂时搁置,周末在...

  • 各语言简单爬虫

    各语言简单爬虫 Python 简单爬虫 golang简单爬虫

  • 【工具】echarts+kuno+分词

    数据: python爬虫:微博爬虫、借助'出书啦'爬微信知乎Java爬虫:Java微博爬虫 时间轴: JAVA时间...

  • Java爬虫:用java爬取小说

    Java也能做爬虫。 现在提到爬虫人第一个想到的就是python,其实使用Java编写爬虫也是很好的选择, 下面给...

  • Python爬虫入门(01) -- 10行代码实现一个爬虫

    跟我学习Python爬虫系列开始啦。带你简单快速高效学习Python爬虫。 一、快速体验一个简单爬虫 以抓取简书首...

  • 一个简单的python爬虫程序

    python|网络爬虫 概述 这是一个简单的python爬虫程序,仅用作技术学习与交流,主要是通过一个简单的实际案...

  • 一篇文章学习 Python 网络爬虫

    一、爬虫开发基础 爬虫基础分为 Python 基础,网页常识和网页分析三部分。 学习爬虫需要有简单的 Python...

  • 爬个小逗图

    python小爬虫 python这门胶水语言, 已经是趋势了,使用范围太广,用它做爬虫比Java和OC方便太多, ...

  • Python网络爬虫

    Python开发简单爬虫(Python2.X版本,Eclipse工具) 一、爬虫介绍 爬虫调度端:启动、停止爬虫,...

网友评论

      本文标题:一个简单的爬虫,对比Python和Java

      本文链接:https://www.haomeiwen.com/subject/psdvwqtx.html