美文网首页
Java爬虫(HttpClient)

Java爬虫(HttpClient)

作者: 吹奏一池春水 | 来源:发表于2019-03-17 23:04 被阅读0次

    网络爬虫主要功能就是对网页内容进行爬取,然后根据特定需求对内容进行过滤分析。
    针对网页内容爬取,假设需求为要对一个网站进行全站爬取,将爬取的文件按类型保存在本地磁盘,并提供配置网站爬取的最大层次、最大链接数、爬取类型范围等。
    这里使用kafka主题做爬虫队列,使用springboot做了一个简单的实现。

    任务创建接口

    这里提供了两个接口,一个是输入网站爬取配置创建爬取任务,一个是根据任务id查询任务状态,没有提供详细结果查询,结果可直接数据库查看。

    @RestController
    public class CrawlerTaskController {
    
        @Autowired
        private WebsiteTaskService websiteTaskService;
    
        @Autowired
        private WebsiteTaskDao websiteTaskDao;
    
        @Autowired
        private TaskProducer taskProducer;
    
        @PostMapping("task/add")
        @ResponseBody
        public Map<String, Object> addWebsiteTask(WebsiteTask item) {
            item.setTaskCount(1);
            websiteTaskService.put(item);
            UrlTask task = new UrlTask();
            task.setUrl(item.getUrl());
            task.setParentId(-1);
            task.setRootId(task.getRootId());
            task.setLevel(0);
            taskProducer.sendUrlTask(task);
            Map<String, Object> map = new HashMap<>();
            map.put("id", item.getId());
            map.put("message", "爬虫任务添加成功!");
            return map;
        }
    
        @PostMapping("task/get")
        @ResponseBody
        public WebsiteTask getWebsiteTask(int id) {
            return websiteTaskDao.findById(id).get();
        }
    }
    

    网站任务实体

    @Entity
    @EntityListeners(AuditingEntityListener.class)
    public class WebsiteTask {
    
        @Id
        @GeneratedValue(strategy = GenerationType.IDENTITY)
        private int id;
        @Column(length = 1024)
        private String url;//网站url,一般为首页链接
        private int maxLevel;//最大爬取层次
        private int maxCount;//最大爬取链接数
        private int outerLevel;//最大爬取外链层次
        private String range;//爬取类型范围
        private int taskCount;//任务数
        private int finishCount;//爬取完成任务数
        private int state = 1;//状态:1=执行中;2=已完成
        @CreatedDate
        private Date createTime;//创建时间
        private Date finishTime;//完成时间
    
        @Transient
        private List<String> ranges;
    
        ......
    }
    

    url任务实体

    @Entity
    @EntityListeners(AuditingEntityListener.class)
    public class UrlTask {
    
        @Id
        @GeneratedValue(strategy = GenerationType.IDENTITY)
        private int id;
        private int parentId;//父页面任务id
        private int rootId;//网站任务id
        @Column(length = 1024)
        private String url;
        private String contentType;//网页类型
        private long contentLength;//内容长度
        private int level;//当前层次
        private long useMillis;//爬取用时
        private int respCode;//响应状态码
        private String remark;//备注
        private String filePath;//保存磁盘文件路径
        @CreatedDate
        private Date createTime;//创建时间
    
        ......
    }
    

    爬取的网页模型定义

    public class WebPageModel {
    
        public int respCode = 200;//响应状态码
        public String message;//异常信息
        public Document document;//html页面Document对象
        public String encoding;//页面编码
        public String contentType;//网页类型
        public long contentLength;//内容长度
        public String filePath;//文件路径
        public String fileExt;//文件后缀
    
        public PageFormat format = PageFormat.OTHER;
    
        public enum PageFormat {
            HTML, IMAGE, AUDIO, VIDEO, TXT, WORD, EXCEL, PPT, PDF, COMPRESS, APK, IPA, OTHER
        }
    
        public void updateFormat() {
            String type = contentType;
            if (ContentTypeUtil.OCTET_STREAM_TYPE.equalsIgnoreCase(contentType)) {
                type = ContentTypeUtil.getContentType(fileExt);
            }
            if (ContentTypeUtil.isHtml(type)) {
                format = PageFormat.HTML;
            } else if (ContentTypeUtil.isImage(type)) {
                format = PageFormat.IMAGE;
            } else if (ContentTypeUtil.isAudio(type)) {
                format = PageFormat.AUDIO;
            } else if (ContentTypeUtil.isVideo(type)) {
                format = PageFormat.VIDEO;
            } else if (ContentTypeUtil.isTxt(type)) {
                format = PageFormat.TXT;
            } else if (ContentTypeUtil.isWord(type)) {
                format = PageFormat.WORD;
            } else if (ContentTypeUtil.isExcel(type)) {
                format = PageFormat.EXCEL;
            } else if (ContentTypeUtil.isPpt(type)) {
                format = PageFormat.PPT;
            } else if (ContentTypeUtil.isPdf(type)) {
                format = PageFormat.PDF;
            } else if (ContentTypeUtil.isCompress(type)) {
                format = PageFormat.COMPRESS;
            } else if (ContentTypeUtil.isApk(type)) {
                format = PageFormat.APK;
            } else if (ContentTypeUtil.isIpa(type)) {
                format = PageFormat.IPA;
            }
        }
    }
    

    url去重

    public class UrlDuplicateFilter {
    
        private final Object lock = new Object();
        private final Set<String> set = new HashSet<>();
        private final int maxCount; // 最大不相同数量
    
        public UrlDuplicateFilter(int maxCount) {
            this.maxCount = maxCount;
        }
    
        /**
         * 过滤重复url
         */
        public boolean filter(String url) {
            if (StringUtils.isBlank(url)) {
                return false;
            }
            synchronized (lock) {
                if (reachMaxCount() || set.contains(url)) {
                    return false;
                }
                set.add(url);
            }
            return true;
        }
    
        /**
         * 判断数量是否达到上限
         */
        private boolean reachMaxCount() {
            return set.size() >= maxCount;
        }
    }
    

    一条url的爬取执行过程

    public class CrawlerTask implements Runnable {
    
        private static final Logger LOG = LoggerFactory.getLogger(CrawlerTask.class);
        private UrlTask task;
        private WebsiteTaskService websiteTaskService;
        private UrlTaskDao urlTaskDao;
        private TaskProducer taskProducer;
    
        public CrawlerTask(UrlTask task, WebsiteTaskService websiteTaskService, UrlTaskDao urlTaskDao,
                           TaskProducer taskProducer) {
            this.task = task;
            this.websiteTaskService = websiteTaskService;
            this.urlTaskDao = urlTaskDao;
            this.taskProducer = taskProducer;
        }
    
        @Override
        public void run() {
            long millis = System.currentTimeMillis();
            WebsiteTask website = websiteTaskService.getWebsiteTask(task.getRootId());
            //爬取链接内容
            WebPageModel page = PageDownloadUtil.executeGet(task.getUrl(), website.getRanges());
            task.setContentLength(page.contentLength);
            task.setContentType(page.contentType);
            task.setRespCode(page.respCode);
            task.setRemark(page.message);
            task.setFilePath(page.filePath);
            task.setUseMillis(System.currentTimeMillis() - millis);
            urlTaskDao.saveAndFlush(task);
    
            if (task.getLevel() < website.getMaxLevel()) {
                Set<String> childUrls = new UrlExtract(page.document, task.getUrl()).extractFromA().extractFromFrame()
                        .extractFromIframe().extractFromImg().getUrls();
                if (!childUrls.isEmpty()) {
                    UrlDuplicateFilter dupFilter = websiteTaskService.getUrlDuplicateFilter(task.getRootId());
                    int addCount = 0;
                    for (String childUrl : childUrls) {
                        if (CrawlerUtil.isOuterUrl(task.getUrl(), childUrl) && task.getLevel() >= website.getOuterLevel()) {
                            continue;
                        }
                        //提取出的子链接去重
                        if (dupFilter.filter(childUrl)) {
                            UrlTask childTask = new UrlTask();
                            childTask.setUrl(childUrl);
                            childTask.setParentId(task.getId());
                            childTask.setRootId(task.getRootId());
                            childTask.setLevel(task.getLevel() + 1);
                            taskProducer.sendUrlTask(childTask);
                            addCount++;
                        }
                    }
                    //任务数更新
                    websiteTaskService.addTaskCount(task.getRootId(), addCount);
                }
            }
            //完成任务数更新
            websiteTaskService.addFinishCount(task.getRootId());
            LOG.info(String.format("爬取用时=%s,url=%s", System.currentTimeMillis() - millis, task.getUrl()));
        }
    }
    

    网页爬取工具

    public class PageDownloadUtil {
    
        private static final Logger LOG = LoggerFactory.getLogger(PageDownloadUtil.class);
        private static final int MAX_HTML_LENGTH = 20 * 1024 * 1024;//html页面限制20M
        private static final int MAX_FILE_LENGTH = 500 * 1024 * 1024;//其它附件类型限制500M
        private static final String FOLDER_NAME = "d:/temp/" + UUID.randomUUID().toString().replace("-", "") + "/";
        private static final AtomicInteger INDEX = new AtomicInteger();
        private static final CloseableHttpClient client = HttpClientUtil.createHttpClient();
        //自定义错误返回值
        private static final Map<Integer,String> CODE_MAP = new HashMap<>();
        static{
            CODE_MAP.put(-501,"uri解析异常");
            CODE_MAP.put(-502,"网络协议异常");
            CODE_MAP.put(-503,"域名解析异常");
            CODE_MAP.put(-504,"http连接异常");
            CODE_MAP.put(-505,"网络IO异常");
            CODE_MAP.put(-506,"页面解析异常");
            CODE_MAP.put(-507,"编码格式异常");
            CODE_MAP.put(-508,"内容长度超出限制");
            CODE_MAP.put(-509,"网页类型超出可爬取范围");
        }
    
        public static WebPageModel executeGet(String url, List<String> ranges) {
            WebPageModel page = new WebPageModel();
            int redirectTimes = 0;
            boolean redirect;
            URI uri = CrawlerUtil.urlConvertToUri(url);
            if (uri == null) {
                page.respCode = -501;
                page.message = CODE_MAP.get(page.respCode);
                return page;
            }
            do {
                redirectTimes++;
                redirect = false;
                HttpGet method = new HttpGet(uri);
                HttpClientUtil.setHeader(method, url);
                CloseableHttpResponse response = null;
                long millis = System.currentTimeMillis();
                try {
                    response = client.execute(method);
                    page.respCode = response.getStatusLine().getStatusCode();
                    if (page.respCode == HttpStatus.SC_OK) {
                        download(page, url, response, ranges);
                    } else if (page.respCode >= 300 && page.respCode < 400) {// 页面跳转
                        Header[] locationHeader = response.getHeaders("location");
                        if (locationHeader != null && locationHeader.length > 0) {
                            String redirectUrl = locationHeader[0].getValue();
                            if (StringUtils.isNotBlank(redirectUrl) && !url.equals(redirectUrl)) {
                                uri = CrawlerUtil.urlConvertToUri(redirectUrl);
                                redirect = true;
                            }
                        }
                    }
                } catch (ClientProtocolException e) {
                    LOG.error("", e);
                    page.respCode = -502;
                    page.message = CODE_MAP.get(page.respCode);
                } catch (UnknownHostException e) {
                    LOG.error("", e);
                    page.respCode = -503;
                    page.message = CODE_MAP.get(page.respCode);
                } catch (HttpHostConnectException e) {
                    LOG.error("", e);
                    page.respCode = -504;
                    page.message = CODE_MAP.get(page.respCode);
                } catch (IOException e) {//连接超时尝试重连3次
                    redirectTimes++;
                    redirect = true;
                    LOG.error(String.format("第%s次链接失败,executeusetime=%s", redirectTimes / 2,
                            System.currentTimeMillis() - millis), e);
                    page.respCode = -505;
                    page.message = CODE_MAP.get(page.respCode);
                } finally {
                    if (response != null) {
                        EntityUtils.consumeQuietly(response.getEntity());
                        try {
                            response.close();
                        } catch (IOException e) {
                            LOG.error("responseclose", e);
                        }
                    }
                    method.releaseConnection();
                }
            } while (redirect && redirectTimes <= 5);
            return page;
        }
    
        private static void download(WebPageModel page, String url, CloseableHttpResponse response,
                                     List<String> ranges) {
            HttpEntity entity = response.getEntity();
            page.contentLength = entity.getContentLength();// 此方法不准确,经常返回-1,后面重新赋值
    
            // ContentType.getOrDefault(entity).getMimeType()提取可能会因为非支持的charset类型而报错,所以这里改为手工提取mimeType
            Header header = entity.getContentType();
            if (header != null) {
                HeaderElement[] headerElements = header.getElements();
                if (headerElements != null && headerElements.length > 0) {
                    page.contentType = headerElements[0].getName();
                }
            }
            if (ContentTypeUtil.OCTET_STREAM_TYPE.equalsIgnoreCase(page.contentType)) {
                page.fileExt = HttpClientUtil.getOctetStreamFileExt(url, response);
            } else if (page.contentType == null) {
                //若未从header中取到contentType,根据url后缀判断
                if (url.lastIndexOf("/") > 8) {
                    String name = url.substring(url.lastIndexOf("/"));
                    if (name.contains(".")) {
                        page.contentType = ContentTypeUtil.getContentType(name.substring(name.lastIndexOf(".")));
                    }
                }
            }
            page.updateFormat();
    
            if (ranges.contains(page.format.toString())) {
                if (page.format == WebPageModel.PageFormat.HTML) {
                    if (page.contentLength == 0 || page.contentLength > MAX_HTML_LENGTH) {
                        page.respCode = -508;
                        page.message = CODE_MAP.get(page.respCode);
                        return;
                    }
                    try {
                        String html = null;
                        Document document = null;
                        String charset = null;
                        if (header != null) {
                            charset = CrawlerUtil.judgeCharset(header.toString());
                        }
                        if (charset != null) {
                            html = EntityUtils.toString(entity, charset);
                            document = Jsoup.parse(html);
                        } else {
                            byte[] data = EntityUtils.toByteArray(entity);
                            html = new String(data, CrawlerUtil.UTF_8);
                            document = Jsoup.parse(html);
                            charset = CrawlerUtil.getCharsetFromMeta(document);
                            if (charset != null && !CrawlerUtil.UTF_8.equals(charset)) {
                                html = new String(data, charset);
                                document = Jsoup.parse(html);
                            }
                        }
                        byte[] data = html.getBytes(CrawlerUtil.UTF_8);
                        page.contentLength = data.length;
                        if (page.contentLength <= 0 || page.contentLength > MAX_HTML_LENGTH) {
                            page.respCode = -508;
                            page.message = CODE_MAP.get(page.respCode);
                            return;
                        }
    
                        page.encoding = CrawlerUtil.UTF_8;
                        page.document = document;
    
                        createFilePath(page);
                        HttpClientUtil.exportDataAsFile(data, page.filePath);
                    } catch (ParseException e) {
                        LOG.error("", e);
                        page.respCode = -506;
                        page.message = CODE_MAP.get(page.respCode);
                    } catch (UnsupportedEncodingException e) {
                        LOG.error("", e);
                        page.respCode = -507;
                        page.message = CODE_MAP.get(page.respCode);
                    } catch (IOException e) {
                        LOG.error("", e);
                        page.respCode = -505;
                        page.message = CODE_MAP.get(page.respCode);
                    }
                } else {// 如果是非html页面直接下载
                    if (page.contentLength == 0 || page.contentLength > MAX_FILE_LENGTH) {
                        page.respCode = -508;
                        page.message = CODE_MAP.get(page.respCode);
                        return;
                    }
                    if (page.fileExt == null) {
                        page.fileExt = ContentTypeUtil.getExtendFileName(page.contentType);
                    }
                    createFilePath(page);
                    HttpClientUtil.exportEntityAsFile(entity, page.filePath);
                }
            } else {
                page.respCode = -509;
                page.message = CODE_MAP.get(page.respCode);
            }
        }
    
        private static void createFilePath(WebPageModel page) {
            String filePath = FOLDER_NAME + page.format.toString() + "/";
            File file = new File(filePath);
            file.mkdirs();
            page.filePath = filePath + INDEX.getAndIncrement() + page.fileExt;
        }
    }
    

    项目地址

    详细项目代码可到我的Github上查看下载:https://github.com/DexterQY/website-crawler

    相关文章

      网友评论

          本文标题:Java爬虫(HttpClient)

          本文链接:https://www.haomeiwen.com/subject/ykklmqtx.html