swift-perfect-爬虫

作者: 遥遥领先M | 来源:发表于2017-04-14 14:21 被阅读189次

我们已经让Perfect跑起来了,那我们来尝试下爬取聚合指数数据

一个小小的爬虫

我们新建一个Index.swift文件,代码如下

import Foundation

class Index {

    //传入的主url
    private var url:String
    //输出结果
    internal var results = ""


    // 初始化 传入一个URL
    init(url:String){
        self.url = url
    }


    // 开始爬取数据
    internal func start(){
        do{
            try handleData(urlString: url)
        }catch{
            debugPrint(error)
        }
    }



    private func handleData(urlString:String) throws {

        if let url = URL(string:urlString) {

            debugPrint("开始获取信息")

            DispatchQueue.global().sync{
                    do{
                        //通过创建Scanner
                        let scanner = Scanner(string: try String(contentsOf:url))

                        var (head,foot) = ("<div class=\"rank-wrapper row\">","<div class=\"footer text-center\">")
                        let content = self.scanWith(head:head,foot:foot,scanner:scanner)

                        var contents  = content.components(separatedBy:"<div class=\"col-xs-12 col-sm-6 col-md-4\">")
                        contents.removeFirst()

                        var data = ""
                        for (_, item) in contents.enumerated() {
                             (head,foot) = ("<div class=\"rank-item text-center\">","</a>")
                            let itemString = self.scanWith(head: head, foot: foot, string: item)

                            let href = self.scanWith(head: "<a href=", foot: " class=\"view-wrapper data-view\">", string: itemString).replacingOccurrences(of: "\"", with: "")
                            let crown = self.scanWith(head: "src=", foot: "/>", string: itemString).replacingOccurrences(of: "\"", with: "")
                            let rank_word = self.scanWith(head: "<p class=\"rank-word\">", foot: "</p>", string: itemString)
                            let rank_tag = self.scanWith(head: "<span class=\"rank-tag\">", foot: "</span>", string: itemString)

                            data += "{\"crown\":\"\(urlString)\(crown)\",\"rank_word\":\"\(rank_word)\",\"rank_tag\":\"\(rank_tag)\",\"href\":\"\(href)\"},"

                        }

                        results += "\"code\":\(200),\"data\":[\(data)]"

                    }catch{
                        debugPrint(error)
                    }
            }
            
            debugPrint("获取信息结束")

            results = results.replace(of: ",", with: "")
            results = results.characters.count > 0 ? "{\(results)}" : ""

        }else{
            throw crawlerError(msg:"查询URL初始化失败")
        }
    }


    // html截取
    func scanWith(head:String,foot:String,scanner:Scanner)->String
    {
        var str:NSString?

        scanner.scanUpTo(head, into: nil)
        scanner.scanUpTo(foot, into: &str)

        return str == nil ? "" : str!.replacingOccurrences(of: head, with: "")
    }

    // 字符串截取
    func scanWith(head:String,foot:String,string:String)->String
    {

        let range = string.range(of: head)
        let startIndex = range!.upperBound

        let range1 = Range(uncheckedBounds: (lower: string.startIndex, upper: string.endIndex))
        let endRange = string.range(of: foot, options: .backwards, range: range1, locale: Locale(identifier: "<"))
        let endIndex = endRange!.lowerBound

        let searchRange = Range(uncheckedBounds: (startIndex, endIndex))

        let endString = string.substring(with: searchRange)
        return endString
    }

}


//自定义一个错误处理
public struct crawlerError:Error
{
    var message:String

    init(msg:String)
    {
        message = msg
    }
}


extension String
{
    //去掉字符串(空格之类的)
    func trim(string:String) -> String
    {
        return self == "" ? "" : self.trimmingCharacters(in: CharacterSet(charactersIn: string))
    }
    //替换从末尾出现的第一个指定字符串
    func replace(of pre:String,with next:String)->String
    {
        return replacingOccurrences(of: pre, with: next, options: String.CompareOptions.backwards, range: index(endIndex, offsetBy: -2)..<endIndex)
    }

}

添加路由

我们在main.swift添加路由

func dataHandler(request:HTTPRequest,_ response:HTTPResponse)
{
    //在请求中创建并开始爬一次
    let crawler = movieCrawler(url:"https://movie.douban.com/")
    // 开始爬虫
    crawler.start()
    //如果有爬到数据,就添加到Response中返回
    response.setHeader(.contentType, value: "text/html;charset=UTF-8")
    response.appendBody(string: crawler.results.characters.count > 0 ? crawler.results : "")
    response.completed()
}

//添加路由
routes.add(method: .get, uri: "/data", handler:dataHandler)

然后运行服务器,在浏览器输入http://127.0.0.1:8181/index
然后就返回了我们想要的数据了,第一个小小的爬虫就完成了
JSON数据

{
  "code": 200,
  "data": [
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "白百何",
      "rank_tag": "人物",
      "href": "/tag/ren-wu"
    },
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "香奈儿",
      "rank_tag": "化妆品",
      "href": "/tag/hua-zhuang-pin"
    },
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "香奈儿",
      "rank_tag": "奢侈品",
      "href": "/tag/she-chi-pin"
    },
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "人民的名义",
      "rank_tag": "娱乐",
      "href": "/tag/yu-le"
    },
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "朗读者",
      "rank_tag": "小说",
      "href": "/tag/xiao-shuo"
    },
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "人民的名义",
      "rank_tag": "热点",
      "href": "/tag/re-dian"
    },
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "微信",
      "rank_tag": "生活",
      "href": "/tag/sheng-huo"
    },
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "速度与激情8",
      "rank_tag": "电影",
      "href": "/tag/dian-ying"
    },
    {
      "crown": "http://zhishuweixin.com/assets/royal-crown.png",
      "rank_word": "中国银行",
      "rank_tag": "股票",
      "href": "/tag/gu-piao"
    },
    
  ]
}

相关文章

  • swift-perfect-爬虫

    我们已经让Perfect跑起来了,那我们来尝试下爬取聚合指数数据 一个小小的爬虫 我们新建一个Index.swif...

  • swift-perfect-跑起来

    Perfect框架简介 Perfect框架是目前比较完善的Swift服务端框架之一Perfect框架也是开源的,在...

  • 11.20-11.26

    本周目标 爬虫 爬虫 爬虫 爬虫

  • 爬虫入门基础

    Day01 一、爬虫介绍 什么是爬虫 Python爬虫的优势 Python爬虫需要掌握什么 爬虫与反爬虫与反反爬虫...

  • 01-认识爬虫

    一、爬虫介绍 什么是爬虫 Python爬虫的优势 Python爬虫需要掌握什么 爬虫与反爬虫与反反爬虫三角之争 网...

  • 爬虫原理与数据抓取之一: 通用爬虫和聚焦爬虫

    通用爬虫和聚焦爬虫 根据使用场景,网络爬虫可分为 通用爬虫 和 聚焦爬虫 两种. 通用爬虫 通用网络爬虫 是 捜索...

  • (了解)通用爬虫和聚焦爬虫--爬虫基础教程(python)(二)

    通用爬虫和聚焦爬虫 根据使用场景,网络爬虫可分为 通用爬虫 和 聚焦爬虫 两种.我们主要写通用爬虫。 通用爬虫 通...

  • Python 网络爬虫(一)

    网络爬虫的基本介绍 学习爬虫,我想主要从以下几个方面来切入 -爬虫的原理? -爬虫的作用? -爬虫的实现? -爬虫...

  • 7.爬虫概述

    爬虫概述 知识点: 了解 爬虫的概念 了解 爬虫的作用 了解 爬虫的分类 掌握 爬虫的流程 1. 爬虫的概念 模拟...

  • 1-基本概念

    简介 为什么选择Python做爬虫 需要技能 爬虫与反爬虫 网络爬虫类型 通用网络爬虫 聚焦网络爬虫 增量式网络爬...

网友评论

    本文标题:swift-perfect-爬虫

    本文链接:https://www.haomeiwen.com/subject/ehbwattx.html