我们已经让Perfect跑起来了,那我们来尝试下爬取聚合指数数据
一个小小的爬虫
我们新建一个Index.swift
文件,代码如下
import Foundation
class Index {
//传入的主url
private var url:String
//输出结果
internal var results = ""
// 初始化 传入一个URL
init(url:String){
self.url = url
}
// 开始爬取数据
internal func start(){
do{
try handleData(urlString: url)
}catch{
debugPrint(error)
}
}
private func handleData(urlString:String) throws {
if let url = URL(string:urlString) {
debugPrint("开始获取信息")
DispatchQueue.global().sync{
do{
//通过创建Scanner
let scanner = Scanner(string: try String(contentsOf:url))
var (head,foot) = ("<div class=\"rank-wrapper row\">","<div class=\"footer text-center\">")
let content = self.scanWith(head:head,foot:foot,scanner:scanner)
var contents = content.components(separatedBy:"<div class=\"col-xs-12 col-sm-6 col-md-4\">")
contents.removeFirst()
var data = ""
for (_, item) in contents.enumerated() {
(head,foot) = ("<div class=\"rank-item text-center\">","</a>")
let itemString = self.scanWith(head: head, foot: foot, string: item)
let href = self.scanWith(head: "<a href=", foot: " class=\"view-wrapper data-view\">", string: itemString).replacingOccurrences(of: "\"", with: "")
let crown = self.scanWith(head: "src=", foot: "/>", string: itemString).replacingOccurrences(of: "\"", with: "")
let rank_word = self.scanWith(head: "<p class=\"rank-word\">", foot: "</p>", string: itemString)
let rank_tag = self.scanWith(head: "<span class=\"rank-tag\">", foot: "</span>", string: itemString)
data += "{\"crown\":\"\(urlString)\(crown)\",\"rank_word\":\"\(rank_word)\",\"rank_tag\":\"\(rank_tag)\",\"href\":\"\(href)\"},"
}
results += "\"code\":\(200),\"data\":[\(data)]"
}catch{
debugPrint(error)
}
}
debugPrint("获取信息结束")
results = results.replace(of: ",", with: "")
results = results.characters.count > 0 ? "{\(results)}" : ""
}else{
throw crawlerError(msg:"查询URL初始化失败")
}
}
// html截取
func scanWith(head:String,foot:String,scanner:Scanner)->String
{
var str:NSString?
scanner.scanUpTo(head, into: nil)
scanner.scanUpTo(foot, into: &str)
return str == nil ? "" : str!.replacingOccurrences(of: head, with: "")
}
// 字符串截取
func scanWith(head:String,foot:String,string:String)->String
{
let range = string.range(of: head)
let startIndex = range!.upperBound
let range1 = Range(uncheckedBounds: (lower: string.startIndex, upper: string.endIndex))
let endRange = string.range(of: foot, options: .backwards, range: range1, locale: Locale(identifier: "<"))
let endIndex = endRange!.lowerBound
let searchRange = Range(uncheckedBounds: (startIndex, endIndex))
let endString = string.substring(with: searchRange)
return endString
}
}
//自定义一个错误处理
public struct crawlerError:Error
{
var message:String
init(msg:String)
{
message = msg
}
}
extension String
{
//去掉字符串(空格之类的)
func trim(string:String) -> String
{
return self == "" ? "" : self.trimmingCharacters(in: CharacterSet(charactersIn: string))
}
//替换从末尾出现的第一个指定字符串
func replace(of pre:String,with next:String)->String
{
return replacingOccurrences(of: pre, with: next, options: String.CompareOptions.backwards, range: index(endIndex, offsetBy: -2)..<endIndex)
}
}
添加路由
我们在main.swift
添加路由
func dataHandler(request:HTTPRequest,_ response:HTTPResponse)
{
//在请求中创建并开始爬一次
let crawler = movieCrawler(url:"https://movie.douban.com/")
// 开始爬虫
crawler.start()
//如果有爬到数据,就添加到Response中返回
response.setHeader(.contentType, value: "text/html;charset=UTF-8")
response.appendBody(string: crawler.results.characters.count > 0 ? crawler.results : "")
response.completed()
}
//添加路由
routes.add(method: .get, uri: "/data", handler:dataHandler)
然后运行服务器,在浏览器输入http://127.0.0.1:8181/index
然后就返回了我们想要的数据了,第一个小小的爬虫就完成了
JSON数据
{
"code": 200,
"data": [
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "白百何",
"rank_tag": "人物",
"href": "/tag/ren-wu"
},
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "香奈儿",
"rank_tag": "化妆品",
"href": "/tag/hua-zhuang-pin"
},
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "香奈儿",
"rank_tag": "奢侈品",
"href": "/tag/she-chi-pin"
},
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "人民的名义",
"rank_tag": "娱乐",
"href": "/tag/yu-le"
},
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "朗读者",
"rank_tag": "小说",
"href": "/tag/xiao-shuo"
},
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "人民的名义",
"rank_tag": "热点",
"href": "/tag/re-dian"
},
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "微信",
"rank_tag": "生活",
"href": "/tag/sheng-huo"
},
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "速度与激情8",
"rank_tag": "电影",
"href": "/tag/dian-ying"
},
{
"crown": "http://zhishuweixin.com/assets/royal-crown.png",
"rank_word": "中国银行",
"rank_tag": "股票",
"href": "/tag/gu-piao"
},
]
}
网友评论