package main
import (
"time"
"fmt"
"github.com/360EntSecGroup-Skylar/excelize"
"strconv"
"regexp"
"net/http"
"io/ioutil"
)
type Spider1 struct {
url string
header map[string]string
}
//定义spider 的 GET方法
func (spider1 *Spider1) get_html_header() string {
//获取http.Client 对象
client := &http.Client{}
//创建http请求
//param1:请求类型 param2:请求url param3:请求头body
req,err := http.NewRequest("GET",spider1.url,nil)
if err != nil {
}
//遍历spider1的header信息 添加到request
for key, value := range spider1.header {
req.Header.Add(key,value)
}
//执行req请求,接收返回对象
resp,err := client.Do(req)
if err != nil {
}
//调用结束 关闭
defer resp.Body.Close()
//读取body里面的内容
html,err := ioutil.ReadAll(resp.Body)
if err != nil {
}
// 返回
return string(html)
}
func parse1() {
//添加头信息
header := map[string]string{
"Host": "movie.douban.com",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Referer": "https://movie.douban.com/top250",
}
//创建excel文件
xlsx := excelize.NewFile()
xlsx.SetCellValue("Sheet1", "A1", "电影名称")
xlsx.SetCellValue("Sheet1", "B1", "评分")
xlsx.SetCellValue("Sheet1", "C1", "评价人数")
//定义整数为每行的行号 每解析一条就+1
count := 1
//循环每页解析并把结果写入excel
for i := 0; i < 250/25; i++ {
fmt.Println("正在抓取" + strconv.Itoa(i) + "页....")
url := "https://movie.douban.com/top250?start=" + strconv.Itoa(i*25) + "&filter="
//给结构体Spider1赋值
spider1 := &Spider1{url, header}
//调用spider1绑定的方法
html := spider1.get_html_header()
//爬取评价人数
pattern2 := `<span>(.*?)评价</span>` //正则表达式
rp2 := regexp.MustCompile(pattern2) //解析正则表达式是否合法
//在返回的html 里查找符合的所有内容, -1表示要爬取所有内容
find_txt2 := rp2.FindAllStringSubmatch(html, -1)
//爬取评分
pattern3 := `property="v:average">(.*?)</span>`
rp3 := regexp.MustCompile(pattern3)
find_txt3 := rp3.FindAllStringSubmatch(html, -1)
//爬取电影名称
pattern4 := `<img width="100" alt="(.*?)" src="`
rp4 := regexp.MustCompile(pattern4)
find_txt4 := rp4.FindAllStringSubmatch(html, -1)
// 定义cell单元格列标识
var A string
var B string
var C string
//将爬取到的内容写入excel
for i := 0; i < len(find_txt2); i++ {
count++ //每读取一行,行标+1
A = "A" + strconv.Itoa(count)
B = "B" + strconv.Itoa(count)
C = "C" + strconv.Itoa(count)
//将内容打印到控制台
fmt.Printf("%s %s %s \n",find_txt2[i][1],find_txt3[i][1],find_txt4[i][1],)
//将内容写入excel
xlsx.SetCellValue("Sheet1",A,find_txt4[i][1])
xlsx.SetCellValue("Sheet1",B,find_txt3[i][1])
xlsx.SetCellValue("Sheet1",C,find_txt2[i][1])
}
}
//保存excel文件
err := xlsx.SaveAs("d:/mySpider.xlsx")
if err != nil {
fmt.Println("保存文件失败:",err)
}
}
func main() {
time1 := time.Now();
parse1()
timeEnd := time.Since(time1)
fmt.Println("爬虫结束,总共耗时:", timeEnd)
}
爬取的内容
image.png
网友评论