1.建表
首先在数据库中建好对应的表,设置对应的字段
CREATE TABLE `movie_info` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`movie_id` int(11) unsigned NOT NULL COMMENT '电影id',
`movie_name` varchar(100) COMMENT '电影名称',
`movie_pic` varchar(200) COMMENT '电影图片',
`movie_director` varchar(50) COMMENT '电影导演',
`movie_writer` varchar(50) COMMENT '电影编剧',
`movie_country` varchar(50) COMMENT '电影产地',
`movie_language` varchar(50) COMMENT '电影语言',
`movie_main_character` varchar(50) COMMENT '电影主演',
`movie_type` varchar(50) COMMENT '电影类型',
`movie_on_time` timestamp DEFAULT CURRENT_TIMESTAMP COMMENT '电影上映时间',
`movie_span` varchar(20) COMMENT '电影时长',
`movie_grade` varchar(5) COMMENT '电影评分',
`remark` varchar(500) DEFAULT '' COMMENT '备注',
`_create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
`_modify_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '修改时间',
`_status` tinyint(1) DEFAULT '1',
PRIMARY KEY (`id`),
KEY `idx_movie_id` (`movie_id`),
KEY `idx_create_time` (`_create_time`),
KEY `idx_modify_time` (`_modify_time`)
) ENGINE=InnoDB AUTO_INCREMENT=20 DEFAULT CHARSET=utf8 COMMENT='电影信息表';
2.新建项目
到目录src下,使用bee new crawl_movie,创建新项目
image.png
3.爬虫代码(静态)
思路:
1.连接redis,以https://movie.douban.com/subject/25827935/,连接为入口,加入到要爬取的url的队列
2.开启循环,只要url的队列中还有url,那么就取出url,进行爬取
3.获取到url中的html页面元素(html源码),根据正则表达式匹配到电影信息,存入数据库
4.提取url中的其他电影的相应连接,这里也是通过正则过滤的,电影连接有一定的规则,并把这些电影的连接存入redis的url队列中。后续在for循环中获取这个url即可继续爬取电影内容
5.将爬取过的连接存入redis的另一个url队列中,用来标记去重(相当于黑名单),不会再继续爬取这些url
crawlMovie.go
package controllers
import (
"crawl_movie/models"
"time"
"github.com/astaxie/beego"
"github.com/astaxie/beego/httplib"
)
type CrawlMovieController struct {
beego.Controller
}
/**
目前这个爬虫只能爬取静态数据 对于像京东的部分动态数据 无法爬取
对于动态数据 可以采用 一个组件 phantomjs
*/
func (c *CrawlMovieController) CrawlMovie() {
var movieInfo models.MovieInfo
//连接到redis
models.ConnectRedis("127.0.0.1:6379")
//爬虫入口url
sUrl := "https://movie.douban.com/subject/25827935/"
models.PutinQueue(sUrl)
for {
length := models.GetQueueLength()
if length == 0 {
break //如果url队列为空 则退出当前循环
}
sUrl = models.PopfromQueue()
//我们应当判断sUrl是否应该被访问过
if models.IsVisit(sUrl) {
continue
}
rsp := httplib.Get(sUrl)
//设置User-agent以及cookie是为了防止 豆瓣网的 403
rsp.Header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0")
rsp.Header("Cookie", `bid=gFP9qSgGTfA; __utma=30149280.1124851270.1482153600.1483055851.1483064193.8; __utmz=30149280.1482971588.4.2.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118221"; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1483064193%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_id.100001.4cf6=5afcf5e5496eab22.1482413017.7.1483066280.1483057909.; __utma=223695111.1636117731.1482413017.1483055857.1483064193.7; __utmz=223695111.1483055857.6.5.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _vwo_uuid_v2=BDC2DBEDF8958EC838F9D9394CC5D9A0|2cc6ef7952be8c2d5408cb7c8cce2684; ap=1; viewed="1006073"; gr_user_id=e5c932fc-2af6-4861-8a4f-5d696f34570b; __utmc=30149280; __utmc=223695111; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1483064193; __utmb=223695111.0.10.1483064193`)
sMovieHtml, err := rsp.String()
if err != nil {
panic(err)
}
movieInfo.Movie_name = models.GetMovieName(sMovieHtml)
//记录电影信息
if movieInfo.Movie_name != "" {
movieInfo.Movie_director = models.GetMovieDirector(sMovieHtml)
movieInfo.Movie_main_character = models.GetMovieMainCharacters(sMovieHtml)
movieInfo.Movie_type = models.GetMovieGenre(sMovieHtml)
movieInfo.Movie_on_time = models.GetMovieOnTime(sMovieHtml)
movieInfo.Movie_grade = models.GetMovieGrade(sMovieHtml)
movieInfo.Movie_span = models.GetMovieRunningTime(sMovieHtml)
models.AddMovie(&movieInfo)
}
//提取该页面的所有连接
urls := models.GetMovieUrls(sMovieHtml)
for _, url := range urls {
models.PutinQueue(url)
c.Ctx.WriteString("<br>" + url + "</br>")
}
//sUrl 应当记录到 访问set中
models.AddToSet(sUrl)
time.Sleep(time.Second)
}
c.Ctx.WriteString("end of crawl!")
}
movie_info.go,进行数据库操作,将电影信息入库
package models
import (
_ "github.com/go-sql-driver/mysql"
"github.com/astaxie/beego/orm"
"regexp"
"strings"
)
var (
db orm.Ormer
)
type MovieInfo struct{
Id int64
Movie_id int64
Movie_name string
Movie_pic string
Movie_director string
Movie_writer string
Movie_country string
Movie_language string
Movie_main_character string
Movie_type string
Movie_on_time string
Movie_span string
Movie_grade string
}
func init() {
orm.Debug = true // 是否开启调试模式 调试模式下会打印出sql语句
orm.RegisterDataBase("default", "mysql", "root:123@tcp(127.0.0.1:3306)/test?charset=utf8", 30)
orm.RegisterModel(new(MovieInfo))
db = orm.NewOrm()
}
func AddMovie(movie_info *MovieInfo)(int64,error){
movie_info.Id = 0
id,err := db.Insert(movie_info)
return id,err
}
func GetMovieDirector(movieHtml string) string{
if movieHtml == ""{
return ""
}
reg := regexp.MustCompile(`<a.*?rel="v:directedBy">(.*?)</a>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieName(movieHtml string)string{
if movieHtml == ""{
return ""
}
reg := regexp.MustCompile(`<span\s*property="v:itemreviewed">(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieMainCharacters(movieHtml string)string{
reg := regexp.MustCompile(`<a.*?rel="v:starring">(.*?)</a>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
mainCharacters := ""
for _,v := range result{
mainCharacters += v[1] + "/"
}
return strings.Trim(mainCharacters, "/")
}
func GetMovieGrade(movieHtml string)string{
reg := regexp.MustCompile(`<strong.*?property="v:average">(.*?)</strong>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieGenre(movieHtml string)string{
reg := regexp.MustCompile(`<span.*?property="v:genre">(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
movieGenre := ""
for _,v := range result{
movieGenre += v[1] + "/"
}
return strings.Trim(movieGenre, "/")
}
func GetMovieOnTime(movieHtml string) string{
reg := regexp.MustCompile(`<span.*?property="v:initialReleaseDate".*?>(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieRunningTime(movieHtml string) string{
reg := regexp.MustCompile(`<span.*?property="v:runtime".*?>(.*?)</span>`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
if len(result) == 0{
return ""
}
return string(result[0][1])
}
func GetMovieUrls(movieHtml string)[]string{
reg := regexp.MustCompile(`<a.*?href="(https://movie.douban.com/.*?)"`)
result := reg.FindAllStringSubmatch(movieHtml, -1)
var movieSets []string
for _,v := range result{
movieSets = append(movieSets, v[1])
}
return movieSets
}
redis.go,对两个url的队列进行操作
package models
import (
"github.com/astaxie/goredis"
)
const (
URL_QUEUE = "url_queue"
URL_VISIT_SET = "url_visit_set"
)
var (
client goredis.Client
)
func ConnectRedis(addr string){
client.Addr = addr
}
func PutinQueue(url string){
client.Lpush(URL_QUEUE, []byte(url))
}
func PopfromQueue() string{
res,err := client.Rpop(URL_QUEUE)
if err != nil{
panic(err)
}
return string(res)
}
func GetQueueLength() int{
length,err := client.Llen(URL_QUEUE)
if err != nil{
return 0
}
return length
}
func AddToSet(url string){
client.Sadd(URL_VISIT_SET, []byte(url))
}
func IsVisit(url string) bool{
bIsVisit, err := client.Sismember(URL_VISIT_SET, []byte(url))
if err != nil{
return false
}
return bIsVisit
}
网友评论