美文网首页
Go语言爬取豆瓣电影排名

Go语言爬取豆瓣电影排名

作者: 大学渣PG | 来源:发表于2018-11-25 01:34 被阅读0次

    前面学习了使用常用的正则表达式和Go语言的IO,今天将会使用实现一个简单的爬虫来爬取豆瓣电影排名

    1.首先对豆瓣网电影排名网页URL进行分析

    这是前三页的URL
    https://movie.douban.com/top250?start=0&filter=
    https://movie.douban.com/top250?start=25&filter=
    https://movie.douban.com/top250?start=50&filter=

    经过分析可以得到一个通用的表达式

    "https://movie.douban.com/top250?start="+strconv.Itoa(i*25)+"&filter="
    
    2.分析网页源码

    明确自己需要哪些数据


    image.png

    写出需要数据的正则表达式

    电影名:<span class="title">([\u4E00-\u9FA5]+)</span>
    导演:(.+)&nbsp;&nbsp;&nbsp;
    国家和类型:&nbsp;/&nbsp;([\u4E00-\u9FA5]+.*)&nbsp;/&nbsp;([\u4E00-\u9FA5]+.*)[^</span>]$
    分数:<span class="rating_num" property="v:average">([0-9]+\.[0-9]+)</span>
    评价人数:<span>([0-9]+)人评价</span>
    //由于Go语言的汉字表示是[\p{Han}]所以等会儿会用[\p{Han}]替换[\u4E00-\u9FA5]
    
    3.完成代码
    package main
    
    import (
        "fmt"
        "io"
        "log"
        "net/http"
        "os"
        "regexp"
        "strconv"
        "strings"
    )
    
    //给定url去获得http响应并转为字符串返回
    func httpRsp2String(url string) (string, error) {
        rsp, err := http.Get(url)
        if err != nil {
            return "", err
        }
        defer rsp.Body.Close()
        tmpByteSlic := make([]byte, int(1<<15))
        result := ""
        for {
            n, err1 := rsp.Body.Read(tmpByteSlic)
            if n == 0 && err1 == io.EOF {
                break
            }
            result += string(tmpByteSlic[0:n])
        }
        return result, nil
    }
    
    func main() {
        filmNames := make([]string, 0)
        directors := make([]string, 0)
        commentNums := make([]string, 0)
        scores := make([]string, 0)
        countrys := make([]string, 0)
        kinds := make([]string, 0)
        for i := 0; i < 10; i++ {
            result, err := httpRsp2String("https://movie.douban.com/top250?start=" + strconv.Itoa(i*25) + "&filter=")
            if err != nil {
                log.Fatal(err)
                return
            }
            filmNameRegex := regexp.MustCompile(`<img width="100" alt="(?s:(.*?))" src="`)
            fileNameSlice := filmNameRegex.FindAllStringSubmatch(result, -1)
            for _, v := range fileNameSlice {
                filmNames = append(filmNames, v[1])
            }
    
            directorRegex := regexp.MustCompile(`导演:\s*(.+)...<br>`)
            directorSlice := directorRegex.FindAllStringSubmatch(result, -1)
            for _, v := range directorSlice {
                tmpStr := strings.Replace(v[1], `&nbsp;`, "", -1)
                tmpStr = strings.Split(tmpStr, "主")[0]
                directors = append(directors, tmpStr)
            }
    
            scoreRegex := regexp.MustCompile(`<span class="rating_num" property="v:average">([0-9]+\.[0-9]+)</span>`)
            scoreSlice := scoreRegex.FindAllStringSubmatch(result, -1)
            for _, v := range scoreSlice {
                scores = append(scores, v[1])
            }
    
            commentNumRegex := regexp.MustCompile(`<span>([0-9]+)人评价</span>`)
            commentNumSlice := commentNumRegex.FindAllStringSubmatch(result, -1)
            for _, v := range commentNumSlice {
                commentNums = append(commentNums, v[1])
            }
    
            countryAndKindRegex := regexp.MustCompile(`&nbsp;/&nbsp;([\p{Han}]+.*)&nbsp;/&nbsp;([\p{Han}]+.*)`)
            countryAndKindSlice := countryAndKindRegex.FindAllStringSubmatch(result, -1)
            for _, v := range countryAndKindSlice {
                countrys = append(countrys, v[1])
                kinds = append(kinds, v[2])
            }
        }
    
        _, err := os.Create("./result.txt")
        if err != nil {
            log.Fatal(err)
            return
        }
        file, err := os.OpenFile("./result.txt", os.O_APPEND|os.O_RDWR, 0644)
        defer file.Close()
        if err != nil {
            log.Fatal(err)
            return
        }
        file.Write([]byte(fmt.Sprintf("%-5s|%-50s|%-20s|%-40s|%-40s|%-5s|%-20s\n","排名","电影名","国家","导演","类型","分数","评价人数")))
        for i := 0; i < 250; i++ {
            file.Write([]byte(fmt.Sprintf("%-5s|%-50s|%-20s|%-40s|%-40s|%-5s|%-20s\n",strconv.Itoa(i+1),filmNames[i],countrys[i],directors[i], kinds[i],scores[i],commentNums[i])))
        }
    }
    
    

    相关文章

      网友评论

          本文标题:Go语言爬取豆瓣电影排名

          本文链接:https://www.haomeiwen.com/subject/ymquqqtx.html