美文网首页
Golang高并发抓取HTML图片

Golang高并发抓取HTML图片

作者: veeeeeeeeeeee | 来源:发表于2019-05-12 13:39 被阅读0次

    版权所有,转载请注明:http://www.lenggirl.com/language/go-picture.html

    使用准备

    1.安装Golang

    2.下载爬虫包

    go get -v github.com/hunterhug/marmot/expert
    go get -v github.com/hunterhug/marmot/miner
    go get -v github.com/hunterhug/parrot/util
    

    程序

    该程序只能抓取HTML中src="http"中的图片, 必须带有协议头http(s), 其他如data-src和混淆在JS中的无法抓取

    See: https://github.com/hunterhug/marmot/blob/master/example/lesson/lesson6.go

    package main
    
    import (
        "errors"
        "fmt"
        "net/url"
        "strings"
    
        "github.com/hunterhug/marmot/expert"
        "github.com/hunterhug/marmot/miner"
        "github.com/hunterhug/parrot/util"
    )
    
    // Num of miner, We can run it at the same time to crawl data fast
    var MinerNum = 5
    
    // You can update this decide whether to proxy
    var ProxyAddress interface{}
    
    func main() {
        // You can Proxy!
        // ProxyAddress = "socks5://127.0.0.1:1080"
    
        fmt.Println(`Welcome: Input "url" and picture keep "dir"`)
        for {
            fmt.Println("---------------------------------------------")
            url := util.Input(`URL(Like: "http://publicdomainarchive.com")`, "http://publicdomainarchive.com")
            dir := util.Input(`DIR(Default: "./picture")`, "./picture")
            fmt.Printf("You will keep %s picture in dir %s\n", url, dir)
            fmt.Println("---------------------------------------------")
    
            // Start Catch
            err := CatchPicture(url, dir)
            if err != nil {
                fmt.Println("Error:" + err.Error())
            }
        }
    }
    
    // Come on!
    func CatchPicture(picture_url string, dir string) error {
        // Check valid
        _, err := url.Parse(picture_url)
        if err != nil {
            return err
        }
    
        // Make dir!
        err = util.MakeDir(dir)
        if err != nil {
            return err
        }
    
        // New a worker to get url
        worker, _ := miner.New(ProxyAddress)
    
        result, err := worker.SetUrl(picture_url).SetUa(miner.RandomUa()).Get()
        if err != nil {
            return err
        }
    
        // Find all picture
        pictures := expert.FindPicture(string(result))
    
        // Empty, What a pity!
        if len(pictures) == 0 {
            return errors.New("empty")
        }
    
        // Devide pictures into several worker
        xxx, _ := util.DevideStringList(pictures, MinerNum)
    
        // Chanel to info exchange
        chs := make(chan int, len(pictures))
    
        // Go at the same time
        for num, imgs := range xxx {
    
            // Get pool miner
            worker_picture, ok := miner.Pool.Get(util.IS(num))
            if !ok {
                // No? set one!
                worker_temp, _ := miner.New(ProxyAddress)
                worker_picture = worker_temp
                worker_temp.SetUa(miner.RandomUa())
                miner.Pool.Set(util.IS(num), worker_temp)
            }
    
            // Go save picture!
            go func(imgs []string, worker *miner.Worker, num int) {
                for _, img := range imgs {
    
                    // Check, May be Pass
                    _, err := url.Parse(img)
                    if err != nil {
                        continue
                    }
    
                    // Change Name of our picture
                    filename := strings.Replace(util.ValidFileName(img), "#", "_", -1)
    
                    // Exist?
                    if util.FileExist(dir + "/" + filename) {
                        fmt.Println("File Exist:" + dir + "/" + filename)
                        chs <- 0
                    } else {
    
                        // Not Exsit?
                        imgsrc, e := worker.SetUrl(img).Get()
                        if e != nil {
                            fmt.Println("Download " + img + " error:" + e.Error())
                            chs <- 0
                            return
                        }
    
                        // Save it!
                        e = util.SaveToFile(dir+"/"+filename, imgsrc)
                        if e == nil {
                            fmt.Printf("SP%d: Keep in %s/%s\n", num, dir, filename)
                        }
                        chs <- 1
                    }
                }
            }(imgs, worker_picture, num)
        }
    
        // Every picture should return
        for i := 0; i < len(pictures); i++ {
            <-chs
        }
    
        return nil
    }
    

    解释均写, 运行后:

    superpika@superpika-chen-110:~/code/src/github.com/hunterhug/marmot/example/lesson$ go run lesson6.go 
    
            Welcome: Input "url" and picture keep "dir"
    
            
            
    ---------------------------------------------
    URL(Like: "http://publicdomainarchive.com")
    
    DIR(Default: "./picture")
    
    You will keep http://publicdomainarchive.com picture in dir ./picture
    ---------------------------------------------
    SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_modern.jpg
    SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_google_dark.png
    SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-003-667x1000-192684_667x675.jpg
    
    superpika@superpika-chen-110:~/code/src/github.com/hunterhug/marmot/example/lesson$ go run lesson6.go 
    
            Welcome: Input "url" and picture keep "dir"
    
            
            
    ---------------------------------------------
    URL(Like: "http://publicdomainarchive.com")
    
    DIR(Default: "./picture")
    
    You will keep http://publicdomainarchive.com picture in dir ./picture
    ---------------------------------------------
    SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_modern.jpg
    SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_google_dark.png
    SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-003-667x1000-192684_667x675.jpg
    SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_powered-by-wp-engine.png
    SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_divi.png
    SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-002-1000x667.jpg
    SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_public-domain-mark.png
    SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos008-1000x625.jpg
    SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_09_03_Weekly.jpg
    SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-054-1000x667.jpg
    SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_10_03_instagram_dark.png
    SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_vintage.jpg
    SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-001-1000x667.jpg
    SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-070-1000x667.jpg
    SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_twitter02_dark.png
    SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos001-1000x750-167066_1000x675.jpg
    SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-035-1000x667.jpg
    SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_facebook_dark.png
    SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-060-1000x667.jpg
    SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-013-1000x667.jpg
    ---------------------------------------------
    URL(Like: "http://publicdomainarchive.com")
    
    
    
    SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_powered-by-wp-engine.png
    SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_divi.png
    SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-002-1000x667.jpg
    SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_public-domain-mark.png
    SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos008-1000x625.jpg
    SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_09_03_Weekly.jpg
    SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-054-1000x667.jpg
    SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_10_03_instagram_dark.png
    SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_vintage.jpg
    SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-001-1000x667.jpg
    SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-070-1000x667.jpg
    SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_twitter02_dark.png
    SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos001-1000x750-167066_1000x675.jpg
    SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-035-1000x667.jpg
    SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_facebook_dark.png
    SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-060-1000x667.jpg
    SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-013-1000x667.jpg
    ---------------------------------------------
    URL(Like: "http://publicdomainarchive.com")
    

    [图片上传失败...(image-685ed8-1557639583356)]

    相关文章

      网友评论

          本文标题:Golang高并发抓取HTML图片

          本文链接:https://www.haomeiwen.com/subject/yircaqtx.html