美文网首页
广度配合匿名爬取(改进)

广度配合匿名爬取(改进)

作者: 哆啦在这A梦在哪 | 来源:发表于2018-06-28 15:05 被阅读3次

    package main

    import (

        "fmt"

        "io/ioutil"

        "log"

        "net/http"

        "os"

        "regexp"

        "strings"

        "sync"

        "golang.org/x/net/html"

    )

    //timer

    var (

        ground sync.WaitGroup

        str    string = "https://docs.hacknode.org/gopl-zh/"

    )

    func init() {

        defer func() {

            if err := recover(); err != nil {

                log.Fatal("recover error is :", err)

            }

        }()

    }

    //CreatFile is func to get infomation

    func CreatFile(bt []byte, i string) {

        f, err := os.OpenFile("F:/MyGo/src/waitground_user/url"+i+".txt", os.O_CREATE|os.O_APPEND, 0666)

        if err != nil {

            log.Fatal(err)

        }

        defer f.Close()

        _, err = f.Write([]byte(bt))

        if err != nil {

            log.Fatal(err)

        }

    }

    //GetURLInfomation is get url infomation

    func GetURLInfomation(URL string, ch chan int) (bt []byte) {

        resp, err := http.Get(URL)

        if err != nil {

            log.Fatal(err)

        }

        defer resp.Body.Close()

        if resp.StatusCode != http.StatusOK {

            log.Fatal("Can't connect")

        }

        bt, err = ioutil.ReadAll(resp.Body)

        if err != nil {

            log.Fatal(err)

        }

        ch <- 1

        return bt

    }

    //GetURLInfomationAdress is a func get URL infomation

    func GetURLInfomationAdress(URL string) []string {

        resp, err := http.Get(URL)

        if err != nil {

            log.Fatal(err)

        }

        defer resp.Body.Close()

        if resp.StatusCode != http.StatusOK {

            log.Fatal("Can't connect:", URL)

        }

        //开始节点处理

        doc, err := html.Parse(resp.Body)

        if err != nil {

            log.Fatal(err)

        }

        var links []string

        ForOneNode := func(n *html.Node) { //单次节点处理

            if n.Type == html.ElementNode && n.Data == "a" {

                for _, a := range n.Attr {

                    if a.Key != "href" {

                        continue

                    }

                    link, err := resp.Request.URL.Parse(a.Val)

                    if err != nil {

                        log.Fatal(err)

                    }

                    if CheckURL(link.String(), links) {

                        links = append(links, link.String()) //这条语句可以改成并行获取URL地址内容

                    }

                }

            }

        }

        ForEachNode(doc, ForOneNode, nil)

        return links

    }

    //ForEachNode is 广度优先遍历

    func ForEachNode(n *html.Node, pre, post func(n *html.Node)) {

        if pre != nil {

            pre(n)

        }

        for c := n.FirstChild; c != nil; c = c.NextSibling {

            ForEachNode(c, pre, post)

        }

        if post != nil {

            post(n)

        }

    }

    //正则表达式检查

    func checkRegexp(cont string, reg string, style int) (result interface{}) {

        check := regexp.MustCompile(reg)

        switch style {

        case 0:

            result = check.FindString(cont)

        case 1:

            result = check.FindAllString(cont, -1)

        default:

            result = check.FindAll([]byte(cont), -1)

        }

        return

    }

    //CheckURL is check the repeated fields

    func CheckURL(link string, links []string) bool {

        bl := true

        for _, str := range links {

            if str == link {

                bl = false

                break

            }

        }

        return bl

    }

    func main() {

        bt := GetURLInfomationAdress(str) //第一次运行,获取所有的标签链接地址

        fmt.Println("first finish")

        // fmt.Println(bt[1])

        // f := GetURLInfomation(bt[1])

        // CreatFile(f, strconv.Itoa(1))

        ch := make(chan int)

        for _, t := range bt {

            t := t

            go func() { //遍历所有地址,获取地址内容

                fname := strings.Split(t, "/")

                ff := fname[len(fname)-1]

                fmt.Println("地址:", t)

                fmt.Println(ff)

                ft := strings.Split(ff, ".")

                fft := ft[0]

                fmt.Println(fft) //进行地址处理,取页面名当做文件名

                p := GetURLInfomation(t, ch)

                CreatFile([]byte(p), fft)

            }()

        }

        for range ch {

            <-ch

        }

    }

    相关文章

      网友评论

          本文标题:广度配合匿名爬取(改进)

          本文链接:https://www.haomeiwen.com/subject/icigyftx.html