美文网首页
Go小项目-扫描子域名

Go小项目-扫描子域名

作者: c4a1d989518e | 来源:发表于2018-03-15 14:09 被阅读44次

    项目结果

    代码的功能
    运行结果

    项目代码

    代码为

    package main
    
    import (
        "encoding/xml"
        "flag"
        "fmt"
        "io"
        "net/http"
        "net/url"
        "os"
        "strings"
    
        "github.com/gophercises/link"
    )
    
    /*
       1. GET the webpage
       2. parse all the links on the page
       3. build proper urls with our links
       4. filter out any links w/ a diff domain
       5. Find all pages (BFS)
       6. print out XML
    */
    
    const xmlns = "http://www.sitemaps.org/schemas/sitemap/0.9"
    
    type loc struct {
        Value string `xml:"loc"`
    }
    
    type urlset struct {
        Urls  []loc  `xml:"url"`
        Xmlns string `xml:"xmlns,attr"`
    }
    
    func main() {
        urlFlag := flag.String("url", "https://gophercises.com", "the url that you want to build a sitemap for")
        maxDepth := flag.Int("depth", 10, "the maximum number of links deep to traverse")
        flag.Parse()
    
        pages := bfs(*urlFlag, *maxDepth)
        toXml := urlset{
            Xmlns: xmlns,
        }
        for _, page := range pages {
            toXml.Urls = append(toXml.Urls, loc{page})
        }
    
        fmt.Print(xml.Header)
        enc := xml.NewEncoder(os.Stdout)
        enc.Indent("", "  ")
        if err := enc.Encode(toXml); err != nil {
            panic(err)
        }
        fmt.Println()
    }
    
    func bfs(urlStr string, maxDepth int) []string {
        seen := make(map[string]struct{})
        var q map[string]struct{}
        nq := map[string]struct{}{
            urlStr: struct{}{},
        }
        for i := 0; i <= maxDepth; i++ {
            q, nq = nq, make(map[string]struct{})
            if len(q) == 0 {
                break
            }
            for url, _ := range q {
                if _, ok := seen[url]; ok {
                    continue
                }
                seen[url] = struct{}{}
                for _, link := range get(url) {
                    nq[link] = struct{}{}
                }
            }
        }
        ret := make([]string, 0, len(seen))
        for url, _ := range seen {
            ret = append(ret, url)
        }
        return ret
    }
    
    func get(urlStr string) []string {
        resp, err := http.Get(urlStr)
        if err != nil {
            return []string{}
        }
        defer resp.Body.Close()
        reqUrl := resp.Request.URL
        baseUrl := &url.URL{
            Scheme: reqUrl.Scheme,
            Host:   reqUrl.Host,
        }
        base := baseUrl.String()
        return filter(hrefs(resp.Body, base), withPrefix(base))
    }
    
    func hrefs(r io.Reader, base string) []string {
        links, _ := link.Parse(r)
        var ret []string
        for _, l := range links {
            switch {
            case strings.HasPrefix(l.Href, "/"):
                ret = append(ret, base+l.Href)
            case strings.HasPrefix(l.Href, "http"):
                ret = append(ret, l.Href)
            }
        }
        return ret
    }
    
    func filter(links []string, keepFn func(string) bool) []string {
        var ret []string
        for _, link := range links {
            if keepFn(link) {
                ret = append(ret, link)
            }
        }
        return ret
    }
    
    func withPrefix(pfx string) func(string) bool {
        return func(link string) bool {
            return strings.HasPrefix(link, pfx)
        }
    }
    
    

    参考链接

    https://github.com/gophercises/sitemap/tree/solution

    相关文章

      网友评论

          本文标题:Go小项目-扫描子域名

          本文链接:https://www.haomeiwen.com/subject/bxxqqftx.html