Monday 16 September 2013

My Go Tour Web Crawler Solution

I recently followed through the Tour of Go, it's a really great introduction to the language with some interesting exercises.

The final exercise is to rewrite a simple web crawler function, making it concurrent and stopping it from retrieving duplicate URLs. It took me a couple of attempts to find a solution I was happy with.

The original crawl function
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // TODO: Fetch URLs in parallel.
    // TODO: Don't fetch the same URL twice.
    // This implementation doesn't do either:
    if depth <= 0 {
        return
    }
    body, urls, err := fetcher.Fetch(url)
    if err != nil {
        fmt.Println(err)
        return
    }
    fmt.Printf("found: %s %q\n", url, body)
    for _, u := range urls {
        Crawl(u, depth-1, fetcher)
    }
    return
}

My crawl function
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
    // the number of outstanding URLs being fetched
    count := 1
    // a list of URLs that have been fetched
    fetched := map[string]bool{url: true}

    // a channel for the go routines to report completion of a fetch
    complete := make(chan bool)

    // a function to recurse through the urls to the required depth
    var fetch func(url string, depth int, fetcher Fetcher)
    fetch = func(url string, depth int, fetcher Fetcher) {
        defer func() {
            complete <- true
        }()

        body, urls, err := fetcher.Fetch(url)
        if err != nil {
            fmt.Println(err)
            return
        }
        fmt.Printf("found: %s %q\n", url, body)

        for _, u := range urls {
            if fetched[u] != true && depth-1 > 0 {
                fetched[u] = true
                count++
                go fetch(u, depth-1, fetcher)
            }
        }
        return
    }
    // start the crawl
    go fetch(url, depth, fetcher)

    // wait for all outstanding fetches to complete
    for i := 0; i < count; i++ {
        <-complete
    }
}