The final exercise is to rewrite a simple web crawler function, making it concurrent and stopping it from retrieving duplicate URLs. It took me a couple of attempts to find a solution I was happy with.
The original crawl function
// Crawl uses fetcher to recursively crawl // pages starting with url, to a maximum of depth. func Crawl(url string, depth int, fetcher Fetcher) { // TODO: Fetch URLs in parallel. // TODO: Don't fetch the same URL twice. // This implementation doesn't do either: if depth <= 0 { return } body, urls, err := fetcher.Fetch(url) if err != nil { fmt.Println(err) return } fmt.Printf("found: %s %q\n", url, body) for _, u := range urls { Crawl(u, depth-1, fetcher) } return }
My crawl function
// Crawl uses fetcher to recursively crawl // pages starting with url, to a maximum of depth. func Crawl(url string, depth int, fetcher Fetcher) { // the number of outstanding URLs being fetched count := 1 // a list of URLs that have been fetched fetched := map[string]bool{url: true} // a channel for the go routines to report completion of a fetch complete := make(chan bool) // a function to recurse through the urls to the required depth var fetch func(url string, depth int, fetcher Fetcher) fetch = func(url string, depth int, fetcher Fetcher) { defer func() { complete <- true }() body, urls, err := fetcher.Fetch(url) if err != nil { fmt.Println(err) return } fmt.Printf("found: %s %q\n", url, body) for _, u := range urls { if fetched[u] != true && depth-1 > 0 { fetched[u] = true count++ go fetch(u, depth-1, fetcher) } } return } // start the crawl go fetch(url, depth, fetcher) // wait for all outstanding fetches to complete for i := 0; i < count; i++ { <-complete } }