-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworkOrchestrator.go
51 lines (48 loc) · 1.33 KB
/
workOrchestrator.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
package main
import (
mapset "github.com/deckarep/golang-set"
"log"
)
func worker(client HttpClient, links chan string, results chan *Page) {
scraper := Scraper{client}
for link := range links {
results <- scraper.GetSameDomainLinks(link)
}
}
func getSiteMap(client HttpClient, maxThreads int, startingUrl string, maxUrlsToCrawl int) map[string][]string {
jobs := make(chan string)
results := make(chan *Page)
for i := 0; i < maxThreads; i++ {
go worker(client, jobs, results)
}
toCrawl := mapset.NewThreadUnsafeSet()
toCrawl.Add(startingUrl)
alreadyCrawled := mapset.NewThreadUnsafeSet()
siteMap := make(map[string][]string)
pendingResults := 0
for {
if alreadyCrawled.Cardinality() < maxUrlsToCrawl && toCrawl.Cardinality() > 0 && pendingResults < maxThreads {
linkToGet := toCrawl.Pop().(string)
jobs <- linkToGet
pendingResults++
alreadyCrawled.Add(linkToGet)
} else if pendingResults == 0 {
close(jobs)
close(results)
break
} else {
page := <-results
pendingResults--
links := page.sameDomainLinks
for _, link := range links {
if !alreadyCrawled.Contains(link) {
toCrawl.Add(link)
}
}
siteMap[page.link] = links
}
}
log.Println("Number of pages crawled:", alreadyCrawled.Cardinality())
log.Println("Number of pages left in queue:", toCrawl.Cardinality())
return siteMap
}