-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.go
71 lines (60 loc) · 1.63 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package crawler
import (
"github.com/Alexniver/logger4go"
"sync"
)
type SpiderFunc func(spider *Spider, spiderChannel chan *Spider) error
//url:tocrawl url, spiderCrawlFunc: how spiderCrawl, spiderAnalystFunc: how spider deal with response func
func DoCrawl(seedSpider *Spider, spiderCrawlFunc SpiderFunc, spiderAnlystFunc SpiderFunc, maxConcurrencyNum int) {
logger := logger4go.GetDefaultLogger()
visited := map[string]bool{}
// crawler has two channel, one url channel, one data channel
var spiderChannel = make(chan *Spider) // spider to crawl channel
var dataChannel = make(chan *Spider) // spider to analyst channel
var wg sync.WaitGroup
logger.Info("Crawl start!")
//seedSpider := NewDefaultGetSpider(url)
go func() {
spiderChannel <- seedSpider
}()
// max crawl concurrency
//start maxConcurrencyNum goroutine to crawl
for i := 0; i < maxConcurrencyNum; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for {
select {
case spider := <-spiderChannel:
if !visited[spider.Url] {
//fmt.Println(len(visited))
visited[spider.Url] = true
//logger.Info(len(visited))
err := spiderCrawlFunc(spider, dataChannel)
if err != nil {
logger.Error(err)
}
//fmt.Println("crawl end")
}
}
}
}()
}
//analys function will start a goroutine to analys when a response throw to dataChannel
for {
select {
case data := <-dataChannel:
wg.Add(1)
go func() {
defer wg.Done()
//fmt.Println("analysising")
err := spiderAnlystFunc(data, spiderChannel)
if err != nil {
logger.Error(err)
}
//fmt.Println("analys end")
}()
}
}
wg.Wait()
}