-
Notifications
You must be signed in to change notification settings - Fork 3
/
wcons.go
68 lines (54 loc) · 1.97 KB
/
wcons.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
package main
import (
"flag"
"fmt"
"strings"
"github.com/gocolly/colly/v2"
)
// Documentation
// =================
// https://www.zenrows.com/blog/web-scraping-golang#prerequisites
// https://github.com/gocolly/colly
// These functions are executed in the following order:
// OnRequest(): Called before performing an HTTP request with Visit().
// OnError(): Called if an error occurred during the HTTP request.
// OnResponse(): Called after receiving a response from the server.
// OnHTML(): Called right after OnResponse() if the received content is HTML.
// OnScraped(): Called after all OnHTML() callback executions.
// Commnads
// ====================
// go mod init worm.go
// go get github.com/gocolly/colly/v2
// Run from command line
// go build wcons.go
// ./wcons.exe -numb 88
func main() {
// define parameters
visitPtr := flag.String("visit", "https://www.google.com/", "Requested URL for processing")
allowPtr := flag.String("allow", "", "Allow domains, white lists, e.g. 'www.google.com,google.com,...'")
flag.Parse()
allowList := strings.Split((*allowPtr), ",")
fmt.Println("Run setting:")
fmt.Println("Visit:", *visitPtr)
fmt.Println("Allow domain(s):", allowList)
fmt.Println("=====================================", *allowPtr)
// sample from https://go-colly.org/docs/examples/basic/
c := colly.NewCollector(
// visit only domains: hackerspaces.org, wiki.hackerspaces.org
colly.AllowedDomains(allowList...),
)
// on every a element which has href attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
// print info about new link
fmt.Printf("New link : %q -> %s\n", e.Text, link)
// visit new link found on page (only those links are visited which are in AllowedDomains)
c.Visit(e.Request.AbsoluteURL(link))
})
// before making a request print " Visiting ..."
c.OnRequest(func(r *colly.Request) {
fmt.Println(" Visiting", r.URL.String())
})
// start scraping
c.Visit(*visitPtr)
}