Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

timeout option #57

Merged
merged 2 commits into from
Apr 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions cmd/crawley/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ import (
)

const (
appName = "Crawley"
appHelp = "the unix-way web crawler"
appSite = "https://github.com/s0rg/crawley"
defaultDelay = 150 * time.Millisecond
appName = "Crawley"
appHelp = "the unix-way web crawler"
appSite = "https://github.com/s0rg/crawley"
defaultDelay = 150 * time.Millisecond
defaultTimeout = 5 * time.Second
)

// build-time values.
Expand All @@ -43,6 +44,7 @@ var (
fDirsPolicy, fProxyAuth string
fRobotsPolicy, fUA string
fDelay time.Duration
fTimeout time.Duration
cookies, headers values.Smart
tags, ignored values.List
)
Expand Down Expand Up @@ -164,6 +166,7 @@ func initOptions() (rv []crawler.Option, err error) {
crawler.WithTagsFilter(tags.Values),
crawler.WithIgnored(ignored.Values),
crawler.WithProxyAuth(fProxyAuth),
crawler.WithTimeout(fTimeout),
}

return rv, nil
Expand Down Expand Up @@ -198,6 +201,7 @@ func setupFlags() {
flag.StringVar(&fProxyAuth, "proxy-auth", "", "credentials for proxy: user:password")

flag.DurationVar(&fDelay, "delay", defaultDelay, "per-request delay (0 - disable)")
flag.DurationVar(&fTimeout, "timeout", defaultTimeout, "request timeout (min: 1 second, max: 10 minutes)")

flag.Usage = usage
}
Expand Down
15 changes: 6 additions & 9 deletions pkg/client/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@ import (
"time"
)

const (
idleTimeout = 5 * time.Second
dialTimeout = 5 * time.Second
reqTimeout = 10 * time.Second
)
const transportTimeout = 10 * time.Second

// HTTP holds pre-configured http.Client.
type HTTP struct {
Expand All @@ -29,24 +25,25 @@ func New(
conns int,
skipSSL bool,
headers, cookies []string,
timeout time.Duration,
) (h *HTTP) {
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
Dial: (&net.Dialer{
Timeout: dialTimeout,
Timeout: transportTimeout,
}).Dial,
TLSClientConfig: &tls.Config{
InsecureSkipVerify: skipSSL,
},
IdleConnTimeout: idleTimeout,
TLSHandshakeTimeout: dialTimeout,
IdleConnTimeout: transportTimeout,
TLSHandshakeTimeout: transportTimeout,
MaxConnsPerHost: conns,
MaxIdleConns: conns,
MaxIdleConnsPerHost: conns,
}

client := &http.Client{
Timeout: reqTimeout,
Timeout: timeout,
Transport: transport,
}

Expand Down
9 changes: 5 additions & 4 deletions pkg/client/http_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"net/http"
"net/http/httptest"
"testing"
"time"
)

const (
Expand All @@ -15,7 +16,7 @@ const (
func TestHTTPGetOK(t *testing.T) {
t.Parallel()

c := New(ua, 1, false, []string{"FOO: BAR"}, []string{"NAME=VALUE"})
c := New(ua, 1, false, []string{"FOO: BAR"}, []string{"NAME=VALUE"}, time.Minute)

const (
body = "test-body"
Expand Down Expand Up @@ -71,7 +72,7 @@ func TestHTTPGetOK(t *testing.T) {
func TestHTTPGetERR(t *testing.T) {
t.Parallel()

c := New("", 1, false, []string{}, []string{})
c := New("", 1, false, []string{}, []string{}, time.Second)

ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusNotFound)
Expand All @@ -97,7 +98,7 @@ func TestHTTPGetERR(t *testing.T) {
func TestHTTPHeadOK(t *testing.T) {
t.Parallel()

c := New(ua, 1, false, []string{}, []string{})
c := New(ua, 1, false, []string{}, []string{}, time.Second)

const (
key = "x-some-key"
Expand Down Expand Up @@ -132,7 +133,7 @@ func TestHTTPHeadOK(t *testing.T) {
func TestHTTPHeadERR(t *testing.T) {
t.Parallel()

c := New("", 1, false, []string{}, []string{})
c := New("", 1, false, []string{}, []string{}, time.Second)

ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
Expand Down
12 changes: 11 additions & 1 deletion pkg/crawler/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ const (
minWorkers = 1
maxWorkers = 64
minDelay = time.Duration(0)
minTimeout = time.Second
maxTimeout = time.Minute * 10
)

type config struct {
Expand All @@ -27,6 +29,7 @@ type config struct {
Brute bool
NoHEAD bool
ScanJS bool
Timeout time.Duration
}

func (c *config) validate() {
Expand All @@ -37,6 +40,13 @@ func (c *config) validate() {
c.Workers = maxWorkers
}

switch {
case c.Timeout < minTimeout:
c.Timeout = minTimeout
case c.Timeout > maxTimeout:
c.Timeout = maxTimeout
}

if c.Delay < minDelay {
c.Delay = minDelay
}
Expand All @@ -49,7 +59,7 @@ func (c *config) validate() {
func (c *config) String() (rv string) {
var sb strings.Builder

_, _ = sb.WriteString(fmt.Sprintf("workers: %d depth: %d", c.Workers, c.Depth))
_, _ = sb.WriteString(fmt.Sprintf("workers: %d depth: %d timeout: %s", c.Workers, c.Depth, c.Timeout))

if c.Brute {
_, _ = sb.WriteString(" brute: on")
Expand Down
11 changes: 11 additions & 0 deletions pkg/crawler/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ func TestValidate(t *testing.T) {

c.Workers = 1000000
c.Delay = time.Duration(-100)
c.Timeout = time.Hour
c.Depth = -5

c.validate()
Expand All @@ -34,6 +35,10 @@ func TestValidate(t *testing.T) {
if c.Depth != -1 {
t.Error("non empty - bad depth")
}

if c.Timeout != maxTimeout {
t.Error("non empty - bad timeout")
}
}

func TestOptions(t *testing.T) {
Expand All @@ -42,6 +47,7 @@ func TestOptions(t *testing.T) {
rp = RobotsRespect
dp = DirsOnly
delay = time.Hour
timeout = time.Minute * 5
workers = 13
depth = 666
fbool = true
Expand Down Expand Up @@ -69,6 +75,7 @@ func TestOptions(t *testing.T) {
WithTagsFilter([]string{"a", "form"}),
WithScanJS(fbool),
WithIgnored([]string{"logout"}),
WithTimeout(timeout),
}

c := &config{}
Expand Down Expand Up @@ -134,6 +141,10 @@ func TestOptions(t *testing.T) {
if len(c.Ignored) != 1 {
t.Error("unexpected ignored size")
}

if c.Timeout != timeout {
t.Error("bad timeout")
}
}

func TestString(t *testing.T) {
Expand Down
18 changes: 8 additions & 10 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,8 @@ type crawlClient interface {
}

const (
chMult = 256

chanTimeout = 100 * time.Millisecond
crawlTimeout = 5 * time.Second
robotsTimeout = 3 * time.Second
chMult = 256
chTimeout = 100 * time.Millisecond
)

type taskFlag byte
Expand Down Expand Up @@ -102,6 +99,7 @@ func (c *Crawler) Run(uri string, fn func(string)) (err error) {
c.cfg.SkipSSL,
c.cfg.Headers,
c.cfg.Cookies,
c.cfg.Timeout,
)
c.initRobots(base, web)

Expand Down Expand Up @@ -159,7 +157,7 @@ func (c *Crawler) emit(u string) {
return
}

t := time.NewTimer(chanTimeout)
t := time.NewTimer(chTimeout)
defer t.Stop()

select {
Expand All @@ -180,7 +178,7 @@ func (c *Crawler) crawl(base *url.URL, r *crawlResult) (yes bool) {
return
}

t := time.NewTimer(chanTimeout)
t := time.NewTimer(chTimeout)
defer t.Stop()

select {
Expand Down Expand Up @@ -208,7 +206,7 @@ func (c *Crawler) initRobots(host *url.URL, web crawlClient) {
return
}

ctx, cancel := context.WithTimeout(context.Background(), robotsTimeout)
ctx, cancel := context.WithTimeout(context.Background(), c.cfg.Timeout)
defer cancel()

body, _, err := web.Get(ctx, robots.URL(host))
Expand Down Expand Up @@ -288,7 +286,7 @@ func (c *Crawler) linkHandler(a atom.Atom, s string) {
r.Flag = TaskCrawl
}

t := time.NewTimer(chanTimeout)
t := time.NewTimer(chTimeout)
defer t.Stop()

select {
Expand Down Expand Up @@ -344,7 +342,7 @@ func (c *Crawler) crawler(web crawlClient) {
time.Sleep(c.cfg.Delay)
}

ctx, cancel := context.WithTimeout(context.Background(), crawlTimeout)
ctx, cancel := context.WithTimeout(context.Background(), c.cfg.Timeout)
us := uri.String()

var parse bool
Expand Down
7 changes: 7 additions & 0 deletions pkg/crawler/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,10 @@ func WithProxyAuth(v string) Option {
}
}
}

// WithTimeout sets request timeout.
func WithTimeout(v time.Duration) Option {
return func(c *config) {
c.Timeout = v
}
}