Skip to content

Commit 98faae8

Browse files
authored
Merge pull request #2 from s0rg/feature/depth-calc-improved
path.Depth speed + memory optimization
2 parents 60d4781 + 71489b8 commit 98faae8

File tree

5 files changed

+46
-84
lines changed

5 files changed

+46
-84
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Crawls web pages and prints any link it can find.
2121
- scan depth (limited by starting host and path, by default - 0) can be configured
2222
- can crawl `robots.txt` rules and sitemaps
2323
- `brute` mode - scan html comments for urls (this can lead to bogus results)
24+
- make use of `HTTP_PROXY` / `HTTPS_PROXY` environment values
2425

2526
# installation
2627

pkg/crawler/crawl.go

+11-1
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,17 @@ func canCrawl(a, b *url.URL, d int) (yes bool) {
286286
return
287287
}
288288

289-
depth, found := path.Depth(a.EscapedPath(), b.EscapedPath())
289+
var apath, bpath string
290+
291+
if apath = a.Path; apath == "" {
292+
apath = "/"
293+
}
294+
295+
if bpath = b.Path; bpath == "" {
296+
bpath = "/"
297+
}
298+
299+
depth, found := path.Depth(apath, bpath)
290300
if !found {
291301
return
292302
}

pkg/crawler/crawl_test.go

+3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ func Test_canCrawl(t *testing.T) {
3030
url0, _ := url.Parse("http://test/some")
3131
url1, _ := url.Parse("http://test/some/path/even")
3232
url2, _ := url.Parse("http://test/some/path/even/more")
33+
url3, _ := url.Parse("http://test")
3334

3435
tests := []struct {
3536
name string
@@ -47,6 +48,7 @@ func Test_canCrawl(t *testing.T) {
4748
{"url2-0-1", args{b: base, u: url0, d: -1}, false},
4849
{"url2-1-1", args{b: base, u: url1, d: -1}, true},
4950
{"url2-2-1", args{b: base, u: url2, d: -1}, true},
51+
{"url3-3", args{b: base, u: url3, d: 0}, false},
5052
}
5153

5254
for _, tt := range tests {
@@ -276,6 +278,7 @@ sitemap: http://other.host/sitemap.xml`
276278
}
277279

278280
if len(resA) != 5 {
281+
t.Log(resA)
279282
t.Fatal("unexpected len for A")
280283
}
281284

pkg/path/depth.go

+17-20
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,38 @@
11
package path
22

33
import (
4+
"path"
45
"strings"
56
)
67

8+
const pathSep = '/'
9+
10+
func isPathSep(r rune) (yes bool) {
11+
return r == pathSep
12+
}
13+
714
// Depth calculates relative depth for `sub` to `base` resorces path.
815
func Depth(base, sub string) (n int, ok bool) {
916
var (
10-
bp = splitPath(base)
11-
sp = splitPath(sub)
17+
bn = path.Clean(base)
18+
sn = path.Clean(sub)
1219
)
1320

14-
if len(sp) < len(bp) {
21+
if len(sn) <= len(bn) {
1522
return
1623
}
1724

18-
for i := 0; i < len(bp); i++ {
19-
if bp[i] != sp[i] {
20-
return
21-
}
25+
if !strings.HasPrefix(sn, bn) {
26+
return
2227
}
2328

24-
return len(sp) - len(bp), true
25-
}
26-
27-
func splitPath(p string) (o []string) {
28-
return dropSpaces(strings.Split(p, "/"))
29-
}
30-
31-
func dropSpaces(s []string) (o []string) {
32-
o = make([]string, 0, len(s))
29+
fields := strings.FieldsFunc(sn[len(bn):], isPathSep)
3330

34-
for _, v := range s {
35-
if v != "" {
36-
o = append(o, v)
31+
for i := 0; i < len(fields); i++ {
32+
if fields[i] != "" {
33+
n++
3734
}
3835
}
3936

40-
return o
37+
return n, true
4138
}

pkg/path/depth_test.go

+14-63
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,9 @@
11
package path
22

33
import (
4-
"reflect"
54
"testing"
65
)
76

8-
func Test_dropSpaces(t *testing.T) {
9-
type args struct {
10-
s []string
11-
}
12-
13-
tests := []struct {
14-
name string
15-
args args
16-
wantO []string
17-
}{
18-
{"ones", args{s: []string{"", "1", ""}}, []string{"1"}},
19-
{"twos", args{s: []string{"2", "", "2"}}, []string{"2", "2"}},
20-
{"threes", args{s: []string{"", "", "3"}}, []string{"3"}},
21-
{"empty", args{s: []string{"", "", ""}}, []string{}},
22-
}
23-
24-
t.Parallel()
25-
26-
for _, tt := range tests {
27-
tc := tt
28-
29-
t.Run(tc.name, func(t *testing.T) {
30-
t.Parallel()
31-
32-
if gotO := dropSpaces(tc.args.s); !reflect.DeepEqual(gotO, tc.wantO) {
33-
t.Errorf("dropSpaces() = %v, want %v", gotO, tc.wantO)
34-
}
35-
})
36-
}
37-
}
38-
39-
func Test_splitPath(t *testing.T) {
40-
type args struct {
41-
p string
42-
}
43-
44-
tests := []struct {
45-
name string
46-
args args
47-
wantO []string
48-
}{
49-
{"empty", args{p: "/"}, []string{}},
50-
{"foo", args{p: "/foo"}, []string{"foo"}},
51-
{"foo-bar", args{p: "/foo/bar"}, []string{"foo", "bar"}},
52-
{"foo-bar-baz", args{p: "/foo/bar//baz"}, []string{"foo", "bar", "baz"}},
53-
}
54-
55-
t.Parallel()
56-
57-
for _, tt := range tests {
58-
tc := tt
59-
60-
t.Run(tc.name, func(t *testing.T) {
61-
t.Parallel()
62-
63-
if gotO := splitPath(tc.args.p); !reflect.DeepEqual(gotO, tc.wantO) {
64-
t.Errorf("splitPath() = %v, want %v", gotO, tc.wantO)
65-
}
66-
})
67-
}
68-
}
69-
707
func Test_Depth(t *testing.T) {
718
type args struct {
729
base string
@@ -84,6 +21,7 @@ func Test_Depth(t *testing.T) {
8421
{"c-bad", args{base: "/a/b", sub: "/c"}, 0, false},
8522
{"b-ok", args{base: "/a", sub: "/a/b"}, 1, true},
8623
{"c-ok", args{base: "/a", sub: "/a/b/c"}, 2, true},
24+
{"d-bad", args{base: "/a/b/c", sub: "/d/b/c/a"}, 0, false},
8725
}
8826

8927
t.Parallel()
@@ -104,3 +42,16 @@ func Test_Depth(t *testing.T) {
10442
})
10543
}
10644
}
45+
46+
func Benchmark_Depth(b *testing.B) {
47+
const (
48+
x = "/some/rather/long/path"
49+
y = "/some/rather/long/path/but/longer"
50+
)
51+
52+
b.ResetTimer()
53+
54+
for i := 0; i < b.N; i++ {
55+
_, _ = Depth(x, y)
56+
}
57+
}

0 commit comments

Comments
 (0)