Skip to content

Commit 8e73c17

Browse files
committed
js parser up and running #2
1 parent 98d06d9 commit 8e73c17

File tree

8 files changed

+339
-362
lines changed

8 files changed

+339
-362
lines changed

pkg/crawler/config.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ func (c *config) validate() {
4949
func (c *config) String() (rv string) {
5050
var sb strings.Builder
5151

52-
_, _ = sb.WriteString(fmt.Sprintf("workers: %d depth: %d brute: %t", c.Workers, c.Depth, c.Brute))
52+
_, _ = sb.WriteString(fmt.Sprintf(
53+
"workers: %d depth: %d brute: %t scan-js: %t", c.Workers, c.Depth, c.Brute, c.ScanJS,
54+
))
5355

5456
if c.Delay > 0 {
5557
_, _ = sb.WriteString(fmt.Sprintf(" delay: %s", c.Delay))

pkg/crawler/crawler.go

+7-7
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ func (c *Crawler) crawl(base *url.URL, t *crawlResult) (yes bool) {
167167
}
168168

169169
switch {
170-
case !canCrawl(base, u, c.cfg.Depth), c.robots.Forbidden(u.Path), c.cfg.Dirs == DirsOnly && isResorce(u.Path):
170+
case !canCrawl(base, u, c.cfg.Depth), c.robots.Forbidden(u.Path), c.cfg.Dirs == DirsOnly:
171171
return
172172
default:
173173
go func(r *url.URL) { c.crawlCh <- r }(u)
@@ -272,11 +272,11 @@ func (c *Crawler) isIgnored(v string) (yes bool) {
272272
func (c *Crawler) linkHandler(a atom.Atom, s string) {
273273
t := crawlResult{URI: s}
274274

275-
switch a {
276-
case atom.A, atom.Iframe:
277-
if !c.isIgnored(s) {
278-
t.Flag = TaskCrawl
279-
}
275+
fetch := (a == atom.A || a == atom.Iframe) ||
276+
(c.cfg.ScanJS && a == atom.Script)
277+
278+
if fetch && !c.isIgnored(s) {
279+
t.Flag = TaskCrawl
280280
}
281281

282282
c.resultCh <- t
@@ -311,7 +311,7 @@ func (c *Crawler) fetch(
311311
case isSitemap(uri):
312312
links.ExtractSitemap(body, base, c.sitemapHandler)
313313
case c.cfg.ScanJS && isJS(content, uri):
314-
links.ExtractJS(body, base, c.jsHandler)
314+
links.ExtractJS(body, c.jsHandler)
315315
}
316316

317317
client.Discard(body)

pkg/crawler/urlpath.go

-121
This file was deleted.

pkg/crawler/urlpath_test.go

-171
This file was deleted.

0 commit comments

Comments
 (0)