Skip to content
This repository has been archived by the owner on May 11, 2022. It is now read-only.

Commit

Permalink
prepare to refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
kamilsk committed Apr 26, 2018
1 parent 4699056 commit a38248c
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 126 deletions.
12 changes: 11 additions & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
package cmd

import "github.com/spf13/cobra"
import (
"fmt"
"strconv"

"github.com/spf13/cobra"
)

// RootCmd is the entry point.
var RootCmd = &cobra.Command{Short: "check"}

func init() {
RootCmd.AddCommand(urlsCmd)
}

func asBool(value fmt.Stringer) bool {
is, _ := strconv.ParseBool(value.String())
return is
}
4 changes: 4 additions & 0 deletions cmd/urls.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ var urlsCmd = &cobra.Command{
printer.Print(cmd.OutOrStdout())
},
}

func init() {
urlsCmd.Flags().BoolP("verbose", "v", false, "turn on verbose mode")
}
138 changes: 138 additions & 0 deletions http/availability/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ package availability

import (
"net/http"
"net/url"

"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
)

func UserAgent() func(*colly.Collector) {
Expand All @@ -17,3 +19,139 @@ func NoRedirect() func(*colly.Collector) {
}
}
}

// ~

func TempOption(r *Report) func(*colly.Collector) {
return func(c *colly.Collector) {
c.OnRequest(func(req *colly.Request) {
link := r.createLink(req.URL)
if link.IsPage {
r.createPage(link)
}
})

c.OnError(func(resp *colly.Response, err error) { r.setStatus(resp) })
c.OnResponse(func(resp *colly.Response) { r.setStatus(resp) })

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
if r.isPage(e.Request.URL) {
attr := e.Attr("href")
href := e.Request.AbsoluteURL(attr)
if href == "" {

panic("invalid URL " + attr) // TODO set error instead of panic

}

// TODO make thread safe
link := r.createLinkByHref(href)
page := r.findPage(e.Request.URL)
page.Links = append(page.Links, link)

// TODO it can return error
// &errors.errorString{s:""}
// &errors.errorString{s:"URL already visited"}
e.Request.Visit(href)
}
})
}
}

func (r *Report) createLink(location *url.URL) *Link {
href := location.String()

{
r.mu.RLock()
link, ok := r.journal[href]
if ok {
r.mu.RUnlock()
return link
}
r.mu.RUnlock()
}

{
r.mu.Lock()
link, ok := r.journal[href]
if ok {
r.mu.Unlock()
return link
}
link = &Link{IsPage: r.isPage(location), Location: href}
r.journal[href] = link
r.mu.Unlock()
return link
}
}

func (r *Report) createLinkByHref(href string) *Link {
location, err := url.Parse(href)
if err != nil {

panic(err) // TODO set error instead of panic

}
return r.createLink(location)
}

func (r *Report) createPage(link *Link) *Page {
r.mu.Lock()
page := &Page{Link: link, Links: make([]*Link, 0, 8)}
r.pages = append(r.pages, page)
r.mu.Unlock()
return page
}

func (r *Report) findPage(location *url.URL) *Page {
href := location.String()
r.mu.RLock()
defer r.mu.RUnlock()
link, ok := r.journal[href]
if !ok {

panic("can't find link with URL " + href) // TODO set error instead of panic

}
for _, page := range r.pages {
if page.Link == link {
return page
}
}

panic("can't find page with URL " + href) // TODO set error instead of panic
}

func (r *Report) isPage(location *url.URL) bool {
return location.Hostname() == r.location.Hostname()
}

func (r *Report) setStatus(resp *colly.Response) {
r.mu.RLock()
defer r.mu.RUnlock()
href := resp.Request.URL.String()
link, ok := r.journal[href]
if !ok {

panic("unexpected URL " + href) // TODO set error instead of panic

}
link.StatusCode = resp.StatusCode
if _, is := redirects[link.StatusCode]; is {
link.Redirect = resp.Headers.Get(location)
}
}

// ~

type Debugger interface {
debug.Debugger
}

type Option func(*Report)

func WithDebugger() Option {
return func(*Report) {
//
}
}
128 changes: 3 additions & 125 deletions http/availability/report.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,136 +73,14 @@ func (r *Report) Get() error {
}
c := colly.NewCollector(
UserAgent(), NoRedirect(), colly.IgnoreRobotsTxt(),
)

c.OnRequest(func(req *colly.Request) {
link := r.createLink(req.URL)
if link.IsPage {
r.createPage(link)
}
})

c.OnError(func(resp *colly.Response, err error) { r.setStatus(resp) })
c.OnResponse(func(resp *colly.Response) { r.setStatus(resp) })

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
if r.isPage(e.Request.URL) {
attr := e.Attr("href")
href := e.Request.AbsoluteURL(attr)
if href == "" {

panic("invalid URL " + attr) // TODO set error instead of panic

}

// TODO make thread safe
link := r.createLinkByHref(href)
page := r.findPage(e.Request.URL)
page.Links = append(page.Links, link)

// TODO it can return error
// &errors.errorString{s:""}
// &errors.errorString{s:"URL already visited"}
e.Request.Visit(href)
}
})

TempOption(r),
)
return c.Visit(r.location.String())
}

func (r *Report) Pages() []*Page {
r.mu.RLock()

// TODO return []Page instead []*Page
pages := make([]*Page, len(r.pages))
copy(pages, r.pages)

r.mu.RUnlock()
return pages
}

func (r *Report) createLink(location *url.URL) *Link {
href := location.String()

{
r.mu.RLock()
link, ok := r.journal[href]
if ok {
r.mu.RUnlock()
return link
}
r.mu.RUnlock()
}

{
r.mu.Lock()
link, ok := r.journal[href]
if ok {
r.mu.Unlock()
return link
}
link = &Link{IsPage: r.isPage(location), Location: href}
r.journal[href] = link
r.mu.Unlock()
return link
}
}

func (r *Report) createLinkByHref(href string) *Link {
location, err := url.Parse(href)
if err != nil {

panic(err) // TODO set error instead of panic

}
return r.createLink(location)
}

func (r *Report) createPage(link *Link) *Page {
r.mu.Lock()
page := &Page{Link: link, Links: make([]*Link, 0, 8)}
r.pages = append(r.pages, page)
r.mu.Unlock()
return page
}

func (r *Report) findPage(location *url.URL) *Page {
href := location.String()
r.mu.RLock()
defer r.mu.RUnlock()
link, ok := r.journal[href]
if !ok {

panic("can't find link with URL " + href) // TODO set error instead of panic

}
for _, page := range r.pages {
if page.Link == link {
return page
}
}

panic("can't find page with URL " + href) // TODO set error instead of panic
}

func (r *Report) isPage(location *url.URL) bool {
return location.Hostname() == r.location.Hostname()
}

func (r *Report) setStatus(resp *colly.Response) {
r.mu.RLock()
defer r.mu.RUnlock()
href := resp.Request.URL.String()
link, ok := r.journal[href]
if !ok {

panic("unexpected URL " + href) // TODO set error instead of panic

}
link.StatusCode = resp.StatusCode
if _, is := redirects[link.StatusCode]; is {
link.Redirect = resp.Headers.Get(location)
}
return r.pages
}

type Page struct {
Expand Down

0 comments on commit a38248c

Please sign in to comment.