From a38248cac068e96690bdd56d7ce348fce64a8772 Mon Sep 17 00:00:00 2001 From: Kamil Samigullin Date: Thu, 26 Apr 2018 10:57:53 +0300 Subject: [PATCH] prepare to refactoring --- cmd/root.go | 12 ++- cmd/urls.go | 4 + http/availability/crawler.go | 138 +++++++++++++++++++++++++++++++++++ http/availability/report.go | 128 +------------------------------- 4 files changed, 156 insertions(+), 126 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 34264ca..39e4c9a 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -1,6 +1,11 @@ package cmd -import "github.com/spf13/cobra" +import ( + "fmt" + "strconv" + + "github.com/spf13/cobra" +) // RootCmd is the entry point. var RootCmd = &cobra.Command{Short: "check"} @@ -8,3 +13,8 @@ var RootCmd = &cobra.Command{Short: "check"} func init() { RootCmd.AddCommand(urlsCmd) } + +func asBool(value fmt.Stringer) bool { + is, _ := strconv.ParseBool(value.String()) + return is +} diff --git a/cmd/urls.go b/cmd/urls.go index f56e994..45c51d1 100644 --- a/cmd/urls.go +++ b/cmd/urls.go @@ -16,3 +16,7 @@ var urlsCmd = &cobra.Command{ printer.Print(cmd.OutOrStdout()) }, } + +func init() { + urlsCmd.Flags().BoolP("verbose", "v", false, "turn on verbose mode") +} diff --git a/http/availability/crawler.go b/http/availability/crawler.go index 2542c0a..dcb380a 100644 --- a/http/availability/crawler.go +++ b/http/availability/crawler.go @@ -2,8 +2,10 @@ package availability import ( "net/http" + "net/url" "github.com/gocolly/colly" + "github.com/gocolly/colly/debug" ) func UserAgent() func(*colly.Collector) { @@ -17,3 +19,139 @@ func NoRedirect() func(*colly.Collector) { } } } + +// ~ + +func TempOption(r *Report) func(*colly.Collector) { + return func(c *colly.Collector) { + c.OnRequest(func(req *colly.Request) { + link := r.createLink(req.URL) + if link.IsPage { + r.createPage(link) + } + }) + + c.OnError(func(resp *colly.Response, err error) { r.setStatus(resp) }) + c.OnResponse(func(resp *colly.Response) { r.setStatus(resp) }) + + c.OnHTML("a[href]", func(e *colly.HTMLElement) { + if r.isPage(e.Request.URL) { + attr := e.Attr("href") + href := e.Request.AbsoluteURL(attr) + if href == "" { + + panic("invalid URL " + attr) // TODO set error instead of panic + + } + + // TODO make thread safe + link := r.createLinkByHref(href) + page := r.findPage(e.Request.URL) + page.Links = append(page.Links, link) + + // TODO it can return error + // &errors.errorString{s:""} + // &errors.errorString{s:"URL already visited"} + e.Request.Visit(href) + } + }) + } +} + +func (r *Report) createLink(location *url.URL) *Link { + href := location.String() + + { + r.mu.RLock() + link, ok := r.journal[href] + if ok { + r.mu.RUnlock() + return link + } + r.mu.RUnlock() + } + + { + r.mu.Lock() + link, ok := r.journal[href] + if ok { + r.mu.Unlock() + return link + } + link = &Link{IsPage: r.isPage(location), Location: href} + r.journal[href] = link + r.mu.Unlock() + return link + } +} + +func (r *Report) createLinkByHref(href string) *Link { + location, err := url.Parse(href) + if err != nil { + + panic(err) // TODO set error instead of panic + + } + return r.createLink(location) +} + +func (r *Report) createPage(link *Link) *Page { + r.mu.Lock() + page := &Page{Link: link, Links: make([]*Link, 0, 8)} + r.pages = append(r.pages, page) + r.mu.Unlock() + return page +} + +func (r *Report) findPage(location *url.URL) *Page { + href := location.String() + r.mu.RLock() + defer r.mu.RUnlock() + link, ok := r.journal[href] + if !ok { + + panic("can't find link with URL " + href) // TODO set error instead of panic + + } + for _, page := range r.pages { + if page.Link == link { + return page + } + } + + panic("can't find page with URL " + href) // TODO set error instead of panic +} + +func (r *Report) isPage(location *url.URL) bool { + return location.Hostname() == r.location.Hostname() +} + +func (r *Report) setStatus(resp *colly.Response) { + r.mu.RLock() + defer r.mu.RUnlock() + href := resp.Request.URL.String() + link, ok := r.journal[href] + if !ok { + + panic("unexpected URL " + href) // TODO set error instead of panic + + } + link.StatusCode = resp.StatusCode + if _, is := redirects[link.StatusCode]; is { + link.Redirect = resp.Headers.Get(location) + } +} + +// ~ + +type Debugger interface { + debug.Debugger +} + +type Option func(*Report) + +func WithDebugger() Option { + return func(*Report) { + // + } +} diff --git a/http/availability/report.go b/http/availability/report.go index 9146414..6484a32 100644 --- a/http/availability/report.go +++ b/http/availability/report.go @@ -73,136 +73,14 @@ func (r *Report) Get() error { } c := colly.NewCollector( UserAgent(), NoRedirect(), colly.IgnoreRobotsTxt(), - ) - - c.OnRequest(func(req *colly.Request) { - link := r.createLink(req.URL) - if link.IsPage { - r.createPage(link) - } - }) - - c.OnError(func(resp *colly.Response, err error) { r.setStatus(resp) }) - c.OnResponse(func(resp *colly.Response) { r.setStatus(resp) }) - - c.OnHTML("a[href]", func(e *colly.HTMLElement) { - if r.isPage(e.Request.URL) { - attr := e.Attr("href") - href := e.Request.AbsoluteURL(attr) - if href == "" { - - panic("invalid URL " + attr) // TODO set error instead of panic - - } - - // TODO make thread safe - link := r.createLinkByHref(href) - page := r.findPage(e.Request.URL) - page.Links = append(page.Links, link) - - // TODO it can return error - // &errors.errorString{s:""} - // &errors.errorString{s:"URL already visited"} - e.Request.Visit(href) - } - }) + TempOption(r), + ) return c.Visit(r.location.String()) } func (r *Report) Pages() []*Page { - r.mu.RLock() - - // TODO return []Page instead []*Page - pages := make([]*Page, len(r.pages)) - copy(pages, r.pages) - - r.mu.RUnlock() - return pages -} - -func (r *Report) createLink(location *url.URL) *Link { - href := location.String() - - { - r.mu.RLock() - link, ok := r.journal[href] - if ok { - r.mu.RUnlock() - return link - } - r.mu.RUnlock() - } - - { - r.mu.Lock() - link, ok := r.journal[href] - if ok { - r.mu.Unlock() - return link - } - link = &Link{IsPage: r.isPage(location), Location: href} - r.journal[href] = link - r.mu.Unlock() - return link - } -} - -func (r *Report) createLinkByHref(href string) *Link { - location, err := url.Parse(href) - if err != nil { - - panic(err) // TODO set error instead of panic - - } - return r.createLink(location) -} - -func (r *Report) createPage(link *Link) *Page { - r.mu.Lock() - page := &Page{Link: link, Links: make([]*Link, 0, 8)} - r.pages = append(r.pages, page) - r.mu.Unlock() - return page -} - -func (r *Report) findPage(location *url.URL) *Page { - href := location.String() - r.mu.RLock() - defer r.mu.RUnlock() - link, ok := r.journal[href] - if !ok { - - panic("can't find link with URL " + href) // TODO set error instead of panic - - } - for _, page := range r.pages { - if page.Link == link { - return page - } - } - - panic("can't find page with URL " + href) // TODO set error instead of panic -} - -func (r *Report) isPage(location *url.URL) bool { - return location.Hostname() == r.location.Hostname() -} - -func (r *Report) setStatus(resp *colly.Response) { - r.mu.RLock() - defer r.mu.RUnlock() - href := resp.Request.URL.String() - link, ok := r.journal[href] - if !ok { - - panic("unexpected URL " + href) // TODO set error instead of panic - - } - link.StatusCode = resp.StatusCode - if _, is := redirects[link.StatusCode]; is { - link.Redirect = resp.Headers.Get(location) - } + return r.pages } type Page struct {