-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathclient.go
105 lines (92 loc) · 2.59 KB
/
client.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
package url2epub
import (
"context"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
type lastURLKeyType struct{}
var lastURLKey lastURLKeyType
var client = &http.Client{
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
// Copied from:
// https://go.googlesource.com/go/+/go1.15.6/src/net/http/client.go#805
return errors.New("stopped after 10 redirects")
}
value := req.Context().Value(lastURLKey)
if ptr, ok := value.(**url.URL); ok {
*ptr = req.URL
}
return nil
},
}
// GetHTMLArgs define the arguments used by GetHTML function.
type GetHTMLArgs struct {
// The HTTP GET URL, required.
URL string
// The User-Agent header to use, optional.
UserAgent string
}
// GetHTML does HTTP get requests on HTML content.
//
// It's different from standard http.Get in the following ways:
//
// - If there are redirects happening during the request, returned URL will be
// the URL of the last (final) request.
//
// - Instead of returning *http.Response, it returns parsed *html.Node, with
// Type being ElementNode and DataAtom being Html (instead of root node, which
// is usually DoctypeNode).
//
// - The client used by Get does not have timeout set. It's expected that a
// deadline is set in the ctx passed in.
func GetHTML(ctx context.Context, args GetHTMLArgs) (*Node, *url.URL, error) {
src, err := url.Parse(args.URL)
if err != nil {
return nil, nil, fmt.Errorf("unable to parse url %q: %w", args.URL, err)
}
body, lastURL, err := get(ctx, src, args.UserAgent)
if err != nil {
return nil, nil, fmt.Errorf("unable to get %q: %w", args.URL, err)
}
defer DrainAndClose(body)
src = lastURL
root, err := html.Parse(body)
if err != nil {
return nil, nil, fmt.Errorf("unable to parse %q: %w", src, err)
}
return FromNode(root).FindFirstAtomNode(atom.Html), src, nil
}
// DrainAndClose drains and closes r.
func DrainAndClose(r io.ReadCloser) error {
io.Copy(io.Discard, r)
return r.Close()
}
func get(ctx context.Context, src *url.URL, ua string) (io.ReadCloser, *url.URL, error) {
req := &http.Request{
Method: http.MethodGet,
URL: src,
Header: make(http.Header),
}
lastURL := new(*url.URL)
*lastURL = src
ctx = context.WithValue(ctx, lastURLKey, lastURL)
req = req.WithContext(ctx)
if ua != "" {
req.Header.Set("user-agent", ua)
}
resp, err := client.Do(req)
if err != nil {
return nil, nil, err
}
if resp.StatusCode != http.StatusOK {
DrainAndClose(resp.Body)
return nil, nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
return resp.Body, *lastURL, nil
}