-
Notifications
You must be signed in to change notification settings - Fork 36
/
urlscanio.go
154 lines (136 loc) · 4.49 KB
/
urlscanio.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
package iok
import (
"bytes"
"context"
"fmt"
"golang.org/x/net/html"
"golang.org/x/sync/errgroup"
"io"
"net/http"
"net/url"
"phish.report/urlscanio-go"
"sort"
"sync"
)
type httpClient interface {
Do(req *http.Request) (*http.Response, error)
}
// InputFromURLScan takes a urlscan.io result ID and returns an Input suitable for calling GetMatches with.
// The provided http.Client should inject your API key if you have one.
func InputFromURLScan(ctx context.Context, urlscanUUID string, client httpClient) (Input, error) {
urlscanClient := urlscanio.NewClient(urlscanio.HTTPClient(client))
result, err := urlscanClient.RetrieveResult(ctx, urlscanUUID)
if err != nil {
return Input{}, err
}
input := Input{}
u, err := url.Parse(result.Page.Url)
if err != nil {
return Input{}, fmt.Errorf("failed to parse result URL: %w", err)
}
input.Hostname = u.Hostname()
// Some sites have many resources (100+) so fetching each one sequentially takes too long.
// This fetches up to 5 resources in parallel
g, ctx := errgroup.WithContext(ctx)
g.SetLimit(5)
mu := sync.Mutex{}
g.Go(func() error {
domReq, _ := http.NewRequestWithContext(ctx, http.MethodGet, "https://urlscan.io/dom/"+result.Task.Uuid, nil)
domResp, err := client.Do(domReq)
if err != nil || domResp.StatusCode != 200 {
if err == nil {
err = fmt.Errorf(domResp.Status)
}
return fmt.Errorf("failed to get result dom: %w", err)
}
defer domResp.Body.Close()
mu.Lock()
defer mu.Unlock()
resultHTML, _ := io.ReadAll(domResp.Body)
input.DOM = string(resultHTML)
// parse any JS/CSS from the dom
node, err := html.Parse(bytes.NewReader(resultHTML))
if err == nil {
extractHTML(node, &input, extractEmbeddedAssets, extractTitle)
}
return nil
})
for _, cookie := range result.Data.Cookies {
input.Cookies = append(input.Cookies, cookie.Name+"="+cookie.Value)
}
foundHTML := false
for _, request := range result.Data.Requests {
request := request
g.Go(func() error {
mu.Lock()
input.Requests = append(input.Requests, request.Request.Request.Url)
// TODO: how does this check behave in the case of redirects?
if request.Request.PrimaryRequest {
// this is the "primary" page load, so we need to extract the response headers
for headerKey, headerValue := range request.Response.Response.Headers {
input.Headers = append(input.Headers, http.CanonicalHeaderKey(headerKey)+": "+headerValue)
}
sort.Slice(input.Headers, func(i, j int) bool {
return input.Headers[i] < input.Headers[j]
})
}
mu.Unlock()
if request.Response.Hash == "" {
// this isn't a response we can fetch
return nil
}
switch request.Request.Type {
default:
return nil
case "Stylesheet", "Script", "Document":
}
// Fetch the response in parallel with other threads, only lock the mutex once we're modifying the Input{}
resourceReq, _ := http.NewRequestWithContext(ctx, http.MethodGet, "https://urlscan.io/responses/"+request.Response.Hash, nil)
resp, err := client.Do(resourceReq)
if err != nil || resp.StatusCode != 200 && resp.StatusCode != 404 {
if err == nil {
err = fmt.Errorf(resp.Status)
}
return fmt.Errorf("failed to fetch resource %s %s: %w", request.Request.RequestId, request.Response.Hash, err)
}
resource, _ := io.ReadAll(resp.Body) // always read the body to completion to ensure proper connection re-use + caching
resp.Body.Close()
if resp.StatusCode/100 != 2 {
// not all resources are saved by urlscan.io e.g. stylesheets are frequently missing
return nil
}
mu.Lock()
defer mu.Unlock()
switch request.Request.Type {
case "Stylesheet":
input.CSS = append(input.CSS, string(resource))
case "Script":
input.JS = append(input.JS, string(resource))
case "Document":
if request.Request.PrimaryRequest {
foundHTML = true
if input.HTML != "" {
fmt.Println("oops already have response html")
}
// this is the initial page load
input.HTML = string(resource)
// parse any JS/CSS from the html
// This does result in duplicate values (for sites that don't have any dynamically inserted JS/CSS),
// but that doesn't affect correctness
node, err := html.Parse(bytes.NewReader(resource))
if err == nil {
extractHTML(node, &input, extractEmbeddedAssets, extractTitle)
}
}
}
return nil
})
}
if err := g.Wait(); err != nil {
return input, err
}
if !foundHTML {
return input, fmt.Errorf("failed to get response html")
}
return input, nil
}