-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractURL.go
95 lines (74 loc) · 2.65 KB
/
extractURL.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package URLextract
import (
"bytes"
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/asaskevich/govalidator"
"github.com/koffeinsource/go-URLextract/plugins"
"github.com/koffeinsource/go-URLextract/webpage"
"golang.org/x/net/html/charset"
)
// TODO refactor this function into two functions.
// 1.) ExtractFast() a functions that only parsers the webpage
// 2.) CompleteExtract() a functions that queries external services
// 3.) Extract := CompleteExtract(ExtractFast)
// Maybe support batching for CompleteExtract?
// Extract extracts all information from URL
func (c *Client) Extract(sourceURL string) (webpage.Info, error) {
// Create return value with default values
returnee := webpage.Info{
Caption: sourceURL,
URL: sourceURL,
}
// Check if the URL is valid
if !govalidator.IsRequestURL(sourceURL) {
errReturn := fmt.Errorf("Invalid URL: %v", sourceURL)
c.Log.Errorf(errReturn.Error())
return returnee, errReturn
}
contentType, body, err := c.getURL(sourceURL)
if err != nil {
return returnee, err
}
// log.Infof(contentType)
switch {
case strings.Contains(contentType, "image/"):
// Image hostet at imgur?
if strings.Contains(sourceURL, "i.imgur.com/") {
plugins.Imgurl(&returnee, sourceURL, c.HTTPClient, c.Log, c.ImgurClientID)
} else {
plugins.Image(&returnee, sourceURL, contentType, c.Log)
}
case strings.Contains(contentType, "text/html"):
var doc *goquery.Document
charsetReader, err := charset.NewReader(bytes.NewReader(body), contentType)
//content, err := ioutil.ReadAll(charsetReader)
//c.Log.Infof("%v", string(content))
if err == nil {
doc, err = goquery.NewDocumentFromReader(charsetReader)
} else {
doc, err = goquery.NewDocumentFromReader(bytes.NewReader(body))
}
if err != nil {
c.Log.Errorf("Problem parsing body. " + sourceURL + " - " + err.Error())
return returnee, err
}
// Make sure to call this one first
plugins.DefaultHTML(&returnee, sourceURL, doc, c.Log)
plugins.Amazon(&returnee, sourceURL, doc, c.Log, c.AmazonAdID)
plugins.Imgurl(&returnee, sourceURL, c.HTTPClient, c.Log, c.ImgurClientID)
plugins.Gfycat(&returnee, sourceURL, doc, c.Log)
plugins.Fefe(&returnee, sourceURL, doc, c.Log)
plugins.Youtube(&returnee, sourceURL, doc, c.Log)
plugins.Vimeo(&returnee, sourceURL, doc, c.Log)
plugins.Dilbert(&returnee, sourceURL, doc, c.Log)
plugins.Garfield(&returnee, sourceURL, doc, c.Log)
plugins.Xkcd(&returnee, sourceURL, doc, c.Log)
plugins.Littlegamers(&returnee, sourceURL, doc, c.Log)
plugins.IEEExplore(&returnee, sourceURL, doc, c.Log)
plugins.Pastebin(&returnee, sourceURL, doc, c.Log)
default:
}
return returnee, nil
}