Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experiment/better crawler #104

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
218 changes: 218 additions & 0 deletions app/crawlers/classic.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
require "tempfile"

module HTTPRequestInResponse
refine HTTP::Response do
attr_accessor :req
end

refine HTTP::Client do
alias_method :__perform__, :perform

def perform(req, options)
__perform__(req, options).tap { |res| res.req = req }
end
end
end

using HTTPRequestInResponse

Request = Struct.new(:uri, :method, :headers, keyword_init: true) do
end

Response = Struct.new(:status, :headers, :body, keyword_init: true) do
end

class Document
extend Forwardable

attr_accessor :uri, :request, :response, :content_type, :parent, :dependencies

def initialize uri
@uri = uri
end

def dependencies
@dependencies ||= {}
end

def add_dependency dep
binding.pry unless dep.is_a? Document
fail "dependency must be a Document, got `#{ dep.class }`" unless dep.is_a? Document

dep.parent = self
dependencies[dep.uri] = dep
end

def inspect
"#<Document:TODO content-type=#{ content_type } uri=#{ uri }>"
end
end

class DependencyResolver
def initialize doc
@doc = doc
end

# TODO: how
def resolve
return resolve_html if @doc.content_type == "text/html"

[]
end

protected

def resolve_html
root_uri = Addressable::URI.parse @doc.uri

nokogiri = Nokogiri::HTML(@doc.response.body)

[
"//link[@rel='stylesheet']/@href",
"//script/@src",
"//img/@src"
].flat_map(&nokogiri.method(:xpath))
.map(&:to_s)
.uniq
.compact
.reject { |src| src.start_with? "data:" } # TODO: should this reject all non http/https instead?
.map(&Addressable::URI.method(:parse))
.map { |link| (root_uri + link).to_s }
end
end

class Classic
class Client
extend Forwardable

def config
@config ||= ActiveSupport::OrderedOptions.new.tap do |opts|
opts.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0"

opts.headers = {
accept_language: "en-US,en;q=0.5",
accept: "text/html;q=0.9,*/*;q=0.8; charset=utf-8"
}

opts.follow_redirects = true
end
end

def_delegators :base, *HTTP::Request::METHODS

private

def base
@base ||= HTTP.headers({ user_agent: config.user_agent }.merge(config.headers))
.yield_self { |client| config.follow_redirects ? client.follow : client }
end
end

def crawl site
crawl_deps(site).tap(&method(:fetch_favicon_for))
end

private

def crawl_deps site
request(site).tap(&method(:fetch_assets_for))
end

def client
@client ||= Client.new
end

def fetch_favicon_for root
root_address = Addressable::URI.parse root.uri

res = root_address
.yield_self { |uri| uri.join "/favicon.ico" }
.yield_self(&:to_s)
.yield_self(&method(:request))

root.add_dependency res
return if (200...299).include? res.response.status

root.xpath('//link[@rel="shortcut icon"]').find do |possible_favicon|
res = possible_favicon.attr("href")
.yield_self { |href| root_address.join href } # Ensures that a relative URI is converted to absolute
.yield_self(&:to_s)
.yield_self(&method(:request))

root.add_dependency res

(200..299).include? res.response.status
end
end

def fetch_assets_for root
DependencyResolver.new(root).resolve.each do |link|
root.add_dependency crawl_deps link
end
end

def request uri
return if uri.start_with? "data:"

Document.new(uri).tap do |document|
puts "Fetching uri: '#{ uri }'"

res = client.get uri

body, content_type = with_temp_file do |temp_file|
write_body response: res, to: temp_file
content_type = get_content_type response: res, io: temp_file

[temp_file.read, content_type]
end

# TODO: There is a lot more info to capture from the request and
# response.
document.request = Request.new uri: uri, method: "GET", headers: {} #res.req.headers
document.response = Response.new status: res.code, headers: res.headers, body: body

document.content_type = content_type
rescue HTTP::ConnectionError => e
debugger
end
end

# TODO: Could these three methods live else where?
def with_temp_file
Tempfile.create do |temp_file|
temp_file.binmode

yield temp_file
end
end

# Streams the response body into the temp file to hopefully avoid some
# memory issues if this is a large document
def write_body response:, to:
io_handle = to

response.body.each do |partial|
io_handle.write partial
end

io_handle.rewind
end

def get_content_type response:, io:
return response.content_type.mime_type if response.content_type.mime_type

io_handle = io
io_handle.rewind

# some websites *cough*offline.pink*cough* don't return a content type
# header, so we'll first see if the html doctype string is present and
# guess this is text/html or we'll fallback to assuming its binary
# https://en.wikipedia.org/wiki/Content_sniffing
content_type ||= MimeMagic.by_magic(io_handle)
content_type ||= "application/octet-stream"

io_handle.rewind

content_type
end
end