-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwebtable_to_text.rb
executable file
·74 lines (58 loc) · 2.31 KB
/
webtable_to_text.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env ruby
require 'cgi'
require 'nokogiri'
require 'open-uri'
require 'optparse'
require 'reverse_markdown'
require 'reverse_adoc'
require_relative 'libhtmltable.rb'
options = {}
OptionParser.new do |opts|
opts.banner = " Usage: webtable_to_text.rb [options]"
opts.on("-A", "--all", "Print all tables found on the specified page") { options[:all] = true }
opts.on("-a", "--asciidoc", "Output in asciidoc/asciidoctor format") { options[:asciidoc] = true }
opts.on("-c", "--csv", "Output in CSV / comma separated values format") { options[:csv] = true }
opts.on("-f", "--file FILE", "Specify HTML input file as source for extracting tables") { |v| options[:file] = v }
opts.on("-i", "--interactive", "Interactive mode") { options[:interactive] = true }
opts.on("-m", "--markdown", "Output in markdown format") { options[:markdown] = true }
opts.on("-n", "--number NUM", "Print specific table number only; separate multiple numbers with commas") { |v| options[:number] = v }
opts.on("-o", "--output FILE", "Specify output file (default: output to STDOUT)") { |v| options[:output] = v }
opts.on("-r", "--raw", "Output raw table HTML") { options[:html] = true }
opts.on("-t", "--tsv", "Output in TSV / tab separated values format (default)") { options[:tsv] = true }
opts.on("-u", "--url URL", "Specify URL as source for extracting tables") { |v| options[:url] = v }
end.parse!
source = ""
url = options[:url]
file = options[:file]
if url
source_location = url
escaped = escape_url(url)
source_content = URI.open(escaped).read
elsif file
source_location = file
source_content = File.read(file)
else
abort(" Please provide a source file or URL as input")
end
doc = Nokogiri::HTML(source_content)
tables = doc.xpath('//table')
len = tables.length
if len < 1
abort(" No tables found in page at #{source_location}")
end
numstring = "all"
if options[:number]
numstring = options[:number]
end
if options[:interactive]
puts " There were #{len.to_s} tables found in the page at: #{source_location}"
puts " Please enter the number of the table you would like to convert, or <Enter> for all:"
numstring = $stdin.gets.chomp
end
if numstring.match(/,/)
multiple_tables(tables, options, numstring)
end
if !numstring.match(/^\d+$/)
all_tables(tables, options)
end
single_table(tables, options, numstring)