-
-
Notifications
You must be signed in to change notification settings - Fork 651
/
tabula_java_wrapper.rb
110 lines (87 loc) · 3.27 KB
/
tabula_java_wrapper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
java_import org.apache.pdfbox.pdmodel.PDDocument
java_import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial
class Java::TechnologyTabula::Table
attr_accessor :spec_index
def to_csv
sb = java.lang.StringBuilder.new
Java::TechnologyTabulaWriters.CSVWriter.new.write(sb, self)
sb.toString
end
def to_tsv
sb = java.lang.StringBuilder.new
Java::TechnologyTabulaWriters.TSVWriter.new.write(sb, self)
sb.toString
end
def to_json(*a)
sb = java.lang.StringBuilder.new
Java::TechnologyTabulaWriters.JSONWriter.new.write(sb, self)
sb.toString
end
end
module Tabula
def Tabula.extract_tables(pdf_path, specs, options={})
options = {
:password => '',
:detect_ruling_lines => true,
:vertical_rulings => [],
:extraction_method => "guess",
}.merge(options)
specs.each_with_index{|spec, i| spec["spec_index"] = i }
specs = specs.group_by { |s| s['page'] }
pages = specs.keys.sort
extractor = Extraction::ObjectExtractor.new(pdf_path,
options[:password])
sea = Java::TechnologyTabulaExtractors.SpreadsheetExtractionAlgorithm.new
bea = Java::TechnologyTabulaExtractors.BasicExtractionAlgorithm.new
Enumerator.new do |y|
extractor.extract(pages.map { |p| p.to_java(:int) }).each do |page|
specs[page.getPageNumber].each do |spec|
if ["spreadsheet", "original", "basic", "stream", "lattice"].include?(spec['extraction_method'])
use_spreadsheet_extraction_method = (spec['extraction_method'] == "spreadsheet" || spec['extraction_method'] == "lattice" )
else # guess
use_spreadsheet_extraction_method = sea.isTabular(page)
end
area = page.getArea(spec['y1'], spec['x1'], spec['y2'], spec['x2'])
table_extractor = use_spreadsheet_extraction_method ? sea : bea
table_extractor.extract(area).each { |table| table.spec_index = spec["spec_index"]; y.yield table }
end
end;
extractor.close!
end
end
module Extraction
def Extraction.openPDF(pdf_filename, password='')
raise Errno::ENOENT unless File.exists?(pdf_filename)
PDDocument.load(java.io.File.new(pdf_filename))
end
class ObjectExtractor < Java::TechnologyTabula.ObjectExtractor
alias_method :close!, :close
# TODO: the +pages+ constructor argument does not make sense
# now that we have +extract_page+ and +extract_pages+
def initialize(pdf_filename, pages=[1], password='', options={})
raise Errno::ENOENT unless File.exists?(pdf_filename)
@pdf_filename = pdf_filename
@document = Extraction.openPDF(pdf_filename, password)
super(@document)
end
def page_count
@document.get_number_of_pages
end
end
class PagesInfoExtractor < ObjectExtractor
def pages
Enumerator.new do |y|
self.extract.each do |page|
y.yield({
:width => page.getWidth,
:height => page.getHeight,
:number => page.getPageNumber,
:rotation => page.getRotation.to_i,
:hasText => page.hasText
})
end
end
end
end
end
end