Skip to content

Commit

Permalink
Merge pull request #711 from tabulapdf/feature/templates
Browse files Browse the repository at this point in the history
save and load templates, with tabula-java-1.0.0
  • Loading branch information
jazzido authored Aug 10, 2017
2 parents 2082ad2 + e49ac4f commit 68c83fe
Show file tree
Hide file tree
Showing 14 changed files with 627 additions and 396 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ language: ruby
before_install:
- gem update --system
- gem install bundler
- gem install jbundler
rvm:
- jruby-9.1.9.0
jdk:
Expand Down
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ platform :jruby do
gem "tilt", "~> 2.0.7"

group :development do
gem 'jar-dependencies', '0.3.11'
gem 'jar-dependencies', '0.3.10'
gem 'jbundler', '~> 0.9.3'
gem "rake"
gem "warbler", "~> 2.0.3"
Expand Down
4 changes: 2 additions & 2 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ GEM
equalizer (0.0.11)
ffi (1.9.18-java)
ice_nine (0.11.2)
jar-dependencies (0.3.11)
jar-dependencies (0.3.10)
jbundler (0.9.3)
bundler (~> 1.5)
jar-dependencies (~> 0.3)
Expand Down Expand Up @@ -70,7 +70,7 @@ DEPENDENCIES
bootstrap-sass (~> 3.2.0)
compass
cuba (~> 3.8.1)
jar-dependencies (= 0.3.11)
jar-dependencies (= 0.3.10)
jbundler (~> 0.9.3)
jruby-jars (= 9.1.12.0)
rack (~> 2.0.3)
Expand Down
3 changes: 0 additions & 3 deletions lib/tabula_job_executor/jobs/generate_document_data.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,9 @@ def perform
'size' => File.size(filepath),
'thumbnail_sizes' => options[:thumbnail_sizes]
}

at(5, 100, "analyzing PDF text...")

extractor = Tabula::Extraction::PagesInfoExtractor.new(filepath)

page_data = extractor.pages.to_a
doc['page_count'] = page_data.size
unless page_data.any? { |pd| pd[:hasText] }
Expand All @@ -36,7 +34,6 @@ def perform
end

Tabula::Workspace.instance.add_document(doc, page_data)

at(100, 100, "complete")
extractor.close!
return nil
Expand Down
103 changes: 92 additions & 11 deletions lib/tabula_workspace.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,32 @@ class Workspace
include JRuby::Synchronized
include Singleton

STARTING_VALUE = {"pdfs" => [], "templates" => [], "version" => 2}


def initialize(data_dir=TabulaSettings.getDataDir)
unless File.directory?(data_dir)
raise "DOCUMENTS_BASEPATH does not exist or is not a directory."
end

@data_dir = data_dir
@workspace_path = File.join(@data_dir, "pdfs", "workspace.json")
@workspace = []

@workspace = STARTING_VALUE
if !File.exists?(@workspace_path)
FileUtils.mkdir_p(File.join(@data_dir, "pdfs"))
end
end

def add_document(document, pages)
read_workspace!
@workspace.unshift(document)
@workspace["pdfs"].unshift(document)
add_file(pages.to_json, document['id'], 'pages.json')
flush_workspace!
end

def delete_document(document_id)
read_workspace!
@workspace.delete_if { |d| d['id'] == document_id }
@workspace["pdfs"].delete_if { |d| d['id'] == document_id }
flush_workspace!

FileUtils.rm_rf(get_document_dir(document_id))
Expand All @@ -42,7 +44,7 @@ def delete_page(document_id, page_number)

def get_document_metadata(document_id)
read_workspace!
@workspace.find { |d| d['id'] == document_id }
@workspace["pdfs"].find { |d| d['id'] == document_id }
end

def get_document_pages(document_id)
Expand All @@ -53,14 +55,13 @@ def get_document_path(document_id)
File.join(get_document_dir(document_id), 'document.pdf')
end

def get_document_dir(document_id)
p = File.join(@data_dir, 'pdfs', document_id)
if !File.directory?(p)
FileUtils.mkdir_p(p)
end
p
def list_documents
read_workspace!
@workspace["pdfs"]
end



def get_data_dir()
@data_dir
end
Expand All @@ -77,12 +78,92 @@ def move_file(path, document_id, filename)
FileUtils.mv(path, File.join(get_document_dir(document_id), filename))
end



def list_templates
read_workspace!
@workspace["templates"]
end

def get_template_metadata(template_id)
read_workspace!
@workspace["templates"].find { |d| d['id'] == template_id }
end
def get_template_body(template_id)
puts File.join(get_templates_dir, "#{template_id}.tabula-template.json")
open(File.join(get_templates_dir, "#{template_id}.tabula-template.json"), 'r'){|f| f.read }
end

def add_template(template_metadata)
read_workspace!

# write template metadata to workspace
@workspace["templates"].insert(0,{
"name" => template_metadata["name"].gsub(".tabula-template.json", ""),
"selection_count" => template_metadata["selection_count"],
"page_count" => template_metadata["page_count"],
"time" => template_metadata["time"],
"id" => template_metadata["id"]
})
# write template file to disk
write_template_file(template_metadata)
flush_workspace!
end

def replace_template_metadata(template_id, template_metadata)
read_workspace!
idx = @workspace["templates"].index{|t| t["id"] == template_id}
@workspace["templates"][idx] = template_metadata.select{|k,_| ["name", "selection_count", "page_count", "time", "id"].include?(k) }
flush_workspace!
end



def delete_template(template_id)
read_workspace!
@workspace["templates"].delete_if { |t| t['id'] == template_id }
flush_workspace!
File.delete(File.join(get_templates_dir, "#{template_id}.tabula-template.json"))
end


private

def write_template_file(template_metadata)
template_name = template_metadata["name"]
template_id = Digest::SHA1.hexdigest(Time.now.to_s + template_name) # just SHA1 of time isn't unique with multiple uploads
template_filename = template_id + ".tabula-template.json"
open(File.join(get_templates_dir, template_filename), 'w'){|f| f << JSON.dump(template_metadata["template"])}
end

def get_templates_dir
p = File.join(@data_dir, 'templates')
if !File.directory?(p)
FileUtils.mkdir_p(p)
end
p
end
def get_document_dir(document_id)
p = File.join(@data_dir, 'pdfs', document_id)
if !File.directory?(p)
FileUtils.mkdir_p(p)
end
p
end


def read_workspace!
return STARTING_VALUE unless File.exists?(@workspace_path)
File.open(@workspace_path) do |f|
@workspace = JSON.parse(f.read)
end
# what if the already-existing workspace is v1? i.e. if it's just an array?
# then we'll make it the new kind, seamlessly.
if @workspace.is_a? Array
@workspace = {"pdfs" => @workspace, "templates" => [], "version" => 2}
flush_workspace!
end
@workspace
end

def flush_workspace!
Expand Down
89 changes: 88 additions & 1 deletion webapp/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
<li><a id="upload-nav" href="">My Files</a></li>
<li><a id="templates-nav" href="mytemplates">My Templates</a></li>
<li><a id="about-nav" href="about">About</a></li>
<li><a id="help-nav" href="help">Help</a></li>
<li><a href="https://github.com/tabulapdf/tabula">Source Code</a></li>
Expand Down Expand Up @@ -112,9 +113,21 @@
<script type="text/template" id="select-control-panel-template" >
<span class="filename"><%= original_filename %></span>

<a href="javascript:void(0)"><button id="restore-detected-tables" type="button" class="btn btn-default <%= restore_detected_tables %>" <%= disable_detected_tables %>><span class="glyphicon glyphicon-flash"></span><span class="glyphicon glyphicon-refresh"></span> Autodetect Tables</button></a>
<a href="javascript:void(0)"><button id="restore-detected-tables" type="button" class="btn btn-default <%= restore_detected_tables %>" <%= disabled_if_there_are_selections %>><span class="glyphicon glyphicon-flash"></span><span class="glyphicon glyphicon-refresh"></span> Autodetect Tables</button></a>
<a href="javascript:void(0)"><button type="button" id="clear-all-selections" class="btn btn-default" <%= disable_clear_all_selections %>><span class="glyphicon glyphicon-remove-circle"></span> Clear All Selections</button></a>
<a href="javascript:void(0)"><button type="button" id="all-data" class="btn btn-success" <%= disable_download_all %>><span class="glyphicon glyphicon-eye-open"></span> Preview & Export Extracted Data</button></a>

<span style="float: right;" class="template-menu">

<div class="dropdown" <%= disable_load_template %> style="display: inline;">
<button type="button" class="dropdown-toggle btn btn-link" data-toggle="dropdown" href="#" <%= disabled_if_there_are_selections %>><span class="glyphicon glyphicon-import" title="You may only load a template if there are no selections active in your PDF."></span> Load Template</button>
<div id="template-dropdown-container" class="dropdown-menu">
</div>
</div>
<a href="javascript:void(0)"><button type="button" id="save-template" class="btn btn-info" <%= disable_save_template %>><span class="glyphicon glyphicon-export"></span> Save as Template</button></a>

</span>
<span style="clear: both;">
</script>

<script type="text/template" id="export-page-sidebar-template">
Expand Down Expand Up @@ -300,6 +313,9 @@ <h4>Imported PDFs</h4>
</div>
</div>

<hr />
<p style="text-size: small;">If you have several PDFs with the same layout, you can select the appropriate regions once, then save the selections as a Tabula Template from the Select Tables page. If someone has shared a template with you, you can upload it to Tabula at the <a href="/mytemplates">My Templates page</a>.</p>

</div> <!-- /jumbotron -->
</div> <!-- /container -->

Expand All @@ -318,6 +334,22 @@ <h4>Imported PDFs</h4>
<td><a href="pdf/<%= id %>"><button type="button" class="btn btn-sm btn-success">Extract Data</button></a></td>
</script>

<script type="text/template" id="saved-template-library-item-template">
<td><span class="template-name"><%= name %></span> <a href="javascript:"><span data-name=<%= name %> data-templateid=<%= id %> class="glyphicon glyphicon-pencil edit-template-name"></span></a></td>
<td><%= selection_count %> selection<%= selection_count == "!" ? '' : 's' %></td>
<td><%= page_count || '??' %></td>
<td><%= new Date(parseInt(time) * 1000).toUTCString().slice(5, -7) %></td>
<td><a href="javascript:"><span data-name=<%= name %> data-templateid=<%= id %> class="glyphicon glyphicon-remove delete-template"></span></a></td>
<td>
<form class="template-download-form" action="templates/<%= id %>.json" method="get" style="margin-bottom: 0;">
<button type="submit" class="btn btn-default" data-action=>
<span class="glyphicon glyphicon-download-alt download-template"></span>
Download
</button>
</form>
</td>
</script>

<script type="text/template" id="upload-error-template" >
<div class="container-fluid">
<div class="row-fluid">
Expand Down Expand Up @@ -357,6 +389,60 @@ <h2>Credits</h2>
</div> <!-- /container -->
</script>

<script type="text/templates" id="templates-template">
<div class="container">
<div class="jumbotron help">
<h1>My Saved Templates</h1>
<p style="font-size: small;">If you have several PDFs with the same layout, you can select the appropriate regions once, then save the selections as a Tabula Template, and load it in subsequent PDFs. You can see your saved templates here; you can also rename and delete them. If someone has shared a template with you, you can import it to Tabula here.</p>
<p>To use a template, <a href="">upload a file</a> or select it from <a href="">My Files</a>.</p>
<div id="template-library-container">
<div id="file-list-container">
<table class="table file-list" id="templateTable">
<thead>
<tr>
<th>Template Name</th>
<th>Selection Count</th>
<th>Page Count</th>
<th>Date Added</th>
<th>Remove</th>
<th>Download</th>
</tr>
</thead>
<tbody id="saved-templates-container">
</tbody>
</table>
</div>
</div>

<div id="template-upload-form-container">
<h2>Import one or more Tabula Templates</h2>
<p style="font-size: small;">Once you save a Tabula Template, it'll appear here.</p>
<form id="uploadtemplate" action="templates/upload.json" method="post" enctype="multipart/form-data" class="form-inline">

<div class="input-group">
<!-- we are using this here: http://www.abeautifulsite.net/whipping-file-inputs-into-shape-with-bootstrap-3/ -->
<span class="input-group-btn">
<span class="btn btn-primary btn-file">
Browse&hellip; <input type="file" id="file" name="files[]" multiple accept="application/json">
</span>
</span>
<input type="text" class="form-control" readonly>
</div>

<!-- fix for the fact that IE11 is trash https://blog.yorkxin.org/posts/2014/02/06/ajax-with-formdata-is-broken-on-ie10-ie11/ -->
<input type="hidden" name="_dontcare">

<button type="submit" class="btn btn-default">Import</button>
</form>

</div>
</div> <!-- /jumbotron -->
</div> <!-- /container -->
<nestedscript type="text/javascript" src="js/vendor/upload-group.js"></nestedscript>
<nestedscript type="text/javascript" src="js/vendor/jquery.tablesorter.min.js"></nestedscript>
</script>


<script type="text/template" id="help-template">
<div class="container">
<div class="jumbotron help">
Expand Down Expand Up @@ -397,6 +483,7 @@ <h3 name="trouble">Having trouble with Tabula?</h3>
</script> <!-- TODO: move this to tabula.js, only run it if we're in upload page viewer -->

<script type="text/javascript" src="js/tabula.js?_cachebuster=201510300905"></script> <!-- actually starts Tabula -->
<script type="text/javascript" src="js/template_library.js?_cachebuster=201510300905"></script> <!-- needed on almost all pages -->

</body>
</html>
9 changes: 8 additions & 1 deletion webapp/static/css/styles.css
Original file line number Diff line number Diff line change
Expand Up @@ -7492,7 +7492,7 @@ form {
position: fixed;
top: 50px;
left: 195px;
width: 100%;
width: calc(100% - 200px);
z-index: 101;
/* gotta be >100, which is the max z-index for selections */
background: #d9edf7;
Expand All @@ -7514,6 +7514,13 @@ form {
#control-panel span.filename {
vertical-align: middle;
}
#control-panel .template-menu #template-dropdown-container ul {
padding-left: 20px;
}
#control-panel .template-menu #template-dropdown-container li {
cursor: pointer;
text-decoration: underline;
}

#main-pane {
background: #777777;
Expand Down
Loading

0 comments on commit 68c83fe

Please sign in to comment.