-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpaginator.rb
80 lines (68 loc) · 3.64 KB
/
paginator.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# VARIABLES
unescapeargv = ARGV[0].chomp('"').reverse.chomp('"').reverse
input_file = File.expand_path(unescapeargv).split(Regexp.union(*[File::SEPARATOR, File::ALT_SEPARATOR].compact)).join(File::SEPARATOR)
filename_split = input_file.split(Regexp.union(*[File::SEPARATOR, File::ALT_SEPARATOR].compact)).pop
filename = input_file.split(Regexp.union(*[File::SEPARATOR, File::ALT_SEPARATOR].compact)).pop.rpartition('.').first.gsub(/ /, "")
output_file = "#{filename}.html"
input_html = ARGV[1].chomp('"').reverse.chomp('"').reverse
input_html = File.expand_path(input_html).split(Regexp.union(*[File::SEPARATOR, File::ALT_SEPARATOR].compact)).join(File::SEPARATOR)
paginate_html = "paginate.html"
# Location: /Users/nellie.mckesson/tools/pdfminer-20140328
# Docs: file:///Users/nellie.mckesson/tools/pdfminer-20140328/docs/index.html
# Github: https://github.com/euske/pdfminer
pdfminer = "/Users/nellie.mckesson/tools/pdfminer-20140328/tools/pdf2txt.py"
# extract the html from the PDF
`python #{pdfminer} -o #{output_file} #{input_file}`
# run a first round of js cleanup
cleanup = "cleanup.js"
`node #{cleanup} #{output_file} #{output_file}`
# get the cleaned-up extracted html and fix line wraps
rubycleanup = File.read(output_file).gsub(/(^)(<span class="line")(\/>)(.*)($)/, "\\1\\2>\\4</span>\\5")
.gsub(/(^<span class="line">.*)($)(^$)/, "\\1</div>\\2\\3")
.gsub(/(^)(<div><span>)(.*?)($)/, "\\1<span class=\"line\">\\3</span>\\4")
# then strip nested tags in lines
rubycleanup.scan(/^<span class="line">.*<\/span>$/).each do |s|
str = "#{s}"
t = str.gsub(/(<\/?[a-z]*\/?>)/, "")
rubycleanup = rubycleanup.gsub(str, t)
end
# then add closing span tags, to create a clean and wrapped line
rubycleanup = rubycleanup.gsub(/(^<span class="line">.*)($)/, "\\1</span>\\2")
# write the cleaned-up html back to a file
File.open(output_file, 'w') do |output|
output.puts rubycleanup
end
# run a second round of js cleanup on the cleaned up html
cleanuptwo = "cleanup2.js"
`node #{cleanuptwo} #{output_file} #{output_file}`
# get the final, cleaned-up and extracted PDF html, and prep for analysis
burstfile = File.read(output_file).gsub(/<body>/,"<div class='burstfile'>")
.gsub(/<\/body>/,"</div>")
.gsub(/<html><head>/, "")
.gsub(/<meta http-equiv="Content-Type" content="text\/html; charset=utf-8">/, "")
.gsub(/<\/head>/, "")
.gsub(/<\/html>/, "")
.gsub(/<\/meta>/, "")
# get the source html, and prep for analysis
srcfile = File.read(input_html).gsub(/<body data-type="book">/,"<div class='srcfile'>")
.gsub(/<\?xml version="1.0" encoding="UTF-8"\?>/, "")
.gsub(/<html xmlns="http:\/\/www.w3.org\/1999\/xhtml">/, "")
.gsub(/<head>/, "")
.gsub(/<title>.*<\/title><\/head>/, "")
.gsub(/<\/body>/,"</div>")
.gsub(/<\/html>/,"")
# write both the cleanup-up extracted PDF html AND the source html to a single file, for analiysis and re-breaking
File.open(paginate_html, 'w') do |output|
output.puts "<html>"
output.puts "<head>"
output.puts "<title>Repagination for #{filename}</title>"
output.puts "</head>"
output.puts "<body>"
output.puts burstfile
output.puts srcfile
output.puts "</body>"
output.puts "</html>"
end
# run the analysis and re-breaking
# analyze = "analyze.js"
# `node #{analyze} #{paginate_html}`