-
-
Notifications
You must be signed in to change notification settings - Fork 705
/
Copy pathanchors.py
342 lines (310 loc) · 13.2 KB
/
anchors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
"""Insert anchors, links, bookmarks and inputs in PDFs."""
import hashlib
import io
import zlib
from os.path import basename
from urllib.parse import unquote, urlsplit
import pydyf
from .. import Attachment
from ..logger import LOGGER
from ..text.ffi import ffi, gobject, pango
from ..text.fonts import get_font_description
from ..urls import URLFetchingError
def add_links(links_and_anchors, matrix, pdf, page, names, mark):
"""Include hyperlinks in given PDF page."""
links, anchors = links_and_anchors
for link_type, link_target, rectangle, box in links:
x1, y1 = matrix.transform_point(*rectangle[:2])
x2, y2 = matrix.transform_point(*rectangle[2:])
if link_type in ('internal', 'external'):
box.link_annotation = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Link',
'Rect': pydyf.Array([x1, y1, x2, y2]),
'BS': pydyf.Dictionary({'W': 0}),
})
if mark:
box.link_annotation['Contents'] = pydyf.String(link_target)
if link_type == 'internal':
box.link_annotation['Dest'] = pydyf.String(link_target)
else:
box.link_annotation['A'] = pydyf.Dictionary({
'Type': '/Action',
'S': '/URI',
'URI': pydyf.String(link_target),
})
pdf.add_object(box.link_annotation)
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
page['Annots'].append(box.link_annotation.reference)
for anchor in anchors:
anchor_name, x, y = anchor
x, y = matrix.transform_point(x, y)
names.append([
anchor_name, pydyf.Array([page.reference, '/XYZ', x, y, 0])])
def add_outlines(pdf, bookmarks, parent=None):
"""Include bookmark outlines in PDF."""
count = len(bookmarks)
outlines = []
for title, (page, x, y), children, state in bookmarks:
destination = pydyf.Array((pdf.page_references[page], '/XYZ', x, y, 0))
outline = pydyf.Dictionary({
'Title': pydyf.String(title), 'Dest': destination})
pdf.add_object(outline)
children_outlines, children_count = add_outlines(
pdf, children, parent=outline)
outline['Count'] = children_count
if state == 'closed':
outline['Count'] *= -1
else:
count += children_count
if outlines:
outline['Prev'] = outlines[-1].reference
outlines[-1]['Next'] = outline.reference
if children_outlines:
outline['First'] = children_outlines[0].reference
outline['Last'] = children_outlines[-1].reference
if parent is not None:
outline['Parent'] = parent.reference
outlines.append(outline)
if parent is None and outlines:
outlines_dictionary = pydyf.Dictionary({
'Count': count,
'First': outlines[0].reference,
'Last': outlines[-1].reference,
})
pdf.add_object(outlines_dictionary)
for outline in outlines:
outline['Parent'] = outlines_dictionary.reference
pdf.catalog['Outlines'] = outlines_dictionary.reference
return outlines, count
def add_inputs(inputs, matrix, pdf, page, resources, stream, font_map,
compress):
"""Include form inputs in PDF."""
if not inputs:
return
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
if 'AcroForm' not in pdf.catalog:
pdf.catalog['AcroForm'] = pydyf.Dictionary({
'Fields': pydyf.Array(),
'DR': resources.reference,
'NeedAppearances': 'true',
})
page_reference = page['Contents'].split()[0]
context = ffi.gc(
pango.pango_font_map_create_context(font_map),
gobject.g_object_unref)
for i, (element, style, rectangle) in enumerate(inputs):
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
input_type = element.attrib.get('type')
default_name = f'unknown-{page_reference.decode()}-{i}'
input_name = pydyf.String(element.attrib.get('name', default_name))
# TODO: where does this 0.75 scale come from?
font_size = style['font_size'] * 0.75
field_stream = pydyf.Stream(compress=compress)
field_stream.set_color_rgb(*style['color'][:3])
if input_type == 'checkbox':
# Checkboxes
width = rectangle[2] - rectangle[0]
height = rectangle[1] - rectangle[3]
checked_stream = pydyf.Stream(extra={
'Resources': resources.reference,
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array((0, 0, width, height)),
}, compress=compress)
checked_stream.push_state()
checked_stream.begin_text()
checked_stream.set_color_rgb(*style['color'][:3])
checked_stream.set_font_size('ZaDb', font_size)
# Center (let’s assume that Dingbat’s check has a 0.8em size)
x = (width - font_size * 0.8) / 2
y = (height - font_size * 0.8) / 2
checked_stream.move_text_to(x, y)
checked_stream.show_text_string('4')
checked_stream.end_text()
checked_stream.pop_state()
pdf.add_object(checked_stream)
checked = 'checked' in element.attrib
field_stream.set_font_size('ZaDb', font_size)
field = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Widget',
'Rect': pydyf.Array(rectangle),
'FT': '/Btn',
'P': page.reference,
'T': pydyf.String(input_name),
'V': '/Yes' if checked else '/Off',
'AP': pydyf.Dictionary({'N': pydyf.Dictionary({
'Yes': checked_stream.reference,
})}),
'AS': '/Yes' if checked else '/Off',
'DA': pydyf.String(b' '.join(field_stream.stream)),
})
else:
# Text, password, textarea, files, and unknown
font_description = get_font_description(style)
font = pango.pango_font_map_load_font(
font_map, context, font_description)
font = stream.add_font(font)
font.used_in_forms = True
field_stream.set_font_size(font.hash, font_size)
value = (
element.text if element.tag == 'textarea'
else element.attrib.get('value', ''))
field = pydyf.Dictionary({
'Type': '/Annot',
'Subtype': '/Widget',
'Rect': pydyf.Array(rectangle),
'FT': '/Tx',
'P': page.reference,
'T': pydyf.String(input_name),
'V': pydyf.String(value),
'DA': pydyf.String(b' '.join(field_stream.stream)),
})
if element.tag == 'textarea':
field['Ff'] = 2 ** (13 - 1)
elif input_type == 'password':
field['Ff'] = 2 ** (14 - 1)
elif input_type == 'file':
field['Ff'] = 2 ** (21 - 1)
pdf.add_object(field)
page['Annots'].append(field.reference)
pdf.catalog['AcroForm']['Fields'].append(field.reference)
def add_annotations(links, matrix, document, pdf, page, annot_files, compress):
"""Include annotations in PDF."""
# TODO: splitting a link into multiple independent rectangular
# annotations works well for pure links, but rather mediocre for
# other annotations and fails completely for transformed (CSS) or
# complex link shapes (area). It would be better to use /AP for all
# links and coalesce link shapes that originate from the same HTML
# link. This would give a feeling similiar to what browsers do with
# links that span multiple lines.
for link_type, annot_target, rectangle, _ in links:
if link_type != 'attachment':
continue
if annot_target not in annot_files:
# A single link can be split in multiple regions. We don't want
# to embed a file multiple times of course, so keep a reference
# to every embedded URL and reuse the object number.
# TODO: Use the title attribute as description. The comment
# above about multiple regions won't always be correct, because
# two links might have the same href, but different titles.
annot_files[annot_target] = write_pdf_attachment(
pdf, (annot_target, None), document.url_fetcher)
annot_file = annot_files[annot_target]
if annot_file is None:
continue
rectangle = (
*matrix.transform_point(*rectangle[:2]),
*matrix.transform_point(*rectangle[2:]))
stream = pydyf.Stream([], {
'Type': '/XObject',
'Subtype': '/Form',
'BBox': pydyf.Array(rectangle),
}, compress)
pdf.add_object(stream)
annot = pydyf.Dictionary({
'Type': '/Annot',
'Rect': pydyf.Array(rectangle),
'Subtype': '/FileAttachment',
'T': pydyf.String(),
'FS': annot_file.reference,
'AP': pydyf.Dictionary({'N': stream.reference}),
'AS': '/N',
})
pdf.add_object(annot)
if 'Annots' not in page:
page['Annots'] = pydyf.Array()
page['Annots'].append(annot.reference)
def write_pdf_attachment(pdf, attachment, url_fetcher):
"""Write an attachment to the PDF stream."""
# Attachments from document links like <link> or <a> can only be URLs.
# They're passed in as tuples
url = ''
if isinstance(attachment, tuple):
url, description = attachment
attachment = Attachment(
url=url, url_fetcher=url_fetcher, description=description)
elif not isinstance(attachment, Attachment):
attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)
try:
with attachment.source as (source_type, source, url, _):
if isinstance(source, bytes):
source = io.BytesIO(source)
uncompressed_length = 0
stream = b''
md5 = hashlib.md5()
compress = zlib.compressobj()
for data in iter(lambda: source.read(4096), b''):
uncompressed_length += len(data)
md5.update(data)
compressed = compress.compress(data)
stream += compressed
compressed = compress.flush(zlib.Z_FINISH)
stream += compressed
file_extra = pydyf.Dictionary({
'Type': '/EmbeddedFile',
'Filter': '/FlateDecode',
'Params': pydyf.Dictionary({
'CheckSum': f'<{md5.hexdigest()}>',
'Size': uncompressed_length,
})
})
file_stream = pydyf.Stream([stream], file_extra, compress)
pdf.add_object(file_stream)
except URLFetchingError as exception:
LOGGER.error('Failed to load attachment: %s', exception)
return
# TODO: Use the result object from a URL fetch operation to provide more
# details on the possible filename.
if url and urlsplit(url).path:
filename = basename(unquote(urlsplit(url).path))
else:
filename = 'attachment.bin'
attachment = pydyf.Dictionary({
'Type': '/Filespec',
'F': pydyf.String(),
'UF': pydyf.String(filename),
'EF': pydyf.Dictionary({'F': file_stream.reference}),
'Desc': pydyf.String(attachment.description or ''),
})
pdf.add_object(attachment)
return attachment
def resolve_links(pages):
"""Resolve internal hyperlinks.
Links to a missing anchor are removed with a warning.
If multiple anchors have the same name, the first one is used.
:returns:
A generator yielding lists (one per page) like :attr:`Page.links`,
except that ``target`` for internal hyperlinks is
``(page_number, x, y)`` instead of an anchor name.
The page number is a 0-based index into the :attr:`pages` list,
and ``x, y`` are in CSS pixels from the top-left of the page.
"""
anchors = set()
paged_anchors = []
for i, page in enumerate(pages):
paged_anchors.append([])
for anchor_name, (point_x, point_y) in page.anchors.items():
if anchor_name not in anchors:
paged_anchors[-1].append((anchor_name, point_x, point_y))
anchors.add(anchor_name)
for page in pages:
page_links = []
for link in page.links:
link_type, anchor_name, _, _ = link
if link_type == 'internal':
if anchor_name not in anchors:
LOGGER.error(
'No anchor #%s for internal URI reference',
anchor_name)
else:
page_links.append(link)
else:
# External link
page_links.append(link)
yield page_links, paged_anchors.pop(0)