Merge pull request #1796 from Kozea/forms

Support PDF forms
Kozea · Feb 3, 2023 · 4978837 · 4978837
2 parents b446f6e + e655a78
commit 4978837
Show file tree

Hide file tree

Showing 26 changed files with 627 additions and 400 deletions.
diff --git a/docs/api_reference.rst b/docs/api_reference.rst
@@ -180,6 +180,10 @@ check that they follow the rules listed by the related specifications. The main
 constraint is to use a correct HTML structure to avoid inconsistencies in the
 PDF structure.
 
+Generated PDFs can include forms, using the ``appearance: auto`` CSS property
+or the ``--pdf-forms`` CLI option. Text inputs, text areas and check boxes are
+supported.
+
 
 Fonts
 ~~~~~
@@ -743,3 +747,22 @@ All the ``flex-*``, ``align-*``, ``justify-*`` and ``order`` properties are
 supported. The ``flex`` and ``flex-flow`` shorthands are supported too.
 
 .. _CSS Flexible Box Layout Module Level 1: https://www.w3.org/TR/css-flexbox-1/
+
+CSS Basic User Interface Module Level 3/4
++++++++++++++++++++++++++++++++++++++++++
+
+The `CSS Basic User Interface Module Level 3/4`_ "enables authors to style user
+interface related properties and values."
+
+The ``outline-width``, ``outline-style``, ``outline-color`` properties and the
+``outline`` shorthand are supported. The ``outline-offset`` property is **not**
+supported.
+
+The ``resize``, ``cursor``, ``caret-*`` and ``nav-*`` properties are **not**
+supported.
+
+The ``appearance`` property is supported. When set to ``auto``, it displays
+form fields as PDF form fields (supported for text inputs, check boxes and
+text areas only).
+
+The ``accent-color`` property is **not** supported.
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -13,7 +13,7 @@
 import pytest
 from PIL import Image
 from weasyprint import CSS, HTML, __main__, default_url_fetcher
-from weasyprint.links import resolve_links
+from weasyprint.pdf.anchors import resolve_links
 from weasyprint.urls import path2url
 
 from .draw import parse_pixels
@@ -464,6 +464,30 @@ def test_partial_pdf_custom_metadata():
     assert b'value' in stdout
 
 
+@pytest.mark.parametrize('html, field', (
+    (b'<input>', b'/Tx'),
+    (b'<input type="checkbox">', b'/Btn'),
+    (b'<textarea></textarea>', b'/Tx'),
+))
+def test_pdf_inputs(html, field):
+    stdout = _run('--pdf-forms - -', html)
+    assert b'AcroForm' in stdout
+    assert field in stdout
+    stdout = _run('- -', html)
+    assert b'AcroForm' not in stdout
+
+
+@pytest.mark.parametrize('css, with_forms, without_forms', (
+    ('appearance: auto', True, True),
+    ('appearance: none', False, False),
+    ('', True, False),
+))
+def test_appearance(css, with_forms, without_forms):
+    html = f'<input style="{css}">'.encode()
+    assert (b'AcroForm' in _run('--pdf-forms - -', html)) is with_forms
+    assert (b'AcroForm' in _run('- -', html)) is without_forms
+
+
 def test_reproducible():
     os.environ['SOURCE_DATE_EPOCH'] = '0'
     stdout1 = _run('- -', b'<body>a<img src=pattern.png>')

diff --git a/tests/test_css.py b/tests/test_css.py
@@ -64,7 +64,8 @@ def test_expand_shorthands():
 @assert_no_logs
 def test_annotate_document():
     document = FakeHTML(resource_filename('doc1.html'))
-    document._ua_stylesheets = lambda: [CSS(resource_filename('mini_ua.css'))]
+    document._ua_stylesheets = (
+        lambda *_, **__: [CSS(resource_filename('mini_ua.css'))])
     style_for = get_all_computed_styles(
         document, user_stylesheets=[CSS(resource_filename('user.css'))])
 

diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -13,6 +13,7 @@
 from weasyprint.css.counters import CounterStyle
 from weasyprint.css.targets import TargetCollector
 from weasyprint.formatting_structure import boxes, build
+from weasyprint.html import HTML5_UA_STYLESHEET
 from weasyprint.logger import LOGGER
 from weasyprint.urls import path2url
 
@@ -47,8 +48,10 @@
 
 class FakeHTML(HTML):
     """Like weasyprint.HTML, but with a lighter UA stylesheet."""
-    def _ua_stylesheets(self):
-        return [TEST_UA_STYLESHEET]
+    def _ua_stylesheets(self, forms=False):
+        return [
+            TEST_UA_STYLESHEET if stylesheet == HTML5_UA_STYLESHEET
+            else stylesheet for stylesheet in super()._ua_stylesheets(forms)]
 
 
 def resource_filename(basename):

diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py
@@ -106,7 +106,9 @@ def __init__(self, guess=None, filename=None, url=None, file_obj=None,
             result, content_language=None)
         self.etree_element = self.wrapper_element.etree_element
 
-    def _ua_stylesheets(self):
+    def _ua_stylesheets(self, forms=False):
+        if forms:
+            return [HTML5_UA_STYLESHEET, HTML5_UA_FORM_STYLESHEET]
         return [HTML5_UA_STYLESHEET]
 
     def _ua_counter_style(self):
@@ -117,7 +119,7 @@ def _ph_stylesheets(self):
 
     def render(self, stylesheets=None, presentational_hints=False,
                optimize_size=('fonts',), font_config=None, counter_style=None,
-               image_cache=None):
+               image_cache=None, forms=False):
         """Lay out and paginate the document, but do not (yet) export it.
 
         This returns a :class:`document.Document` object which provides
@@ -137,18 +139,20 @@ def render(self, stylesheets=None, presentational_hints=False,
         :type counter_style: :class:`css.counters.CounterStyle`
         :param counter_style: A dictionary storing ``@counter-style`` rules.
         :param dict image_cache: A dictionary used to cache images.
+        :param bool forms: Whether PDF forms have to be included.
         :returns: A :class:`document.Document` object.
 
         """
         return Document._render(
-            self, stylesheets, presentational_hints,
-            optimize_size, font_config, counter_style, image_cache)
+            self, stylesheets, presentational_hints, optimize_size,
+            font_config, counter_style, image_cache, forms)
 
     def write_pdf(self, target=None, stylesheets=None, zoom=1,
                   attachments=None, finisher=None, presentational_hints=False,
                   optimize_size=('fonts',), font_config=None,
                   counter_style=None, image_cache=None, identifier=None,
-                  variant=None, version=None, custom_metadata=False):
+                  variant=None, version=None, forms=False,
+                  custom_metadata=False):
         """Render the document to a PDF file.
 
         This is a shortcut for calling :meth:`render`, then
@@ -186,8 +190,9 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1,
         :param bytes identifier: A bytestring used as PDF file identifier.
         :param str variant: A PDF variant name.
         :param str version: A PDF version number.
-        :param bool custom_metadata: A boolean defining whether custom HTML
-            metadata should be stored in the generated PDF.
+        :param bool forms: Whether PDF forms have to be included.
+        :param bool custom_metadata: Whether custom HTML metadata should be
+            stored in the generated PDF.
         :returns:
             The PDF as :obj:`bytes` if ``target`` is not provided or
             :obj:`None`, otherwise :obj:`None` (the PDF is written to
@@ -197,7 +202,7 @@ def write_pdf(self, target=None, stylesheets=None, zoom=1,
         return (
             self.render(
                 stylesheets, presentational_hints, optimize_size, font_config,
-                counter_style, image_cache)
+                counter_style, image_cache, forms)
             .write_pdf(
                 target, zoom, attachments, finisher, identifier, variant,
                 version, custom_metadata))
@@ -335,5 +340,6 @@ def _select_source(guess=None, filename=None, url=None, file_obj=None,
 # Work around circular imports.
 from .css import preprocess_stylesheet  # noqa isort:skip
 from .html import (  # noqa isort:skip
-    HTML5_UA_COUNTER_STYLE, HTML5_UA_STYLESHEET, HTML5_PH_STYLESHEET)
+    HTML5_UA_COUNTER_STYLE, HTML5_UA_STYLESHEET, HTML5_UA_FORM_STYLESHEET,
+    HTML5_PH_STYLESHEET)
 from .document import Document, Page  # noqa isort:skip
diff --git a/weasyprint/__main__.py b/weasyprint/__main__.py
@@ -144,6 +144,8 @@ def main(argv=None, stdout=None, stdin=None):
     parser.add_argument(
         '--pdf-variant', choices=VARIANTS, help='PDF variant to generate')
     parser.add_argument('--pdf-version', help='PDF version number')
+    parser.add_argument(
+        '--pdf-forms', action='store_true', help='Include PDF forms')
     parser.add_argument(
         '--custom-metadata', action='store_true',
         help='include custom HTML meta tags in PDF metadata')
@@ -199,6 +201,7 @@ def main(argv=None, stdout=None, stdin=None):
         'identifier': args.pdf_identifier,
         'variant': args.pdf_variant,
         'version': args.pdf_version,
+        'forms': args.pdf_forms,
         'custom_metadata': args.custom_metadata,
     }
 

diff --git a/weasyprint/links.py → weasyprint/anchors.py b/weasyprint/links.py → weasyprint/anchors.py
@@ -1,53 +1,12 @@
-"""PDF links and bookmarks management."""
+"""Find anchors, links, bookmarks and inputs in documents."""
 
 import math
 
 from .formatting_structure import boxes
 from .layout.percent import percentage
-from .logger import LOGGER
 from .matrix import Matrix
 
 
-def resolve_links(pages):
-    """Resolve internal hyperlinks.
-
-    Links to a missing anchor are removed with a warning.
-
-    If multiple anchors have the same name, the first one is used.
-
-    :returns:
-        A generator yielding lists (one per page) like :attr:`Page.links`,
-        except that ``target`` for internal hyperlinks is
-        ``(page_number, x, y)`` instead of an anchor name.
-        The page number is a 0-based index into the :attr:`pages` list,
-        and ``x, y`` are in CSS pixels from the top-left of the page.
-
-    """
-    anchors = set()
-    paged_anchors = []
-    for i, page in enumerate(pages):
-        paged_anchors.append([])
-        for anchor_name, (point_x, point_y) in page.anchors.items():
-            if anchor_name not in anchors:
-                paged_anchors[-1].append((anchor_name, point_x, point_y))
-                anchors.add(anchor_name)
-    for page in pages:
-        page_links = []
-        for link in page.links:
-            link_type, anchor_name, _, _ = link
-            if link_type == 'internal':
-                if anchor_name not in anchors:
-                    LOGGER.error(
-                        'No anchor #%s for internal URI reference',
-                        anchor_name)
-                else:
-                    page_links.append(link)
-            else:
-                # External link
-                page_links.append(link)
-        yield page_links, paged_anchors.pop(0)
-
-
 def rectangle_aabb(matrix, pos_x, pos_y, width, height):
     """Apply a transformation matrix to an axis-aligned rectangle.
 
@@ -68,8 +27,12 @@ def rectangle_aabb(matrix, pos_x, pos_y, width, height):
     return box_x1, box_y1, box_x2, box_y2
 
 
-def gather_links_and_bookmarks(box, anchors, links, bookmarks,
-                               parent_matrix=None):
+def gather_anchors(box, anchors, links, bookmarks, inputs, parent_matrix=None):
+    """Gather anchors and other data related to specific positions in PDF.
+
+    Currently finds anchors, links, bookmarks and inputs.
+
+    """
     # Get box transformation matrix.
     # "Transforms apply to block-level and atomic inline-level elements,
     #  but do not apply to elements which may be split into
@@ -124,19 +87,26 @@ def gather_links_and_bookmarks(box, anchors, links, bookmarks,
     has_link = link and not isinstance(box, (boxes.TextBox, boxes.LineBox))
     # In case of duplicate IDs, only the first is an anchor.
     has_anchor = anchor_name and anchor_name not in anchors
+    is_input = box.is_input()
 
-    if has_bookmark or has_link or has_anchor:
-        pos_x, pos_y, width, height = box.hit_area()
+    if has_bookmark or has_link or has_anchor or is_input:
+        if is_input:
+            pos_x, pos_y = box.content_box_x(), box.content_box_y()
+            width, height = box.width, box.height
+        else:
+            pos_x, pos_y, width, height = box.hit_area()
+        if has_link or is_input:
+            rectangle = rectangle_aabb(matrix, pos_x, pos_y, width, height)
         if has_link:
             token_type, link = link
             assert token_type == 'url'
             link_type, target = link
             assert isinstance(target, str)
             if link_type == 'external' and box.is_attachment():
                 link_type = 'attachment'
-            rectangle = rectangle_aabb(matrix, pos_x, pos_y, width, height)
-            link = (link_type, target, rectangle, box)
-            links.append(link)
+            links.append((link_type, target, rectangle, box))
+        if is_input:
+            inputs.append((box.element, box.style, rectangle))
         if matrix and (has_bookmark or has_anchor):
             pos_x, pos_y = matrix.transform_point(pos_x, pos_y)
         if has_bookmark:
@@ -146,7 +116,7 @@ def gather_links_and_bookmarks(box, anchors, links, bookmarks,
             anchors[anchor_name] = pos_x, pos_y
 
     for child in box.all_children():
-        gather_links_and_bookmarks(child, anchors, links, bookmarks, matrix)
+        gather_anchors(child, anchors, links, bookmarks, inputs, matrix)
 
 
 def make_page_bookmark_tree(page, skipped_levels, last_by_depth,

diff --git a/weasyprint/css/__init__.py b/weasyprint/css/__init__.py
@@ -1096,7 +1096,7 @@ def preprocess_stylesheet(device_media_type, base_url, stylesheet_rules,
 def get_all_computed_styles(html, user_stylesheets=None,
                             presentational_hints=False, font_config=None,
                             counter_style=None, page_rules=None,
-                            target_collector=None):
+                            target_collector=None, forms=False):
     """Compute all the computed styles of all elements in ``html`` document.
 
     Do everything from finding author stylesheets to parsing and applying them.
@@ -1112,7 +1112,7 @@ def get_all_computed_styles(html, user_stylesheets=None,
     for style in html._ua_counter_style():
         for key, value in style.items():
             counter_style[key] = value
-    for sheet in (html._ua_stylesheets() or []):
+    for sheet in (html._ua_stylesheets(forms) or []):
         sheets.append((sheet, 'user agent', None))
     if presentational_hints:
         for sheet in (html._ph_stylesheets() or []):

diff --git a/weasyprint/css/computed_values.py b/weasyprint/css/computed_values.py
@@ -748,7 +748,7 @@ def strut_layout(style, context=None):
         if key in context.strut_layouts:
             return context.strut_layouts[key]
 
-    layout = Layout(context, style['font_size'], style)
+    layout = Layout(context, style)
     layout.set_text(' ')
     line, _ = layout.get_first_line()
     _, _, _, _, text_height, baseline = first_line_metrics(
@@ -782,11 +782,10 @@ def character_ratio(style, character):
     style = style.copy()
     style['letter_spacing'] = 'normal'
     style['word_spacing'] = 0
-
     # Random big value
-    font_size = 1000
+    style['font_size'] = 1000
 
-    layout = Layout(context=None, font_size=font_size, style=style)
+    layout = Layout(context=None, style=style)
     layout.set_text(character)
     line, _ = layout.get_first_line()
 
@@ -802,6 +801,6 @@ def character_ratio(style, character):
 
     # Zero means some kind of failure, fallback is 0.5.
     # We round to try keeping exact values that were altered by Pango.
-    ratio = round(measure / font_size, 5) or 0.5
+    ratio = round(measure / style['font_size'], 5) or 0.5
     cache[cache_key] = ratio
     return ratio