Skip to content

Commit 87dc8d9

Browse files
committed
Add veraPDF validation of PDF/A files
- Add verapdf and the JRE to the worker Docker image - Add a validate_files activity to identify SIP file formats then validate the file formats for which we have a validator - Copy siegfried_embed and the format Identifier interface from https://github.com/artefactual-sdps/temporal-activities - Add the fvalidate package and Validator interface - Add a veraPDF implementation of the Validator interface - Run veraPDF in "batch" mode to minimize startup overheads - Add file validation configuration to config file - Add processing events for file validation success and failure - Add veraPDF binary path to kube dev overlay
1 parent 451bb29 commit 87dc8d9

23 files changed

+1640
-11
lines changed

Dockerfile

+13
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,29 @@ RUN --mount=type=cache,target=/go/pkg/mod \
2222
-o /out/preprocessing-worker \
2323
./cmd/worker
2424

25+
# Build worker image
2526
FROM alpine:3.20 AS preprocessing-worker
2627
RUN apk add --update --no-cache libxml2-utils
2728

29+
# Copy the JRE (Eclipse Temurin v11) from the verapdf/cli image
30+
ENV JAVA_HOME=/opt/java/openjdk
31+
ENV PATH="${JAVA_HOME}/bin:${PATH}"
32+
COPY --from=ghcr.io/verapdf/cli:v1.27.96 --link $JAVA_HOME $JAVA_HOME
33+
2834
ARG USER_ID=1000
2935
ARG GROUP_ID=1000
3036
RUN addgroup -g ${GROUP_ID} -S preprocessing
3137
RUN adduser -u ${USER_ID} -S -D preprocessing preprocessing
3238

39+
# Make preprocessing the owner of the verapdf log dir
40+
RUN mkdir --parents /var/opt/verapdf/logs && chown -R preprocessing:preprocessing /var/opt/verapdf
41+
3342
USER preprocessing
43+
3444
COPY --from=build-preprocessing-worker --link /out/preprocessing-worker /home/preprocessing/bin/preprocessing-worker
3545
RUN mkdir /home/preprocessing/shared
3646

47+
# Copy the veraPDF application (v1.26.2) from the verapdf/cli image
48+
COPY --from=ghcr.io/verapdf/cli:v1.27.96 --link /opt/verapdf/ /opt/verapdf/
49+
3750
CMD ["/home/preprocessing/bin/preprocessing-worker"]

Makefile

+9
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ include hack/make/dep_golines.mk
2121
include hack/make/dep_gomajor.mk
2222
include hack/make/dep_gosec.mk
2323
include hack/make/dep_gotestsum.mk
24+
include hack/make/dep_mockgen.mk
2425
include hack/make/dep_shfmt.mk
2526
include hack/make/dep_tparse.mk
2627
include hack/make/enums.mk
@@ -30,6 +31,7 @@ TOOLS = $(GOLANGCI_LINT) \
3031
$(GOMAJOR) \
3132
$(GOSEC) \
3233
$(GOTESTSUM) \
34+
$(MOCKGEN) \
3335
$(SHFMT) \
3436
$(TPARSE)
3537

@@ -40,7 +42,9 @@ endef
4042

4143
IGNORED_PACKAGES := \
4244
github.com/artefactual-sdps/preprocessing-sfa/hack/% \
45+
github.com/artefactual-sdps/preprocessing-sfa/internal/%/fake \
4346
github.com/artefactual-sdps/preprocessing-sfa/internal/enums
47+
4448
PACKAGES := $(shell go list ./...)
4549
TEST_PACKAGES := $(filter-out $(IGNORED_PACKAGES),$(PACKAGES))
4650
TEST_IGNORED_PACKAGES := $(filter $(IGNORED_PACKAGES),$(PACKAGES))
@@ -55,6 +59,11 @@ deps: # @HELP List available module dependency updates.
5559
deps: $(GOMAJOR)
5660
gomajor list
5761

62+
gen-mock: # @HELP Generate mocks.
63+
gen-mock: $(MOCKGEN)
64+
mockgen -typed -destination=./internal/fformat/fake/mock_identifier.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fformat Identifier
65+
mockgen -typed -destination=./internal/fvalidate/fake/mock_validator.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate Validator
66+
5867
golines: # @HELP Run the golines formatter to fix long lines.
5968
golines: GOLINES_OUT_MODE ?= write-output
6069
golines: $(GOLINES)

cmd/worker/workercmd/cmd.go

+9
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import (
1717

1818
"github.com/artefactual-sdps/preprocessing-sfa/internal/activities"
1919
"github.com/artefactual-sdps/preprocessing-sfa/internal/config"
20+
"github.com/artefactual-sdps/preprocessing-sfa/internal/fformat"
21+
"github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate"
2022
"github.com/artefactual-sdps/preprocessing-sfa/internal/workflow"
2123
)
2224

@@ -78,6 +80,13 @@ func (m *Main) Run(ctx context.Context) error {
7880
ffvalidate.New(m.cfg.FileFormat).Execute,
7981
temporalsdk_activity.RegisterOptions{Name: ffvalidate.Name},
8082
)
83+
w.RegisterActivityWithOptions(
84+
activities.NewValidateFiles(
85+
fformat.NewSiegfriedEmbed(),
86+
fvalidate.NewVeraPDFValidator(m.cfg.FileValidate.VeraPDF.Path, m.logger),
87+
).Execute,
88+
temporalsdk_activity.RegisterOptions{Name: activities.ValidateFilesName},
89+
)
8190
w.RegisterActivityWithOptions(
8291
activities.NewAddPREMISObjects(rand.Reader).Execute,
8392
temporalsdk_activity.RegisterOptions{Name: activities.AddPREMISObjectsName},

go.mod

+2-1
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@ require (
1111
github.com/google/uuid v1.6.0
1212
github.com/hashicorp/go-cleanhttp v0.5.2
1313
github.com/oklog/run v1.1.0
14+
github.com/richardlehane/siegfried v1.11.1
1415
github.com/spf13/pflag v1.0.5
1516
github.com/spf13/viper v1.18.2
1617
github.com/stretchr/testify v1.9.0
1718
go.artefactual.dev/tools v0.14.0
1819
go.temporal.io/sdk v1.26.1
20+
go.uber.org/mock v0.4.0
1921
gocloud.dev v0.39.0
2022
gotest.tools/v3 v3.5.1
2123
)
@@ -67,7 +69,6 @@ require (
6769
github.com/richardlehane/match v1.0.5 // indirect
6870
github.com/richardlehane/mscfb v1.0.4 // indirect
6971
github.com/richardlehane/msoleps v1.0.3 // indirect
70-
github.com/richardlehane/siegfried v1.11.1 // indirect
7172
github.com/richardlehane/xmldetect v1.0.2 // indirect
7273
github.com/robfig/cron v1.2.0 // indirect
7374
github.com/ross-spencer/spargo v0.4.1 // indirect

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,8 @@ go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
266266
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
267267
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
268268
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
269+
go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU=
270+
go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc=
269271
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
270272
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
271273
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=

hack/kube/overlays/dev/preprocessing-secret.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ stringData:
5050
[fileformat]
5151
allowlistPath = "/home/preprocessing/.config/allowed_file_formats.csv"
5252
53+
[filevalidate.verapdf]
54+
path = "/opt/verapdf/verapdf"
55+
5356
allowed_file_formats.csv: |
5457
Format name,PRONOM PUID
5558
text,x-fmt/16

hack/make/dep_mockgen.mk

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
$(call _assert_var,MAKEDIR)
2+
$(call _conditional_include,$(MAKEDIR)/base.mk)
3+
$(call _assert_var,CACHE_VERSIONS)
4+
$(call _assert_var,CACHE_BIN)
5+
6+
MOCKGEN_VERSION ?= 0.4.0
7+
8+
MOCKGEN := $(CACHE_VERSIONS)/mockgen/$(MOCKGEN_VERSION)
9+
$(MOCKGEN):
10+
rm -f $(CACHE_BIN)/mockgen
11+
mkdir -p $(CACHE_BIN)
12+
env GOBIN=$(CACHE_BIN) go install go.uber.org/mock/mockgen@v$(MOCKGEN_VERSION)
13+
chmod +x $(CACHE_BIN)/mockgen
14+
rm -rf $(dir $(MOCKGEN))
15+
mkdir -p $(dir $(MOCKGEN))
16+
touch $(MOCKGEN)

internal/activities/validate_files.go

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
package activities
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"io/fs"
8+
"path/filepath"
9+
"slices"
10+
11+
"go.artefactual.dev/tools/temporal"
12+
13+
"github.com/artefactual-sdps/preprocessing-sfa/internal/fformat"
14+
"github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate"
15+
"github.com/artefactual-sdps/preprocessing-sfa/internal/sip"
16+
)
17+
18+
const ValidateFilesName = "validate-files"
19+
20+
type (
21+
ValidateFiles struct {
22+
identifier fformat.Identifier
23+
validators []fvalidate.Validator
24+
}
25+
ValidateFilesParams struct {
26+
SIP sip.SIP
27+
}
28+
ValidateFilesResult struct {
29+
Failures []string
30+
}
31+
)
32+
33+
type fileFormats map[string]*fformat.FileFormat
34+
35+
func NewValidateFiles(idr fformat.Identifier, vdrs ...fvalidate.Validator) *ValidateFiles {
36+
return &ValidateFiles{
37+
identifier: idr,
38+
validators: vdrs,
39+
}
40+
}
41+
42+
// Execute validates SIP files against a file format specification. The
43+
// only format validator currently implemented verapdf for PDF/A.
44+
func (a *ValidateFiles) Execute(ctx context.Context, params *ValidateFilesParams) (*ValidateFilesResult, error) {
45+
formats, err := a.identifyFormats(ctx, params.SIP)
46+
if err != nil {
47+
return nil, fmt.Errorf("identifyFormats: %v", err)
48+
}
49+
50+
failures, err := a.validateFiles(params.SIP, formats)
51+
if err != nil {
52+
return nil, fmt.Errorf("validateFiles: %v", err)
53+
}
54+
55+
return &ValidateFilesResult{Failures: failures}, nil
56+
}
57+
58+
func (a *ValidateFiles) identifyFormats(ctx context.Context, sip sip.SIP) (fileFormats, error) {
59+
logger := temporal.GetLogger(ctx)
60+
formats := make(fileFormats)
61+
err := filepath.WalkDir(sip.ContentPath, func(path string, d fs.DirEntry, err error) error {
62+
if err != nil {
63+
return err
64+
}
65+
66+
if ctx.Err() != nil {
67+
return errors.New("context cancelled")
68+
}
69+
70+
if d.IsDir() {
71+
return nil
72+
}
73+
74+
ff, err := a.identifier.Identify(path)
75+
if err != nil {
76+
logger.Info("format identication failed", "path", path)
77+
} else {
78+
formats[path] = ff
79+
}
80+
81+
return nil
82+
})
83+
if err != nil {
84+
return nil, err
85+
}
86+
87+
return formats, nil
88+
}
89+
90+
func (a *ValidateFiles) validateFiles(
91+
sip sip.SIP,
92+
files fileFormats,
93+
) ([]string, error) {
94+
var failures []string
95+
for _, v := range a.validators {
96+
out, err := validate(v, sip.ContentPath, files)
97+
if err != nil {
98+
return nil, err
99+
}
100+
if out != "" {
101+
failures = append(failures, out)
102+
}
103+
}
104+
105+
return failures, nil
106+
}
107+
108+
func validate(v fvalidate.Validator, path string, files fileFormats) (string, error) {
109+
var canValidate bool
110+
allowedIds := v.FormatIDs()
111+
112+
for _, f := range files {
113+
if slices.Contains(allowedIds, f.ID) {
114+
canValidate = true
115+
break
116+
}
117+
}
118+
119+
if !canValidate {
120+
return "", nil
121+
}
122+
123+
out, err := v.Validate(path)
124+
if err != nil {
125+
return "", err
126+
}
127+
128+
return out, nil
129+
}

0 commit comments

Comments
 (0)