Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VeraPDF validation of PDF/A files #92

Merged
merged 1 commit into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,29 @@ RUN --mount=type=cache,target=/go/pkg/mod \
-o /out/preprocessing-worker \
./cmd/worker

# Build worker image
FROM alpine:3.20 AS preprocessing-worker
RUN apk add --update --no-cache libxml2-utils

# Copy the JRE (Eclipse Temurin v11) from the verapdf/cli image
ENV JAVA_HOME=/opt/java/openjdk
ENV PATH="${JAVA_HOME}/bin:${PATH}"
COPY --from=ghcr.io/verapdf/cli:v1.27.96 --link $JAVA_HOME $JAVA_HOME

ARG USER_ID=1000
ARG GROUP_ID=1000
RUN addgroup -g ${GROUP_ID} -S preprocessing
RUN adduser -u ${USER_ID} -S -D preprocessing preprocessing

# Make preprocessing the owner of the verapdf log dir
RUN mkdir --parents /var/opt/verapdf/logs && chown -R preprocessing:preprocessing /var/opt/verapdf

USER preprocessing

COPY --from=build-preprocessing-worker --link /out/preprocessing-worker /home/preprocessing/bin/preprocessing-worker
RUN mkdir /home/preprocessing/shared

# Copy the veraPDF application (v1.26.2) from the verapdf/cli image
COPY --from=ghcr.io/verapdf/cli:v1.27.96 --link /opt/verapdf/ /opt/verapdf/

CMD ["/home/preprocessing/bin/preprocessing-worker"]
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ include hack/make/dep_golines.mk
include hack/make/dep_gomajor.mk
include hack/make/dep_gosec.mk
include hack/make/dep_gotestsum.mk
include hack/make/dep_mockgen.mk
include hack/make/dep_shfmt.mk
include hack/make/dep_tparse.mk
include hack/make/enums.mk
Expand All @@ -30,6 +31,7 @@ TOOLS = $(GOLANGCI_LINT) \
$(GOMAJOR) \
$(GOSEC) \
$(GOTESTSUM) \
$(MOCKGEN) \
$(SHFMT) \
$(TPARSE)

Expand All @@ -40,7 +42,9 @@ endef

IGNORED_PACKAGES := \
github.com/artefactual-sdps/preprocessing-sfa/hack/% \
github.com/artefactual-sdps/preprocessing-sfa/internal/%/fake \
github.com/artefactual-sdps/preprocessing-sfa/internal/enums

PACKAGES := $(shell go list ./...)
TEST_PACKAGES := $(filter-out $(IGNORED_PACKAGES),$(PACKAGES))
TEST_IGNORED_PACKAGES := $(filter $(IGNORED_PACKAGES),$(PACKAGES))
Expand All @@ -55,6 +59,11 @@ deps: # @HELP List available module dependency updates.
deps: $(GOMAJOR)
gomajor list

gen-mock: # @HELP Generate mocks.
gen-mock: $(MOCKGEN)
mockgen -typed -destination=./internal/fformat/fake/mock_identifier.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fformat Identifier
mockgen -typed -destination=./internal/fvalidate/fake/mock_validator.go -package=fake github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate Validator

golines: # @HELP Run the golines formatter to fix long lines.
golines: GOLINES_OUT_MODE ?= write-output
golines: $(GOLINES)
Expand Down
9 changes: 9 additions & 0 deletions cmd/worker/workercmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

"github.com/artefactual-sdps/preprocessing-sfa/internal/activities"
"github.com/artefactual-sdps/preprocessing-sfa/internal/config"
"github.com/artefactual-sdps/preprocessing-sfa/internal/fformat"
"github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate"
"github.com/artefactual-sdps/preprocessing-sfa/internal/workflow"
)

Expand Down Expand Up @@ -78,6 +80,13 @@
ffvalidate.New(m.cfg.FileFormat).Execute,
temporalsdk_activity.RegisterOptions{Name: ffvalidate.Name},
)
w.RegisterActivityWithOptions(
activities.NewValidateFiles(
fformat.NewSiegfriedEmbed(),
fvalidate.NewVeraPDFValidator(m.cfg.FileValidate.VeraPDF.Path, m.logger),
).Execute,
temporalsdk_activity.RegisterOptions{Name: activities.ValidateFilesName},
)

Check warning on line 89 in cmd/worker/workercmd/cmd.go

View check run for this annotation

Codecov / codecov/patch

cmd/worker/workercmd/cmd.go#L83-L89

Added lines #L83 - L89 were not covered by tests
w.RegisterActivityWithOptions(
activities.NewAddPREMISObjects(rand.Reader).Execute,
temporalsdk_activity.RegisterOptions{Name: activities.AddPREMISObjectsName},
Expand Down
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@ require (
github.com/google/uuid v1.6.0
github.com/hashicorp/go-cleanhttp v0.5.2
github.com/oklog/run v1.1.0
github.com/richardlehane/siegfried v1.11.1
github.com/spf13/pflag v1.0.5
github.com/spf13/viper v1.18.2
github.com/stretchr/testify v1.9.0
go.artefactual.dev/tools v0.14.0
go.temporal.io/sdk v1.26.1
go.uber.org/mock v0.4.0
gocloud.dev v0.39.0
gotest.tools/v3 v3.5.1
)
Expand Down Expand Up @@ -67,7 +69,6 @@ require (
github.com/richardlehane/match v1.0.5 // indirect
github.com/richardlehane/mscfb v1.0.4 // indirect
github.com/richardlehane/msoleps v1.0.3 // indirect
github.com/richardlehane/siegfried v1.11.1 // indirect
github.com/richardlehane/xmldetect v1.0.2 // indirect
github.com/robfig/cron v1.2.0 // indirect
github.com/ross-spencer/spargo v0.4.1 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,8 @@ go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
go.uber.org/mock v0.4.0 h1:VcM4ZOtdbR4f6VXfiOpwpVJDL6lCReaZ6mw31wqh7KU=
go.uber.org/mock v0.4.0/go.mod h1:a6FSlNadKUHUa9IP5Vyt1zh4fC7uAwxMutEAscFbkZc=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
Expand Down
3 changes: 3 additions & 0 deletions hack/kube/overlays/dev/preprocessing-secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ stringData:
[fileformat]
allowlistPath = "/home/preprocessing/.config/allowed_file_formats.csv"
[filevalidate.verapdf]
path = "/opt/verapdf/verapdf"
allowed_file_formats.csv: |
Format name,PRONOM PUID
text,x-fmt/16
Expand Down
16 changes: 16 additions & 0 deletions hack/make/dep_mockgen.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
$(call _assert_var,MAKEDIR)
$(call _conditional_include,$(MAKEDIR)/base.mk)
$(call _assert_var,CACHE_VERSIONS)
$(call _assert_var,CACHE_BIN)

MOCKGEN_VERSION ?= 0.4.0

MOCKGEN := $(CACHE_VERSIONS)/mockgen/$(MOCKGEN_VERSION)
$(MOCKGEN):
rm -f $(CACHE_BIN)/mockgen
mkdir -p $(CACHE_BIN)
env GOBIN=$(CACHE_BIN) go install go.uber.org/mock/mockgen@v$(MOCKGEN_VERSION)
chmod +x $(CACHE_BIN)/mockgen
rm -rf $(dir $(MOCKGEN))
mkdir -p $(dir $(MOCKGEN))
touch $(MOCKGEN)
129 changes: 129 additions & 0 deletions internal/activities/validate_files.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package activities

import (
"context"
"errors"
"fmt"
"io/fs"
"path/filepath"
"slices"

"go.artefactual.dev/tools/temporal"

"github.com/artefactual-sdps/preprocessing-sfa/internal/fformat"
"github.com/artefactual-sdps/preprocessing-sfa/internal/fvalidate"
"github.com/artefactual-sdps/preprocessing-sfa/internal/sip"
)

const ValidateFilesName = "validate-files"

type (
ValidateFiles struct {
identifier fformat.Identifier
validators []fvalidate.Validator
}
ValidateFilesParams struct {
SIP sip.SIP
}
ValidateFilesResult struct {
Failures []string
}
)

type fileFormats map[string]*fformat.FileFormat

func NewValidateFiles(idr fformat.Identifier, vdrs ...fvalidate.Validator) *ValidateFiles {
return &ValidateFiles{
identifier: idr,
validators: vdrs,
}
}

// Execute validates SIP files against a file format specification. The
// only format validator currently implemented verapdf for PDF/A.
func (a *ValidateFiles) Execute(ctx context.Context, params *ValidateFilesParams) (*ValidateFilesResult, error) {
formats, err := a.identifyFormats(ctx, params.SIP)
if err != nil {
return nil, fmt.Errorf("identifyFormats: %v", err)
}

Check warning on line 48 in internal/activities/validate_files.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/validate_files.go#L47-L48

Added lines #L47 - L48 were not covered by tests

failures, err := a.validateFiles(params.SIP, formats)
if err != nil {
return nil, fmt.Errorf("validateFiles: %v", err)
}

return &ValidateFilesResult{Failures: failures}, nil
}

func (a *ValidateFiles) identifyFormats(ctx context.Context, sip sip.SIP) (fileFormats, error) {
logger := temporal.GetLogger(ctx)
formats := make(fileFormats)
err := filepath.WalkDir(sip.ContentPath, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}

Check warning on line 64 in internal/activities/validate_files.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/validate_files.go#L63-L64

Added lines #L63 - L64 were not covered by tests

if ctx.Err() != nil {
return errors.New("context cancelled")
}

Check warning on line 68 in internal/activities/validate_files.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/validate_files.go#L67-L68

Added lines #L67 - L68 were not covered by tests

if d.IsDir() {
return nil
}

ff, err := a.identifier.Identify(path)
if err != nil {
logger.Info("format identication failed", "path", path)
} else {
formats[path] = ff
}

return nil
})
if err != nil {
return nil, err
}

Check warning on line 85 in internal/activities/validate_files.go

View check run for this annotation

Codecov / codecov/patch

internal/activities/validate_files.go#L84-L85

Added lines #L84 - L85 were not covered by tests

return formats, nil
}

func (a *ValidateFiles) validateFiles(
sip sip.SIP,
files fileFormats,
) ([]string, error) {
var failures []string
for _, v := range a.validators {
out, err := validate(v, sip.ContentPath, files)
if err != nil {
return nil, err
}
if out != "" {
failures = append(failures, out)
}
}

return failures, nil
}

func validate(v fvalidate.Validator, path string, files fileFormats) (string, error) {
var canValidate bool
allowedIds := v.FormatIDs()

for _, f := range files {
if slices.Contains(allowedIds, f.ID) {
canValidate = true
break
}
}

if !canValidate {
return "", nil
}

out, err := v.Validate(path)
if err != nil {
return "", err
}

return out, nil
}
Loading
Loading