diff --git a/.dockerignore b/.dockerignore index 205d13799..12cf92034 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,3 +5,4 @@ !go.mod !go.sum !main.go +!hack/xsd/premis.xsd diff --git a/Dockerfile b/Dockerfile index fb09f4372..0fd8207b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,6 +3,9 @@ ARG TARGET=enduro ARG GO_VERSION +FROM golang:${GO_VERSION}-bookworm AS build-libxml +RUN apt-get update && apt-get install -y --no-install-recommends libxml2-utils + FROM golang:${GO_VERSION}-alpine AS build-go WORKDIR /src ENV CGO_ENABLED=0 @@ -61,10 +64,34 @@ FROM base AS enduro-a3m-worker COPY --from=build-enduro-a3m-worker --link /out/enduro-a3m-worker /home/enduro/bin/enduro-a3m-worker COPY --from=build-enduro-a3m-worker --link /src/enduro.toml /home/enduro/.config/enduro.toml CMD ["/home/enduro/bin/enduro-a3m-worker", "--config", "/home/enduro/.config/enduro.toml"] +COPY hack/xsd/premis.xsd /home/enduro/premis.xsd +COPY --from=build-libxml /usr/bin/xmllint /usr/bin/xmllint +COPY --from=build-libxml /lib/x86_64-linux-gnu/libxml2.so.2 /lib/x86_64-linux-gnu/libxml2.so.2 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libc.so.6 /lib/x86_64-linux-gnu/libc.so.6 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libicuuc.so.72 /lib/x86_64-linux-gnu/libicuuc.so.72 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libz.so.1 /lib/x86_64-linux-gnu/libz.so.1 +COPY --from=build-libxml /lib/x86_64-linux-gnu/liblzma.so.5 /lib/x86_64-linux-gnu/liblzma.so.5 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libm.so.6 /lib/x86_64-linux-gnu/libm.so.6 +COPY --from=build-libxml /lib64/ld-linux-x86-64.so.2 /lib64/ld-linux-x86-64.so.2 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libicudata.so.72 /lib/x86_64-linux-gnu/libicudata.so.72 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libstdc++.so.6 /lib/x86_64-linux-gnu/libstdc++.so.6 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libgcc_s.so.1 /lib/x86_64-linux-gnu/libgcc_s.so.1 FROM base AS enduro-am-worker COPY --from=build-enduro-am-worker --link /out/enduro-am-worker /home/enduro/bin/enduro-am-worker COPY --from=build-enduro-am-worker --link /src/enduro.toml /home/enduro/.config/enduro.toml CMD ["/home/enduro/bin/enduro-am-worker", "--config", "/home/enduro/.config/enduro.toml"] +COPY hack/xsd/premis.xsd /home/enduro/premis.xsd +COPY --from=build-libxml /usr/bin/xmllint /usr/bin/xmllint +COPY --from=build-libxml /lib/x86_64-linux-gnu/libxml2.so.2 /lib/x86_64-linux-gnu/libxml2.so.2 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libc.so.6 /lib/x86_64-linux-gnu/libc.so.6 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libicuuc.so.72 /lib/x86_64-linux-gnu/libicuuc.so.72 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libz.so.1 /lib/x86_64-linux-gnu/libz.so.1 +COPY --from=build-libxml /lib/x86_64-linux-gnu/liblzma.so.5 /lib/x86_64-linux-gnu/liblzma.so.5 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libm.so.6 /lib/x86_64-linux-gnu/libm.so.6 +COPY --from=build-libxml /lib64/ld-linux-x86-64.so.2 /lib64/ld-linux-x86-64.so.2 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libicudata.so.72 /lib/x86_64-linux-gnu/libicudata.so.72 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libstdc++.so.6 /lib/x86_64-linux-gnu/libstdc++.so.6 +COPY --from=build-libxml /lib/x86_64-linux-gnu/libgcc_s.so.1 /lib/x86_64-linux-gnu/libgcc_s.so.1 FROM ${TARGET} diff --git a/cmd/enduro-a3m-worker/main.go b/cmd/enduro-a3m-worker/main.go index 3bd0134af..2a4a63efd 100644 --- a/cmd/enduro-a3m-worker/main.go +++ b/cmd/enduro-a3m-worker/main.go @@ -19,6 +19,7 @@ import ( "github.com/artefactual-sdps/temporal-activities/bagvalidate" "github.com/artefactual-sdps/temporal-activities/bucketupload" "github.com/artefactual-sdps/temporal-activities/removepaths" + "github.com/artefactual-sdps/temporal-activities/xmlvalidate" "github.com/hashicorp/go-cleanhttp" "github.com/oklog/run" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -252,6 +253,10 @@ func main() { archiveextract.New(cfg.ExtractActivity).Execute, temporalsdk_activity.RegisterOptions{Name: archiveextract.Name}, ) + w.RegisterActivityWithOptions( + xmlvalidate.New(xmlvalidate.NewXMLLintValidator()).Execute, + temporalsdk_activity.RegisterOptions{Name: xmlvalidate.Name}, + ) w.RegisterActivityWithOptions( activities.NewClassifyPackageActivity().Execute, temporalsdk_activity.RegisterOptions{Name: activities.ClassifyPackageActivityName}, diff --git a/cmd/enduro-am-worker/main.go b/cmd/enduro-am-worker/main.go index ae325bdb9..b8e48a736 100644 --- a/cmd/enduro-am-worker/main.go +++ b/cmd/enduro-am-worker/main.go @@ -20,6 +20,7 @@ import ( "github.com/artefactual-sdps/temporal-activities/bagvalidate" "github.com/artefactual-sdps/temporal-activities/bucketupload" "github.com/artefactual-sdps/temporal-activities/removepaths" + "github.com/artefactual-sdps/temporal-activities/xmlvalidate" "github.com/hashicorp/go-cleanhttp" "github.com/jonboulle/clockwork" "github.com/oklog/run" @@ -329,6 +330,10 @@ func main() { bucketupload.New(failedPIPs).Execute, temporalsdk_activity.RegisterOptions{Name: activities.SendToFailedPIPsName}, ) + w.RegisterActivityWithOptions( + xmlvalidate.New(xmlvalidate.NewXMLLintValidator()).Execute, + temporalsdk_activity.RegisterOptions{Name: xmlvalidate.Name}, + ) g.Add( func() error { diff --git a/go.mod b/go.mod index f352e0ca4..55a67746b 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/XSAM/otelsql v0.29.0 github.com/alicebob/miniredis/v2 v2.32.1 github.com/artefactual-labs/bagit-gython v0.2.0 - github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b + github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 github.com/coreos/go-oidc/v3 v3.10.0 github.com/cyphar/filepath-securejoin v0.2.4 github.com/dolmen-go/contextio v1.0.0 diff --git a/go.sum b/go.sum index 48ccbb6b1..072bcca2b 100644 --- a/go.sum +++ b/go.sum @@ -442,8 +442,8 @@ github.com/apparentlymart/go-textseg/v15 v15.0.0 h1:uYvfpb3DyLSCGWnctWKGj857c6ew github.com/apparentlymart/go-textseg/v15 v15.0.0/go.mod h1:K8XmNZdhEBkdlyDdvbmmsvpAG721bKi0joRfFdHIWJ4= github.com/artefactual-labs/bagit-gython v0.2.0 h1:Zje4Lb1goZVUPoxpc/k65sWtYpNgK9Rvphvaok5cYzE= github.com/artefactual-labs/bagit-gython v0.2.0/go.mod h1:C+hFZQMDnji1hjGt3nrlMK3BahaBhvo/hU2uqd+Q9Z4= -github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b h1:kTOc2pbkdII6/Z84Bus1q52z5KAOaT8vLpfRoOs1l1I= -github.com/artefactual-sdps/temporal-activities v0.0.0-20240821162351-47302711bc7b/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug= +github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4 h1:WF95IOkZRVSCST/26SAqPYsUrtUuJpavBht6lvdeKl0= +github.com/artefactual-sdps/temporal-activities v0.0.0-20241018212855-8ea34d29bdf4/go.mod h1:FVh79rCGNlUU1QnioAU+lrSjLqrA1PJFYKIhWPsmyug= github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/aws/aws-sdk-go-v2 v1.30.3 h1:jUeBtG0Ih+ZIFH0F4UkmL9w3cSpaMv9tYYDbzILP8dY= diff --git a/hack/xsd/premis.xsd b/hack/xsd/premis.xsd new file mode 100644 index 000000000..be6122b66 --- /dev/null +++ b/hack/xsd/premis.xsd @@ -0,0 +1,1223 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/internal/workflow/processing.go b/internal/workflow/processing.go index c3d9dd742..4da084251 100644 --- a/internal/workflow/processing.go +++ b/internal/workflow/processing.go @@ -20,6 +20,7 @@ import ( "github.com/artefactual-sdps/temporal-activities/bagvalidate" "github.com/artefactual-sdps/temporal-activities/bucketupload" "github.com/artefactual-sdps/temporal-activities/removepaths" + "github.com/artefactual-sdps/temporal-activities/xmlvalidate" "github.com/google/uuid" "go.artefactual.dev/tools/ref" temporal_tools "go.artefactual.dev/tools/temporal" @@ -439,6 +440,16 @@ func (w *ProcessingWorkflow) SessionHandler( return err } + // Validate PREMIS. + activityOpts := withActivityOptsForRequest(sessCtx) + err := temporalsdk_workflow.ExecuteActivity(activityOpts, xmlvalidate.Name, xmlvalidate.Params{ + XMLPath: filepath.Join(tinfo.TempPath, "data", "metadata", "premis.xml"), + XSDPath: "/home/enduro/premis.xsd", + }).Get(activityOpts, nil) + if err != nil { + return err + } + // Classify the SIP. { activityOpts := withActivityOptsForLocalAction(sessCtx) diff --git a/internal/workflow/processing_test.go b/internal/workflow/processing_test.go index 9c61fac33..ac7b3208d 100644 --- a/internal/workflow/processing_test.go +++ b/internal/workflow/processing_test.go @@ -4,6 +4,7 @@ import ( "database/sql" "fmt" "math/rand" + "path/filepath" "strings" "testing" "time" @@ -14,6 +15,7 @@ import ( "github.com/artefactual-sdps/temporal-activities/bagvalidate" "github.com/artefactual-sdps/temporal-activities/bucketupload" "github.com/artefactual-sdps/temporal-activities/removepaths" + "github.com/artefactual-sdps/temporal-activities/xmlvalidate" "github.com/google/uuid" "github.com/jonboulle/clockwork" "github.com/stretchr/testify/mock" @@ -115,6 +117,10 @@ func (s *ProcessingWorkflowTestSuite) SetupWorkflowTest(cfg config.Configuration archiveextract.New(cfg.ExtractActivity).Execute, temporalsdk_activity.RegisterOptions{Name: archiveextract.Name}, ) + s.env.RegisterActivityWithOptions( + xmlvalidate.New(xmlvalidate.NewXMLLintValidator()).Execute, + temporalsdk_activity.RegisterOptions{Name: xmlvalidate.Name}, + ) s.env.RegisterActivityWithOptions( bagvalidate.New(bagvalidate.NewNoopValidator()).Execute, temporalsdk_activity.RegisterOptions{Name: bagvalidate.Name}, @@ -310,6 +316,12 @@ func (s *ProcessingWorkflowTestSuite) TestPackageConfirmation() { &archiveextract.Result{ExtractPath: extractPath}, nil, ) + s.env.OnActivity(xmlvalidate.Name, sessionCtx, + &xmlvalidate.Params{XMLPath: filepath.Join(extractPath, "data/metadata/premis.xml"), XSDPath: "/home/enduro/premis.xsd"}, + ).Return( + &xmlvalidate.Result{Failures: []string{}}, nil, + ) + s.env.OnActivity(activities.BundleActivityName, sessionCtx, &activities.BundleActivityParams{ SourcePath: extractPath, @@ -424,6 +436,12 @@ func (s *ProcessingWorkflowTestSuite) TestAutoApprovedAIP() { &archiveextract.Result{ExtractPath: extractPath}, nil, ) + s.env.OnActivity(xmlvalidate.Name, sessionCtx, + &xmlvalidate.Params{XMLPath: filepath.Join(extractPath, "data/metadata/premis.xml"), XSDPath: "/home/enduro/premis.xsd"}, + ).Return( + &xmlvalidate.Result{Failures: []string{}}, nil, + ) + s.env.OnActivity( activities.ClassifyPackageActivityName, sessionCtx, @@ -601,6 +619,12 @@ func (s *ProcessingWorkflowTestSuite) TestAMWorkflow() { &archiveextract.Result{ExtractPath: extractPath}, nil, ) + s.env.OnActivity(xmlvalidate.Name, sessionCtx, + &xmlvalidate.Params{XMLPath: filepath.Join(extractPath, "data/metadata/premis.xml"), XSDPath: "/home/enduro/premis.xsd"}, + ).Return( + &xmlvalidate.Result{Failures: []string{}}, nil, + ) + s.env.OnActivity( activities.ClassifyPackageActivityName, sessionCtx, @@ -757,6 +781,12 @@ func (s *ProcessingWorkflowTestSuite) TestPackageRejection() { &archiveextract.Result{ExtractPath: extractPath}, nil, ) + s.env.OnActivity(xmlvalidate.Name, sessionCtx, + &xmlvalidate.Params{XMLPath: filepath.Join(extractPath, "data/metadata/premis.xml"), XSDPath: "/home/enduro/premis.xsd"}, + ).Return( + &xmlvalidate.Result{Failures: []string{}}, nil, + ) + s.env.OnActivity( activities.ClassifyPackageActivityName, sessionCtx, @@ -938,6 +968,14 @@ func (s *ProcessingWorkflowTestSuite) TestPreprocessingChildWorkflow() { &activities.ClassifyPackageActivityResult{Type: enums.PackageTypeBagIt}, nil, ) + s.env.OnActivity( + xmlvalidate.Name, + sessionCtx, + &xmlvalidate.Params{XMLPath: filepath.Join(prepDest, "data/metadata/premis.xml"), XSDPath: "/home/enduro/premis.xsd"}, + ).Return( + &xmlvalidate.Result{Failures: []string{}}, nil, + ) + s.env.OnActivity( createPreservationTaskLocalActivity, ctx, @@ -1259,6 +1297,12 @@ func (s *ProcessingWorkflowTestSuite) TestFailedPIPA3m() { &archiveextract.Params{SourcePath: downloadDir + "/" + key}, ).Return(&archiveextract.Result{ExtractPath: extractPath}, nil) + s.env.OnActivity( + xmlvalidate.Name, + sessionCtx, + &xmlvalidate.Params{XMLPath: filepath.Join(extractPath, "data/metadata/premis.xml"), XSDPath: "/home/enduro/premis.xsd"}, + ).Return(&xmlvalidate.Result{Failures: []string{}}, nil) + s.env.OnActivity( activities.ClassifyPackageActivityName, sessionCtx, @@ -1406,6 +1450,12 @@ func (s *ProcessingWorkflowTestSuite) TestFailedPIPAM() { &archiveextract.Params{SourcePath: tempPath + "/" + key}, ).Return(&archiveextract.Result{ExtractPath: extractPath}, nil) + s.env.OnActivity( + xmlvalidate.Name, + sessionCtx, + &xmlvalidate.Params{XMLPath: filepath.Join(extractPath, "data/metadata/premis.xml"), XSDPath: "/home/enduro/premis.xsd"}, + ).Return(&xmlvalidate.Result{Failures: []string{}}, nil) + s.env.OnActivity( activities.ClassifyPackageActivityName, sessionCtx,