From ee121cff215a223bf4c6186e101f40f75914a5a8 Mon Sep 17 00:00:00 2001 From: Christopher Angelo Phillips <32073428+spiffcs@users.noreply.github.com> Date: Tue, 22 Aug 2023 10:47:07 -0400 Subject: [PATCH] feat: 1944 - update purl generation to use a consistent groupID (#2033) Separate the logic for CPE and PURL generation. PURL generation needs a single answer for groupID based on a priority of discovering the field. CPE generation still uses multiple potential groupID to populate the candidate cpe. Improve GroupID detection. Currently syft does not use any hierarchy for GroupID detection and treats all sources as equal. It treats fields from the manifest file with priority. This change adds a hierarchy to the fields and returns a single answer based on that hierarchy. --------- Signed-off-by: Christopher Phillips Signed-off-by: Keith Zantow Co-authored-by: Keith Zantow --- syft/pkg/cataloger/common/cpe/java.go | 23 ++-- .../cataloger/common/cpe/java_groupid_map.go | 2 +- syft/pkg/cataloger/java/archive_parser.go | 63 ++++++--- .../pkg/cataloger/java/archive_parser_test.go | 50 +------ syft/pkg/cataloger/java/package_url.go | 126 +++++++++++++++++- syft/pkg/cataloger/java/package_url_test.go | 56 ++++++++ syft/pkg/cataloger/java/parse_pom_xml.go | 4 +- 7 files changed, 247 insertions(+), 77 deletions(-) diff --git a/syft/pkg/cataloger/common/cpe/java.go b/syft/pkg/cataloger/common/cpe/java.go index 2e838de3010..7a1a8f7b8c3 100644 --- a/syft/pkg/cataloger/common/cpe/java.go +++ b/syft/pkg/cataloger/common/cpe/java.go @@ -21,16 +21,16 @@ var ( "be", } - primaryJavaManifestGroupIDFields = []string{ + PrimaryJavaManifestGroupIDFields = []string{ + "Bundle-SymbolicName", "Extension-Name", "Specification-Vendor", "Implementation-Vendor", - "Bundle-SymbolicName", "Implementation-Vendor-Id", "Implementation-Title", "Bundle-Activator", } - secondaryJavaManifestGroupIDFields = []string{ + SecondaryJavaManifestGroupIDFields = []string{ "Automatic-Module-Name", "Main-Class", "Package", @@ -168,7 +168,7 @@ func artifactIDFromJavaPackage(p pkg.Package) string { } artifactID := strings.TrimSpace(metadata.PomProperties.ArtifactID) - if startsWithTopLevelDomain(artifactID) && len(strings.Split(artifactID, ".")) > 1 { + if looksLikeGroupID(artifactID) && len(strings.Split(artifactID, ".")) > 1 { // there is a strong indication that the artifact ID is really a group ID, don't use it return "" } @@ -184,6 +184,9 @@ func GroupIDsFromJavaPackage(p pkg.Package) (groupIDs []string) { return GroupIDsFromJavaMetadata(p.Name, metadata) } +// GroupIDsFromJavaMetadata returns the possible group IDs for a Java package +// This function is similar to GroupIDFromJavaPackage, but returns all possible group IDs and is less strict +// It is used as a way to generate possible candidates for CPE matching. func GroupIDsFromJavaMetadata(pkgName string, metadata pkg.JavaMetadata) (groupIDs []string) { groupIDs = append(groupIDs, groupIDsFromPomProperties(metadata.PomProperties)...) groupIDs = append(groupIDs, groupIDsFromPomProject(metadata.PomProject)...) @@ -242,7 +245,7 @@ func addGroupIDsFromGroupIDsAndArtifactID(groupID, artifactID string) (groupIDs } func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []string { - if groupID, ok := defaultArtifactIDToGroupID[pkgName]; ok { + if groupID, ok := DefaultArtifactIDToGroupID[pkgName]; ok { return []string{groupID} } @@ -251,7 +254,7 @@ func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []stri } // try the common manifest fields first for a set of candidates - groupIDs := getManifestFieldGroupIDs(manifest, primaryJavaManifestGroupIDFields) + groupIDs := GetManifestFieldGroupIDs(manifest, PrimaryJavaManifestGroupIDFields) if len(groupIDs) != 0 { return groupIDs @@ -262,10 +265,10 @@ func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []stri // for more info see pkg:maven/commons-io/commons-io@2.8.0 within cloudbees/cloudbees-core-mm:2.263.4.2 // at /usr/share/jenkins/jenkins.war:WEB-INF/plugins/analysis-model-api.hpi:WEB-INF/lib/commons-io-2.8.0.jar // as well as the ant package from cloudbees/cloudbees-core-mm:2.277.2.4-ra. - return getManifestFieldGroupIDs(manifest, secondaryJavaManifestGroupIDFields) + return GetManifestFieldGroupIDs(manifest, SecondaryJavaManifestGroupIDFields) } -func getManifestFieldGroupIDs(manifest *pkg.JavaManifest, fields []string) (groupIDs []string) { +func GetManifestFieldGroupIDs(manifest *pkg.JavaManifest, fields []string) (groupIDs []string) { if manifest == nil { return nil } @@ -302,3 +305,7 @@ func removeOSCIDirectives(groupID string) string { func startsWithTopLevelDomain(value string) bool { return internal.HasAnyOfPrefixes(value, domains...) } + +func looksLikeGroupID(value string) bool { + return strings.Contains(value, ".") +} diff --git a/syft/pkg/cataloger/common/cpe/java_groupid_map.go b/syft/pkg/cataloger/common/cpe/java_groupid_map.go index 9e36db185f3..5205c81e9c3 100644 --- a/syft/pkg/cataloger/common/cpe/java_groupid_map.go +++ b/syft/pkg/cataloger/common/cpe/java_groupid_map.go @@ -1,6 +1,6 @@ package cpe -var defaultArtifactIDToGroupID = map[string]string{ +var DefaultArtifactIDToGroupID = map[string]string{ "ant": "org.apache.ant", "ant-antlr": "org.apache.ant", "ant-antunit": "org.apache.ant", diff --git a/syft/pkg/cataloger/java/archive_parser.go b/syft/pkg/cataloger/java/archive_parser.go index ea216e9062f..a9662be65b1 100644 --- a/syft/pkg/cataloger/java/archive_parser.go +++ b/syft/pkg/cataloger/java/archive_parser.go @@ -62,11 +62,11 @@ func parseJavaArchive(_ file.Resolver, _ *generic.Environment, reader file.Locat } // uniquePkgKey creates a unique string to identify the given package. -func uniquePkgKey(p *pkg.Package) string { +func uniquePkgKey(groupID string, p *pkg.Package) string { if p == nil { return "" } - return fmt.Sprintf("%s|%s", p.Name, p.Version) + return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version) } // newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover @@ -371,13 +371,27 @@ func pomProjectByParentPath(archivePath string, location file.Location, extractP return projectByParentPath, nil } -// packagesFromPomProperties processes a single Maven POM properties for a given parent package, returning all listed Java packages found and +// newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and // associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not. func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.PomProject, parentPkg *pkg.Package, location file.Location) *pkg.Package { // keep the artifact name within the virtual path if this package does not match the parent package vPathSuffix := "" - if !strings.HasPrefix(pomProperties.ArtifactID, parentPkg.Name) { - vPathSuffix += ":" + pomProperties.ArtifactID + groupID := "" + if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata); ok { + groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata) + } + + parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version) + // Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package + // See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical + // packages and create equal virtual paths so they are de duped in the future + pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version) + if parentKey != pomProjectKey { + // build a new virtual path suffix for the package that is different from the parent package + // we want to use the GroupID and ArtifactID here to preserve uniqueness + // Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core") + // https://github.com/anchore/syft/issues/1944 + vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID } virtualPath := location.AccessPath() + vPathSuffix @@ -408,21 +422,26 @@ func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.Po } func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool { - // the name/version pair matches... - if uniquePkgKey(&p) == uniquePkgKey(parentPkg) { - return true - } - metadata, ok := p.Metadata.(pkg.JavaMetadata) - if !ok { - log.WithFields("package", p.String()).Warn("unable to extract java metadata to check for matching package identity") - return false + parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaMetadata) + if !ok || !parentOk { + switch { + case !ok: + log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name) + case !parentOk: + log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name) + } + // if we can't extract metadata, we can check for matching identities via the package name + // this is not ideal, but it's better than nothing - this should not be used if we have Metadata + + return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg) } - parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata) - if !ok { - log.WithFields("package", p.String()).Warn("unable to extract java metadata from parent for verifying virtual path") - return false + // try to determine identity with the metadata + groupID := groupIDFromJavaMetadata(p.Name, metadata) + parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata) + if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) { + return true } // the virtual path matches... @@ -434,10 +453,14 @@ func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool { // note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good // example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but // they do not indicate the same package. - if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID { - return true + // NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name + // from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core") + // we will use this check as a last resort + if metadata.PomProperties != nil { + if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID { + return true + } } - return false } diff --git a/syft/pkg/cataloger/java/archive_parser_test.go b/syft/pkg/cataloger/java/archive_parser_test.go index 422de7d480f..4635a9c9887 100644 --- a/syft/pkg/cataloger/java/archive_parser_test.go +++ b/syft/pkg/cataloger/java/archive_parser_test.go @@ -174,8 +174,8 @@ func TestParseJar(t *testing.T) { MetadataType: pkg.JavaMetadataType, Metadata: pkg.JavaMetadata{ // ensure that nested packages with different names than that of the parent are appended as - // a suffix on the virtual path - VirtualPath: "test-fixtures/java-builds/packages/example-java-app-gradle-0.1.0.jar:joda-time", + // a suffix on the virtual path with a colon separator between group name and artifact name + VirtualPath: "test-fixtures/java-builds/packages/example-java-app-gradle-0.1.0.jar:joda-time:joda-time", PomProperties: &pkg.PomProperties{ Path: "META-INF/maven/joda-time/joda-time/pom.properties", GroupID: "joda-time", @@ -240,7 +240,7 @@ func TestParseJar(t *testing.T) { Metadata: pkg.JavaMetadata{ // ensure that nested packages with different names than that of the parent are appended as // a suffix on the virtual path - VirtualPath: "test-fixtures/java-builds/packages/example-java-app-maven-0.1.0.jar:joda-time", + VirtualPath: "test-fixtures/java-builds/packages/example-java-app-maven-0.1.0.jar:joda-time:joda-time", PomProperties: &pkg.PomProperties{ Path: "META-INF/maven/joda-time/joda-time/pom.properties", GroupID: "joda-time", @@ -659,7 +659,7 @@ func Test_newPackageFromMavenData(t *testing.T) { Type: pkg.JavaPkg, MetadataType: pkg.JavaMetadataType, Metadata: pkg.JavaMetadata{ - VirtualPath: virtualPath + ":" + "some-artifact-id", + VirtualPath: virtualPath + ":" + "some-group-id" + ":" + "some-artifact-id", PomProperties: &pkg.PomProperties{ Name: "some-name", GroupID: "some-group-id", @@ -728,7 +728,7 @@ func Test_newPackageFromMavenData(t *testing.T) { Type: pkg.JavaPkg, MetadataType: pkg.JavaMetadataType, Metadata: pkg.JavaMetadata{ - VirtualPath: virtualPath + ":" + "some-artifact-id", + VirtualPath: virtualPath + ":" + "some-group-id" + ":" + "some-artifact-id", PomProperties: &pkg.PomProperties{ Name: "some-name", GroupID: "some-group-id", @@ -797,7 +797,7 @@ func Test_newPackageFromMavenData(t *testing.T) { Type: pkg.JenkinsPluginPkg, MetadataType: pkg.JavaMetadataType, Metadata: pkg.JavaMetadata{ - VirtualPath: virtualPath + ":" + "some-artifact-id", + VirtualPath: virtualPath + ":" + "com.cloudbees.jenkins.plugins" + ":" + "some-artifact-id", PomProperties: &pkg.PomProperties{ Name: "some-name", GroupID: "com.cloudbees.jenkins.plugins", @@ -894,44 +894,6 @@ func Test_newPackageFromMavenData(t *testing.T) { }, expectedPackage: nil, }, - { - name: "child matches parent by virtual path -- override name and version", - props: pkg.PomProperties{ - Name: "some-name", - GroupID: "some-group-id", - ArtifactID: "some-parent-name", // note: DOES NOT match parent package - Version: "3.0", // note: DOES NOT match parent package - }, - parent: &pkg.Package{ - Name: "", // note: empty, so should not be matched on - Version: "", // note: empty, so should not be matched on - Type: pkg.JavaPkg, - Metadata: pkg.JavaMetadata{ - VirtualPath: virtualPath, // note: matching virtual path - Manifest: nil, - PomProperties: nil, - Parent: nil, - }, - }, - expectedParent: pkg.Package{ - Name: "some-parent-name", - Version: "3.0", - Type: pkg.JavaPkg, - Metadata: pkg.JavaMetadata{ - VirtualPath: virtualPath, - Manifest: nil, - // note: we attach the discovered pom properties data - PomProperties: &pkg.PomProperties{ - Name: "some-name", - GroupID: "some-group-id", - ArtifactID: "some-parent-name", - Version: "3.0", - }, - Parent: nil, - }, - }, - expectedPackage: nil, - }, { name: "child matches parent by artifact id", props: pkg.PomProperties{ diff --git a/syft/pkg/cataloger/java/package_url.go b/syft/pkg/cataloger/java/package_url.go index adf05ff74e8..d29f1ae3c06 100644 --- a/syft/pkg/cataloger/java/package_url.go +++ b/syft/pkg/cataloger/java/package_url.go @@ -1,6 +1,8 @@ package java import ( + "strings" + "github.com/anchore/packageurl-go" "github.com/anchore/syft/syft/pkg" "github.com/anchore/syft/syft/pkg/cataloger/common/cpe" @@ -9,9 +11,9 @@ import ( // PackageURL returns the PURL for the specific java package (see https://github.com/package-url/purl-spec) func packageURL(name, version string, metadata pkg.JavaMetadata) string { var groupID = name - groupIDs := cpe.GroupIDsFromJavaMetadata(name, metadata) - if len(groupIDs) > 0 { - groupID = groupIDs[0] + + if gID := groupIDFromJavaMetadata(name, metadata); gID != "" { + groupID = gID } pURL := packageurl.NewPackageURL( @@ -23,3 +25,121 @@ func packageURL(name, version string, metadata pkg.JavaMetadata) string { "") return pURL.ToString() } + +// GroupIDFromJavaPackage returns the authoritative group ID for a Java package. +// The order of precedence is: +// 1. The group ID from the POM properties +// 2. The group ID from the POM project +// 3. The group ID from a select map of known group IDs +// 4. The group ID from the Java manifest +func groupIDFromJavaMetadata(pkgName string, metadata pkg.JavaMetadata) (groupID string) { + if groupID = groupIDFromPomProperties(metadata.PomProperties); groupID != "" { + return groupID + } + + if groupID = groupIDFromPomProject(metadata.PomProject); groupID != "" { + return groupID + } + + if groupID = groupIDFromKnownPackageList(pkgName); groupID != "" { + return groupID + } + + if groupID = groupIDFromJavaManifest(metadata.Manifest); groupID != "" { + return groupID + } + + return groupID +} + +func groupIDFromKnownPackageList(pkgName string) (groupID string) { + if groupID, ok := cpe.DefaultArtifactIDToGroupID[pkgName]; ok { + return groupID + } + return groupID +} + +func groupIDFromJavaManifest(manifest *pkg.JavaManifest) (groupID string) { + if manifest == nil { + return groupID + } + + groupIDS := cpe.GetManifestFieldGroupIDs(manifest, cpe.PrimaryJavaManifestGroupIDFields) + // assumes that primaryJavaManifestNameFields are ordered by priority + if len(groupIDS) != 0 { + return groupIDS[0] + } + + groupIDS = cpe.GetManifestFieldGroupIDs(manifest, cpe.SecondaryJavaManifestGroupIDFields) + + if len(groupIDS) != 0 { + return groupIDS[0] + } + + return groupID +} + +func groupIDFromPomProperties(properties *pkg.PomProperties) (groupID string) { + if properties == nil { + return groupID + } + + if looksLikeGroupID(properties.GroupID) { + return cleanGroupID(properties.GroupID) + } + + // sometimes the publisher puts the group ID in the artifact ID field unintentionally + if looksLikeGroupID(properties.ArtifactID) { + // there is a strong indication that the artifact ID is really a group ID + return cleanGroupID(properties.ArtifactID) + } + + return groupID +} + +func groupIDFromPomProject(project *pkg.PomProject) (groupID string) { + if project == nil { + return groupID + } + + // check the project details + if looksLikeGroupID(project.GroupID) { + return cleanGroupID(project.GroupID) + } + + // sometimes the publisher puts the group ID in the artifact ID field unintentionally + if looksLikeGroupID(project.ArtifactID) { + // there is a strong indication that the artifact ID is really a group ID + return cleanGroupID(project.ArtifactID) + } + + // let's check the parent details + // if the current project does not have a group ID, but the parent does, we'll use the parent's group ID + if project.Parent != nil { + if looksLikeGroupID(project.Parent.GroupID) { + return cleanGroupID(project.Parent.GroupID) + } + + // sometimes the publisher puts the group ID in the artifact ID field unintentionally + if looksLikeGroupID(project.Parent.ArtifactID) { + // there is a strong indication that the artifact ID is really a group ID + return cleanGroupID(project.Parent.ArtifactID) + } + } + + return groupID +} +func looksLikeGroupID(value string) bool { + return strings.Contains(value, ".") +} + +func cleanGroupID(groupID string) string { + return strings.TrimSpace(removeOSCIDirectives(groupID)) +} + +func removeOSCIDirectives(groupID string) string { + // for example: + // org.bar;uses:=“org.foo” -> org.bar + // more about OSGI directives see https://spring.io/blog/2008/10/20/understanding-the-osgi-uses-directive/ + return strings.Split(groupID, ";")[0] +} diff --git a/syft/pkg/cataloger/java/package_url_test.go b/syft/pkg/cataloger/java/package_url_test.go index ac5eef6a06f..665827a2dcc 100644 --- a/syft/pkg/cataloger/java/package_url_test.go +++ b/syft/pkg/cataloger/java/package_url_test.go @@ -45,3 +45,59 @@ func Test_packageURL(t *testing.T) { }) } } + +func Test_groupIDFromJavaMetadata(t *testing.T) { + tests := []struct { + name string + pkgName string + metadata pkg.JavaMetadata + expect string + }{ + { + name: "pom properties", + metadata: pkg.JavaMetadata{ + PomProperties: &pkg.PomProperties{ + GroupID: "org.anchore", + }, + }, + expect: "org.anchore", + }, + { + name: "pom project", + metadata: pkg.JavaMetadata{ + PomProject: &pkg.PomProject{ + GroupID: "org.anchore", + }, + }, + expect: "org.anchore", + }, + { + name: "known package list", + pkgName: "ant-antlr", + metadata: pkg.JavaMetadata{}, + expect: "org.apache.ant", + }, + { + name: "java manifest", + metadata: pkg.JavaMetadata{ + Manifest: &pkg.JavaManifest{ + Main: map[string]string{ + "Implementation-Vendor": "org.anchore", + }, + }, + }, + expect: "org.anchore", + }, + { + name: "no group id", + metadata: pkg.JavaMetadata{}, + expect: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.expect, groupIDFromJavaMetadata(tt.pkgName, tt.metadata)) + }) + } +} diff --git a/syft/pkg/cataloger/java/parse_pom_xml.go b/syft/pkg/cataloger/java/parse_pom_xml.go index 704cb559808..c59bfbf4057 100644 --- a/syft/pkg/cataloger/java/parse_pom_xml.go +++ b/syft/pkg/cataloger/java/parse_pom_xml.go @@ -178,7 +178,9 @@ func resolveProperty(pom gopom.Project, property *string) string { pomValueType = pomValue.Type() if pomValueType.Kind() == reflect.Ptr { pomValue = pomValue.Elem() - pomValueType = pomValue.Type() + if !pomValue.IsZero() { + pomValueType = pomValue.Type() + } } if partNum == numParts-1 { return fmt.Sprintf("%v", pomValue.Interface())