Skip to content

Commit

Permalink
feat: 1944 - update purl generation to use a consistent groupID (#2033)
Browse files Browse the repository at this point in the history
Separate the logic for CPE and PURL generation. 

PURL generation needs a single answer for groupID based on a priority of discovering the field. 
CPE generation still uses multiple potential groupID to populate the candidate cpe.

Improve GroupID detection. 

Currently syft does not use any hierarchy for GroupID detection and treats all sources as equal. 
It treats fields from the manifest file with priority. This change adds a hierarchy to the fields and returns a single answer based on that hierarchy.
---------
Signed-off-by: Christopher Phillips <christopher.phillips@anchore.com>
Signed-off-by: Keith Zantow <kzantow@gmail.com>
Co-authored-by: Keith Zantow <kzantow@gmail.com>
  • Loading branch information
spiffcs authored Aug 22, 2023
1 parent cf37b17 commit ee121cf
Show file tree
Hide file tree
Showing 7 changed files with 247 additions and 77 deletions.
23 changes: 15 additions & 8 deletions syft/pkg/cataloger/common/cpe/java.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,16 @@ var (
"be",
}

primaryJavaManifestGroupIDFields = []string{
PrimaryJavaManifestGroupIDFields = []string{
"Bundle-SymbolicName",
"Extension-Name",
"Specification-Vendor",
"Implementation-Vendor",
"Bundle-SymbolicName",
"Implementation-Vendor-Id",
"Implementation-Title",
"Bundle-Activator",
}
secondaryJavaManifestGroupIDFields = []string{
SecondaryJavaManifestGroupIDFields = []string{
"Automatic-Module-Name",
"Main-Class",
"Package",
Expand Down Expand Up @@ -168,7 +168,7 @@ func artifactIDFromJavaPackage(p pkg.Package) string {
}

artifactID := strings.TrimSpace(metadata.PomProperties.ArtifactID)
if startsWithTopLevelDomain(artifactID) && len(strings.Split(artifactID, ".")) > 1 {
if looksLikeGroupID(artifactID) && len(strings.Split(artifactID, ".")) > 1 {
// there is a strong indication that the artifact ID is really a group ID, don't use it
return ""
}
Expand All @@ -184,6 +184,9 @@ func GroupIDsFromJavaPackage(p pkg.Package) (groupIDs []string) {
return GroupIDsFromJavaMetadata(p.Name, metadata)
}

// GroupIDsFromJavaMetadata returns the possible group IDs for a Java package
// This function is similar to GroupIDFromJavaPackage, but returns all possible group IDs and is less strict
// It is used as a way to generate possible candidates for CPE matching.
func GroupIDsFromJavaMetadata(pkgName string, metadata pkg.JavaMetadata) (groupIDs []string) {
groupIDs = append(groupIDs, groupIDsFromPomProperties(metadata.PomProperties)...)
groupIDs = append(groupIDs, groupIDsFromPomProject(metadata.PomProject)...)
Expand Down Expand Up @@ -242,7 +245,7 @@ func addGroupIDsFromGroupIDsAndArtifactID(groupID, artifactID string) (groupIDs
}

func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []string {
if groupID, ok := defaultArtifactIDToGroupID[pkgName]; ok {
if groupID, ok := DefaultArtifactIDToGroupID[pkgName]; ok {
return []string{groupID}
}

Expand All @@ -251,7 +254,7 @@ func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []stri
}

// try the common manifest fields first for a set of candidates
groupIDs := getManifestFieldGroupIDs(manifest, primaryJavaManifestGroupIDFields)
groupIDs := GetManifestFieldGroupIDs(manifest, PrimaryJavaManifestGroupIDFields)

if len(groupIDs) != 0 {
return groupIDs
Expand All @@ -262,10 +265,10 @@ func groupIDsFromJavaManifest(pkgName string, manifest *pkg.JavaManifest) []stri
// for more info see pkg:maven/commons-io/commons-io@2.8.0 within cloudbees/cloudbees-core-mm:2.263.4.2
// at /usr/share/jenkins/jenkins.war:WEB-INF/plugins/analysis-model-api.hpi:WEB-INF/lib/commons-io-2.8.0.jar
// as well as the ant package from cloudbees/cloudbees-core-mm:2.277.2.4-ra.
return getManifestFieldGroupIDs(manifest, secondaryJavaManifestGroupIDFields)
return GetManifestFieldGroupIDs(manifest, SecondaryJavaManifestGroupIDFields)
}

func getManifestFieldGroupIDs(manifest *pkg.JavaManifest, fields []string) (groupIDs []string) {
func GetManifestFieldGroupIDs(manifest *pkg.JavaManifest, fields []string) (groupIDs []string) {
if manifest == nil {
return nil
}
Expand Down Expand Up @@ -302,3 +305,7 @@ func removeOSCIDirectives(groupID string) string {
func startsWithTopLevelDomain(value string) bool {
return internal.HasAnyOfPrefixes(value, domains...)
}

func looksLikeGroupID(value string) bool {
return strings.Contains(value, ".")
}
2 changes: 1 addition & 1 deletion syft/pkg/cataloger/common/cpe/java_groupid_map.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package cpe

var defaultArtifactIDToGroupID = map[string]string{
var DefaultArtifactIDToGroupID = map[string]string{
"ant": "org.apache.ant",
"ant-antlr": "org.apache.ant",
"ant-antunit": "org.apache.ant",
Expand Down
63 changes: 43 additions & 20 deletions syft/pkg/cataloger/java/archive_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ func parseJavaArchive(_ file.Resolver, _ *generic.Environment, reader file.Locat
}

// uniquePkgKey creates a unique string to identify the given package.
func uniquePkgKey(p *pkg.Package) string {
func uniquePkgKey(groupID string, p *pkg.Package) string {
if p == nil {
return ""
}
return fmt.Sprintf("%s|%s", p.Name, p.Version)
return fmt.Sprintf("%s|%s|%s", groupID, p.Name, p.Version)
}

// newJavaArchiveParser returns a new java archive parser object for the given archive. Can be configured to discover
Expand Down Expand Up @@ -371,13 +371,27 @@ func pomProjectByParentPath(archivePath string, location file.Location, extractP
return projectByParentPath, nil
}

// packagesFromPomProperties processes a single Maven POM properties for a given parent package, returning all listed Java packages found and
// newPackageFromMavenData processes a single Maven POM properties for a given parent package, returning all listed Java packages found and
// associating each discovered package to the given parent package. Note the pom.xml is optional, the pom.properties is not.
func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.PomProject, parentPkg *pkg.Package, location file.Location) *pkg.Package {
// keep the artifact name within the virtual path if this package does not match the parent package
vPathSuffix := ""
if !strings.HasPrefix(pomProperties.ArtifactID, parentPkg.Name) {
vPathSuffix += ":" + pomProperties.ArtifactID
groupID := ""
if parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata); ok {
groupID = groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
}

parentKey := fmt.Sprintf("%s:%s:%s", groupID, parentPkg.Name, parentPkg.Version)
// Since we don't have a package yet, it's important to use the same `field: value` association that we used when creating the parent package
// See below where Name => pomProperties.ArtifactID and Version => pomProperties.Version. We want to check for potentially nested identical
// packages and create equal virtual paths so they are de duped in the future
pomProjectKey := fmt.Sprintf("%s:%s:%s", pomProperties.GroupID, pomProperties.ArtifactID, pomProperties.Version)
if parentKey != pomProjectKey {
// build a new virtual path suffix for the package that is different from the parent package
// we want to use the GroupID and ArtifactID here to preserve uniqueness
// Some packages have the same name but different group IDs (e.g. "org.glassfish.jaxb/jaxb-core", "com.sun.xml.bind/jaxb-core")
// https://github.com/anchore/syft/issues/1944
vPathSuffix += ":" + pomProperties.GroupID + ":" + pomProperties.ArtifactID
}
virtualPath := location.AccessPath() + vPathSuffix

Expand Down Expand Up @@ -408,21 +422,26 @@ func newPackageFromMavenData(pomProperties pkg.PomProperties, pomProject *pkg.Po
}

func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool {
// the name/version pair matches...
if uniquePkgKey(&p) == uniquePkgKey(parentPkg) {
return true
}

metadata, ok := p.Metadata.(pkg.JavaMetadata)
if !ok {
log.WithFields("package", p.String()).Warn("unable to extract java metadata to check for matching package identity")
return false
parentMetadata, parentOk := parentPkg.Metadata.(pkg.JavaMetadata)
if !ok || !parentOk {
switch {
case !ok:
log.WithFields("package", p.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", p.Name)
case !parentOk:
log.WithFields("package", parentPkg.String()).Trace("unable to extract java metadata to check for matching package identity for package: %s", parentPkg.Name)
}
// if we can't extract metadata, we can check for matching identities via the package name
// this is not ideal, but it's better than nothing - this should not be used if we have Metadata

return uniquePkgKey("", &p) == uniquePkgKey("", parentPkg)
}

parentMetadata, ok := parentPkg.Metadata.(pkg.JavaMetadata)
if !ok {
log.WithFields("package", p.String()).Warn("unable to extract java metadata from parent for verifying virtual path")
return false
// try to determine identity with the metadata
groupID := groupIDFromJavaMetadata(p.Name, metadata)
parentGroupID := groupIDFromJavaMetadata(parentPkg.Name, parentMetadata)
if uniquePkgKey(groupID, &p) == uniquePkgKey(parentGroupID, parentPkg) {
return true
}

// the virtual path matches...
Expand All @@ -434,10 +453,14 @@ func packageIdentitiesMatch(p pkg.Package, parentPkg *pkg.Package) bool {
// note: you CANNOT use name-is-subset-of-artifact-id or vice versa --this is too generic. Shaded jars are a good
// example of this: where the package name is "cloudbees-analytics-segment-driver" and a child is "analytics", but
// they do not indicate the same package.
if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID {
return true
// NOTE: artifactId might not be a good indicator of uniqueness since archives can contain forks with the same name
// from different groups (e.g. "org.glassfish.jaxb.jaxb-core" and "com.sun.xml.bind.jaxb-core")
// we will use this check as a last resort
if metadata.PomProperties != nil {
if metadata.PomProperties.ArtifactID != "" && parentPkg.Name == metadata.PomProperties.ArtifactID {
return true
}
}

return false
}

Expand Down
50 changes: 6 additions & 44 deletions syft/pkg/cataloger/java/archive_parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ func TestParseJar(t *testing.T) {
MetadataType: pkg.JavaMetadataType,
Metadata: pkg.JavaMetadata{
// ensure that nested packages with different names than that of the parent are appended as
// a suffix on the virtual path
VirtualPath: "test-fixtures/java-builds/packages/example-java-app-gradle-0.1.0.jar:joda-time",
// a suffix on the virtual path with a colon separator between group name and artifact name
VirtualPath: "test-fixtures/java-builds/packages/example-java-app-gradle-0.1.0.jar:joda-time:joda-time",
PomProperties: &pkg.PomProperties{
Path: "META-INF/maven/joda-time/joda-time/pom.properties",
GroupID: "joda-time",
Expand Down Expand Up @@ -240,7 +240,7 @@ func TestParseJar(t *testing.T) {
Metadata: pkg.JavaMetadata{
// ensure that nested packages with different names than that of the parent are appended as
// a suffix on the virtual path
VirtualPath: "test-fixtures/java-builds/packages/example-java-app-maven-0.1.0.jar:joda-time",
VirtualPath: "test-fixtures/java-builds/packages/example-java-app-maven-0.1.0.jar:joda-time:joda-time",
PomProperties: &pkg.PomProperties{
Path: "META-INF/maven/joda-time/joda-time/pom.properties",
GroupID: "joda-time",
Expand Down Expand Up @@ -659,7 +659,7 @@ func Test_newPackageFromMavenData(t *testing.T) {
Type: pkg.JavaPkg,
MetadataType: pkg.JavaMetadataType,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath + ":" + "some-artifact-id",
VirtualPath: virtualPath + ":" + "some-group-id" + ":" + "some-artifact-id",
PomProperties: &pkg.PomProperties{
Name: "some-name",
GroupID: "some-group-id",
Expand Down Expand Up @@ -728,7 +728,7 @@ func Test_newPackageFromMavenData(t *testing.T) {
Type: pkg.JavaPkg,
MetadataType: pkg.JavaMetadataType,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath + ":" + "some-artifact-id",
VirtualPath: virtualPath + ":" + "some-group-id" + ":" + "some-artifact-id",
PomProperties: &pkg.PomProperties{
Name: "some-name",
GroupID: "some-group-id",
Expand Down Expand Up @@ -797,7 +797,7 @@ func Test_newPackageFromMavenData(t *testing.T) {
Type: pkg.JenkinsPluginPkg,
MetadataType: pkg.JavaMetadataType,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath + ":" + "some-artifact-id",
VirtualPath: virtualPath + ":" + "com.cloudbees.jenkins.plugins" + ":" + "some-artifact-id",
PomProperties: &pkg.PomProperties{
Name: "some-name",
GroupID: "com.cloudbees.jenkins.plugins",
Expand Down Expand Up @@ -894,44 +894,6 @@ func Test_newPackageFromMavenData(t *testing.T) {
},
expectedPackage: nil,
},
{
name: "child matches parent by virtual path -- override name and version",
props: pkg.PomProperties{
Name: "some-name",
GroupID: "some-group-id",
ArtifactID: "some-parent-name", // note: DOES NOT match parent package
Version: "3.0", // note: DOES NOT match parent package
},
parent: &pkg.Package{
Name: "", // note: empty, so should not be matched on
Version: "", // note: empty, so should not be matched on
Type: pkg.JavaPkg,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath, // note: matching virtual path
Manifest: nil,
PomProperties: nil,
Parent: nil,
},
},
expectedParent: pkg.Package{
Name: "some-parent-name",
Version: "3.0",
Type: pkg.JavaPkg,
Metadata: pkg.JavaMetadata{
VirtualPath: virtualPath,
Manifest: nil,
// note: we attach the discovered pom properties data
PomProperties: &pkg.PomProperties{
Name: "some-name",
GroupID: "some-group-id",
ArtifactID: "some-parent-name",
Version: "3.0",
},
Parent: nil,
},
},
expectedPackage: nil,
},
{
name: "child matches parent by artifact id",
props: pkg.PomProperties{
Expand Down
Loading

0 comments on commit ee121cf

Please sign in to comment.