Skip to content

Commit

Permalink
fix reading non utf8 encodings
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Goodman <wagoodman@users.noreply.github.com>
  • Loading branch information
wagoodman committed Aug 22, 2023
1 parent f58425a commit 7566b29
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 2 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ require (
github.com/knqyf263/go-rpmdb v0.0.0-20230301153543-ba94b245509b
github.com/opencontainers/go-digest v1.0.0
github.com/saferwall/pe v1.4.4
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d
github.com/sassoftware/go-rpmutils v0.2.0
github.com/vbatts/go-mtree v0.5.3
github.com/zyedidia/generic v1.2.2-0.20230320175451-4410d2372cb1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,8 @@ github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb
github.com/saferwall/pe v1.4.4 h1:Ml++7/2/Z1iKwV4zCsd1nIqTEAdUQKAetwbbcCarhOg=
github.com/saferwall/pe v1.4.4/go.mod h1:SNzv3cdgk8SBI0UwHfyTcdjawfdnN+nbydnEL7GZ25s=
github.com/sagikazarmark/crypt v0.3.0/go.mod h1:uD/D+6UF4SrIR1uGEv7bBNkNqLGqUr43MRiaGWX1Nig=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/sassoftware/go-rpmutils v0.2.0 h1:pKW0HDYMFWQ5b4JQPiI3WI12hGsVoW0V8+GMoZiI/JE=
github.com/sassoftware/go-rpmutils v0.2.0/go.mod h1:TJJQYtLe/BeEmEjelI3b7xNZjzAukEkeWKmoakvaOoI=
github.com/scylladb/go-set v1.0.3-0.20200225121959-cc7b2070d91e h1:7q6NSFZDeGfvvtIRwBrU/aegEYJYmvev0cHAwo17zZQ=
Expand Down
25 changes: 23 additions & 2 deletions syft/pkg/cataloger/java/parse_pom_xml.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package java

import (
"bytes"
"encoding/xml"
"fmt"
"io"
"reflect"
"regexp"
"strings"

"github.com/saintfish/chardet"
"github.com/vifraa/gopom"
"golang.org/x/net/html/charset"

Expand Down Expand Up @@ -99,9 +101,28 @@ func newPackageFromPom(pom gopom.Project, dep gopom.Dependency, locations ...fil
}

func decodePomXML(content io.Reader) (project gopom.Project, err error) {
decoder := xml.NewDecoder(content)
// prevent against warnings for "xml: encoding "iso-8859-1" declared but Decoder.CharsetReader is nil"
pomContents, err := io.ReadAll(content)
if err != nil {
return project, fmt.Errorf("unable to read pom.xml: %w", err)
}

detector := chardet.NewTextDetector()
detection, err := detector.DetectBest(pomContents)

var inputReader io.Reader
if err == nil && detection != nil && detection.Charset != "UTF-8" {
inputReader, err = charset.NewReaderLabel(detection.Charset, bytes.NewReader(pomContents))
if err != nil {
return project, fmt.Errorf("unable to get encoding: %w", err)
}
} else {
inputReader = bytes.NewReader(pomContents)
}

decoder := xml.NewDecoder(inputReader)
// when an xml file has a character set declaration (e.g. '<?xml version="1.0" encoding="ISO-8859-1"?>') read that and use the correct decoder
decoder.CharsetReader = charset.NewReaderLabel

if err := decoder.Decode(&project); err != nil {
return project, fmt.Errorf("unable to unmarshal pom.xml: %w", err)
}
Expand Down
56 changes: 56 additions & 0 deletions syft/pkg/cataloger/java/parse_pom_xml_test.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package java

import (
"encoding/base64"
"os"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/vifraa/gopom"

"github.com/anchore/syft/syft/file"
Expand Down Expand Up @@ -63,6 +65,60 @@ func Test_parserPomXML(t *testing.T) {
}
}

func Test_decodePomXML_surviveNonUtf8Encoding(t *testing.T) {
// regression for https://github.com/anchore/syft/issues/2044

// we are storing the base64 contents of the pom.xml file. We are doing this to prevent accidental changes to the
// file, which is extremely important for this test.

// for instance, even changing a single character in the file and saving in an IntelliJ IDE will automatically
// convert the file to UTF-8, which will break this test:

// xxd with the original pom.xml
// 00000780: 6964 3e0d 0a20 2020 2020 2020 2020 2020 id>..
// 00000790: 203c 6e61 6d65 3e4a e972 f46d 6520 4d69 <name>J.r.me Mi
// 000007a0: 7263 3c2f 6e61 6d65 3e0d 0a20 2020 2020 rc</name>..

// xxd with the pom.xml converted to UTF-8 (from a simple change with IntelliJ)
// 00000780: 6964 3e0d 0a20 2020 2020 2020 2020 2020 id>..
// 00000790: 203c 6e61 6d65 3e4a efbf bd72 efbf bd6d <name>J...r...m
// 000007a0: 6520 4d69 7263 3c2f 6e61 6d65 3e0d 0a20 e Mirc</name>..

// Note that the name "Jérôme Mirc" was originally interpreted as "J.r.me Mi" and after the save
// is now encoded as "J...r...m" which is not what we want (note the extra bytes for each non UTF-8 character.
// The original 0xe9 byte (é) was converted to 0xefbfbd (�) which is the UTF-8 replacement character.
// This is quite silly on the part of IntelliJ, but it is what it is.

cases := []struct {
name string
fixture string
}{
{
name: "undeclared encoding",
fixture: "test-fixtures/pom/undeclared-iso-8859-encoded-pom.xml.base64",
},
{
name: "declared encoding",
fixture: "test-fixtures/pom/declared-iso-8859-encoded-pom.xml.base64",
},
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
fh, err := os.Open(c.fixture)
require.NoError(t, err)

decoder := base64.NewDecoder(base64.StdEncoding, fh)

proj, err := decodePomXML(decoder)

require.NoError(t, err)
require.NotEmpty(t, proj.Developers)
})
}

}

func Test_parseCommonsTextPomXMLProject(t *testing.T) {
tests := []struct {
input string
Expand Down
Loading

0 comments on commit 7566b29

Please sign in to comment.