From 84f99e6e8752e5e422e298b0e894930c275023ee Mon Sep 17 00:00:00 2001 From: Zach Bloomquist Date: Mon, 15 Jul 2019 12:58:56 -0400 Subject: [PATCH] Fix a variety of character encoding issues (#4698) * add e2e test that demonstrates encoding issue * fix all sorts of content-type wackiness, infer content-type from html * update snapshot * add kr, jp, cn tests * update snapshot * intercept any valid JS content-type * PR review changes --- .../1_interception_spec.coffee.js | 79 +++++++++++++++++++ packages/server/lib/controllers/proxy.coffee | 38 ++++++++- packages/server/package.json | 2 + .../test/e2e/1_interception_spec.coffee | 63 +++++++++++++++ packages/server/test/e2e/6_visit_spec.coffee | 1 - .../integration/character_encoding_spec.js | 44 +++++++++++ .../projects/e2e/static/charsets/euc-kr.html | 13 +++ .../projects/e2e/static/charsets/gb2312.html | 11 +++ .../e2e/static/charsets/iso-8859-1.html | 36 +++++++++ .../e2e/static/charsets/shift-jis.html | 12 +++ .../support/fixtures/server/iso-8859-1.html | 36 +++++++++ 11 files changed, 332 insertions(+), 3 deletions(-) create mode 100644 packages/server/__snapshots__/1_interception_spec.coffee.js create mode 100644 packages/server/test/e2e/1_interception_spec.coffee create mode 100644 packages/server/test/support/fixtures/projects/e2e/cypress/integration/character_encoding_spec.js create mode 100644 packages/server/test/support/fixtures/projects/e2e/static/charsets/euc-kr.html create mode 100644 packages/server/test/support/fixtures/projects/e2e/static/charsets/gb2312.html create mode 100644 packages/server/test/support/fixtures/projects/e2e/static/charsets/iso-8859-1.html create mode 100644 packages/server/test/support/fixtures/projects/e2e/static/charsets/shift-jis.html create mode 100644 packages/server/test/support/fixtures/server/iso-8859-1.html diff --git a/packages/server/__snapshots__/1_interception_spec.coffee.js b/packages/server/__snapshots__/1_interception_spec.coffee.js new file mode 100644 index 000000000000..210034a534d3 --- /dev/null +++ b/packages/server/__snapshots__/1_interception_spec.coffee.js @@ -0,0 +1,79 @@ +exports['e2e interception spec character encodings does not mangle non-UTF-8 text 1'] = ` + +==================================================================================================== + + (Run Starting) + + ┌────────────────────────────────────────────────────────────────────────────────────────────────┐ + │ Cypress: 1.2.3 │ + │ Browser: FooBrowser 88 │ + │ Specs: 1 found (character_encoding_spec.js) │ + │ Searched: cypress/integration/character_encoding_spec.js │ + └────────────────────────────────────────────────────────────────────────────────────────────────┘ + + +──────────────────────────────────────────────────────────────────────────────────────────────────── + + Running: character_encoding_spec.js... (1 of 1) + + + character encoding tests + without gzip + ✓ iso-8859-1 works + ✓ euc-kr works + ✓ shift-jis works + ✓ gb2312 works + with gzip + ✓ iso-8859-1 works + ✓ euc-kr works + ✓ shift-jis works + ✓ gb2312 works + without gzip (no content-type charset) + ✓ iso-8859-1 works + ✓ euc-kr works + ✓ shift-jis works + ✓ gb2312 works + with gzip (no content-type charset) + ✓ iso-8859-1 works + ✓ euc-kr works + ✓ shift-jis works + ✓ gb2312 works + + + 16 passing + + + (Results) + + ┌──────────────────────────────────────────┐ + │ Tests: 16 │ + │ Passing: 16 │ + │ Failing: 0 │ + │ Pending: 0 │ + │ Skipped: 0 │ + │ Screenshots: 0 │ + │ Video: true │ + │ Duration: X seconds │ + │ Spec Ran: character_encoding_spec.js │ + └──────────────────────────────────────────┘ + + + (Video) + + - Started processing: Compressing to 32 CRF + - Finished processing: /foo/bar/.projects/e2e/cypress/videos/abc123.mp4 (X seconds) + + +==================================================================================================== + + (Run Finished) + + + Spec Tests Passing Failing Pending Skipped + ┌────────────────────────────────────────────────────────────────────────────────────────────────┐ + │ ✔ character_encoding_spec.js XX:XX 16 16 - - - │ + └────────────────────────────────────────────────────────────────────────────────────────────────┘ + All specs passed! XX:XX 16 16 - - - + + +` diff --git a/packages/server/lib/controllers/proxy.coffee b/packages/server/lib/controllers/proxy.coffee index ec3c925db0d8..e7e5e86c23e8 100644 --- a/packages/server/lib/controllers/proxy.coffee +++ b/packages/server/lib/controllers/proxy.coffee @@ -1,6 +1,8 @@ _ = require("lodash") zlib = require("zlib") +charset = require("charset") concat = require("concat-stream") +iconv = require("iconv-lite") Promise = require("bluebird") accept = require("http-accept") debug = require("debug")("cypress:server:proxy") @@ -22,6 +24,18 @@ zlibOptions = { finishFlush: zlib.Z_SYNC_FLUSH } +## https://github.com/cypress-io/cypress/issues/1543 +getNodeCharsetFromResponse = (headers, body) -> + httpCharset = (charset(headers, body, 1024) || '').toLowerCase() + + debug("inferred charset from response %o", { httpCharset }) + + if iconv.encodingExists(httpCharset) + return httpCharset + + ## browsers default to latin1 + return "latin1" + isGzipError = (err) -> Object.prototype.hasOwnProperty.call(zlib.constants, err.code) @@ -119,6 +133,13 @@ module.exports = { ## make sure the response includes string type contentType and contentType.includes(str) + resContentTypeIsJavaScript = (respHeaders) -> + _.some [ + 'application/javascript', + 'application/x-javascript', + 'text/javascript' + ].map(_.partial(resContentTypeIs, respHeaders)) + reqAcceptsHtml = -> ## don't inject if this is an XHR from jquery return if req.headers["x-requested-with"] @@ -176,7 +197,10 @@ module.exports = { ## bypass the stream buffer and pipe this back if wantsInjection rewrite = (body) -> - rewriter.html(body.toString("utf8"), remoteState.domainName, wantsInjection, wantsSecurityRemoved) + ## transparently decode their body to a node string and then re-encode + nodeCharset = getNodeCharsetFromResponse(headers, body) + body = rewriter.html(iconv.decode(body, nodeCharset), remoteState.domainName, wantsInjection, wantsSecurityRemoved) + iconv.encode(body, nodeCharset) ## TODO: we can probably move this to the new ## replacestream rewriter instead of using @@ -247,6 +271,16 @@ module.exports = { onResponse = (str, incomingRes) => {headers, statusCode} = incomingRes + originalSetHeader = res.setHeader + + ## express does all kinds of silly/nasty stuff to the content-type... + ## but we don't want to change it at all! + res.setHeader = (k, v) -> + if k == 'content-type' + v = incomingRes.headers['content-type'] + + originalSetHeader.call(res, k, v) + wantsInjection ?= do -> return false if not resContentTypeIs(headers, "text/html") @@ -263,7 +297,7 @@ module.exports = { ## on the response or its a request for any javascript script tag config.modifyObstructiveCode and ( (wantsInjection is "full") or - resContentTypeIs(headers, "application/javascript") + resContentTypeIsJavaScript(headers) ) @setResHeaders(req, res, incomingRes, wantsInjection) diff --git a/packages/server/package.json b/packages/server/package.json index 95668b08ef82..173472357435 100644 --- a/packages/server/package.json +++ b/packages/server/package.json @@ -55,6 +55,7 @@ "browserify": "13.3.0", "chai": "1.10.0", "chalk": "2.4.2", + "charset": "1.0.1", "check-more-types": "2.24.0", "chokidar": "3.0.1", "cjsxify": "0.3.0", @@ -87,6 +88,7 @@ "http-proxy": "1.17.0", "http-status-codes": "1.3.2", "human-interval": "0.1.6", + "iconv-lite": "0.5.0", "image-size": "0.7.4", "is-fork-pr": "2.3.0", "is-html": "2.0.0", diff --git a/packages/server/test/e2e/1_interception_spec.coffee b/packages/server/test/e2e/1_interception_spec.coffee new file mode 100644 index 000000000000..f60c008d4040 --- /dev/null +++ b/packages/server/test/e2e/1_interception_spec.coffee @@ -0,0 +1,63 @@ +compression = require("compression") +e2e = require("../support/helpers/e2e") +Fixtures = require("../support/helpers/fixtures") +path = require("path") + +PORT = 9876 + +## based off of the most common encodings used on the Internet: +## https://w3techs.com/technologies/overview/character_encoding/all +## as of this writing, these tests will cover ~99% of websites +TEST_ENCODINGS = [ + "iso-8859-1" + "euc-kr" + "shift-jis" + "gb2312" +] + +e2ePath = Fixtures.projectPath("e2e") + +fullController = (charset) -> + (req, res) -> + res.set({ 'content-type': "text/html;charset=#{charset}" }); + res.sendFile(path.join(e2ePath, "static/charsets/#{charset}.html")) + +pageOnlyController = (charset) -> + (req, res) -> + res.set() + res.sendFile(path.join(e2ePath, "static/charsets/#{charset}.html"), { + headers: { 'content-type': "text/html" } + }) + +describe "e2e interception spec", -> + e2e.setup + servers: [ + { + onServer: (app) -> + TEST_ENCODINGS.forEach (enc) -> + app.get "/#{enc}.html", fullController(enc) + + app.use "/#{enc}.html.gz", compression() + app.get "/#{enc}.html.gz", fullController(enc) + + app.get "/#{enc}.html.pageonly", pageOnlyController(enc) + + app.use "/#{enc}.html.gz.pageonly", compression() + app.get "/#{enc}.html.gz.pageonly", pageOnlyController(enc) + + port: PORT + } + ] + + context "character encodings", -> + ## https://github.com/cypress-io/cypress/issues/1543 + it "does not mangle non-UTF-8 text", -> + e2e.exec(@, { + spec: "character_encoding_spec.js" + config: { + defaultCommandTimeout: 100 + baseUrl: "http://localhost:9876" + } + snapshot: true + expectedExitCode: 0 + }) diff --git a/packages/server/test/e2e/6_visit_spec.coffee b/packages/server/test/e2e/6_visit_spec.coffee index aff967bd04fd..9de1490fbe1e 100644 --- a/packages/server/test/e2e/6_visit_spec.coffee +++ b/packages/server/test/e2e/6_visit_spec.coffee @@ -1,5 +1,4 @@ useragent = require("express-useragent") -Fixtures = require("../support/helpers/fixtures") e2e = require("../support/helpers/e2e") onServer = (app) -> diff --git a/packages/server/test/support/fixtures/projects/e2e/cypress/integration/character_encoding_spec.js b/packages/server/test/support/fixtures/projects/e2e/cypress/integration/character_encoding_spec.js new file mode 100644 index 000000000000..6b734bbdf151 --- /dev/null +++ b/packages/server/test/support/fixtures/projects/e2e/cypress/integration/character_encoding_spec.js @@ -0,0 +1,44 @@ +describe('character encoding tests', () => { + [ + { + title: 'without gzip', + extension: '.html', + }, + { + title: 'with gzip', + extension: '.html.gz', + }, + { + title: 'without gzip (no content-type charset)', + extension: '.html.pageonly', + }, + { + title: 'with gzip (no content-type charset)', + extension: '.html.gz.pageonly', + }, + ].forEach(({ title, extension }) => { + context(title, () => { + it('iso-8859-1 works', () => { + cy.visit(`/iso-8859-1${extension}`) + cy.get('#t1').should('have.html', 'Olá Mundo') + cy.get('#t2').should('have.html', 'Ç') + cy.get('#t3').should('have.html', 'Pêssego') + }) + + it('euc-kr works', () => { + cy.visit(`/euc-kr${extension}`) + cy.get('.text').should('contain.html', '서울 남산케이블카 운행') + }) + + it('shift-jis works', () => { + cy.visit(`/shift-jis${extension}`) + cy.get('body').should('contain.html', '総合サポート・お問い合わせ') + }) + + it('gb2312 works', () => { + cy.visit(`/gb2312${extension}`) + cy.get('h3').should('contain.html', '雨果主题展8月启幕') + }) + }) + }) +}) diff --git a/packages/server/test/support/fixtures/projects/e2e/static/charsets/euc-kr.html b/packages/server/test/support/fixtures/projects/e2e/static/charsets/euc-kr.html new file mode 100644 index 000000000000..d9679d2d86fa --- /dev/null +++ b/packages/server/test/support/fixtures/projects/e2e/static/charsets/euc-kr.html @@ -0,0 +1,13 @@ + + + + + +̹ + + +
+ ̺ī 潺 ε7 λ() +
+ + diff --git a/packages/server/test/support/fixtures/projects/e2e/static/charsets/gb2312.html b/packages/server/test/support/fixtures/projects/e2e/static/charsets/gb2312.html new file mode 100644 index 000000000000..504b8e07bb07 --- /dev/null +++ b/packages/server/test/support/fixtures/projects/e2e/static/charsets/gb2312.html @@ -0,0 +1,11 @@ + + + + + + + + +

չ8Ļ

+ + diff --git a/packages/server/test/support/fixtures/projects/e2e/static/charsets/iso-8859-1.html b/packages/server/test/support/fixtures/projects/e2e/static/charsets/iso-8859-1.html new file mode 100644 index 000000000000..d14a7455046c --- /dev/null +++ b/packages/server/test/support/fixtures/projects/e2e/static/charsets/iso-8859-1.html @@ -0,0 +1,36 @@ + + + + + + + + + + Cypress ISO 8859-1 Characters Test + + + + + + + +

+Cypress ISO 8859-1 Characters Test +

+ + + + + + + diff --git a/packages/server/test/support/fixtures/projects/e2e/static/charsets/shift-jis.html b/packages/server/test/support/fixtures/projects/e2e/static/charsets/shift-jis.html new file mode 100644 index 000000000000..0ce60fb3aaee --- /dev/null +++ b/packages/server/test/support/fixtures/projects/e2e/static/charsets/shift-jis.html @@ -0,0 +1,12 @@ + + + + + + + \j[iE\j[XgA - \j[ + + + T|[gE₢킹 + + diff --git a/packages/server/test/support/fixtures/server/iso-8859-1.html b/packages/server/test/support/fixtures/server/iso-8859-1.html new file mode 100644 index 000000000000..7bd5de3b3821 --- /dev/null +++ b/packages/server/test/support/fixtures/server/iso-8859-1.html @@ -0,0 +1,36 @@ + + + + + + + + + + Cypress ISO 8859-1 Characters Test + + + + + + + +

+Cypress ISO 8859-1 Characters Test +

+ + + + + + +