Skip to content
This repository has been archived by the owner on Nov 16, 2020. It is now read-only.

Blazing fast HTML escaping. #56

Merged
merged 3 commits into from
Jun 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 92 additions & 7 deletions Sources/TemplateKit/Utilities/HTMLEscape.swift
Original file line number Diff line number Diff line change
@@ -1,13 +1,98 @@
import Foundation

extension String {
/// Lookup table mapping the characters that need escaping to their escaped representation.
private static let htmlEscapeMap: [UInt8: String] = [
UInt8(ascii: "&"): "&",
UInt8(ascii: "\""): """,
UInt8(ascii: "'"): "'",
UInt8(ascii: "<"): "&lt;",
UInt8(ascii: ">"): "&gt;"]

/// Stores an inline byte array to avoid the memory overhead of using `[UInt8]`.
private struct InlineByteArray {
private(set) var eightBytes: Int64 = 0
private(set) var count: Int

init(bytes: [UInt8]) {
assert(bytes.count <= 8)
self.count = bytes.count
let selfPointer = UnsafeMutableRawPointer(&self)
bytes.withUnsafeBytes { bytesPointer in
selfPointer.copyMemory(from: bytesPointer.baseAddress!, byteCount: bytes.count)
}
}
}

private struct SixteenBytes {
let firstEight: Int64 = 0
let secondEight: Int64 = 0

init() { }
}

/// Same as `htmlEscapeMap`, but stored as an array indexed from 0 to 255 to avoid dictionary lookups.
/// In addition, we store `InlineByteArray`s instead of `String`s in order to avoid memory management overhead.
/// If no escaping is required for a character, the character itself is stored.
/// Using an array-typed lookup table is much faster than a dictionary-typed one or `if`-based branching.
private static let htmlEscapeMapASCIIByteArray: [InlineByteArray] = (UInt8(0)...UInt8(255)).map { byte in
if let escaped = String.htmlEscapeMap[byte] {
return InlineByteArray(bytes: Array(escaped.utf8))
} else {
return InlineByteArray(bytes: [byte])
}
}

/// Escapes HTML entities in a `String`.
internal func htmlEscaped() -> String {
/// FIXME: performance
return replacingOccurrences(of: "&", with: "&amp;")
.replacingOccurrences(of: "\"", with: "&quot;")
.replacingOccurrences(of: "'", with: "&#39;")
.replacingOccurrences(of: "<", with: "&lt;")
.replacingOccurrences(of: ">", with: "&gt;")
public func htmlEscaped() -> String {
var expectedLength = 0
// Using `withUnsafeBufferPointer` is minimally faster than calling `String.htmlEscapeMapASCIILengths[Int(character)]` for each character.
String.htmlEscapeMapASCIIByteArray.withUnsafeBufferPointer { lengths in
for character in self.utf8 {
expectedLength += lengths[Int(character)].count
}
}

guard expectedLength != self.utf8.count else {
// Shortcut: no replacements necessary; skip them altogether.
return self
}

func writeEscapedString(_ resultBytes: UnsafeMutableRawPointer) -> Void {
var raw = resultBytes
let end = raw + expectedLength
for character in self.utf8 {
var escaped = String.htmlEscapeMapASCIIByteArray[Int(character)]
assert(raw + escaped.count <= end)
raw.copyMemory(from: &escaped, byteCount: escaped.count)
raw += escaped.count
}
}

if expectedLength <= 15 {
// Avoid the `Array<UInt8>` heap allocation for strings consisting
// of at most 15 UTF-8 code units, where `String`'s small string
// optimization avoids a memory allocation.
// This provides another ~5x speedup compared to the "slow" path below.
// Note: This might be slightly less efficient (but still correct,
// and still faster than the slow path) for non-ASCII Strings on Swift 4.2.
var resultData = SixteenBytes()
let resultBytes = UnsafeMutableRawPointer(&resultData)
writeEscapedString(resultBytes)

// Note: Byte 16 should always be zero to make sure the string is null-terminated.
// This is ensured by `raw + escaped.count <= end = expectedLength <= 15` above.
return String(cString: resultBytes.assumingMemoryBound(to: UInt8.self))
} else {
var resultData = Array<UInt8>(repeating: 0, count: expectedLength)
resultData.withUnsafeMutableBytes {
writeEscapedString($0.baseAddress!)
}

// TODO: It might be possible to gain further improvements
// by re-using the byte array allocated by `resultData`
// to avoid copying the string's bytes here.
return String(bytes: resultData, encoding: .utf8)! // Guaranteed to succeed.
}
}
}
10 changes: 6 additions & 4 deletions Tests/LinuxMain.swift
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import XCTest
@testable import TemplateKitTests

XCTMain([
testCase(TemplateDataEncoderTests.allTests),
])
import TemplateKitTests

var tests = [XCTestCaseEntry]()
tests += TemplateKitTests.__allTests()

XCTMain(tests)
91 changes: 91 additions & 0 deletions Tests/TemplateKitTests/HTMLEscapeTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
@testable import TemplateKit
import XCTest

class HTMLEscapeTests: XCTestCase {
func testCorrectness() {
XCTAssertEqual("".htmlEscaped(), "")
XCTAssertEqual("abcdef".htmlEscaped(), "abcdef")
XCTAssertEqual("abc&<>\"'".htmlEscaped(), "abc&amp;&lt;&gt;&quot;&#39;")
XCTAssertEqual("abc&".htmlEscaped(), "abc&amp;")
}

func testShortStringNoReplacements() {
let string = "abcde12345"
measure {
for _ in 0..<10_000_000 {
_ = string.htmlEscaped()
}
}
}

func testShortStringWithReplacements() {
// The result should still fit into 15 bytes to hit the in-place String storage optimization.
let string = "<abcdef>"
measure {
for _ in 0..<1_000_000 {
_ = string.htmlEscaped()
}
}
}

static let mediumStringNoReplacements: String = {
let lowercase = Array(UInt8(ascii: "a")...UInt8(ascii: "z"))
let digits = Array(UInt8(ascii: "0")...UInt8(ascii: "9"))
let uppercase = Array(UInt8(ascii: "A")...UInt8(ascii: "Z"))

return String(bytes: lowercase + digits + uppercase, encoding: .utf8)!
}()

func testMediumStringNoReplacements() {
measure {
for _ in 0..<2_000_000 {
_ = HTMLEscapeTests.mediumStringNoReplacements.htmlEscaped()
}
}
}

static let mediumStringWithReplacements: String = {
let lowercase = Array(UInt8(ascii: "a")...UInt8(ascii: "z"))
let digits = Array(UInt8(ascii: "0")...UInt8(ascii: "9"))
let uppercase = Array(UInt8(ascii: "A")...UInt8(ascii: "Z"))
let allCharacters = [[UInt8(ascii: "&")], lowercase, [UInt8(ascii: "\"")], digits, [UInt8(ascii: "'")], uppercase, [UInt8(ascii: "<")], [UInt8(ascii: ">")]]
.flatMap { $0 }

return String(bytes: allCharacters, encoding: .utf8)!
}()

func testMediumStringWithReplacements() {
measure {
for _ in 0..<500_000 {
_ = HTMLEscapeTests.mediumStringWithReplacements.htmlEscaped()
}
}
}

func testMediumStringWithOnlyReplacements() {
let string = Array(repeating: "&<>\"'", count: 10).joined(separator: "")
measure {
for _ in 0..<500_000 {
_ = string.htmlEscaped()
}
}
}

func testLongStringNoReplacements() {
let longString = Array(repeating: HTMLEscapeTests.mediumStringNoReplacements, count: 20).joined(separator: "")
measure {
for _ in 0..<200_000 {
_ = longString.htmlEscaped()
}
}
}

func testLongStringWithReplacements() {
let longString = Array(repeating: HTMLEscapeTests.mediumStringWithReplacements, count: 20).joined(separator: "")
measure {
for _ in 0..<50_000 {
_ = longString.htmlEscaped()
}
}
}
}
52 changes: 52 additions & 0 deletions Tests/TemplateKitTests/XCTestManifests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#if !canImport(ObjectiveC)
import XCTest

extension HTMLEscapeTests {
// DO NOT MODIFY: This is autogenerated, use:
// `swift test --generate-linuxmain`
// to regenerate.
static let __allTests__HTMLEscapeTests = [
("testCorrectness", testCorrectness),
("testLongStringNoReplacements", testLongStringNoReplacements),
("testLongStringWithReplacements", testLongStringWithReplacements),
("testMediumStringNoReplacements", testMediumStringNoReplacements),
("testMediumStringWithOnlyReplacements", testMediumStringWithOnlyReplacements),
("testMediumStringWithReplacements", testMediumStringWithReplacements),
("testShortStringNoReplacements", testShortStringNoReplacements),
("testShortStringWithReplacements", testShortStringWithReplacements),
]
}

extension TemplateDataEncoderTests {
// DO NOT MODIFY: This is autogenerated, use:
// `swift test --generate-linuxmain`
// to regenerate.
static let __allTests__TemplateDataEncoderTests = [
("testArray", testArray),
("testComplexEncodable", testComplexEncodable),
("testDictionary", testDictionary),
("testDouble", testDouble),
("testEncodable", testEncodable),
("testEncodeSuperCustomImplementation", testEncodeSuperCustomImplementation),
("testEncodeSuperCustomImplementationWithSuperEncoder1", testEncodeSuperCustomImplementationWithSuperEncoder1),
("testEncodeSuperCustomImplementationWithSuperEncoder2", testEncodeSuperCustomImplementationWithSuperEncoder2),
("testEncodeSuperDefaultImplementation", testEncodeSuperDefaultImplementation),
("testEncodingPerformanceExampleModel", testEncodingPerformanceExampleModel),
("testEncodingPerformanceExampleModelJSONBaseline", testEncodingPerformanceExampleModelJSONBaseline),
("testGH10", testGH10),
("testGH20", testGH20),
("testNestedArray", testNestedArray),
("testNestedDictionary", testNestedDictionary),
("testNestedEncodable", testNestedEncodable),
("testString", testString),
("testTemplabeByteScannerPeak", testTemplabeByteScannerPeak),
]
}

public func __allTests() -> [XCTestCaseEntry] {
return [
testCase(HTMLEscapeTests.__allTests__HTMLEscapeTests),
testCase(TemplateDataEncoderTests.__allTests__TemplateDataEncoderTests),
]
}
#endif