Skip to content
This repository has been archived by the owner on Nov 16, 2020. It is now read-only.

Commit

Permalink
Blazing fast HTML escaping.
Browse files Browse the repository at this point in the history
  • Loading branch information
MrMage committed May 2, 2019
1 parent 121ae51 commit 667862a
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 11 deletions.
99 changes: 92 additions & 7 deletions Sources/TemplateKit/Utilities/HTMLEscape.swift
Original file line number Diff line number Diff line change
@@ -1,13 +1,98 @@
import Foundation

extension String {
/// Lookup table mapping the characters that need escaping to their escaped representation.
private static let htmlEscapeMap: [UInt8: String] = [
UInt8(ascii: "&"): "&",
UInt8(ascii: "\""): """,
UInt8(ascii: "'"): "'",
UInt8(ascii: "<"): "&lt;",
UInt8(ascii: ">"): "&gt;"]

/// Stores an inline byte array to avoid the memory overhead of using `[UInt8]`.
private struct InlineByteArray {
private(set) var eightBytes: Int64 = 0
private(set) var count: Int

init(bytes: [UInt8]) {
assert(bytes.count <= 8)
self.count = bytes.count
let selfPointer = UnsafeMutableRawPointer(&self)
bytes.withUnsafeBytes { bytesPointer in
selfPointer.copyMemory(from: bytesPointer.baseAddress!, byteCount: bytes.count)
}
}
}

private struct SixteenBytes {
let firstEight: Int64 = 0
let secondEight: Int64 = 0

init() { }
}

/// Same as `htmlEscapeMap`, but stored as an array indexed from 0 to 255 to avoid dictionary lookups.
/// In addition, we store `InlineByteArray`s instead of `String`s in order to avoid memory management overhead.
/// If no escaping is required for a character, the character itself is stored.
/// Using an array-typed lookup table is much faster than a dictionary-typed one or `if`-based branching.
private static let htmlEscapeMapASCIIByteArray: [InlineByteArray] = (UInt8(0)...UInt8(255)).map { byte in
if let escaped = String.htmlEscapeMap[byte] {
return InlineByteArray(bytes: Array(escaped.utf8))
} else {
return InlineByteArray(bytes: [byte])
}
}

/// Escapes HTML entities in a `String`.
internal func htmlEscaped() -> String {
/// FIXME: performance
return replacingOccurrences(of: "&", with: "&amp;")
.replacingOccurrences(of: "\"", with: "&quot;")
.replacingOccurrences(of: "'", with: "&#39;")
.replacingOccurrences(of: "<", with: "&lt;")
.replacingOccurrences(of: ">", with: "&gt;")
public func htmlEscaped() -> String {
var expectedLength = 0
// Using `withUnsafeBufferPointer` is minimally faster than calling `String.htmlEscapeMapASCIILengths[Int(character)]` for each character.
String.htmlEscapeMapASCIIByteArray.withUnsafeBufferPointer { lengths in
for character in self.utf8 {
expectedLength += lengths[Int(character)].count
}
}

guard expectedLength != self.utf8.count else {
// Shortcut: no replacements necessary; skip them altogether.
return self
}

func writeEscapedString(_ resultBytes: UnsafeMutableRawPointer) -> Void {
var raw = resultBytes
let end = raw + expectedLength
for character in self.utf8 {
var escaped = String.htmlEscapeMapASCIIByteArray[Int(character)]
assert(raw + escaped.count < end)
raw.copyMemory(from: &escaped, byteCount: escaped.count)
raw += escaped.count
}
}

if expectedLength <= 15 {
// Avoid the `Array<UInt8>` heap allocation for strings consisting
// of at most 15 UTF-8 code units, where `String`'s small string
// optimization avoids a memory allocation.
// This provides another ~5x speedup compared to the "slow" path below.
// Note: This might be slightly less efficient (but still correct,
// and still faster than the slow path) for non-ASCII Strings on Swift 4.2.
var resultData = SixteenBytes()
let resultBytes = UnsafeMutableRawPointer(&resultData)
writeEscapedString(resultBytes)

// Note: Byte 16 should always be zero to make sure the string is null-terminated.
// This is ensured by `raw + escaped.count < end = expectedLength <= 15` above.
return String(cString: resultBytes.assumingMemoryBound(to: UInt8.self))
} else {
var resultData = Array<UInt8>(repeating: 0, count: expectedLength)
resultData.withUnsafeMutableBytes {
writeEscapedString($0.baseAddress!)
}

// TODO: It might be possible to gain further improvements
// by re-using the byte array allocated by `resultData`
// to avoid copying the string's bytes here.
return String(bytes: resultData, encoding: .utf8)! // Guaranteed to succeed.
}
}
}
10 changes: 6 additions & 4 deletions Tests/LinuxMain.swift
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import XCTest
@testable import TemplateKitTests

XCTMain([
testCase(TemplateDataEncoderTests.allTests),
])
import TemplateKitTests

var tests = [XCTestCaseEntry]()
tests += TemplateKitTests.__allTests()

XCTMain(tests)
91 changes: 91 additions & 0 deletions Tests/TemplateKitTests/HTMLEscapeTests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
@testable import TemplateKit
import XCTest

class HTMLEscapeTests: XCTestCase {
func testCorrectness() {
XCTAssertEqual("".htmlEscaped(), "")
XCTAssertEqual("abcdef".htmlEscaped(), "abcdef")
XCTAssertEqual("abc&<>\"'".htmlEscaped(), "abc&amp;&lt;&gt;&quot;&#39;")
XCTAssertEqual("abc&".htmlEscaped(), "abc&amp;")
}

func testShortStringNoReplacements() {
let string = "abcde12345"
measure {
for _ in 0..<10_000_000 {
_ = string.htmlEscaped()
}
}
}

func testShortStringWithReplacements() {
// The result should still fit into 15 bytes to hit the in-place String storage optimization.
let string = "<abcdef>"
measure {
for _ in 0..<1_000_000 {
_ = string.htmlEscaped()
}
}
}

static let mediumStringNoReplacements: String = {
let lowercase = Array(UInt8(ascii: "a")...UInt8(ascii: "z"))
let digits = Array(UInt8(ascii: "0")...UInt8(ascii: "9"))
let uppercase = Array(UInt8(ascii: "A")...UInt8(ascii: "Z"))

return String(bytes: lowercase + digits + uppercase, encoding: .utf8)!
}()

func testMediumStringNoReplacements() {
measure {
for _ in 0..<2_000_000 {
_ = HTMLEscapeTests.mediumStringNoReplacements.htmlEscaped()
}
}
}

static let mediumStringWithReplacements: String = {
let lowercase = Array(UInt8(ascii: "a")...UInt8(ascii: "z"))
let digits = Array(UInt8(ascii: "0")...UInt8(ascii: "9"))
let uppercase = Array(UInt8(ascii: "A")...UInt8(ascii: "Z"))
let allCharacters = [[UInt8(ascii: "&")], lowercase, [UInt8(ascii: "\"")], digits, [UInt8(ascii: "'")], uppercase, [UInt8(ascii: "<")], [UInt8(ascii: ">")]]
.flatMap { $0 }

return String(bytes: allCharacters, encoding: .utf8)!
}()

func testMediumStringWithReplacements() {
measure {
for _ in 0..<500_000 {
_ = HTMLEscapeTests.mediumStringWithReplacements.htmlEscaped()
}
}
}

func testMediumStringWithOnlyReplacements() {
let string = Array(repeating: "&<>\"'", count: 10).joined(separator: "")
measure {
for _ in 0..<500_000 {
_ = string.htmlEscaped()
}
}
}

func testLongStringNoReplacements() {
let longString = Array(repeating: HTMLEscapeTests.mediumStringNoReplacements, count: 20).joined(separator: "")
measure {
for _ in 0..<200_000 {
_ = longString.htmlEscaped()
}
}
}

func testLongStringWithReplacements() {
let longString = Array(repeating: HTMLEscapeTests.mediumStringWithReplacements, count: 20).joined(separator: "")
measure {
for _ in 0..<50_000 {
_ = longString.htmlEscaped()
}
}
}
}
52 changes: 52 additions & 0 deletions Tests/TemplateKitTests/XCTestManifests.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#if !canImport(ObjectiveC)
import XCTest

extension HTMLEscapeTests {
// DO NOT MODIFY: This is autogenerated, use:
// `swift test --generate-linuxmain`
// to regenerate.
static let __allTests__HTMLEscapeTests = [
("testCorrectness", testCorrectness),
("testLongStringNoReplacements", testLongStringNoReplacements),
("testLongStringWithReplacements", testLongStringWithReplacements),
("testMediumStringNoReplacements", testMediumStringNoReplacements),
("testMediumStringWithOnlyReplacements", testMediumStringWithOnlyReplacements),
("testMediumStringWithReplacements", testMediumStringWithReplacements),
("testShortStringNoReplacements", testShortStringNoReplacements),
("testShortStringWithReplacements", testShortStringWithReplacements),
]
}

extension TemplateDataEncoderTests {
// DO NOT MODIFY: This is autogenerated, use:
// `swift test --generate-linuxmain`
// to regenerate.
static let __allTests__TemplateDataEncoderTests = [
("testArray", testArray),
("testComplexEncodable", testComplexEncodable),
("testDictionary", testDictionary),
("testDouble", testDouble),
("testEncodable", testEncodable),
("testEncodeSuperCustomImplementation", testEncodeSuperCustomImplementation),
("testEncodeSuperCustomImplementationWithSuperEncoder1", testEncodeSuperCustomImplementationWithSuperEncoder1),
("testEncodeSuperCustomImplementationWithSuperEncoder2", testEncodeSuperCustomImplementationWithSuperEncoder2),
("testEncodeSuperDefaultImplementation", testEncodeSuperDefaultImplementation),
("testEncodingPerformanceExampleModel", testEncodingPerformanceExampleModel),
("testEncodingPerformanceExampleModelJSONBaseline", testEncodingPerformanceExampleModelJSONBaseline),
("testGH10", testGH10),
("testGH20", testGH20),
("testNestedArray", testNestedArray),
("testNestedDictionary", testNestedDictionary),
("testNestedEncodable", testNestedEncodable),
("testString", testString),
("testTemplabeByteScannerPeak", testTemplabeByteScannerPeak),
]
}

public func __allTests() -> [XCTestCaseEntry] {
return [
testCase(HTMLEscapeTests.__allTests__HTMLEscapeTests),
testCase(TemplateDataEncoderTests.__allTests__TemplateDataEncoderTests),
]
}
#endif

0 comments on commit 667862a

Please sign in to comment.