Create new snakeCase / word splitting algorithm

This commit introduces a word splitting algorithm used for snake casing that has much better results than the current Smithy algorithm. Specifically, it has better behavior for acronyms ending in `s`, version numbers, and compound acronyms like `MiB` and `GiB`.
smithy-lang · Oct 9, 2023 · 7154c84 · 7154c84
1 parent 26a914e
commit 7154c84
Show file tree

Hide file tree

Showing 4 changed files with 126,157 additions and 4 deletions.
diff --git a/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt b/codegen-core/src/main/kotlin/software/amazon/smithy/rust/codegen/core/util/Strings.kt
@@ -8,18 +8,114 @@ package software.amazon.smithy.rust.codegen.core.util
 import software.amazon.smithy.utils.CaseUtils
 import software.amazon.smithy.utils.StringUtils
 
-fun String.doubleQuote(): String = StringUtils.escapeJavaString(this, "").replace(Regex("""\\u([0-9a-f]{4})""")) { matchResult: MatchResult ->
-    "\\u{" + matchResult.groupValues[1] + "}" as CharSequence
-}
+fun String.doubleQuote(): String =
+    StringUtils.escapeJavaString(this, "").replace(Regex("""\\u([0-9a-f]{4})""")) { matchResult: MatchResult ->
+        "\\u{" + matchResult.groupValues[1] + "}" as CharSequence
+    }
 
 /**
  * Double quote a string, e.g. "abc" -> "\"abc\""
  */
 fun String.dq(): String = this.doubleQuote()
 
+private fun String.splitOnWordBoundaries(): List<String> {
+    val out = mutableListOf<String>()
+    // These are whole words but cased differently, e.g. `IPv4`, `MiB`, `GiB`, `TtL`
+    val completeWords = listOf("ipv4", "ipv6", "sigv4", "mib", "gib", "ttl")
+    var currentWord = ""
+
+    // emit the current word and update from the next character
+    val emit = { next: Char ->
+        if (currentWord.isNotEmpty()) {
+            out += currentWord.lowercase()
+        }
+        currentWord = if (next.isLetterOrDigit()) {
+            next.toString()
+        } else {
+            ""
+        }
+    }
+    val allLowerCase = this.lowercase() == this
+    this.forEachIndexed { index, nextCharacter ->
+        val peek = this.getOrNull(index + 1)
+        val doublePeek = this.getOrNull(index + 2)
+        val completeWordInProgress = completeWords.any {
+            (currentWord + this.substring(index)).lowercase().startsWith(
+                it,
+            )
+        } && !completeWords.contains(currentWord.lowercase())
+        when {
+            // [C] in these docs indicates the value of nextCharacter
+            // A[_]B
+            !nextCharacter.isLetterOrDigit() -> emit(nextCharacter)
+
+            // If we have no letters so far, push the next letter (we already know it's a letter or digit)
+            currentWord.isEmpty() -> currentWord += nextCharacter.toString()
+
+            // Abc[D]ef or Ab2[D]ef
+            !completeWordInProgress && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)
+
+            // s3[k]ey
+            !completeWordInProgress && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
+                nextCharacter,
+            )
+
+            // DB[P]roxy, or `IAM[U]ser` but not AC[L]s
+            endOfAcronym(currentWord, nextCharacter, peek, doublePeek) -> emit(nextCharacter)
+
+            // If we haven't found a word boundary, push it and keep going
+            else -> currentWord += nextCharacter.toString()
+        }
+    }
+    if (currentWord.isNotEmpty()) {
+        out += currentWord
+    }
+    return out
+}
+
+/**
+ * Handle cases like `DB[P]roxy`, `ARN[S]upport`, `AC[L]s`
+ */
+private fun endOfAcronym(current: String, nextChar: Char, peek: Char?, doublePeek: Char?): Boolean {
+    if (!current.last().isUpperCase()) {
+        // Not an acronym in progress
+        return false
+    }
+    if (!nextChar.isUpperCase()) {
+        // We aren't at the next word yet
+        return false
+    }
+
+    if (peek?.isLowerCase() != true) {
+        return false
+    }
+
+    // Skip cases like `AR[N]s`, `AC[L]s` but not `IAM[U]ser`
+    if (peek == 's' && (doublePeek == null || !doublePeek.isLowerCase())) {
+        return false
+    }
+
+    // Skip cases like `DynamoD[B]v2`
+    if (peek == 'v' && doublePeek?.isDigit() == true) {
+        return false
+    }
+    return true
+}
+
+private fun loweredFollowedByUpper(current: String, nextChar: Char): Boolean {
+    if (!nextChar.isUpperCase()) {
+        return false
+    }
+    return current.last().isLowerCase() || current.last().isDigit()
+}
+
+private fun digitFollowedByLower(current: String, nextChar: Char): Boolean {
+    return (current.last().isDigit() && nextChar.isLowerCase())
+}
+
 // String extensions
 fun String.toSnakeCase(): String {
-    return CaseUtils.toSnakeCase(this)
+    return this.splitOnWordBoundaries().joinToString("_") { it.lowercase() }
 }
 
 fun String.toPascalCase(): String {

diff --git a/codegen-core/src/test/kotlin/software/amazon/smithy/rust/codegen/core/util/StringsTest.kt b/codegen-core/src/test/kotlin/software/amazon/smithy/rust/codegen/core/util/StringsTest.kt
@@ -7,6 +7,14 @@ package software.amazon.smithy.rust.codegen.core.util
 
 import io.kotest.matchers.shouldBe
 import org.junit.jupiter.api.Test
+import org.junit.jupiter.api.extension.ExtensionContext
+import org.junit.jupiter.api.fail
+import org.junit.jupiter.params.ParameterizedTest
+import org.junit.jupiter.params.provider.Arguments
+import org.junit.jupiter.params.provider.ArgumentsProvider
+import org.junit.jupiter.params.provider.ArgumentsSource
+import java.io.File
+import java.util.stream.Stream
 
 internal class StringsTest {
 
@@ -18,4 +26,77 @@ internal class StringsTest {
         "{\"nested\": \"{\\\"nested\\\": 5}\"}\"}"
         """.trimIndent().trim()
     }
+
+    @Test
+    fun correctlyConvertToSnakeCase() {
+        "NotificationARNs".toSnakeCase() shouldBe "notification_arns"
+    }
+
+    @Test
+    fun testAllNames() {
+        // Set this to true to write a new test expectation file
+        val publishUpdate = false
+        val allNames = this::class.java.getResource("/testOutput.txt")?.readText()!!
+        val errors = mutableListOf<String>()
+        val output = StringBuilder()
+        allNames.lines().filter { it.isNotBlank() }.forEach {
+            val split = it.split(',')
+            val input = split[0]
+            val expectation = split[1]
+            val actual = input.toSnakeCase()
+            if (input.toSnakeCase() != expectation) {
+                errors += "$it => $actual (expected $expectation)"
+            }
+            output.appendLine("$input,$actual")
+        }
+        if (publishUpdate) {
+            File("testOutput.txt").writeText(output.toString())
+        }
+        if (errors.isNotEmpty()) {
+            fail(errors.joinToString("\n"))
+        }
+    }
+
+    @ParameterizedTest
+    @ArgumentsSource(TestCasesProvider::class)
+    fun testSnakeCase(input: String, output: String) {
+        input.toSnakeCase() shouldBe output
+    }
+}
+
+class TestCasesProvider : ArgumentsProvider {
+    override fun provideArguments(context: ExtensionContext?): Stream<out Arguments> =
+        listOf(
+            "ACLs" to "acls",
+            "ACLsUpdateStatus" to "acls_update_status",
+            "AllowedAllVPCs" to "allowed_all_vpcs",
+            "BluePrimaryX" to "blue_primary_x",
+            "CIDRs" to "cidrs",
+            "AuthTtL" to "auth_ttl",
+            "CNAMEPrefix" to "cname_prefix",
+            "S3Location" to "s3_location",
+            "signatureS" to "signature_s",
+            "signatureR" to "signature_r",
+            "M3u8Settings" to "m3u8_settings",
+            "IAMUser" to "iam_user",
+            "OtaaV1_0_x" to "otaa_v1_0_x",
+            "DynamoDBv2Action" to "dynamo_dbv2_action",
+            "SessionKeyEmv2000" to "session_key_emv2000",
+            "SupportsClassB" to "supports_class_b",
+            "UnassignIpv6AddressesRequest" to "unassign_ipv6_addresses_request",
+            "TotalGpuMemoryInMiB" to "total_gpu_memory_in_mib",
+            "WriteIOs" to "write_ios",
+            "dynamoDBv2" to "dynamo_dbv2",
+            "ipv4Address" to "ipv4_address",
+            "sigv4" to "sigv4",
+            "s3key" to "s3_key",
+            "sha256sum" to "sha256_sum",
+            "Av1QvbrSettings" to "av1_qvbr_settings",
+            "Av1Settings" to "av1_settings",
+            "AwsElbv2LoadBalancer" to "aws_elbv2_load_balancer",
+            "SigV4Authorization" to "sigv4_authorization",
+            "IpV6Address" to "ipv6_address",
+            "IpV6Cidr" to "ipv6_cidr",
+            "IpV4Addresses" to "ipv4_addresses",
+        ).map { Arguments.of(it.first, it.second) }.stream()
 }