Skip to content

Commit

Permalink
Create new snakeCase / word splitting algorithm
Browse files Browse the repository at this point in the history
This commit introduces a word splitting algorithm used for snake casing that has much better results than the current Smithy algorithm. Specifically, it has better behavior for
acronyms ending in `s`, version numbers, and compound acronyms like `MiB` and `GiB`.
  • Loading branch information
rcoh committed Oct 9, 2023
1 parent 26a914e commit 7154c84
Show file tree
Hide file tree
Showing 4 changed files with 126,157 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,114 @@ package software.amazon.smithy.rust.codegen.core.util
import software.amazon.smithy.utils.CaseUtils
import software.amazon.smithy.utils.StringUtils

fun String.doubleQuote(): String = StringUtils.escapeJavaString(this, "").replace(Regex("""\\u([0-9a-f]{4})""")) { matchResult: MatchResult ->
"\\u{" + matchResult.groupValues[1] + "}" as CharSequence
}
fun String.doubleQuote(): String =
StringUtils.escapeJavaString(this, "").replace(Regex("""\\u([0-9a-f]{4})""")) { matchResult: MatchResult ->
"\\u{" + matchResult.groupValues[1] + "}" as CharSequence
}

/**
* Double quote a string, e.g. "abc" -> "\"abc\""
*/
fun String.dq(): String = this.doubleQuote()

private fun String.splitOnWordBoundaries(): List<String> {
val out = mutableListOf<String>()
// These are whole words but cased differently, e.g. `IPv4`, `MiB`, `GiB`, `TtL`
val completeWords = listOf("ipv4", "ipv6", "sigv4", "mib", "gib", "ttl")
var currentWord = ""

// emit the current word and update from the next character
val emit = { next: Char ->
if (currentWord.isNotEmpty()) {
out += currentWord.lowercase()
}
currentWord = if (next.isLetterOrDigit()) {
next.toString()
} else {
""
}
}
val allLowerCase = this.lowercase() == this
this.forEachIndexed { index, nextCharacter ->
val peek = this.getOrNull(index + 1)
val doublePeek = this.getOrNull(index + 2)
val completeWordInProgress = completeWords.any {
(currentWord + this.substring(index)).lowercase().startsWith(
it,
)
} && !completeWords.contains(currentWord.lowercase())
when {
// [C] in these docs indicates the value of nextCharacter
// A[_]B
!nextCharacter.isLetterOrDigit() -> emit(nextCharacter)

// If we have no letters so far, push the next letter (we already know it's a letter or digit)
currentWord.isEmpty() -> currentWord += nextCharacter.toString()

// Abc[D]ef or Ab2[D]ef
!completeWordInProgress && loweredFollowedByUpper(currentWord, nextCharacter) -> emit(nextCharacter)

// s3[k]ey
!completeWordInProgress && allLowerCase && digitFollowedByLower(currentWord, nextCharacter) -> emit(
nextCharacter,
)

// DB[P]roxy, or `IAM[U]ser` but not AC[L]s
endOfAcronym(currentWord, nextCharacter, peek, doublePeek) -> emit(nextCharacter)

// If we haven't found a word boundary, push it and keep going
else -> currentWord += nextCharacter.toString()
}
}
if (currentWord.isNotEmpty()) {
out += currentWord
}
return out
}

/**
* Handle cases like `DB[P]roxy`, `ARN[S]upport`, `AC[L]s`
*/
private fun endOfAcronym(current: String, nextChar: Char, peek: Char?, doublePeek: Char?): Boolean {
if (!current.last().isUpperCase()) {
// Not an acronym in progress
return false
}
if (!nextChar.isUpperCase()) {
// We aren't at the next word yet
return false
}

if (peek?.isLowerCase() != true) {
return false
}

// Skip cases like `AR[N]s`, `AC[L]s` but not `IAM[U]ser`
if (peek == 's' && (doublePeek == null || !doublePeek.isLowerCase())) {
return false
}

// Skip cases like `DynamoD[B]v2`
if (peek == 'v' && doublePeek?.isDigit() == true) {
return false
}
return true
}

private fun loweredFollowedByUpper(current: String, nextChar: Char): Boolean {
if (!nextChar.isUpperCase()) {
return false
}
return current.last().isLowerCase() || current.last().isDigit()
}

private fun digitFollowedByLower(current: String, nextChar: Char): Boolean {
return (current.last().isDigit() && nextChar.isLowerCase())
}

// String extensions
fun String.toSnakeCase(): String {
return CaseUtils.toSnakeCase(this)
return this.splitOnWordBoundaries().joinToString("_") { it.lowercase() }
}

fun String.toPascalCase(): String {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ package software.amazon.smithy.rust.codegen.core.util

import io.kotest.matchers.shouldBe
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.extension.ExtensionContext
import org.junit.jupiter.api.fail
import org.junit.jupiter.params.ParameterizedTest
import org.junit.jupiter.params.provider.Arguments
import org.junit.jupiter.params.provider.ArgumentsProvider
import org.junit.jupiter.params.provider.ArgumentsSource
import java.io.File
import java.util.stream.Stream

internal class StringsTest {

Expand All @@ -18,4 +26,77 @@ internal class StringsTest {
"{\"nested\": \"{\\\"nested\\\": 5}\"}\"}"
""".trimIndent().trim()
}

@Test
fun correctlyConvertToSnakeCase() {
"NotificationARNs".toSnakeCase() shouldBe "notification_arns"
}

@Test
fun testAllNames() {
// Set this to true to write a new test expectation file
val publishUpdate = false
val allNames = this::class.java.getResource("/testOutput.txt")?.readText()!!
val errors = mutableListOf<String>()
val output = StringBuilder()
allNames.lines().filter { it.isNotBlank() }.forEach {
val split = it.split(',')
val input = split[0]
val expectation = split[1]
val actual = input.toSnakeCase()
if (input.toSnakeCase() != expectation) {
errors += "$it => $actual (expected $expectation)"
}
output.appendLine("$input,$actual")
}
if (publishUpdate) {
File("testOutput.txt").writeText(output.toString())
}
if (errors.isNotEmpty()) {
fail(errors.joinToString("\n"))
}
}

@ParameterizedTest
@ArgumentsSource(TestCasesProvider::class)
fun testSnakeCase(input: String, output: String) {
input.toSnakeCase() shouldBe output
}
}

class TestCasesProvider : ArgumentsProvider {
override fun provideArguments(context: ExtensionContext?): Stream<out Arguments> =
listOf(
"ACLs" to "acls",
"ACLsUpdateStatus" to "acls_update_status",
"AllowedAllVPCs" to "allowed_all_vpcs",
"BluePrimaryX" to "blue_primary_x",
"CIDRs" to "cidrs",
"AuthTtL" to "auth_ttl",
"CNAMEPrefix" to "cname_prefix",
"S3Location" to "s3_location",
"signatureS" to "signature_s",
"signatureR" to "signature_r",
"M3u8Settings" to "m3u8_settings",
"IAMUser" to "iam_user",
"OtaaV1_0_x" to "otaa_v1_0_x",
"DynamoDBv2Action" to "dynamo_dbv2_action",
"SessionKeyEmv2000" to "session_key_emv2000",
"SupportsClassB" to "supports_class_b",
"UnassignIpv6AddressesRequest" to "unassign_ipv6_addresses_request",
"TotalGpuMemoryInMiB" to "total_gpu_memory_in_mib",
"WriteIOs" to "write_ios",
"dynamoDBv2" to "dynamo_dbv2",
"ipv4Address" to "ipv4_address",
"sigv4" to "sigv4",
"s3key" to "s3_key",
"sha256sum" to "sha256_sum",
"Av1QvbrSettings" to "av1_qvbr_settings",
"Av1Settings" to "av1_settings",
"AwsElbv2LoadBalancer" to "aws_elbv2_load_balancer",
"SigV4Authorization" to "sigv4_authorization",
"IpV6Address" to "ipv6_address",
"IpV6Cidr" to "ipv6_cidr",
"IpV4Addresses" to "ipv4_addresses",
).map { Arguments.of(it.first, it.second) }.stream()
}
Loading

0 comments on commit 7154c84

Please sign in to comment.