From 60e9eba2631bee72ec70fbd2c316fddcf911af4e Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Thu, 3 May 2018 12:02:39 -0700 Subject: [PATCH 1/3] Fix System.String over-allocation BaseSize for System.String was not set correctly. It caused unnecessary extra 8 bytes to be allocated at the end of strings that had `Length % 4 < 2` on 64-bit platforms. This change makes affected strings proportionally cheaper. For example, `new string('a', 1)` in a long-running loop is 7% faster. --- src/debug/daccess/request.cpp | 2 +- src/vm/amd64/JitHelpers_InlineGetThread.asm | 4 +- src/vm/amd64/JitHelpers_Slow.asm | 4 +- src/vm/amd64/asmconstants.h | 3 ++ src/vm/appdomain.cpp | 2 - src/vm/arm/asmconstants.h | 45 --------------------- src/vm/i386/jitinterfacex86.cpp | 7 +--- src/vm/methodtablebuilder.cpp | 4 +- src/vm/object.h | 20 +++------ src/vm/object.inl | 10 ++++- 10 files changed, 24 insertions(+), 77 deletions(-) diff --git a/src/debug/daccess/request.cpp b/src/debug/daccess/request.cpp index 377e008fd470..12864e7b0d39 100644 --- a/src/debug/daccess/request.cpp +++ b/src/debug/daccess/request.cpp @@ -1475,7 +1475,7 @@ ClrDataAccess::GetObjectStringData(CLRDATA_ADDRESS obj, unsigned int count, __ou if (count > needed) count = needed; - TADDR pszStr = TO_TADDR(obj)+offsetof(StringObject, m_Characters); + TADDR pszStr = TO_TADDR(obj)+offsetof(StringObject, m_FirstChar); hr = m_pTarget->ReadVirtual(pszStr, (PBYTE)stringData, count * sizeof(wchar_t), &needed); if (SUCCEEDED(hr)) diff --git a/src/vm/amd64/JitHelpers_InlineGetThread.asm b/src/vm/amd64/JitHelpers_InlineGetThread.asm index d9f58cc30fc7..8ebffeb38e5a 100644 --- a/src/vm/amd64/JitHelpers_InlineGetThread.asm +++ b/src/vm/amd64/JitHelpers_InlineGetThread.asm @@ -134,12 +134,10 @@ LEAF_ENTRY AllocateStringFastMP_InlineGetThread, _TEXT cmp ecx, (ASM_LARGE_OBJECT_SIZE - 256)/2 jae OversizedString - mov edx, [r9 + OFFSET__MethodTable__m_BaseSize] - ; Calculate the final size to allocate. ; We need to calculate baseSize + cnt*2, then round that up by adding 7 and anding ~7. - lea edx, [edx + ecx*2 + 7] + lea edx, [STRING_BASE_SIZE + ecx*2 + 7] and edx, -8 INLINE_GETTHREAD r11 diff --git a/src/vm/amd64/JitHelpers_Slow.asm b/src/vm/amd64/JitHelpers_Slow.asm index 0e26ae6dfd96..cd3aa14a9216 100644 --- a/src/vm/amd64/JitHelpers_Slow.asm +++ b/src/vm/amd64/JitHelpers_Slow.asm @@ -276,12 +276,10 @@ LEAF_ENTRY AllocateStringFastUP, _TEXT cmp ecx, (ASM_LARGE_OBJECT_SIZE - 256)/2 jae FramedAllocateString - mov r8d, [r11 + OFFSET__MethodTable__m_BaseSize] - ; Calculate the final size to allocate. ; We need to calculate baseSize + cnt*2, then round that up by adding 7 and anding ~7. - lea r8d, [r8d + ecx*2 + 7] + lea r8d, [STRING_BASE_SIZE + ecx*2 + 7] and r8d, -8 inc [g_global_alloc_lock] diff --git a/src/vm/amd64/asmconstants.h b/src/vm/amd64/asmconstants.h index f526e22f2b97..cadc10ac56b2 100644 --- a/src/vm/amd64/asmconstants.h +++ b/src/vm/amd64/asmconstants.h @@ -545,6 +545,9 @@ ASMCONSTANTS_C_ASSERT(ASM_LARGE_OBJECT_SIZE == LARGE_OBJECT_SIZE); ASMCONSTANTS_C_ASSERT(OFFSETOF__ArrayBase__m_NumComponents == offsetof(ArrayBase, m_NumComponents)); +#define STRING_BASE_SIZE 0x16 +ASMCONSTANTS_RUNTIME_ASSERT(STRING_BASE_SIZE == StringObject::GetBaseSize()); + #define OFFSETOF__StringObject__m_StringLength 0x8 ASMCONSTANTS_C_ASSERT(OFFSETOF__StringObject__m_StringLength == offsetof(StringObject, m_StringLength)); diff --git a/src/vm/appdomain.cpp b/src/vm/appdomain.cpp index 67b79ec19d2c..292ac13d71d0 100644 --- a/src/vm/appdomain.cpp +++ b/src/vm/appdomain.cpp @@ -2759,8 +2759,6 @@ void SystemDomain::LoadBaseSystemClasses() // Load String g_pStringClass = MscorlibBinder::LoadPrimitiveType(ELEMENT_TYPE_STRING); - _ASSERTE(g_pStringClass->GetBaseSize() == ObjSizeOf(StringObject)+sizeof(WCHAR)); - _ASSERTE(g_pStringClass->GetComponentSize() == 2); // Used by Buffer::BlockCopy g_pByteArrayMT = ClassLoader::LoadArrayTypeThrowing( diff --git a/src/vm/arm/asmconstants.h b/src/vm/arm/asmconstants.h index ab3d16b9a9f4..5af778ba3655 100644 --- a/src/vm/arm/asmconstants.h +++ b/src/vm/arm/asmconstants.h @@ -95,18 +95,6 @@ ASMCONSTANTS_C_ASSERT(MethodTableWriteableData__m_dwFlags == offsetof(MethodTabl #define MethodTableWriteableData__enum_flag_Unrestored 0x04 ASMCONSTANTS_C_ASSERT(MethodTableWriteableData__enum_flag_Unrestored == MethodTableWriteableData::enum_flag_Unrestored); -#define StringObject__m_StringLength 0x04 -ASMCONSTANTS_C_ASSERT(StringObject__m_StringLength == offsetof(StringObject, m_StringLength)); - -#define SIZEOF__BaseStringObject 0xe -ASMCONSTANTS_C_ASSERT(SIZEOF__BaseStringObject == (ObjSizeOf(StringObject) + sizeof(WCHAR))); - -#define SIZEOF__ArrayOfObjectRef 0xc -ASMCONSTANTS_C_ASSERT(SIZEOF__ArrayOfObjectRef == ObjSizeOf(ArrayBase)); - -#define SIZEOF__ArrayOfValueType 0xc -ASMCONSTANTS_C_ASSERT(SIZEOF__ArrayOfValueType == ObjSizeOf(ArrayBase)); - #define ArrayBase__m_NumComponents 0x4 ASMCONSTANTS_C_ASSERT(ArrayBase__m_NumComponents == offsetof(ArrayBase, m_NumComponents)); @@ -116,33 +104,8 @@ ASMCONSTANTS_C_ASSERT(ArrayTypeDesc__m_Arg == offsetof(ArrayTypeDesc, m_Arg)); #define PtrArray__m_Array 0x8 ASMCONSTANTS_C_ASSERT(PtrArray__m_Array == offsetof(PtrArray, m_Array)); -#define SYSTEM_INFO__dwNumberOfProcessors 0x14 -ASMCONSTANTS_C_ASSERT(SYSTEM_INFO__dwNumberOfProcessors == offsetof(SYSTEM_INFO, dwNumberOfProcessors)); - #define TypeHandle_CanCast 0x1 // TypeHandle::CanCast -// Maximum number of characters to be allocated for a string in AllocateStringFast*. Chosen so that we'll -// never have to check for overflow and will never try to allocate a string on regular heap that should have -// gone on the large object heap. Additionally the constant has been chosen such that it can be encoded in a -// single Thumb2 CMP instruction. -#define MAX_FAST_ALLOCATE_STRING_SIZE 42240 -ASMCONSTANTS_C_ASSERT(MAX_FAST_ALLOCATE_STRING_SIZE < ((LARGE_OBJECT_SIZE - SIZEOF__BaseStringObject) / 2)); - - -// Array of objectRef of this Maximum number of elements can be allocated in JIT_NewArr1OBJ_MP*. Chosen so that we'll -// never have to check for overflow and will never try to allocate the array on regular heap that should have -// gone on the large object heap. Additionally the constant has been chosen such that it can be encoded in a -// single Thumb2 CMP instruction. -#define MAX_FAST_ALLOCATE_ARRAY_OBJECTREF_SIZE 21120 -ASMCONSTANTS_C_ASSERT(MAX_FAST_ALLOCATE_ARRAY_OBJECTREF_SIZE < ((LARGE_OBJECT_SIZE - SIZEOF__ArrayOfObjectRef) / sizeof(void*))); - -// Array of valueClass of this Maximum number of characters can be allocated JIT_NewArr1VC_MP*. Chosen so that we'll -// never have to check for overflow and will never try to allocate the array on regular heap that should have -// gone on the large object heap. Additionally the constant has been chosen such that it can be encoded in a -// single Thumb2 CMP instruction. -#define MAX_FAST_ALLOCATE_ARRAY_VC_SIZE 65280 -ASMCONSTANTS_C_ASSERT(MAX_FAST_ALLOCATE_ARRAY_VC_SIZE < ((4294967296 - 1 - SIZEOF__ArrayOfValueType) / 65536)); - #define SIZEOF__GSCookie 0x4 ASMCONSTANTS_C_ASSERT(SIZEOF__GSCookie == sizeof(GSCookie)); @@ -203,14 +166,6 @@ ASMCONSTANTS_C_ASSERT(UnmanagedToManagedFrame__m_pvDatum == offsetof(UnmanagedTo #endif // FEATURE_COMINTEROP -#ifndef CROSSGEN_COMPILE -#define Thread__m_alloc_context__alloc_limit 0x44 -ASMCONSTANTS_C_ASSERT(Thread__m_alloc_context__alloc_limit == offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_limit)); - -#define Thread__m_alloc_context__alloc_ptr 0x40 -ASMCONSTANTS_C_ASSERT(Thread__m_alloc_context__alloc_ptr == offsetof(Thread, m_alloc_context) + offsetof(gc_alloc_context, alloc_ptr)); -#endif // CROSSGEN_COMPILE - #define Thread__m_fPreemptiveGCDisabled 0x08 #ifndef CROSSGEN_COMPILE ASMCONSTANTS_C_ASSERT(Thread__m_fPreemptiveGCDisabled == offsetof(Thread, m_fPreemptiveGCDisabled)); diff --git a/src/vm/i386/jitinterfacex86.cpp b/src/vm/i386/jitinterfacex86.cpp index 642c41759890..cc558e85296f 100644 --- a/src/vm/i386/jitinterfacex86.cpp +++ b/src/vm/i386/jitinterfacex86.cpp @@ -1063,14 +1063,11 @@ void *JIT_TrialAlloc::GenAllocString(Flags flags) // jae noLock - seems tempting to jump to noAlloc, but we haven't taken the lock yet sl.X86EmitCondJump(noLock, X86CondCode::kJAE); - // mov edx, [ecx]MethodTable.m_BaseSize - sl.X86EmitIndexRegLoad(kEDX, kECX, offsetof(MethodTable,m_BaseSize)); - // Calculate the final size to allocate. // We need to calculate baseSize + cnt*2, then round that up by adding 3 and anding ~3. - // lea eax, [edx+eax*2+5] - sl.X86EmitOp(0x8d, kEAX, kEDX, (DATA_ALIGNMENT-1), kEAX, 2); + // lea eax, [basesize+(alignment-1)+eax*2] + sl.X86EmitOp(0x8d, kEAX, StringObject::GetBaseSize() + (DATA_ALIGNMENT-1), kEAX, 2); // and eax, ~3 sl.Emit16(0xe083); diff --git a/src/vm/methodtablebuilder.cpp b/src/vm/methodtablebuilder.cpp index 0dc226e33377..5daadddf35c7 100644 --- a/src/vm/methodtablebuilder.cpp +++ b/src/vm/methodtablebuilder.cpp @@ -9737,8 +9737,8 @@ void MethodTableBuilder::CheckForSystemTypes() { // Strings are not "normal" objects, so we need to mess with their method table a bit // so that the GC can figure out how big each string is... - DWORD baseSize = ObjSizeOf(StringObject) + sizeof(WCHAR); - pMT->SetBaseSize(baseSize); // NULL character included + DWORD baseSize = StringObject::GetBaseSize(); + pMT->SetBaseSize(baseSize); GetHalfBakedClass()->SetBaseSizePadding(baseSize - bmtFP->NumInstanceFieldBytes); diff --git a/src/vm/object.h b/src/vm/object.h index e3e6c12399c1..3dd1e26d6394 100644 --- a/src/vm/object.h +++ b/src/vm/object.h @@ -914,7 +914,7 @@ typedef PTR_StringObject STRINGREF; * Special String implementation for performance. * * m_StringLength - Length of string in number of WCHARs - * m_Characters - The string buffer + * m_FirstChar - The string buffer * */ @@ -932,9 +932,6 @@ typedef PTR_StringObject STRINGREF; #define STRING_STATE_FAST_OPS 0x80000000 #define STRING_STATE_SPECIAL_SORT 0xC0000000 -#ifdef _MSC_VER -#pragma warning(disable : 4200) // disable zero-sized array warning -#endif class StringObject : public Object { #ifdef DACCESS_COMPILE @@ -947,11 +944,7 @@ class StringObject : public Object private: DWORD m_StringLength; - WCHAR m_Characters[0]; - // GC will see a StringObject like this: - // DWORD m_StringLength - // WCHAR m_Characters[0] - // DWORD m_OptionalPadding (this is an optional field and will appear based on need) + WCHAR m_FirstChar; public: VOID SetStringLength(DWORD len) { LIMITED_METHOD_CONTRACT; _ASSERTE(len >= 0); m_StringLength = len; } @@ -961,12 +954,11 @@ class StringObject : public Object ~StringObject() {LIMITED_METHOD_CONTRACT; } public: + static DWORD GetBaseSize(); static SIZE_T GetSize(DWORD stringLength); DWORD GetStringLength() { LIMITED_METHOD_DAC_CONTRACT; return( m_StringLength );} - WCHAR* GetBuffer() { LIMITED_METHOD_CONTRACT; _ASSERTE(this != nullptr); return (WCHAR*)( dac_cast(this) + offsetof(StringObject, m_Characters) ); } - WCHAR* GetBuffer(DWORD *pdwSize) { LIMITED_METHOD_CONTRACT; _ASSERTE((this != nullptr) && pdwSize); *pdwSize = GetStringLength(); return GetBuffer(); } - WCHAR* GetBufferNullable() { LIMITED_METHOD_CONTRACT; return( (this == nullptr) ? nullptr : (WCHAR*)( dac_cast(this) + offsetof(StringObject, m_Characters) ) ); } + WCHAR* GetBuffer() { LIMITED_METHOD_CONTRACT; _ASSERTE(this != nullptr); return (WCHAR*)( dac_cast(this) + offsetof(StringObject, m_FirstChar) ); } DWORD GetHighCharState() { WRAPPER_NO_CONTRACT; @@ -992,7 +984,7 @@ class StringObject : public Object static UINT GetBufferOffset() { LIMITED_METHOD_DAC_CONTRACT; - return (UINT)(offsetof(StringObject, m_Characters)); + return (UINT)(offsetof(StringObject, m_FirstChar)); } static UINT GetStringLengthOffset() { @@ -3329,7 +3321,7 @@ class Nullable { static inline void *Value(void *src, MethodTable *nullableMT) { Nullable *nullable = (Nullable *)src; - return nullable->ValueAddr(nullableMT); + return nullable->ValueAddr(nullableMT); } private: diff --git a/src/vm/object.inl b/src/vm/object.inl index 82e55f784490..aff857c90134 100644 --- a/src/vm/object.inl +++ b/src/vm/object.inl @@ -58,12 +58,18 @@ inline SIZE_T Object::GetSize() return s; } +__forceinline /*static*/ DWORD StringObject::GetBaseSize() +{ + LIMITED_METHOD_DAC_CONTRACT; + + return ObjSizeOf(Object) + sizeof(DWORD) + sizeof(WCHAR); +} + __forceinline /*static*/ SIZE_T StringObject::GetSize(DWORD strLen) { LIMITED_METHOD_DAC_CONTRACT; - // Extra WCHAR for null terminator - return ObjSizeOf(StringObject) + sizeof(WCHAR) + strLen * sizeof(WCHAR); + return GetBaseSize() + strLen * sizeof(WCHAR); } #ifdef DACCESS_COMPILE From 6df405f07248328032f873cda37dd56e96bee54a Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Thu, 3 May 2018 16:02:46 -0700 Subject: [PATCH 2/3] Fix x86 build break --- src/vm/i386/jitinterfacex86.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/vm/i386/jitinterfacex86.cpp b/src/vm/i386/jitinterfacex86.cpp index cc558e85296f..4eccd35a7652 100644 --- a/src/vm/i386/jitinterfacex86.cpp +++ b/src/vm/i386/jitinterfacex86.cpp @@ -1067,7 +1067,9 @@ void *JIT_TrialAlloc::GenAllocString(Flags flags) // We need to calculate baseSize + cnt*2, then round that up by adding 3 and anding ~3. // lea eax, [basesize+(alignment-1)+eax*2] - sl.X86EmitOp(0x8d, kEAX, StringObject::GetBaseSize() + (DATA_ALIGNMENT-1), kEAX, 2); + sl.Emit16(0x048d); + sl.Emit8(0x45); + sl.Emit32(StringObject::GetBaseSize() + (DATA_ALIGNMENT-1)); // and eax, ~3 sl.Emit16(0xe083); From 2b65b0bb603c5fb2c35686af3e0a076961923f2e Mon Sep 17 00:00:00 2001 From: Jan Kotas Date: Fri, 4 May 2018 07:46:29 -0700 Subject: [PATCH 3/3] CR feedback --- src/vm/object.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vm/object.inl b/src/vm/object.inl index aff857c90134..53ffb89cfc73 100644 --- a/src/vm/object.inl +++ b/src/vm/object.inl @@ -62,7 +62,7 @@ __forceinline /*static*/ DWORD StringObject::GetBaseSize() { LIMITED_METHOD_DAC_CONTRACT; - return ObjSizeOf(Object) + sizeof(DWORD) + sizeof(WCHAR); + return ObjSizeOf(Object) + sizeof(DWORD) /* length */ + sizeof(WCHAR) /* null terminator */; } __forceinline /*static*/ SIZE_T StringObject::GetSize(DWORD strLen)