diff --git a/scintilla/scripts/GenerateCaseConvert.py b/scintilla/scripts/GenerateCaseConvert.py index 5af4b1da8f..9c41b79284 100644 --- a/scintilla/scripts/GenerateCaseConvert.py +++ b/scintilla/scripts/GenerateCaseConvert.py @@ -424,18 +424,23 @@ def getBitCount(value): caseTable[ch] = '1' maskTable[ch >> 5] |= (1 << (ch & 31)) - blockSize = 4 + blockSizeBit = 2 + blockSize = 1 << blockSizeBit firstCount = first >> 5 maskCount = 1 + (maxCh >> 5) maskCount = blockSize * ((maskCount + blockSize - 1) // blockSize) maskList = maskTable[:firstCount] + blockIndexValueBit = 7 + blockIndexCount = 1 << blockIndexValueBit blockList = [] - blockData = [(0, 0)] * 256 - blockIndex = [0] * 256 - maxBlockId = (maskCount // blockSize - 1) >> 8 + blockData = [(0, 0)] * blockIndexCount + blockIndex = [0] * blockIndexCount + maxBlockId = (maskCount // blockSize - 1) >> blockIndexValueBit blockBitCount = getBitCount(maxBlockId) indexBitCount = 8 - blockBitCount + maxIndex = 1 << indexBitCount + overlapped = False for i in range(firstCount, maskCount, blockSize): block = tuple(maskTable[i:i+blockSize]) @@ -449,13 +454,15 @@ def getBitCount(value): index += 1 blockId = i // blockSize - blockSlot = blockId & 0xff + blockSlot = blockId & (blockIndexCount - 1) if blockData[blockSlot][1]: print('multi block', blockId, blockSlot, blockData[blockSlot], index) + if index > maxIndex: + overlapped = True + print('overlapped block', blockId, blockSlot, index) - blockId = blockId >> 8 + blockId = blockId >> blockIndexValueBit blockData[blockSlot] = (blockId, index) - assert getBitCount(blockId) + getBitCount(index) <= 8 blockIndex[blockSlot] = index | (blockId << indexBitCount) #lines = [] @@ -463,6 +470,8 @@ def getBitCount(value): # line = ', '.join('(%d,%2d)' % item for item in blockData[i:i+8]) # lines.append(line) #print('\n'.join(lines)) + if overlapped: + return indexTable = [] for block in blockList: @@ -502,33 +511,19 @@ def getBitCount(value): output.append('};') indexMask = (1 << indexBitCount) - 1 - indexOffset = 256 - blockSize - if False and blockSize == 8: - function = f""" -// case sensitivity for ch in [kUnicodeCaseSensitiveFirst, kUnicodeCaseSensitiveMax) -static inline BOOL IsCharacterCaseSensitiveSecond(uint32_t ch) {{ - const uint32_t block = ch / {blockSize*32}; - uint32_t index = UnicodeCaseSensitivityIndex[block & 0xff]; - index &= ((index >> {indexBitCount}) ^ (block >> 8)) - 1; - if (index) {{ - ch = ch & {hex(blockSize*32 - 1)}; - index = {indexOffset} + (index & {hex(indexMask)})*{blockSize}; - index = UnicodeCaseSensitivityIndex[index + (ch >> 5)]; - return (UnicodeCaseSensitivityMask[index] >> (ch & 31)) & 1; - }} - return 0; -}} -""" - else: - function = f""" + indexOffset = blockIndexCount - blockSize + #index &= ((index >> {indexBitCount}) ^ (block >> {blockIndexValueBit})) ? 0 : {hex(indexMask)}; + #index &= ((index ^ (block >> {blockIndexValueBit - indexBitCount})) >> {indexBitCount}) ? 0 : {hex(indexMask)}; + #index &= {hex(indexMask)}ULL >> (((index ^ (block >> {blockIndexValueBit - indexBitCount})) >> {indexBitCount}) * {indexBitCount}); + function = f""" // case sensitivity for ch in [kUnicodeCaseSensitiveFirst, kUnicodeCaseSensitiveMax) static inline BOOL IsCharacterCaseSensitiveSecond(uint32_t ch) {{ - const uint32_t block = ch / {blockSize*32}; - uint32_t index = UnicodeCaseSensitivityIndex[block & 0xff]; - index &= ((index >> {indexBitCount}) == (block >> 8))? {hex(indexMask)} : 0; + const uint32_t block = ch >> {blockSizeBit + 5}; + uint32_t index = UnicodeCaseSensitivityIndex[block & {hex(blockIndexCount - 1)}]; + index &= 0 - (({maxBlockId + 1} - ((index ^ (block >> {blockIndexValueBit - indexBitCount})) >> {indexBitCount})) >> {blockBitCount}); if (index) {{ ch = ch & {hex(blockSize*32 - 1)}; - index = {indexOffset} + index*{blockSize}; + index = {indexOffset} + ((index & {hex(indexMask)}) << {blockSizeBit}); index = UnicodeCaseSensitivityIndex[index + (ch >> 5)]; return (UnicodeCaseSensitivityMask[index] >> (ch & 31)) & 1; }} @@ -565,5 +560,6 @@ def getBitCount(value): updateCaseConvert() #checkUnicodeCaseSensitivity('caseList.cpp') #updateCaseSensitivity('CaseSensitivity.cpp', True) -updateCaseSensitivity('../../src/EditEncoding.c') +#updateCaseSensitivity('../../src/EditEncoding.c') #updateCaseSensitivityBlock('caseBlock.cpp', True) +updateCaseSensitivityBlock('../../src/EditEncoding.c') diff --git a/src/EditEncoding.c b/src/EditEncoding.c index 810303784a..9e7a6cfaae 100644 --- a/src/EditEncoding.c +++ b/src/EditEncoding.c @@ -1929,28 +1929,17 @@ INT UTF8_mbslen(LPCSTR source, INT byte_length) { #define kUnicodeCaseSensitiveFirst 0x0600U #define kUnicodeCaseSensitiveMax 0x1e943U -static const unsigned char UnicodeCaseSensitivityIndex[] = { -// UnicodeCaseSensitivityIndex1 -0, 16, 32, 48, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 112, 128, 0, 144, 0, 0, 0, 0, 0, -0, 0, 0, 0, 0, 160, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -176, -// UnicodeCaseSensitivityIndex2 -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 16, 0, 0, 0, 0, 0, 0, -0, 0, 24, 32, 40, 48, 0, 56, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 72, 80, -88, 96, 0, 0, 0, 104, 0, 0, 0, 0, 0, 0, 0, 112, 0, 0, 0, 120, 0, 0, -0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 144, 0, 0, 0, 0, 0, -152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 160, -// UnicodeCaseSensitivityIndex -0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 48, 49, 0, 0, 0, 0, -0, 8, 8, 50, 0, 0, 0, 0, 51, 49, 0, 0, 0, 0, 0, 52, 53, 0, 0, 0, -8, 8, 8, 8, 54, 8, 8, 8, 50, 8, 55, 56, 8, 57, 58, 59, 0, 60, 53, 8, -61, 0, 0, 0, 0, 0, 0, 0, 0, 62, 8, 63, 8, 64, 65, 66, 8, 8, 8, 67, -8, 68, 0, 0, 0, 0, 0, 0, 0, 0, 8, 69, 70, 0, 0, 0, 0, 71, 8, 72, -73, 64, 74, 75, 0, 0, 76, 77, 8, 8, 0, 0, 78, 0, 0, 0, 0, 0, 0, 0, -0, 2, 2, 0, 0, 0, 0, 0, 8, 8, 79, 0, 0, 77, 80, 70, 0, 0, 0, 0, -8, 81, 8, 81, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 8, 8, 0, 0, 0, 0, -8, 8, 82, +static const uint8_t UnicodeCaseSensitivityIndex[] = { +// block index +0, 0, 0, 0, 0, 0, 0, 0, 152, 153, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 154, 0, 0, 0, 0, 0, 0, +0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 155, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 6, 7, 8, 9, +0, 0, 10, 11, 0, 0, 0, 0, 0, 12, 0, 0, 80, 81, 82, 83, 0, 0, 253, 0, 0, 0, 84, 85, 13, 14, 15, 0, 188, 0, 0, 0, +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118, 0, 0, 0, 0, 0, 0, 0, 119, 0, +// mask index +0, 8, 48, 49, 0, 8, 8, 50, 51, 49, 0, 0, 0, 0, 0, 52, 53, 0, 0, 0, 8, 8, 8, 8, 54, 8, 8, 8, 50, 8, 55, 56, +8, 57, 58, 59, 0, 60, 53, 8, 61, 0, 0, 0, 0, 62, 8, 63, 8, 64, 65, 66, 8, 8, 8, 67, 8, 68, 0, 0, 0, 0, 8, 69, +70, 0, 0, 0, 0, 71, 8, 72, 73, 64, 74, 75, 0, 0, 76, 77, 8, 8, 0, 0, 78, 0, 0, 0, 0, 2, 2, 0, 8, 8, 79, 0, +0, 77, 80, 70, 8, 81, 8, 81, 0, 8, 8, 0, 0, 0, 8, 8, 8, 8, 82, 0, }; static const uint32_t UnicodeCaseSensitivityMask[] = { @@ -1969,12 +1958,16 @@ static const uint32_t UnicodeCaseSensitivityMask[] = { // case sensitivity for ch in [kUnicodeCaseSensitiveFirst, kUnicodeCaseSensitiveMax) static inline BOOL IsCharacterCaseSensitiveSecond(uint32_t ch) { - const uint32_t lower = ch & 31; - ch = (ch - kUnicodeCaseSensitiveFirst) >> 5; - ch = (UnicodeCaseSensitivityIndex[ch >> 6] << 2) | (ch & 63); - ch = (UnicodeCaseSensitivityIndex[61 + (ch >> 3)] << 0) | (ch & 7); - ch = UnicodeCaseSensitivityIndex[153 + ch]; - return (UnicodeCaseSensitivityMask[ch] >> lower) & 1; + const uint32_t block = ch >> 7; + uint32_t index = UnicodeCaseSensitivityIndex[block & 0x7f]; + index &= 0 - ((8 - ((index ^ (block >> 2)) >> 5)) >> 3); + if (index) { + ch = ch & 0x7f; + index = 124 + ((index & 0x1f) << 2); + index = UnicodeCaseSensitivityIndex[index + (ch >> 5)]; + return (UnicodeCaseSensitivityMask[index] >> (ch & 31)) & 1; + } + return 0; } //case--Autogenerated -- end of section automatically generated