Skip to content

Commit

Permalink
Use new function for character case sensitivity check, issue #249.
Browse files Browse the repository at this point in the history
  • Loading branch information
zufuliu committed Nov 9, 2020
1 parent 2720b4a commit d5287a3
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 59 deletions.
58 changes: 27 additions & 31 deletions scintilla/scripts/GenerateCaseConvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,18 +424,23 @@ def getBitCount(value):
caseTable[ch] = '1'
maskTable[ch >> 5] |= (1 << (ch & 31))

blockSize = 4
blockSizeBit = 2
blockSize = 1 << blockSizeBit
firstCount = first >> 5
maskCount = 1 + (maxCh >> 5)
maskCount = blockSize * ((maskCount + blockSize - 1) // blockSize)
maskList = maskTable[:firstCount]

blockIndexValueBit = 7
blockIndexCount = 1 << blockIndexValueBit
blockList = []
blockData = [(0, 0)] * 256
blockIndex = [0] * 256
maxBlockId = (maskCount // blockSize - 1) >> 8
blockData = [(0, 0)] * blockIndexCount
blockIndex = [0] * blockIndexCount
maxBlockId = (maskCount // blockSize - 1) >> blockIndexValueBit
blockBitCount = getBitCount(maxBlockId)
indexBitCount = 8 - blockBitCount
maxIndex = 1 << indexBitCount
overlapped = False

for i in range(firstCount, maskCount, blockSize):
block = tuple(maskTable[i:i+blockSize])
Expand All @@ -449,20 +454,24 @@ def getBitCount(value):

index += 1
blockId = i // blockSize
blockSlot = blockId & 0xff
blockSlot = blockId & (blockIndexCount - 1)
if blockData[blockSlot][1]:
print('multi block', blockId, blockSlot, blockData[blockSlot], index)
if index > maxIndex:
overlapped = True
print('overlapped block', blockId, blockSlot, index)

blockId = blockId >> 8
blockId = blockId >> blockIndexValueBit
blockData[blockSlot] = (blockId, index)
assert getBitCount(blockId) + getBitCount(index) <= 8
blockIndex[blockSlot] = index | (blockId << indexBitCount)

#lines = []
#for i in range(0, len(blockData), 8):
# line = ', '.join('(%d,%2d)' % item for item in blockData[i:i+8])
# lines.append(line)
#print('\n'.join(lines))
if overlapped:
return

indexTable = []
for block in blockList:
Expand Down Expand Up @@ -502,33 +511,19 @@ def getBitCount(value):
output.append('};')

indexMask = (1 << indexBitCount) - 1
indexOffset = 256 - blockSize
if False and blockSize == 8:
function = f"""
// case sensitivity for ch in [kUnicodeCaseSensitiveFirst, kUnicodeCaseSensitiveMax)
static inline BOOL IsCharacterCaseSensitiveSecond(uint32_t ch) {{
const uint32_t block = ch / {blockSize*32};
uint32_t index = UnicodeCaseSensitivityIndex[block & 0xff];
index &= ((index >> {indexBitCount}) ^ (block >> 8)) - 1;
if (index) {{
ch = ch & {hex(blockSize*32 - 1)};
index = {indexOffset} + (index & {hex(indexMask)})*{blockSize};
index = UnicodeCaseSensitivityIndex[index + (ch >> 5)];
return (UnicodeCaseSensitivityMask[index] >> (ch & 31)) & 1;
}}
return 0;
}}
"""
else:
function = f"""
indexOffset = blockIndexCount - blockSize
#index &= ((index >> {indexBitCount}) ^ (block >> {blockIndexValueBit})) ? 0 : {hex(indexMask)};
#index &= ((index ^ (block >> {blockIndexValueBit - indexBitCount})) >> {indexBitCount}) ? 0 : {hex(indexMask)};
#index &= {hex(indexMask)}ULL >> (((index ^ (block >> {blockIndexValueBit - indexBitCount})) >> {indexBitCount}) * {indexBitCount});
function = f"""
// case sensitivity for ch in [kUnicodeCaseSensitiveFirst, kUnicodeCaseSensitiveMax)
static inline BOOL IsCharacterCaseSensitiveSecond(uint32_t ch) {{
const uint32_t block = ch / {blockSize*32};
uint32_t index = UnicodeCaseSensitivityIndex[block & 0xff];
index &= ((index >> {indexBitCount}) == (block >> 8))? {hex(indexMask)} : 0;
const uint32_t block = ch >> {blockSizeBit + 5};
uint32_t index = UnicodeCaseSensitivityIndex[block & {hex(blockIndexCount - 1)}];
index &= 0 - (({maxBlockId + 1} - ((index ^ (block >> {blockIndexValueBit - indexBitCount})) >> {indexBitCount})) >> {blockBitCount});
if (index) {{
ch = ch & {hex(blockSize*32 - 1)};
index = {indexOffset} + index*{blockSize};
index = {indexOffset} + ((index & {hex(indexMask)}) << {blockSizeBit});
index = UnicodeCaseSensitivityIndex[index + (ch >> 5)];
return (UnicodeCaseSensitivityMask[index] >> (ch & 31)) & 1;
}}
Expand Down Expand Up @@ -565,5 +560,6 @@ def getBitCount(value):
updateCaseConvert()
#checkUnicodeCaseSensitivity('caseList.cpp')
#updateCaseSensitivity('CaseSensitivity.cpp', True)
updateCaseSensitivity('../../src/EditEncoding.c')
#updateCaseSensitivity('../../src/EditEncoding.c')
#updateCaseSensitivityBlock('caseBlock.cpp', True)
updateCaseSensitivityBlock('../../src/EditEncoding.c')
49 changes: 21 additions & 28 deletions src/EditEncoding.c
Original file line number Diff line number Diff line change
Expand Up @@ -1929,28 +1929,17 @@ INT UTF8_mbslen(LPCSTR source, INT byte_length) {
#define kUnicodeCaseSensitiveFirst 0x0600U
#define kUnicodeCaseSensitiveMax 0x1e943U

static const unsigned char UnicodeCaseSensitivityIndex[] = {
// UnicodeCaseSensitivityIndex1
0, 16, 32, 48, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 96, 112, 128, 0, 144, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 160, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176,
// UnicodeCaseSensitivityIndex2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 16, 0, 0, 0, 0, 0, 0,
0, 0, 24, 32, 40, 48, 0, 56, 0, 0, 64, 0, 0, 0, 0, 0, 0, 0, 72, 80,
88, 96, 0, 0, 0, 104, 0, 0, 0, 0, 0, 0, 0, 112, 0, 0, 0, 120, 0, 0,
0, 0, 128, 0, 0, 0, 0, 0, 0, 0, 136, 0, 0, 0, 144, 0, 0, 0, 0, 0,
152, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 160,
// UnicodeCaseSensitivityIndex
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 48, 49, 0, 0, 0, 0,
0, 8, 8, 50, 0, 0, 0, 0, 51, 49, 0, 0, 0, 0, 0, 52, 53, 0, 0, 0,
8, 8, 8, 8, 54, 8, 8, 8, 50, 8, 55, 56, 8, 57, 58, 59, 0, 60, 53, 8,
61, 0, 0, 0, 0, 0, 0, 0, 0, 62, 8, 63, 8, 64, 65, 66, 8, 8, 8, 67,
8, 68, 0, 0, 0, 0, 0, 0, 0, 0, 8, 69, 70, 0, 0, 0, 0, 71, 8, 72,
73, 64, 74, 75, 0, 0, 76, 77, 8, 8, 0, 0, 78, 0, 0, 0, 0, 0, 0, 0,
0, 2, 2, 0, 0, 0, 0, 0, 8, 8, 79, 0, 0, 77, 80, 70, 0, 0, 0, 0,
8, 81, 8, 81, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 8, 8, 0, 0, 0, 0,
8, 8, 82,
static const uint8_t UnicodeCaseSensitivityIndex[] = {
// block index
0, 0, 0, 0, 0, 0, 0, 0, 152, 153, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 154, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 155, 0, 0, 0, 0, 0, 0, 0, 3, 4, 5, 6, 7, 8, 9,
0, 0, 10, 11, 0, 0, 0, 0, 0, 12, 0, 0, 80, 81, 82, 83, 0, 0, 253, 0, 0, 0, 84, 85, 13, 14, 15, 0, 188, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 118, 0, 0, 0, 0, 0, 0, 0, 119, 0,
// mask index
0, 8, 48, 49, 0, 8, 8, 50, 51, 49, 0, 0, 0, 0, 0, 52, 53, 0, 0, 0, 8, 8, 8, 8, 54, 8, 8, 8, 50, 8, 55, 56,
8, 57, 58, 59, 0, 60, 53, 8, 61, 0, 0, 0, 0, 62, 8, 63, 8, 64, 65, 66, 8, 8, 8, 67, 8, 68, 0, 0, 0, 0, 8, 69,
70, 0, 0, 0, 0, 71, 8, 72, 73, 64, 74, 75, 0, 0, 76, 77, 8, 8, 0, 0, 78, 0, 0, 0, 0, 2, 2, 0, 8, 8, 79, 0,
0, 77, 80, 70, 8, 81, 8, 81, 0, 8, 8, 0, 0, 0, 8, 8, 8, 8, 82, 0,
};

static const uint32_t UnicodeCaseSensitivityMask[] = {
Expand All @@ -1969,12 +1958,16 @@ static const uint32_t UnicodeCaseSensitivityMask[] = {

// case sensitivity for ch in [kUnicodeCaseSensitiveFirst, kUnicodeCaseSensitiveMax)
static inline BOOL IsCharacterCaseSensitiveSecond(uint32_t ch) {
const uint32_t lower = ch & 31;
ch = (ch - kUnicodeCaseSensitiveFirst) >> 5;
ch = (UnicodeCaseSensitivityIndex[ch >> 6] << 2) | (ch & 63);
ch = (UnicodeCaseSensitivityIndex[61 + (ch >> 3)] << 0) | (ch & 7);
ch = UnicodeCaseSensitivityIndex[153 + ch];
return (UnicodeCaseSensitivityMask[ch] >> lower) & 1;
const uint32_t block = ch >> 7;
uint32_t index = UnicodeCaseSensitivityIndex[block & 0x7f];
index &= 0 - ((8 - ((index ^ (block >> 2)) >> 5)) >> 3);
if (index) {
ch = ch & 0x7f;
index = 124 + ((index & 0x1f) << 2);
index = UnicodeCaseSensitivityIndex[index + (ch >> 5)];
return (UnicodeCaseSensitivityMask[index] >> (ch & 31)) & 1;
}
return 0;
}
//case--Autogenerated -- end of section automatically generated

Expand Down

0 comments on commit d5287a3

Please sign in to comment.