From 0ef3a0e70965b3a686364a753d4832c0978dc241 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 27 Jul 2016 14:00:40 -0400 Subject: [PATCH] fix Julia 0.5 compatibility (mainly due to Char/Integer comparisons no longer being allowed) --- src/TinySegmenter.jl | 85 ++++++++++++++++++++++++-------------------- test/runtests.jl | 14 ++++---- 2 files changed, 53 insertions(+), 46 deletions(-) diff --git a/src/TinySegmenter.jl b/src/TinySegmenter.jl index 7fcb27a..04f1f1b 100644 --- a/src/TinySegmenter.jl +++ b/src/TinySegmenter.jl @@ -3,12 +3,19 @@ module TinySegmenter export tokenize -typealias US UTF8String - -macro t_str(s) +# make a tuple of Char from a string +macro c_str(s) tuple(s...) end +# make a tuple of UInt8s from an ASCII string +macro i_str(s) + tuple(UInt8[UInt8(c) for c in s]...) +end + +# make a Dict{UInt8,Int} from Char=>Int pairs +dict_c2i(p::Pair{Char,Int}...) = Dict{UInt8,Int}(map(p -> Pair(UInt8(p[1]),p[2]), p)) + # Use out of range of Unicode code point. See also: https://en.wikipedia.org/wiki/Code_point const B1 = Char(0x110001) const B2 = Char(0x110002) @@ -18,42 +25,42 @@ const E2 = Char(0x110005) const BIAS = -332 -const BC1 = Dict{Tuple{UInt8,UInt8},Int}(t"HH" => 6, t"II" => 2461, t"KH" => 406, t"OH" => -1378) -const BC2 = Dict{Tuple{UInt8,UInt8},Int}(t"AA" => -3267, t"AI" => 2744, t"AN" => -878, t"HH" => -4070, t"HM" => -1711, t"HN" => 4012, t"HO" => 3761, t"IA" => 1327, t"IH" => -1184, t"II" => -1332, t"IK" => 1721, t"IO" => 5492, t"KI" => 3831, t"KK" => -8741, t"MH" => -3132, t"MK" => 3334, t"OO" => -2920) -const BC3 = Dict{Tuple{UInt8,UInt8},Int}(t"HH" => 996, t"HI" => 626, t"HK" => -721, t"HN" => -1307, t"HO" => -836, t"IH" => -301, t"KK" => 2762, t"MK" => 1079, t"MM" => 4034, t"OA" => -1652, t"OH" => 266) -const BP1 = Dict{Tuple{UInt8,UInt8},Int}(t"BB" => 295, t"OB" => 304, t"OO" => -125, t"UB" => 352) -const BP2 = Dict{Tuple{UInt8,UInt8},Int}(t"BO" => 60, t"OO" => -1762) -const BQ1 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(t"BHH" => 1150, t"BHM" => 1521, t"BII" => -1158, t"BIM" => 886, t"BMH" => 1208, t"BNH" => 449, t"BOH" => -91, t"BOO" => -2597, t"OHI" => 451, t"OIH" => -296, t"OKA" => 1851, t"OKH" => -1020, t"OKK" => 904, t"OOO" => 2965) -const BQ2 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(t"BHH" => 118, t"BHI" => -1159, t"BHM" => 466, t"BIH" => -919, t"BKK" => -1720, t"BKO" => 864, t"OHH" => -1139, t"OHM" => -181, t"OIH" => 153, t"UHI" => -1146) -const BQ3 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(t"BHH" => -792, t"BHI" => 2664, t"BII" => -299, t"BKI" => 419, t"BMH" => 937, t"BMM" => 8335, t"BNN" => 998, t"BOH" => 775, t"OHH" => 2174, t"OHM" => 439, t"OII" => 280, t"OKH" => 1798, t"OKI" => -793, t"OKO" => -2242, t"OMH" => -2402, t"OOO" => 11699) -const BQ4 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(t"BHH" => -3895, t"BIH" => 3761, t"BII" => -4654, t"BIK" => 1348, t"BKK" => -1806, t"BMI" => -3385, t"BOO" => -12396, t"OAH" => 926, t"OHH" => 266, t"OHK" => -2036, t"ONN" => -973) -const BW1 = Dict{Tuple{Char,Char},Int}(t",と" => 660, t",同" => 727, (B1,'あ') => 1404, (B1,'同') => 542, t"、と" => 660, t"、同" => 727, t"」と" => 1682, t"あっ" => 1505, t"いう" => 1743, t"いっ" => -2055, t"いる" => 672, t"うし" => -4817, t"うん" => 665, t"から" => 3472, t"がら" => 600, t"こう" => -790, t"こと" => 2083, t"こん" => -1262, t"さら" => -4143, t"さん" => 4573, t"した" => 2641, t"して" => 1104, t"すで" => -3399, t"そこ" => 1977, t"それ" => -871, t"たち" => 1122, t"ため" => 601, t"った" => 3463, t"つい" => -802, t"てい" => 805, t"てき" => 1249, t"でき" => 1127, t"です" => 3445, t"では" => 844, t"とい" => -4915, t"とみ" => 1922, t"どこ" => 3887, t"ない" => 5713, t"なっ" => 3015, t"など" => 7379, t"なん" => -1113, t"にし" => 2468, t"には" => 1498, t"にも" => 1671, t"に対" => -912, t"の一" => -501, t"の中" => 741, t"ませ" => 2448, t"まで" => 1711, t"まま" => 2600, t"まる" => -2155, t"やむ" => -1947, t"よっ" => -2565, t"れた" => 2369, t"れで" => -913, t"をし" => 1860, t"を見" => 731, t"亡く" => -1886, t"京都" => 2558, t"取り" => -2784, t"大き" => -2604, t"大阪" => 1497, t"平方" => -2314, t"引き" => -1336, t"日本" => -195, t"本当" => -2423, t"毎日" => -2113, t"目指" => -724, (B1,'あ') => 1404, (B1,'同') => 542, t"」と" => 1682) -const BW2 = Dict{Tuple{Char,Char},Int}(t".." => -11822, t"11" => -669, t"――" => -5730, t"−−" => -13175, t"いう" => -1609, t"うか" => 2490, t"かし" => -1350, t"かも" => -602, t"から" => -7194, t"かれ" => 4612, t"がい" => 853, t"がら" => -3198, t"きた" => 1941, t"くな" => -1597, t"こと" => -8392, t"この" => -4193, t"させ" => 4533, t"され" => 13168, t"さん" => -3977, t"しい" => -1819, t"しか" => -545, t"した" => 5078, t"して" => 972, t"しな" => 939, t"その" => -3744, t"たい" => -1253, t"たた" => -662, t"ただ" => -3857, t"たち" => -786, t"たと" => 1224, t"たは" => -939, t"った" => 4589, t"って" => 1647, t"っと" => -2094, t"てい" => 6144, t"てき" => 3640, t"てく" => 2551, t"ては" => -3110, t"ても" => -3065, t"でい" => 2666, t"でき" => -1528, t"でし" => -3828, t"です" => -4761, t"でも" => -4203, t"とい" => 1890, t"とこ" => -1746, t"とと" => -2279, t"との" => 720, t"とみ" => 5168, t"とも" => -3941, t"ない" => -2488, t"なが" => -1313, t"など" => -6509, t"なの" => 2614, t"なん" => 3099, t"にお" => -1615, t"にし" => 2748, t"にな" => 2454, t"によ" => -7236, t"に対" => -14943, t"に従" => -4688, t"に関" => -11388, t"のか" => 2093, t"ので" => -7059, t"のに" => -6041, t"のの" => -6125, t"はい" => 1073, t"はが" => -1033, t"はず" => -2532, t"ばれ" => 1813, t"まし" => -1316, t"まで" => -6621, t"まれ" => 5409, t"めて" => -3153, t"もい" => 2230, t"もの" => -10713, t"らか" => -944, t"らし" => -1611, t"らに" => -1897, t"りし" => 651, t"りま" => 1620, t"れた" => 4270, t"れて" => 849, t"れば" => 4114, t"ろう" => 6067, t"われ" => 7901, t"を通" => -11877, t"んだ" => 728, t"んな" => -4115, t"一人" => 602, t"一方" => -1375, t"一日" => 970, t"一部" => -1051, t"上が" => -4479, t"会社" => -1116, t"出て" => 2163, t"分の" => -7758, t"同党" => 970, t"同日" => -913, t"大阪" => -2471, t"委員" => -1250, t"少な" => -1050, t"年度" => -8669, t"年間" => -1626, t"府県" => -2363, t"手権" => -1982, t"新聞" => -4066, t"日新" => -722, t"日本" => -7068, t"日米" => 3372, t"曜日" => -601, t"朝鮮" => -2355, t"本人" => -2697, t"東京" => -1543, t"然と" => -1384, t"社会" => -1276, t"立て" => -990, t"第に" => -1612, t"米国" => -4268, t"11" => -669, t"グ" => 1319) -const BW3 = Dict{Tuple{Char,Char},Int}(t"あた" => -2194, t"あり" => 719, t"ある" => 3846, t"い." => -1185, t"い。" => -1185, t"いい" => 5308, t"いえ" => 2079, t"いく" => 3029, t"いた" => 2056, t"いっ" => 1883, t"いる" => 5600, t"いわ" => 1527, t"うち" => 1117, t"うと" => 4798, t"えと" => 1454, t"か." => 2857, t"か。" => 2857, t"かけ" => -743, t"かっ" => -4098, t"かに" => -669, t"から" => 6520, t"かり" => -2670, t"が," => 1816, t"が、" => 1816, t"がき" => -4855, t"がけ" => -1127, t"がっ" => -913, t"がら" => -4977, t"がり" => -2064, t"きた" => 1645, t"けど" => 1374, t"こと" => 7397, t"この" => 1542, t"ころ" => -2757, t"さい" => -714, t"さを" => 976, t"し," => 1557, t"し、" => 1557, t"しい" => -3714, t"した" => 3562, t"して" => 1449, t"しな" => 2608, t"しま" => 1200, t"す." => -1310, t"す。" => -1310, t"する" => 6521, t"ず," => 3426, t"ず、" => 3426, t"ずに" => 841, t"そう" => 428, t"た." => 8875, t"た。" => 8875, t"たい" => -594, t"たの" => 812, t"たり" => -1183, t"たる" => -853, t"だ." => 4098, t"だ。" => 4098, t"だっ" => 1004, t"った" => -4748, t"って" => 300, t"てい" => 6240, t"てお" => 855, t"ても" => 302, t"です" => 1437, t"でに" => -1482, t"では" => 2295, t"とう" => -1387, t"とし" => 2266, t"との" => 541, t"とも" => -3543, t"どう" => 4664, t"ない" => 1796, t"なく" => -903, t"など" => 2135, t"に," => -1021, t"に、" => -1021, t"にし" => 1771, t"にな" => 1906, t"には" => 2644, t"の," => -724, t"の、" => -724, t"の子" => -1000, t"は," => 1337, t"は、" => 1337, t"べき" => 2181, t"まし" => 1113, t"ます" => 6943, t"まっ" => -1549, t"まで" => 6154, t"まれ" => -793, t"らし" => 1479, t"られ" => 6820, t"るる" => 3818, t"れ," => 854, t"れ、" => 854, t"れた" => 1850, t"れて" => 1375, t"れば" => -3246, t"れる" => 1091, t"われ" => -605, t"んだ" => 606, t"んで" => 798, t"カ月" => 990, t"会議" => 860, t"入り" => 1232, t"大会" => 2217, t"始め" => 1681, t"市 " => 965, t"新聞" => -5055, t"日," => 974, t"日、" => 974, t"社会" => 2024, t"カ月" => 990) -const TC1 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(t"AAA" => 1093, t"HHH" => 1029, t"HHM" => 580, t"HII" => 998, t"HOH" => -390, t"HOM" => -331, t"IHI" => 1169, t"IOH" => -142, t"IOI" => -1015, t"IOM" => 467, t"MMH" => 187, t"OOI" => -1832) -const TC2 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(t"HHO" => 2088, t"HII" => -1023, t"HMM" => -1154, t"IHI" => -1965, t"KKH" => 703, t"OII" => -2649) -const TC3 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(t"AAA" => -294, t"HHH" => 346, t"HHI" => -341, t"HII" => -1088, t"HIK" => 731, t"HOH" => -1486, t"IHH" => 128, t"IHI" => -3041, t"IHO" => -1935, t"IIH" => -825, t"IIM" => -1035, t"IOI" => -542, t"KHH" => -1216, t"KKA" => 491, t"KKH" => -1217, t"KOK" => -1009, t"MHH" => -2694, t"MHM" => -457, t"MHO" => 123, t"MMH" => -471, t"NNH" => -1689, t"NNO" => 662, t"OHO" => -3393) -const TC4 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(t"HHH" => -203, t"HHI" => 1344, t"HHK" => 365, t"HHM" => -122, t"HHN" => 182, t"HHO" => 669, t"HIH" => 804, t"HII" => 679, t"HOH" => 446, t"IHH" => 695, t"IHO" => -2324, t"IIH" => 321, t"III" => 1497, t"IIO" => 656, t"IOO" => 54, t"KAK" => 4845, t"KKA" => 3386, t"KKK" => 3065, t"MHH" => -405, t"MHI" => 201, t"MMH" => -241, t"MMM" => 661, t"MOM" => 841) -const TQ1 = Dict{Tuple{UInt8,UInt8,UInt8,UInt8},Int}(t"BHHH" => -227, t"BHHI" => 316, t"BHIH" => -132, t"BIHH" => 60, t"BIII" => 1595, t"BNHH" => -744, t"BOHH" => 225, t"BOOO" => -908, t"OAKK" => 482, t"OHHH" => 281, t"OHIH" => 249, t"OIHI" => 200, t"OIIH" => -68) -const TQ2 = Dict{Tuple{UInt8,UInt8,UInt8,UInt8},Int}(t"BIHH" => -1401, t"BIII" => -1033, t"BKAK" => -543, t"BOOO" => -5591) -const TQ3 = Dict{Tuple{UInt8,UInt8,UInt8,UInt8},Int}(t"BHHH" => 478, t"BHHM" => -1073, t"BHIH" => 222, t"BHII" => -504, t"BIIH" => -116, t"BIII" => -105, t"BMHI" => -863, t"BMHM" => -464, t"BOMH" => 620, t"OHHH" => 346, t"OHHI" => 1729, t"OHII" => 997, t"OHMH" => 481, t"OIHH" => 623, t"OIIH" => 1344, t"OKAK" => 2792, t"OKHH" => 587, t"OKKA" => 679, t"OOHH" => 110, t"OOII" => -685) -const TQ4 = Dict{Tuple{UInt8,UInt8,UInt8,UInt8},Int}(t"BHHH" => -721, t"BHHM" => -3604, t"BHII" => -966, t"BIIH" => -607, t"BIII" => -2181, t"OAAA" => -2763, t"OAKK" => 180, t"OHHH" => -294, t"OHHI" => 2446, t"OHHO" => 480, t"OHIH" => -1573, t"OIHH" => 1935, t"OIHI" => -493, t"OIIH" => 626, t"OIII" => -4007, t"OKAK" => -8156) -const TW1 = Dict{Tuple{Char,Char,Char},Int}(t"につい" => -4681, t"東京都" => 2026) -const TW2 = Dict{Tuple{Char,Char,Char},Int}(t"ある程" => -2049, t"いった" => -1256, t"ころが" => -2434, t"しょう" => 3873, t"その後" => -4430, t"だって" => -1049, t"ていた" => 1833, t"として" => -4657, t"ともに" => -4517, t"もので" => 1882, t"一気に" => -792, t"初めて" => -1512, t"同時に" => -8097, t"大きな" => -1255, t"対して" => -2721, t"社会党" => -3216) -const TW3 = Dict{Tuple{Char,Char,Char},Int}(t"いただ" => -1734, t"してい" => 1314, t"として" => -4314, t"につい" => -5483, t"にとっ" => -5989, t"に当た" => -6247, t"ので," => -727, t"ので、" => -727, t"のもの" => -600, t"れから" => -3752, t"十二月" => -2287) -const TW4 = Dict{Tuple{Char,Char,Char},Int}(t"いう." => 8576, t"いう。" => 8576, t"からな" => -2348, t"してい" => 2958, t"たが," => 1516, t"たが、" => 1516, t"ている" => 1538, t"という" => 1349, t"ました" => 5543, t"ません" => 1097, t"ようと" => -4258, t"よると" => 5865) -const UC1 = Dict{UInt8,Int}('A' => 484, 'K' => 93, 'M' => 645, 'O' => -505) -const UC2 = Dict{UInt8,Int}('A' => 819, 'H' => 1059, 'I' => 409, 'M' => 3987, 'N' => 5775, 'O' => 646) -const UC3 = Dict{UInt8,Int}('A' => -1370, 'I' => 2311) -const UC4 = Dict{UInt8,Int}('A' => -2643, 'H' => 1809, 'I' => -1032, 'K' => -3450, 'M' => 3565, 'N' => 3876, 'O' => 6646) -const UC5 = Dict{UInt8,Int}('H' => 313, 'I' => -1238, 'K' => -799, 'M' => 539, 'O' => -831) -const UC6 = Dict{UInt8,Int}('H' => -506, 'I' => -253, 'K' => 87, 'M' => 247, 'O' => -387) -const UP1 = Dict{UInt8,Int}('O' => -214) -const UP2 = Dict{UInt8,Int}('B' => 69, 'O' => 935) -const UP3 = Dict{UInt8,Int}('B' => 189) -const UQ1 = Dict{Tuple{UInt8,UInt8},Int}(t"BH" => 21, t"BI" => -12, t"BK" => -99, t"BN" => 142, t"BO" => -56, t"OH" => -95, t"OI" => 477, t"OK" => 410, t"OO" => -2422) -const UQ2 = Dict{Tuple{UInt8,UInt8},Int}(t"BH" => 216, t"BI" => 113, t"OK" => 1759) -const UQ3 = Dict{Tuple{UInt8,UInt8},Int}(t"BA" => -479, t"BH" => 42, t"BI" => 1913, t"BK" => -7198, t"BM" => 3160, t"BN" => 6427, t"BO" => 14761, t"OI" => -827, t"ON" => -3212) +const BC1 = Dict{Tuple{UInt8,UInt8},Int}(i"HH" => 6, i"II" => 2461, i"KH" => 406, i"OH" => -1378) +const BC2 = Dict{Tuple{UInt8,UInt8},Int}(i"AA" => -3267, i"AI" => 2744, i"AN" => -878, i"HH" => -4070, i"HM" => -1711, i"HN" => 4012, i"HO" => 3761, i"IA" => 1327, i"IH" => -1184, i"II" => -1332, i"IK" => 1721, i"IO" => 5492, i"KI" => 3831, i"KK" => -8741, i"MH" => -3132, i"MK" => 3334, i"OO" => -2920) +const BC3 = Dict{Tuple{UInt8,UInt8},Int}(i"HH" => 996, i"HI" => 626, i"HK" => -721, i"HN" => -1307, i"HO" => -836, i"IH" => -301, i"KK" => 2762, i"MK" => 1079, i"MM" => 4034, i"OA" => -1652, i"OH" => 266) +const BP1 = Dict{Tuple{UInt8,UInt8},Int}(i"BB" => 295, i"OB" => 304, i"OO" => -125, i"UB" => 352) +const BP2 = Dict{Tuple{UInt8,UInt8},Int}(i"BO" => 60, i"OO" => -1762) +const BQ1 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(i"BHH" => 1150, i"BHM" => 1521, i"BII" => -1158, i"BIM" => 886, i"BMH" => 1208, i"BNH" => 449, i"BOH" => -91, i"BOO" => -2597, i"OHI" => 451, i"OIH" => -296, i"OKA" => 1851, i"OKH" => -1020, i"OKK" => 904, i"OOO" => 2965) +const BQ2 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(i"BHH" => 118, i"BHI" => -1159, i"BHM" => 466, i"BIH" => -919, i"BKK" => -1720, i"BKO" => 864, i"OHH" => -1139, i"OHM" => -181, i"OIH" => 153, i"UHI" => -1146) +const BQ3 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(i"BHH" => -792, i"BHI" => 2664, i"BII" => -299, i"BKI" => 419, i"BMH" => 937, i"BMM" => 8335, i"BNN" => 998, i"BOH" => 775, i"OHH" => 2174, i"OHM" => 439, i"OII" => 280, i"OKH" => 1798, i"OKI" => -793, i"OKO" => -2242, i"OMH" => -2402, i"OOO" => 11699) +const BQ4 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(i"BHH" => -3895, i"BIH" => 3761, i"BII" => -4654, i"BIK" => 1348, i"BKK" => -1806, i"BMI" => -3385, i"BOO" => -12396, i"OAH" => 926, i"OHH" => 266, i"OHK" => -2036, i"ONN" => -973) +const BW1 = Dict{Tuple{Char,Char},Int}(c",と" => 660, c",同" => 727, (B1,'あ') => 1404, (B1,'同') => 542, c"、と" => 660, c"、同" => 727, c"」と" => 1682, c"あっ" => 1505, c"いう" => 1743, c"いっ" => -2055, c"いる" => 672, c"うし" => -4817, c"うん" => 665, c"から" => 3472, c"がら" => 600, c"こう" => -790, c"こと" => 2083, c"こん" => -1262, c"さら" => -4143, c"さん" => 4573, c"した" => 2641, c"して" => 1104, c"すで" => -3399, c"そこ" => 1977, c"それ" => -871, c"たち" => 1122, c"ため" => 601, c"った" => 3463, c"つい" => -802, c"てい" => 805, c"てき" => 1249, c"でき" => 1127, c"です" => 3445, c"では" => 844, c"とい" => -4915, c"とみ" => 1922, c"どこ" => 3887, c"ない" => 5713, c"なっ" => 3015, c"など" => 7379, c"なん" => -1113, c"にし" => 2468, c"には" => 1498, c"にも" => 1671, c"に対" => -912, c"の一" => -501, c"の中" => 741, c"ませ" => 2448, c"まで" => 1711, c"まま" => 2600, c"まる" => -2155, c"やむ" => -1947, c"よっ" => -2565, c"れた" => 2369, c"れで" => -913, c"をし" => 1860, c"を見" => 731, c"亡く" => -1886, c"京都" => 2558, c"取り" => -2784, c"大き" => -2604, c"大阪" => 1497, c"平方" => -2314, c"引き" => -1336, c"日本" => -195, c"本当" => -2423, c"毎日" => -2113, c"目指" => -724, (B1,'あ') => 1404, (B1,'同') => 542, c"」と" => 1682) +const BW2 = Dict{Tuple{Char,Char},Int}(c".." => -11822, c"11" => -669, c"――" => -5730, c"−−" => -13175, c"いう" => -1609, c"うか" => 2490, c"かし" => -1350, c"かも" => -602, c"から" => -7194, c"かれ" => 4612, c"がい" => 853, c"がら" => -3198, c"きた" => 1941, c"くな" => -1597, c"こと" => -8392, c"この" => -4193, c"させ" => 4533, c"され" => 13168, c"さん" => -3977, c"しい" => -1819, c"しか" => -545, c"した" => 5078, c"して" => 972, c"しな" => 939, c"その" => -3744, c"たい" => -1253, c"たた" => -662, c"ただ" => -3857, c"たち" => -786, c"たと" => 1224, c"たは" => -939, c"った" => 4589, c"って" => 1647, c"っと" => -2094, c"てい" => 6144, c"てき" => 3640, c"てく" => 2551, c"ては" => -3110, c"ても" => -3065, c"でい" => 2666, c"でき" => -1528, c"でし" => -3828, c"です" => -4761, c"でも" => -4203, c"とい" => 1890, c"とこ" => -1746, c"とと" => -2279, c"との" => 720, c"とみ" => 5168, c"とも" => -3941, c"ない" => -2488, c"なが" => -1313, c"など" => -6509, c"なの" => 2614, c"なん" => 3099, c"にお" => -1615, c"にし" => 2748, c"にな" => 2454, c"によ" => -7236, c"に対" => -14943, c"に従" => -4688, c"に関" => -11388, c"のか" => 2093, c"ので" => -7059, c"のに" => -6041, c"のの" => -6125, c"はい" => 1073, c"はが" => -1033, c"はず" => -2532, c"ばれ" => 1813, c"まし" => -1316, c"まで" => -6621, c"まれ" => 5409, c"めて" => -3153, c"もい" => 2230, c"もの" => -10713, c"らか" => -944, c"らし" => -1611, c"らに" => -1897, c"りし" => 651, c"りま" => 1620, c"れた" => 4270, c"れて" => 849, c"れば" => 4114, c"ろう" => 6067, c"われ" => 7901, c"を通" => -11877, c"んだ" => 728, c"んな" => -4115, c"一人" => 602, c"一方" => -1375, c"一日" => 970, c"一部" => -1051, c"上が" => -4479, c"会社" => -1116, c"出て" => 2163, c"分の" => -7758, c"同党" => 970, c"同日" => -913, c"大阪" => -2471, c"委員" => -1250, c"少な" => -1050, c"年度" => -8669, c"年間" => -1626, c"府県" => -2363, c"手権" => -1982, c"新聞" => -4066, c"日新" => -722, c"日本" => -7068, c"日米" => 3372, c"曜日" => -601, c"朝鮮" => -2355, c"本人" => -2697, c"東京" => -1543, c"然と" => -1384, c"社会" => -1276, c"立て" => -990, c"第に" => -1612, c"米国" => -4268, c"11" => -669, c"グ" => 1319) +const BW3 = Dict{Tuple{Char,Char},Int}(c"あた" => -2194, c"あり" => 719, c"ある" => 3846, c"い." => -1185, c"い。" => -1185, c"いい" => 5308, c"いえ" => 2079, c"いく" => 3029, c"いた" => 2056, c"いっ" => 1883, c"いる" => 5600, c"いわ" => 1527, c"うち" => 1117, c"うと" => 4798, c"えと" => 1454, c"か." => 2857, c"か。" => 2857, c"かけ" => -743, c"かっ" => -4098, c"かに" => -669, c"から" => 6520, c"かり" => -2670, c"が," => 1816, c"が、" => 1816, c"がき" => -4855, c"がけ" => -1127, c"がっ" => -913, c"がら" => -4977, c"がり" => -2064, c"きた" => 1645, c"けど" => 1374, c"こと" => 7397, c"この" => 1542, c"ころ" => -2757, c"さい" => -714, c"さを" => 976, c"し," => 1557, c"し、" => 1557, c"しい" => -3714, c"した" => 3562, c"して" => 1449, c"しな" => 2608, c"しま" => 1200, c"す." => -1310, c"す。" => -1310, c"する" => 6521, c"ず," => 3426, c"ず、" => 3426, c"ずに" => 841, c"そう" => 428, c"た." => 8875, c"た。" => 8875, c"たい" => -594, c"たの" => 812, c"たり" => -1183, c"たる" => -853, c"だ." => 4098, c"だ。" => 4098, c"だっ" => 1004, c"った" => -4748, c"って" => 300, c"てい" => 6240, c"てお" => 855, c"ても" => 302, c"です" => 1437, c"でに" => -1482, c"では" => 2295, c"とう" => -1387, c"とし" => 2266, c"との" => 541, c"とも" => -3543, c"どう" => 4664, c"ない" => 1796, c"なく" => -903, c"など" => 2135, c"に," => -1021, c"に、" => -1021, c"にし" => 1771, c"にな" => 1906, c"には" => 2644, c"の," => -724, c"の、" => -724, c"の子" => -1000, c"は," => 1337, c"は、" => 1337, c"べき" => 2181, c"まし" => 1113, c"ます" => 6943, c"まっ" => -1549, c"まで" => 6154, c"まれ" => -793, c"らし" => 1479, c"られ" => 6820, c"るる" => 3818, c"れ," => 854, c"れ、" => 854, c"れた" => 1850, c"れて" => 1375, c"れば" => -3246, c"れる" => 1091, c"われ" => -605, c"んだ" => 606, c"んで" => 798, c"カ月" => 990, c"会議" => 860, c"入り" => 1232, c"大会" => 2217, c"始め" => 1681, c"市 " => 965, c"新聞" => -5055, c"日," => 974, c"日、" => 974, c"社会" => 2024, c"カ月" => 990) +const TC1 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(i"AAA" => 1093, i"HHH" => 1029, i"HHM" => 580, i"HII" => 998, i"HOH" => -390, i"HOM" => -331, i"IHI" => 1169, i"IOH" => -142, i"IOI" => -1015, i"IOM" => 467, i"MMH" => 187, i"OOI" => -1832) +const TC2 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(i"HHO" => 2088, i"HII" => -1023, i"HMM" => -1154, i"IHI" => -1965, i"KKH" => 703, i"OII" => -2649) +const TC3 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(i"AAA" => -294, i"HHH" => 346, i"HHI" => -341, i"HII" => -1088, i"HIK" => 731, i"HOH" => -1486, i"IHH" => 128, i"IHI" => -3041, i"IHO" => -1935, i"IIH" => -825, i"IIM" => -1035, i"IOI" => -542, i"KHH" => -1216, i"KKA" => 491, i"KKH" => -1217, i"KOK" => -1009, i"MHH" => -2694, i"MHM" => -457, i"MHO" => 123, i"MMH" => -471, i"NNH" => -1689, i"NNO" => 662, i"OHO" => -3393) +const TC4 = Dict{Tuple{UInt8,UInt8,UInt8},Int}(i"HHH" => -203, i"HHI" => 1344, i"HHK" => 365, i"HHM" => -122, i"HHN" => 182, i"HHO" => 669, i"HIH" => 804, i"HII" => 679, i"HOH" => 446, i"IHH" => 695, i"IHO" => -2324, i"IIH" => 321, i"III" => 1497, i"IIO" => 656, i"IOO" => 54, i"KAK" => 4845, i"KKA" => 3386, i"KKK" => 3065, i"MHH" => -405, i"MHI" => 201, i"MMH" => -241, i"MMM" => 661, i"MOM" => 841) +const TQ1 = Dict{Tuple{UInt8,UInt8,UInt8,UInt8},Int}(i"BHHH" => -227, i"BHHI" => 316, i"BHIH" => -132, i"BIHH" => 60, i"BIII" => 1595, i"BNHH" => -744, i"BOHH" => 225, i"BOOO" => -908, i"OAKK" => 482, i"OHHH" => 281, i"OHIH" => 249, i"OIHI" => 200, i"OIIH" => -68) +const TQ2 = Dict{Tuple{UInt8,UInt8,UInt8,UInt8},Int}(i"BIHH" => -1401, i"BIII" => -1033, i"BKAK" => -543, i"BOOO" => -5591) +const TQ3 = Dict{Tuple{UInt8,UInt8,UInt8,UInt8},Int}(i"BHHH" => 478, i"BHHM" => -1073, i"BHIH" => 222, i"BHII" => -504, i"BIIH" => -116, i"BIII" => -105, i"BMHI" => -863, i"BMHM" => -464, i"BOMH" => 620, i"OHHH" => 346, i"OHHI" => 1729, i"OHII" => 997, i"OHMH" => 481, i"OIHH" => 623, i"OIIH" => 1344, i"OKAK" => 2792, i"OKHH" => 587, i"OKKA" => 679, i"OOHH" => 110, i"OOII" => -685) +const TQ4 = Dict{Tuple{UInt8,UInt8,UInt8,UInt8},Int}(i"BHHH" => -721, i"BHHM" => -3604, i"BHII" => -966, i"BIIH" => -607, i"BIII" => -2181, i"OAAA" => -2763, i"OAKK" => 180, i"OHHH" => -294, i"OHHI" => 2446, i"OHHO" => 480, i"OHIH" => -1573, i"OIHH" => 1935, i"OIHI" => -493, i"OIIH" => 626, i"OIII" => -4007, i"OKAK" => -8156) +const TW1 = Dict{Tuple{Char,Char,Char},Int}(c"につい" => -4681, c"東京都" => 2026) +const TW2 = Dict{Tuple{Char,Char,Char},Int}(c"ある程" => -2049, c"いった" => -1256, c"ころが" => -2434, c"しょう" => 3873, c"その後" => -4430, c"だって" => -1049, c"ていた" => 1833, c"として" => -4657, c"ともに" => -4517, c"もので" => 1882, c"一気に" => -792, c"初めて" => -1512, c"同時に" => -8097, c"大きな" => -1255, c"対して" => -2721, c"社会党" => -3216) +const TW3 = Dict{Tuple{Char,Char,Char},Int}(c"いただ" => -1734, c"してい" => 1314, c"として" => -4314, c"につい" => -5483, c"にとっ" => -5989, c"に当た" => -6247, c"ので," => -727, c"ので、" => -727, c"のもの" => -600, c"れから" => -3752, c"十二月" => -2287) +const TW4 = Dict{Tuple{Char,Char,Char},Int}(c"いう." => 8576, c"いう。" => 8576, c"からな" => -2348, c"してい" => 2958, c"たが," => 1516, c"たが、" => 1516, c"ている" => 1538, c"という" => 1349, c"ました" => 5543, c"ません" => 1097, c"ようと" => -4258, c"よると" => 5865) +const UC1 = dict_c2i('A' => 484, 'K' => 93, 'M' => 645, 'O' => -505) +const UC2 = dict_c2i('A' => 819, 'H' => 1059, 'I' => 409, 'M' => 3987, 'N' => 5775, 'O' => 646) +const UC3 = dict_c2i('A' => -1370, 'I' => 2311) +const UC4 = dict_c2i('A' => -2643, 'H' => 1809, 'I' => -1032, 'K' => -3450, 'M' => 3565, 'N' => 3876, 'O' => 6646) +const UC5 = dict_c2i('H' => 313, 'I' => -1238, 'K' => -799, 'M' => 539, 'O' => -831) +const UC6 = dict_c2i('H' => -506, 'I' => -253, 'K' => 87, 'M' => 247, 'O' => -387) +const UP1 = dict_c2i('O' => -214) +const UP2 = dict_c2i('B' => 69, 'O' => 935) +const UP3 = dict_c2i('B' => 189) +const UQ1 = Dict{Tuple{UInt8,UInt8},Int}(i"BH" => 21, i"BI" => -12, i"BK" => -99, i"BN" => 142, i"BO" => -56, i"OH" => -95, i"OI" => 477, i"OK" => 410, i"OO" => -2422) +const UQ2 = Dict{Tuple{UInt8,UInt8},Int}(i"BH" => 216, i"BI" => 113, i"OK" => 1759) +const UQ3 = Dict{Tuple{UInt8,UInt8},Int}(i"BA" => -479, i"BH" => 42, i"BI" => 1913, i"BK" => -7198, i"BM" => 3160, i"BN" => 6427, i"BO" => 14761, i"OI" => -827, i"ON" => -3212) const UW1 = Dict{Char,Int}(',' => 156, '、' => 156, '「' => -463, 'あ' => -941, 'う' => -127, 'が' => -553, 'き' => 121, 'こ' => 505, 'で' => -201, 'と' => -547, 'ど' => -123, 'に' => -789, 'の' => -185, 'は' => -847, 'も' => -466, 'や' => -470, 'よ' => 182, 'ら' => -292, 'り' => 208, 'れ' => 169, 'を' => -446, 'ん' => -137, '・' => -135, '主' => -402, '京' => -268, '区' => -912, '午' => 871, '国' => -460, '大' => 561, '委' => 729, '市' => -411, '日' => -141, '理' => 361, '生' => -408, '県' => -386, '都' => -718, '「' => -463, '・' => -135) const UW2 = Dict{Char,Int}(',' => -829, '、' => -829, '〇' => 892, '「' => -645, '」' => 3145, 'あ' => -538, 'い' => 505, 'う' => 134, 'お' => -502, 'か' => 1454, 'が' => -856, 'く' => -412, 'こ' => 1141, 'さ' => 878, 'ざ' => 540, 'し' => 1529, 'す' => -675, 'せ' => 300, 'そ' => -1011, 'た' => 188, 'だ' => 1837, 'つ' => -949, 'て' => -291, 'で' => -268, 'と' => -981, 'ど' => 1273, 'な' => 1063, 'に' => -1764, 'の' => 130, 'は' => -409, 'ひ' => -1273, 'べ' => 1261, 'ま' => 600, 'も' => -1263, 'や' => -402, 'よ' => 1639, 'り' => -579, 'る' => -694, 'れ' => 571, 'を' => -2516, 'ん' => 2095, 'ア' => -587, 'カ' => 306, 'キ' => 568, 'ッ' => 831, '三' => -758, '不' => -2150, '世' => -302, '中' => -968, '主' => -861, '事' => 492, '人' => -123, '会' => 978, '保' => 362, '入' => 548, '初' => -3025, '副' => -1566, '北' => -3414, '区' => -422, '大' => -1769, '天' => -865, '太' => -483, '子' => -1519, '学' => 760, '実' => 1023, '小' => -2009, '市' => -813, '年' => -1060, '強' => 1067, '手' => -1519, '揺' => -1033, '政' => 1522, '文' => -1355, '新' => -1682, '日' => -1815, '明' => -1462, '最' => -630, '朝' => -1843, '本' => -1650, '東' => -931, '果' => -665, '次' => -2378, '民' => -180, '気' => -1740, '理' => 752, '発' => 529, '目' => -1584, '相' => -242, '県' => -1165, '立' => -763, '第' => 810, '米' => 509, '自' => -1353, '行' => 838, '西' => -744, '見' => -3874, '調' => 1010, '議' => 1198, '込' => 3041, '開' => 1758, '間' => -1257, '「' => -645, '」' => 3145, 'ッ' => 831, 'ア' => -587, 'カ' => 306, 'キ' => 568) const UW3 = Dict{Char,Int}(',' => 4889, '1' => -800, '−' => -1723, '、' => 4889, '々' => -2311, '〇' => 5827, '」' => 2670, '〓' => -3573, 'あ' => -2696, 'い' => 1006, 'う' => 2342, 'え' => 1983, 'お' => -4864, 'か' => -1163, 'が' => 3271, 'く' => 1004, 'け' => 388, 'げ' => 401, 'こ' => -3552, 'ご' => -3116, 'さ' => -1058, 'し' => -395, 'す' => 584, 'せ' => 3685, 'そ' => -5228, 'た' => 842, 'ち' => -521, 'っ' => -1444, 'つ' => -1081, 'て' => 6167, 'で' => 2318, 'と' => 1691, 'ど' => -899, 'な' => -2788, 'に' => 2745, 'の' => 4056, 'は' => 4555, 'ひ' => -2171, 'ふ' => -1798, 'へ' => 1199, 'ほ' => -5516, 'ま' => -4384, 'み' => -120, 'め' => 1205, 'も' => 2323, 'や' => -788, 'よ' => -202, 'ら' => 727, 'り' => 649, 'る' => 5905, 'れ' => 2773, 'わ' => -1207, 'を' => 6620, 'ん' => -518, 'ア' => 551, 'グ' => 1319, 'ス' => 874, 'ッ' => -1350, 'ト' => 521, 'ム' => 1109, 'ル' => 1591, 'ロ' => 2201, 'ン' => 278, '・' => -3794, '一' => -1619, '下' => -1759, '世' => -2087, '両' => 3815, '中' => 653, '主' => -758, '予' => -1193, '二' => 974, '人' => 2742, '今' => 792, '他' => 1889, '以' => -1368, '低' => 811, '何' => 4265, '作' => -361, '保' => -2439, '元' => 4858, '党' => 3593, '全' => 1574, '公' => -3030, '六' => 755, '共' => -1880, '円' => 5807, '再' => 3095, '分' => 457, '初' => 2475, '別' => 1129, '前' => 2286, '副' => 4437, '力' => 365, '動' => -949, '務' => -1872, '化' => 1327, '北' => -1038, '区' => 4646, '千' => -2309, '午' => -783, '協' => -1006, '口' => 483, '右' => 1233, '各' => 3588, '合' => -241, '同' => 3906, '和' => -837, '員' => 4513, '国' => 642, '型' => 1389, '場' => 1219, '外' => -241, '妻' => 2016, '学' => -1356, '安' => -423, '実' => -1008, '家' => 1078, '小' => -513, '少' => -3102, '州' => 1155, '市' => 3197, '平' => -1804, '年' => 2416, '広' => -1030, '府' => 1605, '度' => 1452, '建' => -2352, '当' => -3885, '得' => 1905, '思' => -1291, '性' => 1822, '戸' => -488, '指' => -3973, '政' => -2013, '教' => -1479, '数' => 3222, '文' => -1489, '新' => 1764, '日' => 2099, '旧' => 5792, '昨' => -661, '時' => -1248, '曜' => -951, '最' => -937, '月' => 4125, '期' => 360, '李' => 3094, '村' => 364, '東' => -805, '核' => 5156, '森' => 2438, '業' => 484, '氏' => 2613, '民' => -1694, '決' => -1073, '法' => 1868, '海' => -495, '無' => 979, '物' => 461, '特' => -3850, '生' => -273, '用' => 914, '町' => 1215, '的' => 7313, '直' => -1835, '省' => 792, '県' => 6293, '知' => -1528, '私' => 4231, '税' => 401, '立' => -960, '第' => 1201, '米' => 7767, '系' => 3066, '約' => 3663, '級' => 1384, '統' => -4229, '総' => 1163, '線' => 1255, '者' => 6457, '能' => 725, '自' => -2869, '英' => 785, '見' => 1044, '調' => -562, '財' => -733, '費' => 1777, '車' => 1835, '軍' => 1375, '込' => -1504, '通' => -1136, '選' => -681, '郎' => 1026, '郡' => 4404, '部' => 1200, '金' => 2163, '長' => 421, '開' => -1432, '間' => 1302, '関' => -1282, '雨' => 2009, '電' => -1045, '非' => 2066, '駅' => 1620, '1' => -800, '」' => 2670, '・' => -3794, 'ッ' => -1350, 'ア' => 551, 'ス' => 874, 'ト' => 521, 'ム' => 1109, 'ル' => 1591, 'ロ' => 2201, 'ン' => 278) diff --git a/test/runtests.jl b/test/runtests.jl index 2cc2785..73d2f86 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -13,10 +13,10 @@ js_result_text = join(readlines(f2)); @test result_text == js_result_text import TinySegmenter.ctype -@test ctype('一') == 'M' -@test ctype('〆') == 'H' -@test ctype('名') == 'H' -@test ctype('あ') == 'I' -@test ctype('ア') == 'K' -@test ctype('Z') == 'A' -@test ctype('9') == 'N' +@test ctype('一') == UInt8('M') +@test ctype('〆') == UInt8('H') +@test ctype('名') == UInt8('H') +@test ctype('あ') == UInt8('I') +@test ctype('ア') == UInt8('K') +@test ctype('Z') == UInt8('A') +@test ctype('9') == UInt8('N')