The eKorpkit Corpus is a large, diverse, multilingual (ko/en) language modelling dataset.
Name | Language | Size | Weight | # Docs | # Sents | # Words |
---|---|---|---|---|---|---|
mc4_ko | ko | 90.76 GiB | 20.22% | 15,618,718 | 665,858,888 | 8,007,674,274 |
courtlistener | en | 47.92 GiB | 10.68% | 3,489,298 | 335,079,871 | 8,324,277,457 |
pmc_comm | en | 45.26 GiB | 10.08% | 51,276,102 | 297,884,818 | 7,365,607,900 |
edgar | en | 36.94 GiB | 8.23% | 213,376 | 177,270,203 | 6,053,677,897 |
c4_realnewslike | en | 33.79 GiB | 7.53% | 13,813,090 | 155,883,681 | 6,040,207,703 |
pubmed | en | 27.51 GiB | 6.13% | 22,498,747 | 190,907,356 | 4,281,121,705 |
bigpatent | en | 22.46 GiB | 5.00% | 1,244,053 | 2,488,106 | 4,613,882,925 |
aihub_formal1 | ko | 19.16 GiB | 4.27% | 1,073,944 | 93,148,022 | 1,993,574,713 |
enwiki | en | 13.85 GiB | 3.09% | 6,200,658 | 129,066,417 | 2,400,717,561 |
pmc_noncomm | en | 11.88 GiB | 2.65% | 14,142,294 | 79,748,279 | 1,923,415,913 |
kcbert | ko | 11.45 GiB | 2.55% | 82,990,213 | 82,990,213 | 1,088,177,367 |
nikl_news | ko | 11.19 GiB | 2.49% | 4,104,534 | 42,527,395 | 1,138,897,337 |
oscar_ko | ko | 11.05 GiB | 2.46% | 3,673,262 | 61,833,262 | 1,122,638,494 |
aida_paper | ko | 8.77 GiB | 1.95% | 481,389 | 38,808,105 | 1,025,422,060 |
kcc | ko | 6.80 GiB | 1.51% | 46,529,987 | 46,529,987 | 703,222,627 |
nikl_written | ko | 6.45 GiB | 1.44% | 20,128 | 27,231,846 | 679,547,033 |
namuwiki | ko | 6.43 GiB | 1.43% | 571,026 | 67,315,244 | 691,537,393 |
aihub_patent1 | ko | 6.40 GiB | 1.42% | 155,939 | 29,206,198 | 673,134,598 |
earnings_call | en | 6.30 GiB | 1.40% | 159,380 | 32,391,491 | 1,160,525,933 |
sec_report | ko | 4.70 GiB | 1.05% | 817,040 | 32,644,657 | 495,245,547 |
hacker_news | en | 3.80 GiB | 0.85% | 818,299 | 41,573,998 | 662,524,112 |
philpapers | en | 2.19 GiB | 0.49% | 31,016 | 139,518 | 365,576,851 |
nih_exporter | en | 2.10 GiB | 0.47% | 1,017,230 | 13,540,126 | 326,974,102 |
bigkinds | ko | 1.99 GiB | 0.44% | 871,304 | 7,759,115 | 197,746,184 |
youtube_subtitles | en | 1.61 GiB | 0.36% | 150,749 | 16,074,289 | 303,286,377 |
respec | en | 1.08 GiB | 0.24% | 1,119,640 | 7,083,257 | 169,590,880 |
nikl_spoken | ko | 1002.49 MiB | 0.22% | 25,614 | 19,042,013 | 116,067,432 |
kowiki | ko | 715.39 MiB | 0.16% | 563,959 | 5,671,388 | 70,263,451 |
us_equities_news | en | 714.16 MiB | 0.16% | 220,976 | 1,834,664 | 131,179,752 |
aihub_law_case | ko | 689.96 MiB | 0.15% | 77,202 | 1,095,140 | 66,686,761 |
aihub_formal2 | ko | 650.03 MiB | 0.14% | 95,990 | 1,650,141 | 64,523,191 |
gd_review | en | 642.76 MiB | 0.14% | 1,929,910 | 6,733,680 | 112,977,678 |
aihub_patent2 | ko | 457.18 MiB | 0.10% | 147,674 | 1,879,909 | 46,045,036 |
enron_mail | en | 428.36 MiB | 0.09% | 247,586 | 7,908,959 | 65,258,456 |
aihub_paper | ko | 370.11 MiB | 0.08% | 98,344 | 1,802,883 | 35,556,261 |
kaist | ko | 304.92 MiB | 0.07% | 11,157 | 1,926,901 | 30,929,508 |
reuters_financial | en | 288.63 MiB | 0.06% | 101,055 | 1,983,069 | 49,495,061 |
aihub_book | ko | 236.66 MiB | 0.05% | 180,001 | 1,201,956 | 23,052,720 |
aihub_koen_formal | ko | 206.37 MiB | 0.04% | 1,350,000 | 1,350,000 | 20,659,619 |
aihub_koen_ssci | ko | 186.49 MiB | 0.04% | 1,361,845 | 1,361,845 | 19,104,237 |
aihub_koen_sci | ko | 164.42 MiB | 0.04% | 1,344,631 | 1,344,631 | 17,720,448 |
fomc | en | 112.66 MiB | 0.02% | 2,822 | 950,620 | 18,640,148 |
esg_report | ko | 24.17 MiB | 0.01% | 15,561 | 119,031 | 2,488,545 |
aihub_law_kb | ko | 9.99 MiB | 0.00% | 17,373 | 46,140 | 934,632 |
bok_minutes | ko | 9.54 MiB | 0.00% | 163 | 33,027 | 918,203 |
pathobook | en | 4.28 MiB | 0.00% | 28 | 33,603 | 648,221 |
English | en | 258.83 GiB | 57.66% | |||
Korean | ko | 190.04 GiB | 42.34% | |||
Total | 448.87 GiB | 100.00% |