Skip to content
This repository has been archived by the owner on May 9, 2024. It is now read-only.

Latest commit

 

History

History
49 lines (47 loc) · 1.21 KB

nikl_news.md

File metadata and controls

49 lines (47 loc) · 1.21 KB

NIKL Newspaper Corpus

Sample

name: nikl_news
fullname: NIKL Newspaper Corpus
lang: ko
category: formal
description: National Institute of the Korean Language Corpus - Newspaper
license: Korea Open Government License, Category 4
homepage: https://corpus.korean.go.kr
version: 2.0.0
num_docs: 4104534
num_docs_before_processing: 4116643
num_segments: 4104543
num_sents: 42527395
num_words: 1138897337
size_in_bytes: 12017799919
num_bytes_before_processing: 12130403694
size_in_human_bytes: 11.19 GiB
data_files_modified: '2022-02-25 01:50:56'
meta_files_modified: '2022-02-22 11:08:17'
info_updated: '2022-02-26 03:06:09'
data_files:
  train: nikl_news-train.parquet
meta_files:
  train: meta-nikl_news-train.parquet
features:
  columns:
    id: id
    text: text
  data:
    id: int
    text: str
  meta:
    id: int
    doc_id: str
    title: str
    author: str
    publisher: str
    date: str
    topic: str
    original_topic: str
    category: str