convert_to_xml.py

import os
from lxml import etree as ET
from importlib.resources import open_text
import json
import csv
import argparse


def get_label_map(transform):
    with open_text('transformer_deid', 'label.json') as fp:
        label_map = json.load(fp)

    # label_membership has different label transforms as keys
    if transform not in label_map:
        raise KeyError(
            'Unable to find label transform %s in label.json' % transform
        )
    label_map = label_map[transform]

    # label_map has items "harmonized_label": ["label 1", "label 2", ...]
    # invert this for the final label mapping
    return {
        label: harmonized_label
        for harmonized_label, original_labels in label_map.items()
        for label in original_labels
    }


def text_ann_to_xml(txt_path, ann_path, label_map):
    root = ET.Element('deIdi2b2')

    with open(txt_path, 'r') as fp:
        full_text = ''.join(fp.readlines())
    ET.SubElement(root, 'TEXT').text = full_text

    tags = ET.SubElement(root, 'TAGS')
    with open(ann_path, 'r') as fp:
        csvreader = csv.reader(fp, delimiter=',', quotechar='"')
        header = next(csvreader)
        # identify which columns we want
        idx = [
            header.index('entity_type'),
            header.index('start'),
            header.index('stop'),
            header.index('entity'),
            header.index('comment'),
            header.index('annotation_id')
        ]

        for row in csvreader:
            ET.SubElement(
                tags,
                label_map[row[idx[0]]],
                TYPE=row[idx[0]],
                comment=row[idx[4]],
                end=str(row[idx[2]]),
                id=row[idx[5]],
                start=str(row[idx[1]]),
                text=row[idx[3]]
            )

    tree = ET.ElementTree(root)
    # tree.write('test.xml', encoding='utf8', xml_declaration=True, pretty_print=True)
    return tree


def parse_args():
    parser = argparse.ArgumentParser(
        description=
        'Convert annotations and text files to .xml readable by pydeid.'
    )

    parser.add_argument(
        '-p',
        '--path',
        type=str,
        help='file containing txt and ann files data to be converted to XML'
    )

    args = parser.parse_args()

    return args


def main():
    args = parse_args()
    rootdir = args.path

    label_map = get_label_map('base')

    filepath = os.listdir(rootdir + 'txt')

    os.mkdir(f'{rootdir}/xml/')

    for file in filepath:
        id = file.split('.')[0]
        txt_path = f'{rootdir}txt/{id}.txt'
        ann_path = f'{rootdir}ann/{id}.gs'
        tree = text_ann_to_xml(txt_path, ann_path, label_map)

        outpath = f'{rootdir}xml/{id}.xml'
        tree.write(
            outpath, encoding='utf8', xml_declaration=True, pretty_print=True
        )  #


if __name__ == '__main__':
    main()