Skip to content

Commit

Permalink
Add util for mixed korean and english to ipa
Browse files Browse the repository at this point in the history
This will default to korean methods for numbers, though that won't be
the focus, and might get some numbers wrong for phone numbers and so
forth.

Might have to add these into a future version.
  • Loading branch information
gkielian committed Oct 16, 2024
1 parent 42df697 commit 3f8c0b6
Showing 1 changed file with 114 additions and 0 deletions.
114 changes: 114 additions & 0 deletions data/template/utils/ko_en_to_ipa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import subprocess
from konlpy.tag import Okt
import inflect
import argparse
import re

# English number conversion engine
inflect_engine = inflect.engine()

# Korean number dictionary
korean_numbers = {
0: "영",
1: "일",
2: "이",
3: "삼",
4: "사",
5: "오",
6: "육",
7: "칠",
8: "팔",
9: "구",
10: "십"
}

def number_to_korean(num):
"""Convert numbers to Korean words."""
if num <= 10:
return korean_numbers[num]
else:
result = []
if num >= 1000:
result.append(korean_numbers[num // 1000] + "천")
num %= 1000
if num >= 100:
result.append(korean_numbers[num // 100] + "백")
num %= 100
if num >= 10:
result.append(korean_numbers[num // 10] + "십")
num %= 10
if num > 0:
result.append(korean_numbers[num])
return "".join(result)

def transcribe_english(sentence):
"""Transcribe an English sentence to phonemes using espeak-ng."""
try:
result = subprocess.run(
["espeak-ng", "-q", "--ipa", sentence],
capture_output=True,
text=True
)
return result.stdout.strip()
except Exception as e:
return f"Error in transcribing English: {str(e)}"

def transcribe_korean(sentence):
"""Transcribe a Korean sentence into its phonemes using KoNLPy (Okt)."""
okt = Okt()
tokens = okt.morphs(sentence)
tokenized_sentence = ' '.join(tokens)

try:
result = subprocess.run(
["espeak-ng", "-q", "-v", "ko", "--ipa", tokenized_sentence],
capture_output=True,
text=True
)
return result.stdout.strip()
except Exception as e:
return f"Error in transcribing Korean: {str(e)}"

def handle_mixed_language(word):
"""Handle a word with potential Korean, English, or number content."""
if word.isdigit(): # Detect numbers
number_in_korean = number_to_korean(int(word))
return transcribe_korean(number_in_korean)
elif any('가' <= char <= '힣' for char in word): # Detect Korean
return transcribe_korean(word)
else: # English word
return transcribe_english(word)

def transcribe_multilingual(sentences, output_file):
"""Transcribe multilingual sentences (English and Korean, with numbers) and save to a file."""
with open(output_file, 'w', encoding='utf-8') as f:
for sentence in sentences:
result = []
# Split sentence but keep punctuation (preserve spaces, commas, etc.)
words = re.findall(r'\w+|[^\w\s]', sentence, re.UNICODE)
for word in words:
if re.match(r'\w+', word): # Only process words (skip punctuation)
result.append(handle_mixed_language(word))
else:
result.append(word) # Preserve punctuation as is
transcription_result = " ".join(result)
f.write(transcription_result + "\n")
print(transcription_result) # Print to console for reference

def main():
parser = argparse.ArgumentParser(description='Transcribe multilingual sentences into IPA phonemes.')
parser.add_argument('input_file', type=str, help='Path to the input file containing sentences.')
parser.add_argument('output_file', type=str, help='Path to the output file for IPA transcription.')

args = parser.parse_args()

# Read input sentences
with open(args.input_file, 'r', encoding='utf-8') as f:
sentences = f.readlines()

# Transcribe and save to the output file
transcribe_multilingual(sentences, args.output_file)

if __name__ == '__main__':
main()

0 comments on commit 3f8c0b6

Please sign in to comment.