-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
113 lines (88 loc) · 3.78 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import html
import os
from datetime import datetime
import pandas as pd
from bs4 import BeautifulSoup
from gtts import gTTS
def extract_vocabulary(filename):
"""
Extract vocabulary pairs from the given HTML file.
Args:
filename (str): Path to the HTML file.
Returns:
tuple: Lists of first and second vocabulary words.
"""
first_vocabulary = []
second_vocabulary = []
with open(filename, "r", encoding="utf-8") as file:
text = file.read()
soup = BeautifulSoup(text, "html.parser")
word_pairs = soup.find_all("div")
for word_pair in word_pairs[3:]:
first_word = word_pair.find("h3")
second_word = word_pair.find("p")
if first_word and second_word:
first_vocabulary.append(html.unescape(first_word.text.strip()))
second_vocabulary.append(
html.unescape(second_word.text.strip()).replace(",", "/")
)
first_vocabulary = first_vocabulary[5:]
second_vocabulary = second_vocabulary[5:]
return first_vocabulary[::3], second_vocabulary[::3]
def merge_vocabulary(first_vocabulary, second_vocabulary, output_file):
"""
Merge the first and second vocabulary lists into a single file.
Args:
first_vocabulary (list): List of first vocabulary words.
second_vocabulary (list): List of second vocabulary words.
output_file (str): Path to the output file.
"""
with open(output_file, "w", encoding="utf-8") as file:
for first_word, second_word in zip(first_vocabulary[2:], second_vocabulary[2:]):
line = f"{first_word}\t{second_word}\n"
file.write(line)
def generate_and_save_pronunciation(word, lang_code, output_folder):
"""
Generate and save the pronunciation of a word.
Args:
word (str): The word to generate pronunciation for.
lang_code (str): The language code for pronunciation.
output_folder (str): The folder to save the audio file.
"""
tts = gTTS(word, lang=lang_code)
filename = f"{lang_code}_{word}.mp3"
filepath = os.path.join(output_folder, filename)
tts.save(filepath)
print(f"📥 Saved audio for '{word}' to '{filepath}'")
def main():
# User input with default values
filename = input("Enter the filename (default 'duolingo.txt'): ") or "duolingo.txt"
langcode = input("Enter the language code (default 'fr-en'): ") or "fr-en"
output_folder = input("Enter the output folder (default 'audio'): ") or "audio"
current_date = str(datetime.now().date())
languages = langcode.split("-")
first_code = languages[0]
second_code = languages[1]
output_file = f"{current_date}_merged_vocabulary_for_{langcode}.txt"
output_filename = f"{current_date}_vocabulary_list_for_anki_{langcode}.csv"
print("===== EXTRACT WORDS FROM HTML =====")
first_vocab, second_vocab = extract_vocabulary(filename)
for i in range(len(first_vocab)):
print(f"{first_vocab[i]} \t {second_vocab[i]}")
print("===== SAVING VOCABULARY TEXT FILE =====")
merge_vocabulary(first_vocab, second_vocab, output_file)
print(f"📄 Vocabulary text file saved as '{output_file}'")
print("===== SAVING VOCABULARY AUDIO FILES =====")
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for word in first_vocab:
generate_and_save_pronunciation(word, first_code, output_folder)
print("===== SAVING ANKI FILE =====")
audio_column = [f"{word} [sound:{first_code}_{word}.mp3]" for word in first_vocab]
df = pd.DataFrame(
{first_code: first_vocab, second_code: second_vocab, "Audio": audio_column}
)
df.to_csv(output_filename, index=False)
print(f"📝 DataFrame saved as '{output_filename}'")
if __name__ == "__main__":
main()