-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_exploration.py
133 lines (113 loc) · 4.97 KB
/
data_exploration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
Usage:
data_handling.py [--words_list [--eng | --ita | --cat]]
[--plot [--ita | --eng | --cat]]
data_handling.py (-h | --help)
Options:
--words_list Output a file with words from the specified semantic space
--eng English
--ita Italian
--cat Catalan
--plot Output 2D and 3D visualizations of semantic space subsets for translations
--ita_cat Italian to/from Catalan
--eng_cat English to/from Catalan
--eng_ita English to/from Italian
-h, --help Show this help message
Examples:
data_handling.py --words_list --eng
data_handling.py --plot --ita_cat
data_handling.py --words_list --eng --plot --ita_cat
"""
import re
import bz2
from docopt import docopt
import numpy as np
import utils
########### Functions
########### Words lists
def extract_words_from_dm(filename):
dm_data = utils.read_dm(filename)
words = list(dm_data.keys())
return words
def extract_words_from_bz2(filename):
word_pattern = re.compile(r'\b[A-Za-z]+\b') # Regular expression to match words
with bz2.open(filename, 'rt') as file:
text = file.read()
words = word_pattern.findall(text)
return words
def write_words_to_file(words, output_file):
with open(output_file, 'w') as file:
for word in words:
file.write(word + '\n')
########### Plots
def extract_first_words(input_file):
"""
Extract the first words from each line in the input file.
Args:
input_file (str): Path to the input file.
Returns:
list: List of first words extracted from each line.
"""
first_words = []
with open(input_file, 'r') as f:
lines = f.readlines()
for line in lines:
words = line.split()
if len(words) >= 1:
first_words.append(words[0])
else:
print(f"Warning: Line '{line.strip()}' does not contain any words.")
return first_words
def extract_word_vectors(semantic_space, words_of_interest):
"""
Extract word vectors from a semantic space dictionary for specified words of interest.
Args:
semantic_space (dict): Dictionary where keys are words and values are their corresponding vectors.
words_of_interest (list): List of words for which vectors should be extracted.
Returns:
dict: Dictionary where keys are words of interest and values are their corresponding vectors.
"""
word_vectors = {}
for word in words_of_interest:
if word in semantic_space:
word_vectors[word] = np.array(semantic_space[word])
else:
print(f"Warning: '{word}' not found in the semantic space.")
return word_vectors
############# Main
if __name__ == '__main__':
args = docopt(__doc__, version='PLSR regression for word translation')
if args['--words_list']:
# English space
if args['--eng']:
filename = './spaces/english_space.dm'
output_filename = './pairs/sets/eng_words.txt'
words_1 = extract_words_from_dm(filename)
# Italian space
if args['--ita']:
filename = './spaces/ita_space.bz2'
output_filename = './pairs/sets/ita_words.txt'
words_1 = extract_words_from_bz2(filename)
# Catalan space
if args['--cat']:
filename = './spaces/catalan.subset.dm'
output_filename = './pairs/sets/cat_words.txt'
words_1 = extract_words_from_dm(filename)
write_words_to_file(words_1, output_filename)
# Plot 2D and 3D images of the specified subset semantic spaces based on PCA
if args['--plot']:
if args['--ita']:
first_words = extract_first_words('./pairs/ita_cat_pairs.txt')
ita_space_subset = extract_word_vectors(utils.read_bz2('./spaces/ita_space.bz2'), first_words)
utils.plot_2d_semantic_space(ita_space_subset, save_path='./spaces/figures/', file_name = 'ita_space_subset_2D')
utils.plot_3d_semantic_space(ita_space_subset, save_path='./spaces/figures/', file_name = 'ita_space_subset_3D')
if args['--eng']:
first_words = extract_first_words('./pairs/eng_cat_pairs.txt')
eng_space_subset = extract_word_vectors(utils.read_dm('./spaces/english_space.dm'), first_words)
utils.plot_2d_semantic_space(eng_space_subset, save_path='./spaces/figures/', file_name = 'eng_space_subset_2D')
utils.plot_3d_semantic_space(eng_space_subset, save_path='./spaces/figures/', file_name = 'eng_space_subset_3D')
if args['--cat']:
first_words = extract_first_words('./pairs/cat_ita_pairs.txt')
cat_space_subset = extract_word_vectors(utils.read_dm('./spaces/catalan.subset.dm'), first_words)
utils.plot_2d_semantic_space(cat_space_subset, save_path='./spaces/figures/', file_name = 'cat_space_subset_2D')
utils.plot_3d_semantic_space(cat_space_subset, save_path='./spaces/figures/', file_name = 'cat_space_subset_3D')