From 78881fb502fe7300e6abde9c41d264625336a6c2 Mon Sep 17 00:00:00 2001 From: Johann Aschenloher Date: Fri, 11 Feb 2022 14:39:47 +0100 Subject: [PATCH] Remove repeated DF joins. Highly repeated joins to a pandas Dataframe, causes highly fragmented Dataframes, this leads to poor performance and warnings are printed. I replaced it, as recomended, with a single join --- loglizer/preprocessing.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/loglizer/preprocessing.py b/loglizer/preprocessing.py index 33d3c0e..c0a0252 100644 --- a/loglizer/preprocessing.py +++ b/loglizer/preprocessing.py @@ -43,7 +43,7 @@ def transform(self, x, window_y, y): y = y data_dict = {"SessionId": x["SessionId"].values, "window_y": window_y.values, "y": y.values, "x": np.array(x["EventSequence"].tolist())} return data_dict - + class FeatureExtractor(object): @@ -91,12 +91,12 @@ def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=Fals X = X[:, idx] self.events = np.array(X_df.columns)[idx].tolist() X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)]) - + num_instance, num_event = X.shape if self.term_weighting == 'tf-idf': df_vec = np.sum(X > 0, axis=0) self.idf_vec = np.log(num_instance / (df_vec + 1e-8)) - idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) + idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) X = idf_matrix if self.normalization == 'zero-mean': mean_vec = X.mean(axis=0) @@ -105,8 +105,8 @@ def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=Fals elif self.normalization == 'sigmoid': X[X != 0] = expit(X[X != 0]) X_new = X - - print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) + + print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) return X_new def transform(self, X_seq): @@ -129,16 +129,15 @@ def transform(self, X_seq): X_df = pd.DataFrame(X_counts) X_df = X_df.fillna(0) empty_events = set(self.events) - set(X_df.columns) - for event in empty_events: - X_df[event] = [0] * len(X_df) + X_df = pd.concat([X_df, pd.DataFrame(np.zeros((len(X_df),len(empty_events))), columns=empty_events)], axis=1) X = X_df[self.events].values if self.oov: oov_vec = np.sum(X_df[X_df.columns.difference(self.events)].values > 0, axis=1) X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)]) - + num_instance, num_event = X.shape if self.term_weighting == 'tf-idf': - idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) + idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) X = idf_matrix if self.normalization == 'zero-mean': X = X - np.tile(self.mean_vec, (num_instance, 1)) @@ -146,6 +145,6 @@ def transform(self, X_seq): X[X != 0] = expit(X[X != 0]) X_new = X - print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) + print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) return X_new