From 78881fb502fe7300e6abde9c41d264625336a6c2 Mon Sep 17 00:00:00 2001
From: Johann Aschenloher <ja@safri.net>
Date: Fri, 11 Feb 2022 14:39:47 +0100
Subject: [PATCH] Remove repeated DF joins.

Highly repeated joins to a pandas Dataframe, causes highly fragmented
Dataframes, this leads to poor performance and warnings are printed.
I replaced it, as recomended, with a single join
---
 loglizer/preprocessing.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/loglizer/preprocessing.py b/loglizer/preprocessing.py
index 33d3c0e..c0a0252 100644
--- a/loglizer/preprocessing.py
+++ b/loglizer/preprocessing.py
@@ -43,7 +43,7 @@ def transform(self, x, window_y, y):
         y = y
         data_dict = {"SessionId": x["SessionId"].values, "window_y": window_y.values, "y": y.values, "x": np.array(x["EventSequence"].tolist())}
         return data_dict
-        
+
 
 class FeatureExtractor(object):
 
@@ -91,12 +91,12 @@ def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=Fals
                 X = X[:, idx]
                 self.events = np.array(X_df.columns)[idx].tolist()
             X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
-        
+
         num_instance, num_event = X.shape
         if self.term_weighting == 'tf-idf':
             df_vec = np.sum(X > 0, axis=0)
             self.idf_vec = np.log(num_instance / (df_vec + 1e-8))
-            idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) 
+            idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1))
             X = idf_matrix
         if self.normalization == 'zero-mean':
             mean_vec = X.mean(axis=0)
@@ -105,8 +105,8 @@ def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=Fals
         elif self.normalization == 'sigmoid':
             X[X != 0] = expit(X[X != 0])
         X_new = X
-        
-        print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) 
+
+        print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1]))
         return X_new
 
     def transform(self, X_seq):
@@ -129,16 +129,15 @@ def transform(self, X_seq):
         X_df = pd.DataFrame(X_counts)
         X_df = X_df.fillna(0)
         empty_events = set(self.events) - set(X_df.columns)
-        for event in empty_events:
-            X_df[event] = [0] * len(X_df)
+        X_df = pd.concat([X_df, pd.DataFrame(np.zeros((len(X_df),len(empty_events))), columns=empty_events)], axis=1)
         X = X_df[self.events].values
         if self.oov:
             oov_vec = np.sum(X_df[X_df.columns.difference(self.events)].values > 0, axis=1)
             X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
-        
+
         num_instance, num_event = X.shape
         if self.term_weighting == 'tf-idf':
-            idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) 
+            idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1))
             X = idf_matrix
         if self.normalization == 'zero-mean':
             X = X - np.tile(self.mean_vec, (num_instance, 1))
@@ -146,6 +145,6 @@ def transform(self, X_seq):
             X[X != 0] = expit(X[X != 0])
         X_new = X
 
-        print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) 
+        print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1]))
 
         return X_new