diff --git a/notebooks/baseline.ipynb b/notebooks/baseline.ipynb index 3d8e4b7..83538e9 100644 --- a/notebooks/baseline.ipynb +++ b/notebooks/baseline.ipynb @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -75,7 +75,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import preprocessing \n", + "\n", + "# encode labels to integers\n", + "label_encoder = preprocessing.LabelEncoder() \n", + "df['encoded_label'] = label_encoder.fit_transform(df['label']) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -87,21 +100,21 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", - "X = df.drop(columns=['label'])\n", - "y=df['label']\n", + "X = df.drop(columns=['encoded_label'])\n", + "y=df['encoded_label']\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# select small portion to get code running\n", - "X_train = X_train.iloc[0:50]\n", - "y_train = y_train.iloc[0:50]\n", - "X_test = X_test.iloc[0:10]\n", - "y_test = y_test.iloc[0:10]\n" + "# X_train = X_train.iloc[0:50]\n", + "# y_train = y_train.iloc[0:50]\n", + "# X_test = X_test.iloc[0:10]\n", + "# y_test = y_test.iloc[0:10]\n" ] }, { @@ -113,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -122,8 +135,8 @@ "\n", "\n", "def run_baseline(baseline_function, dataframe, text_col):\n", - " X = dataframe.drop(columns=['label'])\n", - " y=dataframe['label']\n", + " X = dataframe.drop(columns=['encoded_label'])\n", + " y=dataframe['encoded_label']\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", " vectorizer = TfidfVectorizer()\n", @@ -142,7 +155,7 @@ " print(\"Accuracy:\", accuracy)\n", "\n", " predictions = X_test.copy()\n", - " predictions['label'] = y_test\n", + " predictions['encoded_label'] = y_test\n", " predictions['prediction'] = y_pred\n", " return predictions" ] @@ -156,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -189,7 +202,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -209,43 +222,76 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Baseline 3: Linear Regression + tf-idf" + "### Baseline 3: Logistic Regression + tf-idf" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [ { - "ename": "ValueError", - "evalue": "could not convert string to float: 'Motie'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinear_model\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LinearRegression\n\u001b[0;32m----> 2\u001b[0m linear_reg \u001b[38;5;241m=\u001b[39m \u001b[43mrun_baseline\u001b[49m\u001b[43m(\u001b[49m\u001b[43mLinearRegression\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mclean_text\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn[11], line 19\u001b[0m, in \u001b[0;36mrun_baseline\u001b[0;34m(baseline_function, dataframe, text_col)\u001b[0m\n\u001b[1;32m 16\u001b[0m model \u001b[38;5;241m=\u001b[39m baseline_function\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# Train the classifier on the training data\u001b[39;00m\n\u001b[0;32m---> 19\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_tfidf_bin\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mpredict(X_test_tfidf_bin)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# Calculate the accuracy of the classifier\u001b[39;00m\n", - "File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/base.py:1474\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1467\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m 1469\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[1;32m 1470\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 1471\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 1472\u001b[0m )\n\u001b[1;32m 1473\u001b[0m ):\n\u001b[0;32m-> 1474\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/linear_model/_base.py:578\u001b[0m, in \u001b[0;36mLinearRegression.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 574\u001b[0m n_jobs_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_jobs\n\u001b[1;32m 576\u001b[0m accept_sparse \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpositive \u001b[38;5;28;01melse\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcsr\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcsc\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcoo\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m--> 578\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 579\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maccept_sparse\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_numeric\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmulti_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\n\u001b[1;32m 580\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 582\u001b[0m has_sw \u001b[38;5;241m=\u001b[39m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 583\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_sw:\n", - "File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/base.py:650\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 648\u001b[0m y \u001b[38;5;241m=\u001b[39m check_array(y, input_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcheck_y_params)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 650\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_X_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 651\u001b[0m out \u001b[38;5;241m=\u001b[39m X, y\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m check_params\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mensure_2d\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m):\n", - "File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/utils/validation.py:1279\u001b[0m, in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1259\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1260\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mestimator_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m requires y to be passed, but the target y is None\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1261\u001b[0m )\n\u001b[1;32m 1263\u001b[0m X \u001b[38;5;241m=\u001b[39m check_array(\n\u001b[1;32m 1264\u001b[0m X,\n\u001b[1;32m 1265\u001b[0m accept_sparse\u001b[38;5;241m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1276\u001b[0m input_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 1277\u001b[0m )\n\u001b[0;32m-> 1279\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[43m_check_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmulti_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmulti_output\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_numeric\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my_numeric\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1281\u001b[0m check_consistent_length(X, y)\n\u001b[1;32m 1283\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m X, y\n", - "File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/utils/validation.py:1304\u001b[0m, in \u001b[0;36m_check_y\u001b[0;34m(y, multi_output, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1302\u001b[0m _ensure_no_complex_data(y)\n\u001b[1;32m 1303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y_numeric \u001b[38;5;129;01mand\u001b[39;00m y\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m-> 1304\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[43my\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat64\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m y\n", - "\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'Motie'" + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9148737137511693\n" ] } ], "source": [ - "from sklearn.linear_model import LinearRegression\n", - "linear_reg = run_baseline(LinearRegression(), df, 'clean_text')" + "from sklearn.linear_model import LogisticRegression\n", + "log_reg = run_baseline(LogisticRegression(), df, 'clean_text')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Baseline 4: k Nearest Neigbors + tf-idf" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.7090739008419084\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "knn = run_baseline(KNeighborsClassifier(), df, 'clean_text')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Baseline 5: RandomForest + tf-idf" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.9195509822263798\n" + ] + } + ], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "random_forest = run_baseline(RandomForestClassifier(), df, 'clean_text')" + ] } ], "metadata": {