Skip to content

Commit

Permalink
update baseline
Browse files Browse the repository at this point in the history
  • Loading branch information
FemkeBakker committed Apr 15, 2024
1 parent 12e80a5 commit c8f997c
Showing 1 changed file with 82 additions and 36 deletions.
118 changes: 82 additions & 36 deletions notebooks/baseline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -75,7 +75,20 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import preprocessing \n",
"\n",
"# encode labels to integers\n",
"label_encoder = preprocessing.LabelEncoder() \n",
"df['encoded_label'] = label_encoder.fit_transform(df['label']) \n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -87,21 +100,21 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X = df.drop(columns=['label'])\n",
"y=df['label']\n",
"X = df.drop(columns=['encoded_label'])\n",
"y=df['encoded_label']\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# select small portion to get code running\n",
"X_train = X_train.iloc[0:50]\n",
"y_train = y_train.iloc[0:50]\n",
"X_test = X_test.iloc[0:10]\n",
"y_test = y_test.iloc[0:10]\n"
"# X_train = X_train.iloc[0:50]\n",
"# y_train = y_train.iloc[0:50]\n",
"# X_test = X_test.iloc[0:10]\n",
"# y_test = y_test.iloc[0:10]\n"
]
},
{
Expand All @@ -113,7 +126,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -122,8 +135,8 @@
"\n",
"\n",
"def run_baseline(baseline_function, dataframe, text_col):\n",
" X = dataframe.drop(columns=['label'])\n",
" y=dataframe['label']\n",
" X = dataframe.drop(columns=['encoded_label'])\n",
" y=dataframe['encoded_label']\n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
" vectorizer = TfidfVectorizer()\n",
Expand All @@ -142,7 +155,7 @@
" print(\"Accuracy:\", accuracy)\n",
"\n",
" predictions = X_test.copy()\n",
" predictions['label'] = y_test\n",
" predictions['encoded_label'] = y_test\n",
" predictions['prediction'] = y_pred\n",
" return predictions"
]
Expand All @@ -156,7 +169,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -189,7 +202,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand All @@ -209,43 +222,76 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### Baseline 3: Linear Regression + tf-idf"
"### Baseline 3: Logistic Regression + tf-idf"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "could not convert string to float: 'Motie'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[14], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mlinear_model\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LinearRegression\n\u001b[0;32m----> 2\u001b[0m linear_reg \u001b[38;5;241m=\u001b[39m \u001b[43mrun_baseline\u001b[49m\u001b[43m(\u001b[49m\u001b[43mLinearRegression\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mclean_text\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
"Cell \u001b[0;32mIn[11], line 19\u001b[0m, in \u001b[0;36mrun_baseline\u001b[0;34m(baseline_function, dataframe, text_col)\u001b[0m\n\u001b[1;32m 16\u001b[0m model \u001b[38;5;241m=\u001b[39m baseline_function\n\u001b[1;32m 18\u001b[0m \u001b[38;5;66;03m# Train the classifier on the training data\u001b[39;00m\n\u001b[0;32m---> 19\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train_tfidf_bin\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 21\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mpredict(X_test_tfidf_bin)\n\u001b[1;32m 23\u001b[0m \u001b[38;5;66;03m# Calculate the accuracy of the classifier\u001b[39;00m\n",
"File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/base.py:1474\u001b[0m, in \u001b[0;36m_fit_context.<locals>.decorator.<locals>.wrapper\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1467\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m 1469\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[1;32m 1470\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 1471\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 1472\u001b[0m )\n\u001b[1;32m 1473\u001b[0m ):\n\u001b[0;32m-> 1474\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/linear_model/_base.py:578\u001b[0m, in \u001b[0;36mLinearRegression.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 574\u001b[0m n_jobs_ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_jobs\n\u001b[1;32m 576\u001b[0m accept_sparse \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpositive \u001b[38;5;28;01melse\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcsr\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcsc\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcoo\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m--> 578\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_validate_data\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 579\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maccept_sparse\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maccept_sparse\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_numeric\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmulti_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\n\u001b[1;32m 580\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 582\u001b[0m has_sw \u001b[38;5;241m=\u001b[39m sample_weight \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 583\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m has_sw:\n",
"File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/base.py:650\u001b[0m, in \u001b[0;36mBaseEstimator._validate_data\u001b[0;34m(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)\u001b[0m\n\u001b[1;32m 648\u001b[0m y \u001b[38;5;241m=\u001b[39m check_array(y, input_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124my\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mcheck_y_params)\n\u001b[1;32m 649\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 650\u001b[0m X, y \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_X_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mcheck_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 651\u001b[0m out \u001b[38;5;241m=\u001b[39m X, y\n\u001b[1;32m 653\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m no_val_X \u001b[38;5;129;01mand\u001b[39;00m check_params\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mensure_2d\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mTrue\u001b[39;00m):\n",
"File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/utils/validation.py:1279\u001b[0m, in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1259\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1260\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mestimator_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m requires y to be passed, but the target y is None\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1261\u001b[0m )\n\u001b[1;32m 1263\u001b[0m X \u001b[38;5;241m=\u001b[39m check_array(\n\u001b[1;32m 1264\u001b[0m X,\n\u001b[1;32m 1265\u001b[0m accept_sparse\u001b[38;5;241m=\u001b[39maccept_sparse,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1276\u001b[0m input_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mX\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 1277\u001b[0m )\n\u001b[0;32m-> 1279\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[43m_check_y\u001b[49m\u001b[43m(\u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmulti_output\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmulti_output\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_numeric\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43my_numeric\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mestimator\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mestimator\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1281\u001b[0m check_consistent_length(X, y)\n\u001b[1;32m 1283\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m X, y\n",
"File \u001b[0;32m/anaconda/envs/AmsterdamInContextLearning/lib/python3.9/site-packages/sklearn/utils/validation.py:1304\u001b[0m, in \u001b[0;36m_check_y\u001b[0;34m(y, multi_output, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 1302\u001b[0m _ensure_no_complex_data(y)\n\u001b[1;32m 1303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m y_numeric \u001b[38;5;129;01mand\u001b[39;00m y\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;241m.\u001b[39mkind \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mO\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m-> 1304\u001b[0m y \u001b[38;5;241m=\u001b[39m \u001b[43my\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mastype\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfloat64\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m y\n",
"\u001b[0;31mValueError\u001b[0m: could not convert string to float: 'Motie'"
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9148737137511693\n"
]
}
],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"linear_reg = run_baseline(LinearRegression(), df, 'clean_text')"
"from sklearn.linear_model import LogisticRegression\n",
"log_reg = run_baseline(LogisticRegression(), df, 'clean_text')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Baseline 4: k Nearest Neigbors + tf-idf"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.7090739008419084\n"
]
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"knn = run_baseline(KNeighborsClassifier(), df, 'clean_text')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Baseline 5: RandomForest + tf-idf"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9195509822263798\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"random_forest = run_baseline(RandomForestClassifier(), df, 'clean_text')"
]
}
],
"metadata": {
Expand Down

0 comments on commit c8f997c

Please sign in to comment.