diff --git a/python/docs/source/tutorials/python_simplified_dftovw_tuto.ipynb b/python/docs/source/tutorials/python_simplified_dftovw_tuto.ipynb new file mode 100644 index 00000000000..85f9c1efc99 --- /dev/null +++ b/python/docs/source/tutorials/python_simplified_dftovw_tuto.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "51f41eaf-f24f-44fc-8178-3270efa46ec4", + "metadata": {}, + "source": [ + "# Simple pandas to vowpalwabbit conversion tutorial" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b9a21a43-39ad-4213-9c7f-814bbafd8a54", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from vowpalwabbit.dftovw import DFtoVW\n", + "from vowpalwabbit import Workspace" + ] + }, + { + "cell_type": "markdown", + "id": "fc831353-b5aa-4bb0-a928-c47b340397a5", + "metadata": {}, + "source": [ + "### Building simple examples using `DftoVW.from_column_names`" + ] + }, + { + "cell_type": "markdown", + "id": "c60089f1-ce41-49ee-a3a9-74f0fb2cb34f", + "metadata": {}, + "source": [ + "Let's create the following pandas dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a31118c2-b315-4129-b28a-2ea37d2dae50", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(\n", + " [\n", + " {\n", + " \"income\": 0,\n", + " \"age\": 27,\n", + " \"marital-status\": \"Separated\",\n", + " \"education\": \"HS-grad\",\n", + " \"occupation\": \"Handlers-cleaners\",\n", + " \"hours-per-week\": 25,\n", + " },\n", + " {\n", + " \"income\": 1,\n", + " \"age\": 34,\n", + " \"marital-status\": \"Married-civ-spouse\",\n", + " \"education\": \"Bachelors\",\n", + " \"occupation\": \"Prof-specialty\",\n", + " \"hours-per-week\": 40,\n", + " },\n", + " {\n", + " \"income\": 0,\n", + " \"age\": 44,\n", + " \"marital-status\": \"Never-married\",\n", + " \"education\": \"Assoc-voc\",\n", + " \"occupation\": \"Priv-house-serv\",\n", + " \"hours-per-week\": 25,\n", + " },\n", + " {\n", + " \"income\": 1,\n", + " \"age\": 38,\n", + " \"marital-status\": \"Married-civ-spouse\",\n", + " \"education\": \"Bachelors\",\n", + " \"occupation\": \"Prof-specialty\",\n", + " \"hours-per-week\": 60,\n", + " },\n", + " {\n", + " \"income\": 0,\n", + " \"age\": 34,\n", + " \"marital-status\": \"Married-civ-spouse\",\n", + " \"education\": \"HS-grad\",\n", + " \"occupation\": \"Other-service\",\n", + " \"hours-per-week\": 36,\n", + " },\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "473e5c72-ab6c-4d72-a466-7352ec604393", + "metadata": {}, + "source": [ + "The user builds the examples using the class method `DftoVW.from_column_names`. The method is called using the dataframe object (`df`) and its various column names. The conversion to vowpal wabbit examples is then performed by calling the `convert_df` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "2be83f6c-ecaa-45cb-bb3f-2f47827d6016", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['0 | age:27 marital-status=Separated education=HS-grad occupation=Handlers-cleaners hours-per-week:25',\n", + " '1 | age:34 marital-status=Married-civ-spouse education=Bachelors occupation=Prof-specialty hours-per-week:40',\n", + " '0 | age:44 marital-status=Never-married education=Assoc-voc occupation=Priv-house-serv hours-per-week:25',\n", + " '1 | age:38 marital-status=Married-civ-spouse education=Bachelors occupation=Prof-specialty hours-per-week:60',\n", + " '0 | age:34 marital-status=Married-civ-spouse education=HS-grad occupation=Other-service hours-per-week:36']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "converter = DFtoVW.from_column_names(\n", + " df=df,\n", + " y=\"income\",\n", + " x=[\"age\", \"marital-status\", \"education\", \"occupation\", \"hours-per-week\"],\n", + ")\n", + "examples = converter.convert_df()\n", + "examples" + ] + }, + { + "cell_type": "markdown", + "id": "6109f95e-cd17-485b-947d-8c2c33a5843a", + "metadata": {}, + "source": [ + "Note that the vowpal wabbit format for categorical features is `feature_name=feature_value` whereas for numerical features the format is `feature_name:feature_value`. When using `DFtoVW` class, the appropriate format will be inferred from the dataframe columns types.\n", + "\n", + "We then train the model on these examples:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c0269980-78b3-4123-84eb-27e0fba929b4", + "metadata": {}, + "outputs": [], + "source": [ + "model = Workspace(P=1, enable_logging=True)\n", + "\n", + "for ex in examples:\n", + " model.learn(ex)\n", + "model.finish()" + ] + }, + { + "cell_type": "markdown", + "id": "50470ca2-f33d-495e-a3f9-46ae1a618e6d", + "metadata": {}, + "source": [ + "### Building more complex examples" + ] + }, + { + "cell_type": "markdown", + "id": "30a526a6-7f8f-48e4-8dca-f9058a0d87fb", + "metadata": {}, + "source": [ + "The class method `DFtoVW.from_column_names` represents a quick and simple way to build the examples, but if the user needs more control over the way the examples are created, she or he can either use the class `Feature` or the class `Namespace` for building features, and any of the label class available (see below) based on the nature of the task. \n", + "\n", + "- When using `Namespace` class (see https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Namespaces for the meaning) the user specifies the name of the namespace with the `name` field, and will pass one or a list of `Feature` object to the `features` field.\n", + "\n", + "- The `Feature` class has a `value` field, which is the name of the column. The user can also rename the feature using the `rename_feature` field or choose to enforce a specific type (`\"numerical\"` or `\"categorical\"`) using `as_type` field.\n", + "\n", + "Regarding the labels, multiple classes are available:\n", + "- `SimpleLabel` for regression\n", + "- `MulticlassLabel` and `Multilabel` for classification\n", + "- `ContextualbanditLabel`.\n", + "\n", + "In the following examples we'll build 2 namespaces based on socio-demographic features and the job features." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "90a69d90-a0a6-42d4-8867-5d1b0e73f4ec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['0 |ns_sociodemo age:27 marital-status=Separated education=HS-grad |ns_job occupation=Handlers-cleaners hours-per-week:25',\n", + " '1 |ns_sociodemo age:34 marital-status=Married-civ-spouse education=Bachelors |ns_job occupation=Prof-specialty hours-per-week:40',\n", + " '0 |ns_sociodemo age:44 marital-status=Never-married education=Assoc-voc |ns_job occupation=Priv-house-serv hours-per-week:25',\n", + " '1 |ns_sociodemo age:38 marital-status=Married-civ-spouse education=Bachelors |ns_job occupation=Prof-specialty hours-per-week:60',\n", + " '0 |ns_sociodemo age:34 marital-status=Married-civ-spouse education=HS-grad |ns_job occupation=Other-service hours-per-week:36']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from vowpalwabbit.dftovw import SimpleLabel, Namespace, Feature\n", + "\n", + "ns_sociodemo = Namespace(\n", + " features=[Feature(col) for col in [\"age\", \"marital-status\", \"education\"]],\n", + " name=\"ns_sociodemo\",\n", + ")\n", + "ns_job = Namespace(\n", + " features=[Feature(col) for col in [\"occupation\", \"hours-per-week\"]], name=\"ns_job\"\n", + ")\n", + "label = SimpleLabel(\"income\")\n", + "\n", + "converter_advanced = DFtoVW(df=df, namespaces=[ns_sociodemo, ns_job], label=label)\n", + "examples_advanced = converter_advanced.convert_df()\n", + "examples_advanced[:5]" + ] + }, + { + "cell_type": "markdown", + "id": "071326d7-f969-4db1-a73e-3cee225921f4", + "metadata": {}, + "source": [ + "We train the model by also including interactions between the variables of the 2 namespaces:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f0ed661f-d9a0-4ebb-93b8-f5747347c7b4", + "metadata": {}, + "outputs": [], + "source": [ + "model_advanced = Workspace(\n", + " # arg_str=\"--interactions ns_sociodemo:ns_job\", P=1, enable_logging=True\n", + " arg_str=\"--redefine a:=ns_job b:=ns_sociodemo -q ab \",\n", + " P=1,\n", + " enable_logging=True,\n", + ")\n", + "\n", + "for ex in examples_advanced:\n", + " model_advanced.learn(ex)\n", + "\n", + "model_advanced.finish()" + ] + }, + { + "cell_type": "markdown", + "id": "5bb2208e-9d0e-44ef-8d91-faccedf41ac0", + "metadata": {}, + "source": [ + "Finally, we can get the estimated weights associated to each namespace and feature:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "06aabeab-2365-4f86-bf60-7043b0e59190", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('ns_job', 'occupation', 0.0),\n", + " ('ns_job', 'hours-per-week', 0.0019117757910862565),\n", + " ('ns_sociodemo', 'age', 0.001858704723417759),\n", + " ('ns_sociodemo', 'marital-status', 0.0),\n", + " ('ns_sociodemo', 'education', 0.0)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[\n", + " (ns.name, feature.name, model_advanced.get_weight_from_name(feature.name, ns.name))\n", + " for ns in [ns_job, ns_sociodemo]\n", + " for feature in ns.features\n", + "]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}