diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml deleted file mode 100644 index 0b6a410..0000000 --- a/.github/workflows/draft-pdf.yml +++ /dev/null @@ -1,22 +0,0 @@ -on: - push: - -jobs: - paper: - runs-on: ubuntu-latest - name: Paper Draft - steps: - - name: Checkout - uses: actions/checkout@v3 - - - name: Build draft PDF - uses: openjournals/openjournals-draft-action@master - with: - journal: joss - paper-path: paper/paper.md - - - name: Upload - uses: actions/upload-artifact@v3 - with: - name: paper - path: paper/paper.pdf diff --git a/paper/paper.bib b/paper/paper.bib deleted file mode 100644 index a0a9e1e..0000000 --- a/paper/paper.bib +++ /dev/null @@ -1,73 +0,0 @@ -@article{Gu, - title={Mamba: Linear-time sequence modeling with selective state spaces}, - author={Gu, Albert and Dao, Tri}, - journal={arXiv preprint arXiv:2312.00752}, - year={2023} -} - - -@article{Ahamed, - title={MambaTab: A Simple Yet Effective Approach for Handling Tabular Data}, - author={Ahamed, Md Atik and Cheng, Qiang}, - journal={arXiv preprint arXiv:2401.08867}, - year={2024} -} - - -@article{Gorishnyi1, - title={Revisiting deep learning models for tabular data}, - author={Gorishniy, Yury and Rubachev, Ivan and Khrulkov, Valentin and Babenko, Artem}, - journal={Advances in Neural Information Processing Systems}, - volume={34}, - pages={18932--18943}, - year={2021} -} - - -@article{Huang, - title={Tabtransformer: Tabular data modeling using contextual embeddings}, - author={Huang, Xin and Khetan, Ashish and Cvitkovic, Milan and Karnin, Zohar}, - journal={arXiv preprint arXiv:2012.06678}, - year={2020} -} - - -@inproceedings{Thielmann, - title={Neural additive models for location scale and shape: A framework for interpretable neural regression beyond the mean}, - author={Thielmann, Anton Frederik and Kruse, Ren{\'e}-Marcel and Kneib, Thomas and S{\"a}fken, Benjamin}, - booktitle={International Conference on Artificial Intelligence and Statistics}, - pages={1783--1791}, - year={2024}, - organization={PMLR} -} - - -@article{Kneib, - title={Rage against the mean--a review of distributional regression approaches}, - author={Kneib, Thomas and Silbersdorff, Alexander and S{\"a}fken, Benjamin}, - journal={Econometrics and Statistics}, - volume={26}, - pages={99--123}, - year={2023}, - publisher={Elsevier} -} - - -@article{Pedregosa, - title={Scikit-learn: Machine learning in Python}, - author={Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and others}, - journal={the Journal of machine Learning research}, - volume={12}, - pages={2825--2830}, - year={2011}, - publisher={JMLR. org} -} - -@article{natt, - title={Interpretable Additive Tabular Transformer Networks}, - author={Anton Frederik Thielmann and Arik Reuter and Thomas Kneib and David R{\"u}gamer and Benjamin S{\"a}fken}, - journal={Transactions on Machine Learning Research}, - issn={2835-8856}, - year={2024}, - url={https://openreview.net/forum?id=TdJ7lpzAkD}, -} \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md deleted file mode 100644 index 0bd32e1..0000000 --- a/paper/paper.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: "Mambular: A User-Centric Python Library for Tabular Deep Learning Leveraging Mamba Architecture" -tags: - - Python - - Tabular Deep Learning - - Mamba - - Distributional Regression -authors: - - name: Anton Frederik Thielmann - orcid: 0000-0002-6768-8992 - affiliation: 1 - - name: Christoph Weisser - affiliation: 1 - - name: Manish Kumar - affiliation: 1 - - name: Benjamin Saefken - affiliation: 2 - - name: Soheila Samiee - affiliation: 3 -affiliations: - - name: BASF SE, Germany - index: 1 - - name: TU Clausthal, Germany - index: 2 - - name: BASF Canada Inc, Canada - index: 3 -date: 22 April 2024 -bibliography: paper.bib ---- - -# 1. Summary - -Mambular is a Python library designed to leverage the capabilities of the recently proposed Mamba architecture [@Gu] for deep learning tasks involving tabular datasets. The effectiveness of the attention mechanism, as demonstrated by models such as TabTransformer [@Ahamed] and FT-Transformer [@Gorishnyi1], is extended to these data types, showcasing the potential for sequence-focused architectures to excel in this domain. Thus, sequence-focused architectures can also achieve state-of-the-art performances for tabular data problems. [@Huang] already demonstrated that the Mamba architecture, similar to the attention mechanism, can effectively be used when dealing with tabular data. Mambular closely follows [@Gorishnyi1], but uses Mamba blocks instead of transformer blocks. -Furthermore, it offers enhanced flexibility in model architecture with respect to embedding activation, pooling layers, and task-specific head architectures. Choosing the appropriate settings, a user can thus easily implement the models presented in [@Huang]. - -# 2. Statement of Need -Transformer-based models for tabular data have become powerful alternatives to traditional gradient-based decision trees. [@Huang; @Gorishnyi1; @natt]. However, effectively training these models requires users to: **i)** deeply understand the intricacies of tabular transformer networks, **ii)** master various data type-dependent preprocessing techniques, **iii)** navigate complex deep learning libraries. -This either leads researchers and practitioners alike to develop extensive custom scripts and libraries to fit these models or discourages them from using these advanced tools altogether. However, since tabular transformer models are becoming more popular and powerful, they should be easy to use, also for practitioners. Mambular addresses this by offering a straightforward framework that allows users to easily train tabular models using the innovative Mamba architecture. - -# 3. Methodology -The Mambular default architecture, independent of the task follows the straight forward architecture of tabular tansformer models [@Ahamed; @Gorishnyi1; @Huang]: -If the numerical features are integer binned they are treated as categorical features and each feature/variable is passed through an embedding layer. When other numerical preprocessing techniques are applied (or no preprocessing), the numerical features are passed through a single feed-forward dense layer with the same output dimensionality as the embedding layers [@Gorishnyi1]. By default, no activation is used on the created embeddings, but the users can easily change that with available arguments. The created embeddings are passed through a stack of Mamba layers after which the contextualized embeddings are pooled (default is average pooling). Mambular also offers the use of cls token embeddings instead of pooling layers. After pooling, RMS layer normalization from [@Gu] is applied by default, followed by a task-specific model head. - -### 3.1 Models -Mambular includes the following three model classes: -**i)** *MambularRegressor* for regression tasks, **ii)** *MambularClassifier* for classification tasks and **iii)** *MambularLSS* for distributional regression tasks, similar to [@Thielmann].^[ See e.g. [@Kneib] for an overview on distributional regression.] - - -The loss functions are respectively the **i)** Mean squared error loss, **ii)** categorical cross entropy (Binary for binary classification) and **iii)** the negative log-likelihood for distributional regression. For **iii)** all distributional parameters have default activation/link functions that adhere to the distributional restrictions (e.g. positive variance for a normal distribution) but can be adapted to the users preferences. The inclusion of a distributional model focusing on regression beyond the mean further allows users to account for aleatoric uncertainty [@Kneib] without increasing the number of parameters or the complexity of the model. - -# 4. Ecosystem Compatibility and Flexibility - -Mambular is seamlessly compatible with the scikit-learn [@Pedregosa] ecosystem, allowing users to incorporate Mambular models into their existing workflows with minimal friction. This compatibility extends to various stages of the machine learning process, including data preprocessing, model training, evaluation, and hyperparameter tuning. - -Furthermore, Mambular's design emphasizes flexibility and user-friendliness. The library offers a range of customizable options for model architecture, including the choice of preprocessing, activation functions, pooling layers, normalization layers, regularization and more. This level of customization ensures that practitioners can tailor their models to the specific requirements of their tabular data tasks, optimizing performance and achieving state-of-the-art results as demonstrated by [@Ahamed]. - - - -### 4.1 Preprocessing Capabilities - -Mambular includes a comprehensive preprocessing module also following scikit-learns preprocessing pipeline. -The preprocessing module supports a wide range of data transformation techniques, including ordinal and one-hot encoding for categorical variables, decision tree-based binning for numerical features, and various strategies for handling missing values. By leveraging these preprocessing tools, users can ensure that their data is in the best possible shape for training Mambular models, leading to improved model performance. - -# Acknowledgements -We sincerely acknowledge and appreciate the financial support provided by the Key Digital Capability (KDC) for Generative AI at BASF and the BASF Data & AI Academy, which played a critical role in facilitating this research. - -# References - - -