Merge pull request #238 from amosproj/dev

Final sprint release
amosproj · Feb 7, 2024 · 34576ab · 34576ab
2 parents 31bdcae + 4a6ca72
commit 34576ab
Show file tree

Hide file tree

Showing 79 changed files with 2,979 additions and 1,543 deletions.
diff --git a/.env.template b/.env.template
@@ -9,15 +9,9 @@
 GOOGLE_PLACES_API_KEY=
 OPEN_AI_API_KEY=
 
-DB_USER=
-DB_PASSWORD=
-DB_CONNECTION=
-
-FACEBOOK_APP_ID=
-FACEBOOK_APP_SECRET=
-OPEN_AI_API_KEY=
-
+# Need to be set when 'DATABASE_TYPE' is 'S3'
 AWS_ACCESS_KEY_ID=
 AWS_SECRET_ACCESS_KEY=
 
+# Choose between 'Local' and 'S3'
 DATABASE_TYPE=
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: MIT
+# SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
+
+name: documentation
+
+on: [push, pull_request, workflow_dispatch]
+
+permissions:
+  contents: write
+
+jobs:
+  docs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pipenv
+          # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          pipenv install --dev
+      - name: Generate Sphinx
+        run: |
+          cd src/docs
+          pipenv run sphinx-apidoc -o . ..
+          pipenv run make clean
+          pipenv run make html
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@v3
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+        with:
+          publish_branch: gh-pages
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: src/docs/_build/html/
+          force_orphan: true
diff --git a/.gitignore b/.gitignore
@@ -53,9 +53,17 @@ bin/
 !**/data/merged_geo.geojson
 **/data/reviews/*.json
 **/data/gpt-results/*.json
-**/data/models/*
+**/data/models/*.pkl
+**/data/models/*.joblib
 **/data/classification_reports/*
 
+**/docs/*
+!**/docs/conf.py
+!**/docs/index.rst
+!**/docs/make.bat
+!**/docs/Makefile
+!**/docs/readme_link.md
+
 # Env files
 *.env
 
@@ -70,3 +78,6 @@ report.pdf
 **/cache/*
 
 !.gitkeep
+
+# testing
+.coverage
diff --git a/Dockerfile b/Dockerfile
diff --git a/Documentation/SBOM_generator.md b/Documentation/SBOM_generator.md
@@ -0,0 +1,61 @@
+# Automatic SBOM generation
+
+```console
+pipenv install
+pipenv shell
+
+pip install pipreqs
+pip install cyclonedx-bom
+pip install pip-licenses
+
+# Create the SBOM (cyclonedx-bom) based on (pipreqs) requirements that are actually imported in the .py files
+
+$sbom = pipreqs --print | cyclonedx-py -r -pb -o - -i -
+
+# Create an XmlDocument object
+$xml = New-Object System.Xml.XmlDocument
+
+# Load XML content into the XmlDocument
+$xml.LoadXml($sbom)
+
+
+# Create an empty CSV file
+$csvPath = "SBOM.csv"
+
+# Initialize an empty array to store rows
+$result = @()
+
+# Iterate through the XML nodes and create rows for each node
+$xml.SelectNodes("//*[local-name()='component']") | ForEach-Object {
+
+    $row = @{
+        "Version" = $_.Version
+        "Context" = $_.Purl
+        "Name" = if ($_.Name -eq 'scikit_learn') { 'scikit-learn' } else { $_.Name }
+    }
+
+    # Get license information
+    $match = pip-licenses --from=mixed --format=csv --with-system --packages $row.Name | ConvertFrom-Csv
+
+    # Add license information to the row
+    $result += [PSCustomObject]@{
+        "Context" = $row.Context
+        "Name" = $row.Name
+        "Version" = $row.Version
+        "License" = $match.License
+    }
+}
+
+# Export the data to the CSV file
+$result | Export-Csv -Path $csvPath -NoTypeInformation
+
+# Create the license file
+$licensePath = $csvPath + '.license'
+@"
+SPDX-License-Identifier: CC-BY-4.0
+SPDX-FileCopyrightText: 2023 Fabian-Paul Utech <f.utech@gmx.net>
+"@ | Out-File -FilePath $licensePath
+
+exit
+
+```
diff --git a/Documentation/ideas.md b/Documentation/ideas.md
@@ -0,0 +1,41 @@
+<!--
+SPDX-License-Identifier: MIT
+SPDX-FileCopyrightText: 2024 Felix Zailskas <felixzailskas@gmail.com>
+-->
+
+# Unused Ideas
+
+This document lists ideas and implementations which have either not been tried yet or have been deprecated as they are not used in the current product version but still carry some conceptual value.
+
+## Deprecated
+
+The original implementation of the deprecated modules can be found in the `deprecated/` directory.
+
+### Controller
+
+**_Note:_** This package has the additional dependency `pydantic==2.4.2`
+
+The controller module was originally planned to be used as a communication device between EVP and BDC. Whenever the salesperson interface would register a new lead the controller is supposed to trigger the BDC pipeline to enrich the data of that lead and preprocess it to create a feature vector. The successful completion of the BDC pipeline is then registered at the controller which will then trigger an inference of the EVP to compute the predicted merchant size and write this back to the lead data. The computed merchant size can then be used to rank the leads and allow the salesperson to decide the value of the leads and which one to call.
+
+The current implementation of the module supports queueing messages from the BDC and EVP as indicated by their type. Depending on the message type the message is then routed to the corresponding module (EVP or BDC). The actual processing of the messages by the modules is not implemented. All of this is done asynchronously by using the python threading library.
+
+### FacebookGraphAPI
+
+**_Note:_** This package has the additional dependency `facebook-sdk==3.1.0`. Also the environment variables `FACEBOOK_APP_ID` `FACEBOOK_APP_SECRET` need to be set with a valid token.
+
+This step was supposed to be used for querying lead data from the facebook by using either the business owner's name or the company name. The attempt was deprecated as the cost for the needed API token was evaluated too high and because the usage permissions of the facebook API were changed. Furthermore, it is paramount to check the legal ramifications of querying facebook for this kind of data as there might be legal consequences of searching for individuals on facebook instead of their businesses due to data privacy regulations in the EU.
+
+### ScrapeAddresses
+
+This step was an early experiment, using only the custom domain from an email address. We check if there's a live website running
+for the domain, and then try to parse the main site for a business address using a RegEx pattern. The pattern is not very precise
+and calling the website, as well as parsing it, takes quite some time, which accumulates for a lot of entries. The Google places
+step yields better results for the business address and is faster, that's why `scrape_addresses.py` was deprecated.
+
+## Possible ML improvements
+
+### Creating data subsets
+
+The data collected by the BDC pipeline has not been refined to only include semantically valuable data fields. It is possible that some data fields contain no predictive power. This would mean they are practically polluting the dataset with unnecessary information. A proper analysis of the predictive power of all data fields would allow cutting down on the amount of data for each lead, reducing processing time and possibly make predictions more precise. This approach has been explored very briefly by the subset 1 as described in `Classifier-Comparison.md`. However, the choice of included features has not been justified by experiments making them somewhat arbitrary. Additionally, an analysis of this type could give insights on which data fields to expand on and what new data one might want to collect to increase the EVP's performance in predicting merchant sizes.
+
+Possibly filtering data based on some quality metric could also improve general performance. The regional_atlas_score and google_confidence_score have been tried for this but did not improve performance. However, these values are computed somewhat arbitrarily and implementing a more refined quality metric might result in more promising results.
diff --git a/Pipfile b/Pipfile
@@ -7,53 +7,50 @@ verify_ssl = true
 name = "pypi"
 
 [dev-packages]
-pytest = "==7.4.0"
 coverage = "==7.4.1"
-pre-commit = "==3.5.0"
 flake8 = "==6.0.0"
-pytest-env = "==1.0.1"
-matplotlib = "==3.8.2"
-plotly = "==5.18.0"
 geopy = "==2.4.1"
+matplotlib = "==3.8.2"
 notebook = "==7.0.6"
+plotly = "==5.18.0"
+pre-commit = "==3.5.0"
+pytest = "==7.4.0"
+pytest-env = "==1.0.1"
+sphinx = "==7.2.6"
+sphinx_rtd_theme = "==2.0.0"
+myst_parser = "==2.0.0"
 
 [packages]
-numpy = "==1.26.1"
-requests = "==2.31.0"
-scikit-learn = "==1.3.2"
-pydantic = "==2.4.2"
-email-validator = "==2.1.0.post1"
-pandas = "==2.0.3"
+autocorrect = "==2.6.1"
 beautifulsoup4 = "==4.12.2"
-tqdm = "==4.65.0"
-python-dotenv = "==0.21.0"
-googlemaps = "==4.10.0"
-phonenumbers = "==8.13.25"
-pymongo = "==4.6.0"
-facebook-sdk = "==3.1.0"
 boto3 = "==1.33.1"
+colorama = "==0.4.6"
+deep-translator = "==1.11.4"
+deutschland = "==0.4.0"
+email-validator = "==2.1.0.post1"
+fsspec = "==2023.12.2"
+geopandas = "==0.14.1"
+googlemaps = "==4.10.0"
+joblib = "==1.3.2"
+lightgbm = "==4.3.0"
+numpy = "==1.26.1"
 openai = "==1.3.3"
-tiktoken = "==0.5.1"
+osmnx = "==1.7.1"
+pandas = "==2.0.3"
+phonenumbers = "==8.13.25"
 pylanguagetool = "==0.10.0"
+pyspellchecker = "==0.7.2"
+python-dotenv = "==0.21.0"
 reportlab = "==4.0.7"
-osmnx = "==1.7.1"
-geopandas = "==0.14.1"
+requests = "==2.31.0"
+s3fs = "==2023.12.2"
+scikit-learn = "==1.3.2"
 shapely = "==2.0.2"
-pyspellchecker = "==0.7.2"
-autocorrect = "==2.6.1"
 textblob = "==0.17.1"
-deep-translator = "==1.11.4"
-fsspec = "2023.12.2"
-s3fs = "2023.12.2"
-imblearn = "==0.0"
-sagemaker = "==2.198.0"
-joblib = "1.3.2"
+tiktoken = "==0.5.1"
+torch = "==2.1.2"
+tqdm = "==4.65.0"
 xgboost = "==2.0.3"
-colorama = "==0.4.6"
-torch = "2.1.2"
-deutschland = "0.4.0"
-bs4 = "0.0.2"
-lightgbm = "==4.3.0"
 
 [requires]
 python_version = "3.10"